In [1]:
import pandas as pd

## 1. Limpiando data

In [5]:
df_data_historica = pd.read_csv('fifa_wolrcup_historical_data.csv')
df_fixture = pd.read_csv('fifa_wolrcup_fixture.csv')
df_data_faltante = pd.read_csv('fifa_worldcup_missing_data.csv')    

## 2. Limpiando df_fixture

In [8]:
df_fixture['home'] = df_fixture['home'].str.strip()
df_fixture['away'] = df_fixture['away'].str.strip()

### 1.2 Limpiando df_missing_data y agregandolo a df_historical_data

In [None]:
# Concatenar 02 dataframe
df_data_historica = pd.concat([df_data_historica, df_data_faltante],ignore_index=True)

# Eliminar filas duplicadas
df_data_historica.drop_duplicates(inplace=True)

# Ordenar las filas segun una columna
df_data_historica.sort_values('year',inplace=True)

### 3. Limpiando df_historical_data

In [17]:
# Buscar partidos de Sweden vs Austria
index_eliminar = df_data_historica[df_data_historica['home'].str.contains('Sweden') & 
                                   df_data_historica['away'].str.contains('Austria')].index

In [16]:
# Eliminando una fila por su indice
df_data_historica.drop(index=index_eliminar, inplace=True)

In [24]:
# Aplicar expresiones regulares para eliminar caracteres en texto de la columna score
df_data_historica[df_data_historica['score'].str.contains('[^\d–]')]

In [None]:
# Reemplaza los valores buscados
df_data_historica['score'] = df_data_historica['score'].str.replace('[^\d–]', '', regex=True)

In [26]:
# Limpiando loz espacios en blanco en el Dataframe
df_data_historica['home'] = df_data_historica['home'].str.strip()
df_data_historica['away'] = df_data_historica['away'].str.strip()

In [28]:
# Separar la columna SCORE en dos columnas y eliminar "-"
df_data_historica[['HomeGoals', 'AwayGoals']] = df_data_historica['score'].str.split('–', expand=True) # Expand separa los valores en columnas

Unnamed: 0,home,score,away,year,HomeGoals,AwayGoals
0,France,4–1,Mexico,1930,4,1
1,Argentina,1–0,France,1930,1,0
2,Chile,3–0,Mexico,1930,3,0
3,Chile,1–0,France,1930,1,0
4,Argentina,6–3,Mexico,1930,6,3
...,...,...,...,...,...,...
860,Russia,2–2,Croatia,2018,2,2
861,France,1–0,Belgium,2018,1,0
862,Croatia,2–1,England,2018,2,1
863,Belgium,2–0,England,2018,2,0


In [29]:
# Elimnar la columna score
df_data_historica.drop('score', axis=1, inplace=True)
df_data_historica

Unnamed: 0,home,away,year,HomeGoals,AwayGoals
0,France,Mexico,1930,4,1
1,Argentina,France,1930,1,0
2,Chile,Mexico,1930,3,0
3,Chile,France,1930,1,0
4,Argentina,Mexico,1930,6,3
...,...,...,...,...,...
860,Russia,Croatia,2018,2,2
861,France,Belgium,2018,1,0
862,Croatia,England,2018,2,1
863,Belgium,England,2018,2,0


In [30]:
# Renombrar el nombre de las columnas
df_data_historica.rename(columns={'home':'HomeTeam','away':'AwayTeam','year':'Year'}, inplace=True)

In [31]:
df_data_historica

Unnamed: 0,HomeTeam,AwayTeam,Year,HomeGoals,AwayGoals
0,France,Mexico,1930,4,1
1,Argentina,France,1930,1,0
2,Chile,Mexico,1930,3,0
3,Chile,France,1930,1,0
4,Argentina,Mexico,1930,6,3
...,...,...,...,...,...
860,Russia,Croatia,2018,2,2
861,France,Belgium,2018,1,0
862,Croatia,England,2018,2,1
863,Belgium,England,2018,2,0


In [32]:
# Para saber el tipo de dato de cada columna
df_data_historica.dtypes

HomeTeam     object
AwayTeam     object
Year          int64
HomeGoals    object
AwayGoals    object
dtype: object

In [36]:
# cambiar el tipo de datos del dataframe
df_data_historica = df_data_historica.astype({'HomeGoals':'int64', 'AwayGoals':'int64', 'Year':'int64'})

In [37]:
df_data_historica.dtypes

HomeTeam     object
AwayTeam     object
Year          int64
HomeGoals     int64
AwayGoals     int64
dtype: object

In [40]:
# Crear una columna con los goles totales
df_data_historica['TotalGoals'] = df_data_historica['HomeGoals'] + df_data_historica['AwayGoals']
df_data_historica.head()

Unnamed: 0,HomeTeam,AwayTeam,Year,HomeGoals,AwayGoals,TotalGoals
0,France,Mexico,1930,4,1,5
1,Argentina,France,1930,1,0,1
2,Chile,Mexico,1930,3,0,3
3,Chile,France,1930,1,0,1
4,Argentina,Mexico,1930,6,3,9


## 2. Exportar Dataframes Limpios

In [42]:
df_data_historica.to_csv('clean_fifa_worldcup_matches.csv', index=False)
df_fixture.to_csv('clean_fifa_worldcup_fixture.csv', index=False)

In [43]:
# Verificar numers partidos
years = [1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970, 1974,
       1978, 1982, 1986, 1990, 1994, 1998, 2002, 2006, 2010, 2014,
       2018]

for year in years:
    print(year, len(df_data_historica[df_data_historica['Year']==year]))

1930 18
1934 17
1938 18
1950 22
1954 26
1958 35
1962 32
1966 32
1970 32
1974 38
1978 38
1982 52
1986 52
1990 16
1994 52
1998 64
2002 64
2006 64
2010 64
2014 64
2018 64
