<h1 style="color:#ffc0cb;font-size:70px;font-family:Georgia;text-align:center;"><strong>FIFA World Cup Matches</strong></h1>

<a id="1"></a>
<h1 style="color:#ffc0cb;font-size:40px;font-family:Georgia;text-align:center;"><strong>1. Data Preparation</strong></h1>

<a id="1.1"></a>
# 1.1 Importing Necessary Libraries and datasets

In [1]:
# Install a conda package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install missingno
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install xgboost
!{sys.executable} -m pip install statsmodels
!{sys.executable} -m pip install imbalanced-learn
!{sys.executable} -m pip install category_encoders


# work with data in tabular representation
from datetime import time
import pandas as pd
# round the data in the correlation matrix
import numpy as np
import os


# Modules for data visualization
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import plot_confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
# encoding
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

plt.rcParams['figure.figsize'] = [6, 6]

# Ensure that our plots are shown and embedded within the Jupyter notebook itself. Without this command, sometimes plots may show up in pop-up windows
%matplotlib inline

# overwrite the style of all the matplotlib graphs
sns.set()

# ignore DeprecationWarning Error Messages
import warnings
warnings.filterwarnings('ignore')



In [2]:
# check the version of the packages
print("Numpy version: ", np.__version__)
print("Pandas version: ",pd.__version__)
! python --version

Numpy version:  1.20.3
Pandas version:  1.3.4
Python 3.9.7


<a id="1.2"></a>
# Data Retrieving
***
In order to load data properly, the data in csv file have to be examined carefully. First of all, all the categories are seperated by the "," and strip the extra-whitespaces at the begin by setting "skipinitialspace = True".

In [3]:
%time

# set the path of the external data from the third party source - Kaggle
external_data_path = os.path.join(os.path.pardir, '', 'data','external')
WorldCupMatches = os.path.join(external_data_path, 'WorldCupMatches.csv')


# import dataset
df = pd.read_csv(WorldCupMatches, delimiter=',', skipinitialspace = True)

print("The shape of the ORGINAL data is (row, column):", str(df.shape))
df

Wall time: 0 ns
The shape of the ORGINAL data is (row, column): (4572, 20)


Unnamed: 0,Year,Datetime,Stage,Stadium,City,Home Team Name,Home Team Goals,Away Team Goals,Away Team Name,Win conditions,Attendance,Half-time Home Goals,Half-time Away Goals,Referee,Assistant 1,Assistant 2,RoundID,MatchID,Home Team Initials,Away Team Initials
0,1930.0,13 Jul 1930 - 15:00,Group 1,Pocitos,Montevideo,France,4.0,1.0,Mexico,,4444.0,3.0,0.0,LOMBARDI Domingo (URU),CRISTOPHE Henry (BEL),REGO Gilberto (BRA),201.0,1096.0,FRA,MEX
1,1930.0,13 Jul 1930 - 15:00,Group 4,Parque Central,Montevideo,USA,3.0,0.0,Belgium,,18346.0,2.0,0.0,MACIAS Jose (ARG),MATEUCCI Francisco (URU),WARNKEN Alberto (CHI),201.0,1090.0,USA,BEL
2,1930.0,14 Jul 1930 - 12:45,Group 2,Parque Central,Montevideo,Yugoslavia,2.0,1.0,Brazil,,24059.0,2.0,0.0,TEJADA Anibal (URU),VALLARINO Ricardo (URU),BALWAY Thomas (FRA),201.0,1093.0,YUG,BRA
3,1930.0,14 Jul 1930 - 14:50,Group 3,Pocitos,Montevideo,Romania,3.0,1.0,Peru,,2549.0,1.0,0.0,WARNKEN Alberto (CHI),LANGENUS Jean (BEL),MATEUCCI Francisco (URU),201.0,1098.0,ROU,PER
4,1930.0,15 Jul 1930 - 16:00,Group 1,Parque Central,Montevideo,Argentina,1.0,0.0,France,,23409.0,0.0,0.0,REGO Gilberto (BRA),SAUCEDO Ulises (BOL),RADULESCU Constantin (ROU),201.0,1085.0,ARG,FRA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4567,,,,,,,,,,,,,,,,,,,,
4568,,,,,,,,,,,,,,,,,,,,
4569,,,,,,,,,,,,,,,,,,,,
4570,,,,,,,,,,,,,,,,,,,,


<a id="1.3"></a>
## Rename column
***
Since these columns are not in SQL naming convention will be more straightforward in later process. We will also analyse the column meaning

In [4]:
df.columns

Index(['Year', 'Datetime', 'Stage', 'Stadium', 'City', 'Home Team Name',
       'Home Team Goals', 'Away Team Goals', 'Away Team Name',
       'Win conditions', 'Attendance', 'Half-time Home Goals',
       'Half-time Away Goals', 'Referee', 'Assistant 1', 'Assistant 2',
       'RoundID', 'MatchID', 'Home Team Initials', 'Away Team Initials'],
      dtype='object')

In [5]:
df.columns = ['year', 'date_time', 'stage', 'stadium', 'city', 'home_team_name',
       'home_team_goals', 'away_team_goals', 'away_team_name',
       'win_conditions', 'attendance', 'half_time_home_goals',
       'half_time_away_goals', 'referee', 'assistant_1', 'assistant_2',
       'round_id', 'match_id', 'home_team_initials', 'away_team_initials']
df.head(3)

Unnamed: 0,year,date_time,stage,stadium,city,home_team_name,home_team_goals,away_team_goals,away_team_name,win_conditions,attendance,half_time_home_goals,half_time_away_goals,referee,assistant_1,assistant_2,round_id,match_id,home_team_initials,away_team_initials
0,1930.0,13 Jul 1930 - 15:00,Group 1,Pocitos,Montevideo,France,4.0,1.0,Mexico,,4444.0,3.0,0.0,LOMBARDI Domingo (URU),CRISTOPHE Henry (BEL),REGO Gilberto (BRA),201.0,1096.0,FRA,MEX
1,1930.0,13 Jul 1930 - 15:00,Group 4,Parque Central,Montevideo,USA,3.0,0.0,Belgium,,18346.0,2.0,0.0,MACIAS Jose (ARG),MATEUCCI Francisco (URU),WARNKEN Alberto (CHI),201.0,1090.0,USA,BEL
2,1930.0,14 Jul 1930 - 12:45,Group 2,Parque Central,Montevideo,Yugoslavia,2.0,1.0,Brazil,,24059.0,2.0,0.0,TEJADA Anibal (URU),VALLARINO Ricardo (URU),BALWAY Thomas (FRA),201.0,1093.0,YUG,BRA


<a id="2"></a>
<h1 style="color:#ffc0cb;font-size:40px;font-family:Georgia;text-align:center;"><strong>2. Data Cleaning</strong></h1>

<a id="2.1"></a>
## About This Dataset
***

<a id="2.2"></a>
## Data types
***

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   year                  852 non-null    float64
 1   date_time             852 non-null    object 
 2   stage                 852 non-null    object 
 3   stadium               852 non-null    object 
 4   city                  852 non-null    object 
 5   home_team_name        852 non-null    object 
 6   home_team_goals       852 non-null    float64
 7   away_team_goals       852 non-null    float64
 8   away_team_name        852 non-null    object 
 9   win_conditions        852 non-null    object 
 10  attendance            850 non-null    float64
 11  half_time_home_goals  852 non-null    float64
 12  half_time_away_goals  852 non-null    float64
 13  referee               852 non-null    object 
 14  assistant_1           852 non-null    object 
 15  assistant_2          

<a id="2.2.1"></a>
### Format date features

In [7]:
# Now let's create new features from date column

# Cast date columns to the Date data type
df['date_time'] = pd.to_datetime(df['date_time'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   year                  852 non-null    float64       
 1   date_time             852 non-null    datetime64[ns]
 2   stage                 852 non-null    object        
 3   stadium               852 non-null    object        
 4   city                  852 non-null    object        
 5   home_team_name        852 non-null    object        
 6   home_team_goals       852 non-null    float64       
 7   away_team_goals       852 non-null    float64       
 8   away_team_name        852 non-null    object        
 9   win_conditions        852 non-null    object        
 10  attendance            850 non-null    float64       
 11  half_time_home_goals  852 non-null    float64       
 12  half_time_away_goals  852 non-null    float64       
 13  referee           

<br><br>
<a id="2.6"></a>
## Check data types & Make the data homogeneous
The dtypes that pandas uses are: `float`, `int`, `bool`, `datetime`, `timedelta`, `category` and `object`. I modify data types in my DataFrames to help me transform them into more meaningful metrics

+ Cast pandas objects to a specified dtype (string)¶
+ Numeric data should have for example the same number of digits after the point.

In [8]:
print("The dataframe BEFORE dropped has {} rows and {} columns".format(df.shape[0], df.shape[1]))

# If all values are NA, drop that row or column.
df = df. dropna()

# display missing values in descending
print("The dataframe AFTER dropped has {} rows and {} columns".format(df.shape[0], df.shape[1]))

The dataframe BEFORE dropped has 4572 rows and 20 columns
The dataframe AFTER dropped has 850 rows and 20 columns


# Save the Intermediate data that has been transformed

In [9]:
# set the path of the cleaned data to data and dash
interim_data_path = os.path.join(os.path.pardir,'data','interim')
write_interim_path = os.path.join(interim_data_path, 'WorldCupMatches.csv')


# To write the data from the data frame into a file, use the to_csv function.
df.to_csv(write_interim_path, index=False)

# df.to_csv('Dash/cleaned_data.csv', index=False)
print("Cleaned data was successfully saved!")

Cleaned data was successfully saved!


<h1 style="color:#ffc0cb;font-size:70px;font-family:Georgia;text-align:center;"><strong>FIFA World Cup Players</strong></h1>

<a id="1.2"></a>
# Data Retrieving
***
In order to load data properly, the data in csv file have to be examined carefully. First of all, all the categories are seperated by the "," and strip the extra-whitespaces at the begin by setting "skipinitialspace = True".

In [10]:
%time

# set the path of the external data from the third party source - Kaggle
external_data_path = os.path.join(os.path.pardir, '', 'data','external')
WorldCupPlayers = os.path.join(external_data_path, 'WorldCupPlayers.csv')


# import dataset
WorldCupPlayers = pd.read_csv(WorldCupPlayers, delimiter=',', skipinitialspace = True)

print("The shape of the ORGINAL data is (row, column):", str(df.shape))
WorldCupPlayers

Wall time: 0 ns
The shape of the ORGINAL data is (row, column): (850, 20)


Unnamed: 0,RoundID,MatchID,Team Initials,Coach Name,Line-up,Shirt Number,Player Name,Position,Event
0,201,1096,FRA,CAUDRON Raoul (FRA),S,0,Alex THEPOT,GK,
1,201,1096,MEX,LUQUE Juan (MEX),S,0,Oscar BONFIGLIO,GK,
2,201,1096,FRA,CAUDRON Raoul (FRA),S,0,Marcel LANGILLER,,G40'
3,201,1096,MEX,LUQUE Juan (MEX),S,0,Juan CARRENO,,G70'
4,201,1096,FRA,CAUDRON Raoul (FRA),S,0,Ernest LIBERATI,,
...,...,...,...,...,...,...,...,...,...
37779,255959,300186501,ARG,SABELLA Alejandro (ARG),N,19,ALVAREZ,,
37780,255959,300186501,GER,LOEW Joachim (GER),N,6,KHEDIRA,,
37781,255959,300186501,ARG,SABELLA Alejandro (ARG),N,20,AGUERO,,IH46' Y65'
37782,255959,300186501,GER,LOEW Joachim (GER),N,21,MUSTAFI,,


<a id="1.3"></a>
## Rename column
***
Since these columns are not in SQL naming convention will be more straightforward in later process. We will also analyse the column meaning

In [11]:
WorldCupPlayers.columns

Index(['RoundID', 'MatchID', 'Team Initials', 'Coach Name', 'Line-up',
       'Shirt Number', 'Player Name', 'Position', 'Event'],
      dtype='object')

In [12]:
WorldCupPlayers.columns = ['round_id', 'match_id', 'team_initials', 'coach_name', 'line_up',
       'shirt_number', 'player_name', 'position', 'event']
WorldCupPlayers.head(3)

Unnamed: 0,round_id,match_id,team_initials,coach_name,line_up,shirt_number,player_name,position,event
0,201,1096,FRA,CAUDRON Raoul (FRA),S,0,Alex THEPOT,GK,
1,201,1096,MEX,LUQUE Juan (MEX),S,0,Oscar BONFIGLIO,GK,
2,201,1096,FRA,CAUDRON Raoul (FRA),S,0,Marcel LANGILLER,,G40'


<a id="2"></a>
<h1 style="color:#ffc0cb;font-size:40px;font-family:Georgia;text-align:center;"><strong>2. Data Cleaning</strong></h1>

<a id="2.1"></a>
## About This Dataset
***

<a id="2.2"></a>
## Data types
***

In [13]:
WorldCupPlayers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37784 entries, 0 to 37783
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   round_id       37784 non-null  int64 
 1   match_id       37784 non-null  int64 
 2   team_initials  37784 non-null  object
 3   coach_name     37784 non-null  object
 4   line_up        37784 non-null  object
 5   shirt_number   37784 non-null  int64 
 6   player_name    37784 non-null  object
 7   position       4143 non-null   object
 8   event          9069 non-null   object
dtypes: int64(3), object(6)
memory usage: 2.6+ MB


<br><br>
<a id="2.6"></a>
## Drop NA

In [14]:
print("The dataframe BEFORE dropped has {} rows and {} columns".format(WorldCupPlayers.shape[0], WorldCupPlayers.shape[1]))

# If all values are NA, drop that row or column.
WorldCupPlayers = WorldCupPlayers. dropna()

# display missing values in descending
print("The dataframe AFTER dropped has {} rows and {} columns".format(WorldCupPlayers.shape[0], WorldCupPlayers.shape[1]))

The dataframe BEFORE dropped has 37784 rows and 9 columns
The dataframe AFTER dropped has 573 rows and 9 columns


# Save the Intermediate data that has been transformed

In [15]:
# set the path of the cleaned data to data and dash
interim_data_path = os.path.join(os.path.pardir,'data','interim')
write_interim_path = os.path.join(interim_data_path, 'WorldCupPlayers.csv')


# To write the data from the data frame into a file, use the to_csv function.
WorldCupPlayers.to_csv(write_interim_path, index=False)

# df.to_csv('Dash/cleaned_data.csv', index=False)
print("Cleaned data was successfully saved!")

Cleaned data was successfully saved!


<a id="5"></a>
<h1 style="color:#ffc0cb;font-size:40px;font-family:Georgia;text-align:center;"><strong>8. Appendix</strong></h1>

# Pipeline - End-to-end

> Makes the pipeline faster and easier to deploy. Ex. cd `../src/models/classification.py`