
<h1 style="color:#ffc0cb;font-size:70px;font-family:Georgia;text-align:center;"><strong>National Leagues 2021-2022</strong></h1>

In [1]:
# Install a conda package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install missingno


# work with data in tabular representation
from datetime import time
import pandas as pd
# round the data in the correlation matrix
import numpy as np
import os


# Modules for data visualization
import seaborn as sns
import missingno as msno

# run a python file 
%run ../../src/cleaning/function.py

# ignore DeprecationWarning Error Messages
import warnings
warnings.filterwarnings('ignore')



<a id="1.2"></a>
# Data Retrieving
***
In order to load data properly, the data in csv file have to be examined carefully. First of all, all the categories are seperated by the "," and strip the extra-whitespaces at the begin by setting "skipinitialspace = True".

In [2]:
# set the path of the external data from the third party source
external_data_path = os.path.join(os.path.pardir, '', '..', 'data','external', 'National Leagues', 'Season 2021-2022')

# get each specfic file
english_premier_league_df = os.path.join(external_data_path, 'E0_English Premier League_2021-2022.csv')
efl_championship_df = os.path.join(external_data_path, 'E1_EFL Championship_2021-2022.csv')
efl_league_one_df = os.path.join(external_data_path, 'E2_EFL League One_2021-2022.csv')
efl_league_two_df = os.path.join(external_data_path, 'E3_EFL League Two_2021-2022.csv')

# import dataset
english_premier_league_df = pd.read_csv(english_premier_league_df, delimiter=',', skipinitialspace = True)
efl_championship_df = pd.read_csv(efl_championship_df, delimiter=',', skipinitialspace = True)
efl_league_one_df = pd.read_csv(efl_league_one_df, delimiter=',', skipinitialspace = True)
efl_league_two_df = pd.read_csv(efl_league_two_df, delimiter=',', skipinitialspace = True)

# print out the shape
print("The shape of the E0_English Premier League 2021-2022 dataframe is (row, column):", str(english_premier_league_df.shape))
print("The shape of the E1_EFL Championship 2021-2022 dataframe is (row, column):", str(efl_championship_df.shape))
print("The shape of the E2_EFL League One 2021-2022 dataframe is (row, column):", str(efl_league_one_df.shape))
print("The shape of the E3_EFL League Two 2021-2022 dataframe is (row, column):", str(efl_league_two_df.shape), "\n\n")

The shape of the E0_English Premier League 2021-2022 dataframe is (row, column): (244, 106)
The shape of the E1_EFL Championship 2021-2022 dataframe is (row, column): (380, 106)
The shape of the E2_EFL League One 2021-2022 dataframe is (row, column): (385, 106)
The shape of the E3_EFL League Two 2021-2022 dataframe is (row, column): (370, 106) 




In [3]:
style(english_premier_league_df.head(3))

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,PSH,PSD,PSA,WHH,WHD,WHA,VCH,VCD,VCA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,B365>2.5,B365<2.5,P>2.5,P<2.5,Max>2.5,Max<2.5,Avg>2.5,Avg<2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA,B365CH,B365CD,B365CA,BWCH,BWCD,BWCA,IWCH,IWCD,IWCA,PSCH,PSCD,PSCA,WHCH,WHCD,WHCA,VCCH,VCCD,VCCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,B365C>2.5,B365C<2.5,PC>2.5,PC<2.5,MaxC>2.5,MaxC<2.5,AvgC>2.5,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E0,13/08/2021,20:00,Brentford,Arsenal,2,0,H,1,0,H,M Oliver,8,22,3,4,12,8,2,5,0,0,0,0,4.0,3.4,1.95,4.0,3.5,1.95,3.8,3.4,2.05,4.05,3.46,2.05,4.0,3.4,1.9,4.1,3.4,2.0,4.62,3.72,2.1,4.02,3.43,2.02,2.1,1.72,2.22,1.73,2.26,1.83,2.16,1.73,0.5,1.86,2.07,1.88,2.06,2.05,2.08,1.87,2.03,3.8,3.25,2.05,3.8,3.3,2.05,3.8,3.25,2.1,3.94,3.33,2.13,3.9,3.0,2.05,3.9,3.25,2.1,4.2,3.5,2.18,3.89,3.28,2.1,2.37,1.57,2.44,1.62,2.47,1.75,2.33,1.62,0.5,1.75,2.05,1.81,2.13,2.05,2.17,1.8,2.09
1,E0,14/08/2021,12:30,Man United,Leeds,5,1,H,1,0,H,P Tierney,16,10,8,3,11,9,5,4,1,2,0,0,1.53,4.5,5.75,1.53,4.5,5.75,1.55,4.4,5.75,1.56,4.57,5.96,1.52,4.33,5.8,1.55,4.4,6.0,1.59,4.65,6.35,1.55,4.48,5.87,1.61,2.3,1.67,2.32,1.71,2.38,1.65,2.29,-1.0,1.95,1.98,1.96,1.96,2.0,2.01,1.93,1.96,1.61,4.2,5.25,1.62,4.1,5.25,1.65,4.2,4.9,1.67,4.2,5.4,1.57,4.2,5.5,1.65,4.1,5.25,1.71,4.33,5.8,1.64,4.19,5.22,1.66,2.2,1.7,2.27,1.75,2.37,1.67,2.25,-1.0,2.05,1.75,2.17,1.77,2.19,1.93,2.1,1.79
2,E0,14/08/2021,15:00,Burnley,Brighton,1,2,A,1,0,H,D Coote,14,14,3,8,10,7,7,6,2,1,0,0,3.1,3.1,2.45,3.2,3.1,2.4,3.15,3.05,2.45,3.3,3.12,2.51,3.2,3.0,2.45,3.13,3.1,2.45,3.33,3.2,2.6,3.19,3.09,2.49,2.5,1.53,2.56,1.56,2.56,1.63,2.46,1.57,0.25,1.8,2.14,1.83,2.12,1.83,2.17,1.79,2.12,3.1,3.1,2.45,3.25,3.1,2.4,3.1,3.05,2.45,3.27,3.14,2.51,3.1,3.0,2.45,3.13,3.13,2.5,3.35,3.2,2.56,3.19,3.1,2.48,2.3,1.61,2.33,1.67,2.42,1.71,2.34,1.62,0.25,1.79,2.15,1.81,2.14,1.82,2.19,1.79,2.12


In [4]:
style(efl_championship_df.head(3))

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,PSH,PSD,PSA,WHH,WHD,WHA,VCH,VCD,VCA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,B365>2.5,B365<2.5,P>2.5,P<2.5,Max>2.5,Max<2.5,Avg>2.5,Avg<2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA,B365CH,B365CD,B365CA,BWCH,BWCD,BWCA,IWCH,IWCD,IWCA,PSCH,PSCD,PSCA,WHCH,WHCD,WHCA,VCCH,VCCD,VCCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,B365C>2.5,B365C<2.5,PC>2.5,PC<2.5,MaxC>2.5,MaxC<2.5,AvgC>2.5,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E1,06/08/2021,19:45,Bournemouth,West Brom,2,2,D,1,1,D,D Whitestone,7,15,4,5,11,13,4,8,1,3,0,0,2.3,3.25,3.2,2.3,3.4,3.1,2.3,3.3,3.15,2.37,3.36,3.24,2.3,3.2,3.1,2.3,3.4,3.13,2.42,3.44,3.3,2.31,3.32,3.14,2.0,1.8,2.1,1.79,2.11,1.86,2.06,1.77,-0.25,2.01,1.89,2.03,1.88,2.03,1.94,1.99,1.87,2.45,3.3,2.9,2.4,3.3,3.0,2.4,3.3,2.95,2.49,3.36,3.04,2.35,3.2,3.0,2.4,3.3,3.0,2.55,3.5,3.07,2.44,3.3,2.95,2.0,1.8,2.04,1.86,2.09,1.92,1.97,1.84,-0.25,2.13,1.78,2.13,1.8,2.16,1.83,2.09,1.77
1,E1,07/08/2021,15:00,Blackburn,Swansea,2,1,H,1,0,H,D Webb,21,7,10,4,11,10,7,5,4,2,0,0,2.1,3.4,3.4,2.15,3.3,3.5,2.1,3.3,3.55,2.18,3.38,3.65,2.1,3.2,3.5,2.1,3.3,3.5,2.26,3.41,3.75,2.14,3.29,3.53,2.3,1.61,2.31,1.67,2.33,1.68,2.25,1.65,-0.25,1.86,2.04,1.87,2.04,1.88,2.06,1.84,2.01,2.05,3.3,3.8,2.15,3.3,3.5,2.1,3.25,3.65,2.13,3.38,3.79,2.05,3.25,3.6,2.1,3.4,3.5,2.2,3.46,3.97,2.12,3.31,3.61,2.1,1.72,2.21,1.72,2.27,1.74,2.16,1.7,-0.25,1.77,2.14,1.83,2.09,1.87,2.17,1.81,2.04
2,E1,07/08/2021,15:00,Bristol City,Blackpool,1,1,D,1,0,H,A Davies,11,7,2,1,10,13,2,4,2,0,0,0,2.15,3.4,3.3,2.25,3.4,3.25,2.2,3.35,3.3,2.27,3.39,3.35,2.2,3.2,3.3,2.2,3.3,3.3,2.32,3.45,3.43,2.22,3.32,3.3,2.2,1.66,2.19,1.72,2.25,1.74,2.17,1.7,-0.25,1.94,1.96,1.96,1.94,1.96,1.99,1.91,1.93,2.15,3.3,3.5,2.2,3.3,3.4,2.2,3.15,3.5,2.26,3.2,3.67,2.15,3.1,3.5,2.2,3.25,3.4,2.3,3.37,3.74,2.22,3.2,3.47,2.2,1.66,2.29,1.68,2.38,1.7,2.22,1.66,-0.25,1.9,2.0,1.92,1.99,1.92,2.01,1.89,1.95


In [5]:
style(efl_league_one_df.head(3))

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,PSH,PSD,PSA,WHH,WHD,WHA,VCH,VCD,VCA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,B365>2.5,B365<2.5,P>2.5,P<2.5,Max>2.5,Max<2.5,Avg>2.5,Avg<2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA,B365CH,B365CD,B365CA,BWCH,BWCD,BWCA,IWCH,IWCD,IWCA,PSCH,PSCD,PSCA,WHCH,WHCD,WHCA,VCCH,VCCD,VCCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,B365C>2.5,B365C<2.5,PC>2.5,PC<2.5,MaxC>2.5,MaxC<2.5,AvgC>2.5,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E2,07/08/2021,15:00,Bolton,Milton Keynes Dons,3,3,D,1,1,D,M Coy,17,10,4,4,15,10,3,2,1,0,0,0,2.3,3.4,3.0,2.3,3.5,2.85,2.35,3.25,2.8,2.33,3.55,3.05,2.25,3.3,2.9,2.3,3.5,2.88,2.4,3.64,3.08,2.3,3.45,2.93,1.88,1.98,1.84,2.02,1.93,2.02,1.83,1.96,-0.25,2.02,1.83,2.02,1.86,2.06,1.88,2.0,1.83,2.15,3.5,3.4,2.2,3.5,2.95,2.2,3.25,3.15,2.21,3.6,3.3,2.2,3.3,3.0,2.2,3.5,3.2,2.35,3.68,3.4,2.2,3.45,3.16,1.8,2.0,1.85,2.03,1.88,2.08,1.83,1.97,-0.25,1.9,1.95,1.91,1.97,1.99,2.01,1.9,1.92
1,E2,07/08/2021,15:00,Cambridge,Oxford,1,1,D,0,1,A,D Rock,9,11,1,3,7,12,3,4,1,2,0,0,3.6,3.5,2.0,3.5,3.4,2.05,3.45,3.25,2.05,3.63,3.45,2.08,3.5,3.3,2.0,3.5,3.4,2.05,3.67,3.64,2.14,3.5,3.4,2.05,1.98,1.88,1.97,1.88,2.01,1.91,1.94,1.85,0.5,1.83,2.02,1.78,2.07,1.86,2.08,1.79,2.04,3.6,3.6,2.0,3.6,3.4,2.0,3.55,3.25,2.05,3.74,3.53,2.07,3.4,3.3,2.05,3.6,3.5,2.05,3.8,3.64,2.14,3.56,3.43,2.05,1.95,1.9,1.97,1.89,2.0,1.92,1.94,1.86,0.25,2.08,1.73,2.12,1.78,2.12,1.82,2.06,1.77
2,E2,07/08/2021,15:00,Crewe,Cheltenham,1,1,D,1,1,D,B Toner,7,12,1,6,15,14,3,2,1,1,0,0,1.9,3.6,3.75,1.98,3.3,3.75,1.95,3.2,3.8,1.98,3.46,4.07,1.95,3.3,3.7,1.93,3.5,3.75,2.05,3.6,4.1,1.97,3.39,3.81,2.07,1.72,2.1,1.77,2.15,1.79,2.09,1.74,-0.5,1.98,1.88,1.98,1.89,2.0,1.92,1.96,1.86,2.25,3.1,3.5,2.2,3.1,3.25,2.3,2.9,3.1,2.36,3.07,3.53,2.2,3.2,3.1,2.3,3.25,3.25,2.38,3.25,3.6,2.29,3.08,3.33,2.15,1.66,2.24,1.69,2.25,1.74,2.14,1.69,-0.25,2.0,1.85,1.99,1.9,2.04,1.95,1.96,1.87


In [6]:
style(efl_league_two_df.head(3))

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,PSH,PSD,PSA,WHH,WHD,WHA,VCH,VCD,VCA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,B365>2.5,B365<2.5,P>2.5,P<2.5,Max>2.5,Max<2.5,Avg>2.5,Avg<2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA,B365CH,B365CD,B365CA,BWCH,BWCD,BWCA,IWCH,IWCD,IWCA,PSCH,PSCD,PSCA,WHCH,WHCD,WHCA,VCCH,VCCD,VCCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,B365C>2.5,B365C<2.5,PC>2.5,PC<2.5,MaxC>2.5,MaxC<2.5,AvgC>2.5,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E3,07/08/2021,15:00,Carlisle,Colchester,0,0,D,0,0,D,P Wright,18,10,6,3,17,14,7,8,2,0,0,0,2.1,3.4,3.5,2.1,3.4,3.25,2.05,3.15,3.5,2.09,3.42,3.73,2.05,3.2,3.4,2.1,3.4,3.4,2.2,3.52,3.73,2.09,3.32,3.5,2.1,1.7,2.14,1.74,2.17,1.77,2.1,1.71,-0.25,1.85,2.0,1.79,2.09,1.88,2.09,1.81,2.02,1.95,3.3,4.1,2.05,3.2,3.6,2.05,2.95,3.85,2.1,3.19,4.09,2.0,3.2,3.6,2.05,3.3,3.9,2.14,3.4,4.1,2.05,3.21,3.81,2.4,1.53,2.5,1.56,2.5,1.65,2.37,1.57,-0.25,1.75,2.05,1.78,2.13,1.82,2.13,1.75,2.08
1,E3,07/08/2021,15:00,Exeter,Bradford,0,0,D,0,0,D,S Allison,12,14,4,4,8,7,5,5,2,0,0,0,2.3,3.25,3.0,2.3,3.3,3.0,2.35,3.1,2.95,2.39,3.36,3.1,2.25,3.1,3.1,2.3,3.3,3.0,2.43,3.4,3.2,2.32,3.25,3.04,2.15,1.66,2.19,1.71,2.23,1.73,2.15,1.68,-0.25,2.02,1.83,2.05,1.83,2.05,1.92,2.0,1.83,2.4,3.3,3.0,2.5,3.25,2.7,2.45,2.95,2.8,2.57,3.18,3.03,2.45,3.1,2.8,2.55,3.2,2.9,2.64,3.3,3.07,2.51,3.14,2.89,2.2,1.65,2.27,1.68,2.28,1.71,2.2,1.65,0.0,1.8,2.05,1.79,2.12,1.87,2.12,1.8,2.03
2,E3,07/08/2021,15:00,Forest Green,Sutton,2,1,H,1,0,H,J Bell,12,9,2,2,11,12,4,6,3,0,0,0,1.8,3.75,4.2,1.87,3.6,3.8,1.83,3.45,4.0,1.85,3.72,4.28,1.8,3.5,4.0,1.83,3.6,4.1,1.91,3.84,4.32,1.84,3.6,4.07,1.93,1.93,1.9,1.95,1.94,1.98,1.87,1.92,-0.5,1.85,2.0,1.85,2.02,1.87,2.05,1.84,1.98,2.3,3.5,3.1,2.2,3.2,3.25,2.15,2.95,3.4,2.28,3.18,3.57,2.15,3.2,3.2,2.25,3.3,3.3,2.31,3.5,4.0,2.21,3.17,3.42,2.35,1.57,2.48,1.57,2.53,1.85,2.31,1.6,-0.25,1.98,1.83,1.93,1.95,2.03,1.99,1.91,1.9


<a id="1.3"></a>
# [Rename columns base on their abbreviations](http://www.football-data.co.uk/notes.txt)
***
Since these columns are not in SQL naming convention will be more straightforward in later process. We will also analyse the column meaning

In [7]:
print(f'The list of E0_English Premier League 2021-2022 columns\' names is: {english_premier_league_df.columns.to_list()}\n\n\n')

The list of E0_English Premier League 2021-2022 columns' names is: ['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA', 'B365>2.5', 'B365<2.5', 'P>2.5', 'P<2.5', 'Max>2.5', 'Max<2.5', 'Avg>2.5', 'Avg<2.5', 'AHh', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA', 'B365CH', 'B365CD', 'B365CA', 'BWCH', 'BWCD', 'BWCA', 'IWCH', 'IWCD', 'IWCA', 'PSCH', 'PSCD', 'PSCA', 'WHCH', 'WHCD', 'WHCA', 'VCCH', 'VCCD', 'VCCA', 'MaxCH', 'MaxCD', 'MaxCA', 'AvgCH', 'AvgCD', 'AvgCA', 'B365C>2.5', 'B365C<2.5', 'PC>2.5', 'PC<2.5', 'MaxC>2.5', 'MaxC<2.5', 'AvgC>2.5', 'AvgC<2.5', 'AHCh', 'B365CAHH', 'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA', 'AvgCAHH', 'AvgCAHA']





In [8]:
# rename_cols(english_premier_league_df)
# rename_cols(efl_championship_df)
# rename_cols(efl_league_one_df)
# rename_cols(efl_league_two_df)

# print(f'The list of E0 English Premier League 2021-2022 columns\' names is: {english_premier_league_df.columns.to_list()}\n\n\n')

# Drop unecessary columns

In [9]:
english_premier_league_df = english_premier_league_df.drop(['PSCH', 'PSCD', 'PSCA'], axis=1)
efl_championship_df = efl_championship_df.drop(['PSCH', 'PSCD', 'PSCA'], axis=1)
efl_league_one_df = efl_league_one_df.drop(['PSCH', 'PSCD', 'PSCA'], axis=1)
efl_league_two_df = efl_league_two_df.drop(['PSCH', 'PSCD', 'PSCA'], axis=1)

In [10]:
print(f'The list of E0 English Premier League 2021-2022 columns\' names is: {english_premier_league_df.columns.to_list()}\n\n\n')

The list of E0 English Premier League 2021-2022 columns' names is: ['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA', 'B365>2.5', 'B365<2.5', 'P>2.5', 'P<2.5', 'Max>2.5', 'Max<2.5', 'Avg>2.5', 'Avg<2.5', 'AHh', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA', 'B365CH', 'B365CD', 'B365CA', 'BWCH', 'BWCD', 'BWCA', 'IWCH', 'IWCD', 'IWCA', 'WHCH', 'WHCD', 'WHCA', 'VCCH', 'VCCD', 'VCCA', 'MaxCH', 'MaxCD', 'MaxCA', 'AvgCH', 'AvgCD', 'AvgCA', 'B365C>2.5', 'B365C<2.5', 'PC>2.5', 'PC<2.5', 'MaxC>2.5', 'MaxC<2.5', 'AvgC>2.5', 'AvgC<2.5', 'AHCh', 'B365CAHH', 'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA', 'AvgCAHH', 'AvgCAHA']





# Lower Case the content

In this section we will convert all the string value in the column to uppercase for further processing and keep all the string uniformly format. This will improve the analysis of the data, and also easier to perform any function related to the string.

In [11]:
# Cast all values inside the dataframe (except the columns' name) into upper case.
english_premier_league_df = english_premier_league_df.applymap(lambda s: s.lower() if type(s) == str else s)
efl_championship_df = efl_championship_df.applymap(lambda s: s.lower() if type(s) == str else s)
efl_league_one_df = efl_league_one_df.applymap(lambda s: s.lower() if type(s) == str else s)
efl_league_two_df = efl_league_two_df.applymap(lambda s: s.lower() if type(s) == str else s)


# Concat dataframes
Clear the existing index and reset it in the result by setting the ignore_index option to True.

In [12]:
df = pd.concat([english_premier_league_df, efl_championship_df, efl_league_one_df, efl_league_two_df], ignore_index=True)
df.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,PSH,PSD,PSA,WHH,WHD,WHA,VCH,VCD,VCA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,B365>2.5,B365<2.5,P>2.5,P<2.5,Max>2.5,Max<2.5,Avg>2.5,Avg<2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA,B365CH,B365CD,B365CA,BWCH,BWCD,BWCA,IWCH,IWCD,IWCA,WHCH,WHCD,WHCA,VCCH,VCCD,VCCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,B365C>2.5,B365C<2.5,PC>2.5,PC<2.5,MaxC>2.5,MaxC<2.5,AvgC>2.5,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,e0,13/08/2021,20:00,brentford,arsenal,2,0,h,1,0,h,m oliver,8,22,3,4,12,8,2,5,0,0,0,0,4.0,3.4,1.95,4.0,3.5,1.95,3.8,3.4,2.05,4.05,3.46,2.05,4.0,3.4,1.9,4.1,3.4,2.0,4.62,3.72,2.1,4.02,3.43,2.02,2.1,1.72,2.22,1.73,2.26,1.83,2.16,1.73,0.5,1.86,2.07,1.88,2.06,2.05,2.08,1.87,2.03,3.8,3.25,2.05,3.8,3.3,2.05,3.8,3.25,2.1,3.9,3.0,2.05,3.9,3.25,2.1,4.2,3.5,2.18,3.89,3.28,2.1,2.37,1.57,2.44,1.62,2.47,1.75,2.33,1.62,0.5,1.75,2.05,1.81,2.13,2.05,2.17,1.8,2.09
1,e0,14/08/2021,12:30,man united,leeds,5,1,h,1,0,h,p tierney,16,10,8,3,11,9,5,4,1,2,0,0,1.53,4.5,5.75,1.53,4.5,5.75,1.55,4.4,5.75,1.56,4.57,5.96,1.52,4.33,5.8,1.55,4.4,6.0,1.59,4.65,6.35,1.55,4.48,5.87,1.61,2.3,1.67,2.32,1.71,2.38,1.65,2.29,-1.0,1.95,1.98,1.96,1.96,2.0,2.01,1.93,1.96,1.61,4.2,5.25,1.62,4.1,5.25,1.65,4.2,4.9,1.57,4.2,5.5,1.65,4.1,5.25,1.71,4.33,5.8,1.64,4.19,5.22,1.66,2.2,1.7,2.27,1.75,2.37,1.67,2.25,-1.0,2.05,1.75,2.17,1.77,2.19,1.93,2.1,1.79
2,e0,14/08/2021,15:00,burnley,brighton,1,2,a,1,0,h,d coote,14,14,3,8,10,7,7,6,2,1,0,0,3.1,3.1,2.45,3.2,3.1,2.4,3.15,3.05,2.45,3.3,3.12,2.51,3.2,3.0,2.45,3.13,3.1,2.45,3.33,3.2,2.6,3.19,3.09,2.49,2.5,1.53,2.56,1.56,2.56,1.63,2.46,1.57,0.25,1.8,2.14,1.83,2.12,1.83,2.17,1.79,2.12,3.1,3.1,2.45,3.25,3.1,2.4,3.1,3.05,2.45,3.1,3.0,2.45,3.13,3.13,2.5,3.35,3.2,2.56,3.19,3.1,2.48,2.3,1.61,2.33,1.67,2.42,1.71,2.34,1.62,0.25,1.79,2.15,1.81,2.14,1.82,2.19,1.79,2.12
3,e0,14/08/2021,15:00,chelsea,crystal palace,3,0,h,2,0,h,j moss,13,4,6,1,15,11,5,2,0,0,0,0,1.25,5.75,13.0,1.28,5.75,10.5,1.25,6.0,13.0,1.26,6.24,12.74,1.25,5.5,13.0,1.25,5.75,13.0,1.3,6.3,15.0,1.26,5.92,12.8,1.8,2.0,1.8,2.09,1.84,2.12,1.79,2.06,-1.5,1.84,2.09,1.79,2.12,1.93,2.12,1.83,2.07,1.3,5.25,11.0,1.33,5.0,10.0,1.3,5.25,11.0,1.3,5.25,10.0,1.33,5.0,11.0,1.36,5.5,11.5,1.33,5.17,10.58,1.9,1.9,1.93,1.98,1.96,2.07,1.9,1.94,-1.5,2.05,1.75,2.12,1.81,2.16,1.93,2.06,1.82
4,e0,14/08/2021,15:00,everton,southampton,3,1,h,0,1,a,a madley,14,6,6,3,13,15,6,8,2,0,0,0,1.9,3.5,4.0,1.95,3.5,3.9,1.95,3.45,3.95,2.01,3.56,4.1,1.95,3.4,4.0,1.95,3.4,4.1,2.04,3.66,4.25,1.97,3.53,4.04,2.0,1.8,2.14,1.78,2.14,1.85,2.07,1.79,-0.5,2.0,1.93,2.01,1.92,2.01,1.97,1.96,1.92,2.0,3.4,3.9,2.05,3.4,3.75,2.0,3.35,4.0,1.95,3.4,3.9,2.0,3.3,4.2,2.12,3.5,4.2,2.04,3.39,3.95,2.2,1.66,2.28,1.69,2.34,1.77,2.24,1.67,-0.5,2.05,1.88,2.05,1.88,2.08,1.9,2.03,1.86


# Data types 

In [13]:
# convert columns to the best possible dtypes, object->string
df = df.convert_dtypes()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1379 entries, 0 to 1378
Columns: 103 entries, Div to AvgCAHA
dtypes: Float64(79), Int64(16), string(8)
memory usage: 1.2 MB


<a id="2.2.1"></a>
# Format date features

In [14]:
# Cast date columns to the Date data type
df['Date'] = pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1379 entries, 0 to 1378
Columns: 103 entries, Div to AvgCAHA
dtypes: Float64(79), Int64(16), datetime64[ns](1), string(7)
memory usage: 1.2 MB


# DROP NA & DUPLICATION

+ world_cup dataframe: year is the primary key

+ world_cup_match dataframe: year,match_id are the primary key

In [15]:
print("Number of rows before drop of duplicates in National Leagues:", len(df.index))
print("Number of duplicated records in National Leagues: ", df.duplicated().sum())
df = df.drop_duplicates(subset=['Div', 'Date', 'HomeTeam', 'AwayTeam'], keep="first")
print("Number of duplicated records AFTER DROP in National Leagues: ", df.duplicated().sum())
print("Number of rows after drop of duplicates in National Leagues:", len(df.index), "\n\n")

Number of rows before drop of duplicates in National Leagues: 1379
Number of duplicated records in National Leagues:  0
Number of duplicated records AFTER DROP in National Leagues:  0
Number of rows after drop of duplicates in National Leagues: 1379 




In [16]:
print("The National Leagues dataframe BEFORE dropped has {} rows and {} columns".format(df.shape[0], df.shape[1]))
df = df.dropna(subset=['Div', 'Date', 'HomeTeam', 'AwayTeam'])
# display missing values in descending
print("The National Leagues dataframe AFTER dropped has {} rows and {} columns".format(df.shape[0], df.shape[1]),"\n\n\n")

The National Leagues dataframe BEFORE dropped has 1379 rows and 103 columns
The National Leagues dataframe AFTER dropped has 1379 rows and 103 columns 





<a id="3"></a>
<h1 style="color:#ffc0cb;font-size:40px;font-family:Georgia;text-align:center;"><strong>3. Save the Intermediate data that has been transformed</strong></h1>

In [17]:
write_interim_path(df, '2021-2022.csv', 'national_leagues')

cleaned 2021-2022.csv data was successfully saved!



