In [1]:
import numpy as np
import pandas as pd

In [2]:
# read the train and test csv files

train_df = pd.read_csv('original_data/train.csv')
test_df = pd.read_csv('original_data/test-3.csv', index_col='index') # no need to read the index column

## Data Exploration, Data Cleaning

Let us examine the dataframes and their columns

In [3]:
train_df

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
0,3.0,12.0,0.0,14.0,4.0,0.0,Stuttgart,07/08/09,H,6.0,12.0,0.0,13.0,7.0,0.0,0.0,0.0,Wolfsburg,bundesliga
1,1.0,10.0,0.0,7.0,0.0,1.0,FC Koln,08/08/09,H,16.0,8.0,0.0,24.0,11.0,0.0,0.0,0.0,Dortmund,bundesliga
2,3.0,20.0,0.0,15.0,3.0,2.0,Hannover,08/08/09,H,5.0,16.0,0.0,10.0,4.0,0.0,0.0,3.0,Hertha,bundesliga
3,10.0,28.0,0.0,9.0,3.0,2.0,Bayern Munich,08/08/09,D,3.0,10.0,0.0,9.0,1.0,1.0,1.0,0.0,Hoffenheim,bundesliga
4,5.0,28.0,0.0,13.0,7.0,2.0,Leverkusen,08/08/09,D,3.0,22.0,0.0,8.0,4.0,2.0,1.0,1.0,Mainz,bundesliga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12784,7.0,1.0,0.0,17.0,4.0,1.0,Udinese,28/05/17,H,6.0,15.0,0.0,17.0,6.0,0.0,3.0,2.0,Inter,serie-a
12785,7.0,16.0,0.0,17.0,4.0,4.0,Empoli,28/05/17,H,0.0,18.0,0.0,10.0,5.0,0.0,0.0,3.0,Palermo,serie-a
12786,2.0,14.0,0.0,13.0,4.0,0.0,Genoa,28/05/17,H,6.0,8.0,0.0,24.0,7.0,1.0,1.0,2.0,Roma,serie-a
12787,10.0,8.0,0.0,22.0,7.0,0.0,Napoli,28/05/17,A,5.0,9.0,0.0,12.0,2.0,2.0,0.0,1.0,Sampdoria,serie-a


In [4]:
test_df

Unnamed: 0_level_0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,Referee,league
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,5,17,0,19,4,2,Leverkusen,18/08/17,4,13,0,13,8,0,2,1,Bayern Munich,,bundesliga
1,7,16,0,13,1,3,Augsburg,19/08/17,3,18,0,11,5,0,1,4,Hamburg,,bundesliga
2,1,12,0,9,2,1,Stuttgart,19/08/17,5,18,0,10,3,0,0,2,Hertha,,bundesliga
3,4,15,0,11,2,3,Werder Bremen,19/08/17,6,17,0,14,3,0,0,1,Hoffenheim,,bundesliga
4,1,14,0,6,2,5,Hannover,19/08/17,13,18,0,14,6,0,0,3,Mainz,,bundesliga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,1,5,1,9,3,1,Fiorentina,20/05/18,6,10,0,18,11,1,2,2,Milan,,serie-a
376,9,9,0,9,4,0,Crotone,20/05/18,6,3,0,19,9,0,2,0,Napoli,,serie-a
377,9,8,0,15,6,1,Roma,20/05/18,5,14,0,16,3,1,0,1,Sassuolo,,serie-a
378,3,11,0,9,3,4,Sampdoria,20/05/18,5,10,0,16,6,0,1,1,Spal,,serie-a


In [5]:
train_df.columns

Index(['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'AwayTeam', 'Date', 'FTR', 'HC',
       'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY', 'HomeTeam', 'league'],
      dtype='object')

In [6]:
test_df.columns

Index(['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'AwayTeam', 'Date', 'HC', 'HF',
       'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY', 'HomeTeam', 'Referee',
       'league'],
      dtype='object')

At first glance, there is a `Referee` column in the test dataset which is not in the train dataset.
In the train df, the FTR (what we should predict) appears but not in the test df, as it should.  

Let us find out if there are more such columns.  



In [7]:
commonColumns = set(list(train_df.columns)).intersection(set(list(test_df.columns))) # between train and test dataframes

In [8]:
set(list(train_df.columns)) - commonColumns  # columns only in train_df

{'FTR'}

In [9]:
set(list(test_df.columns)) - commonColumns  # columns only in test_df

{'Referee'}

In [10]:
print(sorted(commonColumns))

['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'AwayTeam', 'Date', 'HC', 'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY', 'HomeTeam', 'league']


In [11]:
len(commonColumns)

18

The referee should not influence the outcome of the game. So, we can safely drop that column in test_df.  

As we train our model, FTR column is not one of the predictors. 

So, both the train and test dataframes have the same columns that we will be looking as we train our model and predicting using it. 

In [12]:
# we can use the date information to identify the season and things like no. of days since last match to capture player fatigue

train_df["Date"] = train_df["Date"].apply(pd.to_datetime)
test_df["Date"] = test_df["Date"].apply(pd.to_datetime)

Let us test for missing values and check for datatypes in the two DFs.  

We start with `train_df`.

In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12789 entries, 0 to 12788
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   AC        12780 non-null  float64       
 1   AF        12778 non-null  float64       
 2   AR        12780 non-null  float64       
 3   AS        12780 non-null  float64       
 4   AST       12780 non-null  float64       
 5   AY        12780 non-null  float64       
 6   AwayTeam  12782 non-null  object        
 7   Date      12782 non-null  datetime64[ns]
 8   FTR       12782 non-null  object        
 9   HC        12780 non-null  float64       
 10  HF        12778 non-null  float64       
 11  HR        12780 non-null  float64       
 12  HS        12780 non-null  float64       
 13  HST       12780 non-null  float64       
 14  HTAG      12779 non-null  float64       
 15  HTHG      12779 non-null  float64       
 16  HY        12779 non-null  float64       
 17  HomeTeam  12

In [14]:
# the following columns are always integers and can never be float values. 
columnNames = ['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'HC', 'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY']
for columnName in columnNames:
    train_df[columnName] = train_df[columnName].astype('Int64')


In [15]:
train_df

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
0,3,12,0,14,4,0,Stuttgart,2009-07-08,H,6,12,0,13,7,0,0,0,Wolfsburg,bundesliga
1,1,10,0,7,0,1,FC Koln,2009-08-08,H,16,8,0,24,11,0,0,0,Dortmund,bundesliga
2,3,20,0,15,3,2,Hannover,2009-08-08,H,5,16,0,10,4,0,0,3,Hertha,bundesliga
3,10,28,0,9,3,2,Bayern Munich,2009-08-08,D,3,10,0,9,1,1,1,0,Hoffenheim,bundesliga
4,5,28,0,13,7,2,Leverkusen,2009-08-08,D,3,22,0,8,4,2,1,1,Mainz,bundesliga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12784,7,1,0,17,4,1,Udinese,2017-05-28,H,6,15,0,17,6,0,3,2,Inter,serie-a
12785,7,16,0,17,4,4,Empoli,2017-05-28,H,0,18,0,10,5,0,0,3,Palermo,serie-a
12786,2,14,0,13,4,0,Genoa,2017-05-28,H,6,8,0,24,7,1,1,2,Roma,serie-a
12787,10,8,0,22,7,0,Napoli,2017-05-28,A,5,9,0,12,2,2,0,1,Sampdoria,serie-a


In [16]:
train_df.isnull().sum()

AC           9
AF          11
AR           9
AS           9
AST          9
AY           9
AwayTeam     7
Date         7
FTR          7
HC           9
HF          11
HR           9
HS           9
HST          9
HTAG        10
HTHG        10
HY          10
HomeTeam     7
league       0
dtype: int64

Almost every column in train_df except the "league" has null values.  Even the FTR column i.e. result column has 7 missing values.  

Let's list those rows.

In [17]:
train_df[train_df["FTR"].isnull()]

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
7082,,,,,,,,NaT,,,,,,,,,,,ligue-1
9363,,,,,,,,NaT,,,,,,,,,,,premier-league
11644,,,,,,,,NaT,,,,,,,,,,,serie-a
11645,,,,,,,,NaT,,,,,,,,,,,serie-a
11646,,,,,,,,NaT,,,,,,,,,,,serie-a
12027,,,,,,,,NaT,,,,,,,,,,,serie-a
12408,,,,,,,,NaT,,,,,,,,,,,serie-a


We can see we can reduce the null values in multiple columns by dropping these rows.

In [18]:
print(f"{round((train_df.isnull().sum()['FTR'] / len(train_df))*100, 4)}% have no values under FTR column")

0.0547% have no values under FTR column


Since more than 99% of the rows are retained on dropping rows with null FTR values, we can go ahead and drop such rows. 

But before we do that, 

In [19]:
train_df.index[train_df["FTR"].isnull()] # get indices of rows with NaN under FTR

Int64Index([7082, 9363, 11644, 11645, 11646, 12027, 12408], dtype='int64')

In [20]:
train_df.drop(axis=1, inplace=True, index = train_df.index[train_df["FTR"].isnull()]) # drop from train_df where FTR and others are null

In [21]:
train_df

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
0,3,12,0,14,4,0,Stuttgart,2009-07-08,H,6,12,0,13,7,0,0,0,Wolfsburg,bundesliga
1,1,10,0,7,0,1,FC Koln,2009-08-08,H,16,8,0,24,11,0,0,0,Dortmund,bundesliga
2,3,20,0,15,3,2,Hannover,2009-08-08,H,5,16,0,10,4,0,0,3,Hertha,bundesliga
3,10,28,0,9,3,2,Bayern Munich,2009-08-08,D,3,10,0,9,1,1,1,0,Hoffenheim,bundesliga
4,5,28,0,13,7,2,Leverkusen,2009-08-08,D,3,22,0,8,4,2,1,1,Mainz,bundesliga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12784,7,1,0,17,4,1,Udinese,2017-05-28,H,6,15,0,17,6,0,3,2,Inter,serie-a
12785,7,16,0,17,4,4,Empoli,2017-05-28,H,0,18,0,10,5,0,0,3,Palermo,serie-a
12786,2,14,0,13,4,0,Genoa,2017-05-28,H,6,8,0,24,7,1,1,2,Roma,serie-a
12787,10,8,0,22,7,0,Napoli,2017-05-28,A,5,9,0,12,2,2,0,1,Sampdoria,serie-a


In [22]:
# next, we check for null values in train_df again.

train_df.isnull().sum()

AC          2
AF          4
AR          2
AS          2
AST         2
AY          2
AwayTeam    0
Date        0
FTR         0
HC          2
HF          4
HR          2
HS          2
HST         2
HTAG        3
HTHG        3
HY          3
HomeTeam    0
league      0
dtype: int64

Let us examine the first column with null values.

In [23]:
train_df[train_df["AC"].isnull()]

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
7408,,,,,,,Lyon,2017-04-16,A,,,,,,,,,Bastia,ligue-1
11298,,,,,,,Roma,2012-09-23,A,,,,,,,,,Cagliari,serie-a


Let's examine the other rows where these null values are. These two rows also can be dropped as they do not contribute very little information towards determining the outcome of the match except telling us that  

Lyon won in Bastia vs. Lyon match in 2016-17 Ligue-1 season.  
Roma won in Cagliari vs. Roma match in 2012-13 Serie-A season.

In [24]:
# list indices of empty null under each column of train_df

for colname in train_df.isnull():
    print(colname, "indices :: ", list(train_df.index[train_df[colname].isnull()]))

AC indices ::  [7408, 11298]
AF indices ::  [5620, 5733, 7408, 11298]
AR indices ::  [7408, 11298]
AS indices ::  [7408, 11298]
AST indices ::  [7408, 11298]
AY indices ::  [7408, 11298]
AwayTeam indices ::  []
Date indices ::  []
FTR indices ::  []
HC indices ::  [7408, 11298]
HF indices ::  [5620, 5733, 7408, 11298]
HR indices ::  [7408, 11298]
HS indices ::  [7408, 11298]
HST indices ::  [7408, 11298]
HTAG indices ::  [7408, 11298, 12426]
HTHG indices ::  [7408, 11298, 12426]
HY indices ::  [7408, 11298, 11894]
HomeTeam indices ::  []
league indices ::  []


In [25]:
# clearly 7408, 11298 occur in all relevant columns

train_df.drop(axis=1, inplace=True, index = [7408, 11298])  # drop from train_df where FTR is null

In [26]:
train_df

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
0,3,12,0,14,4,0,Stuttgart,2009-07-08,H,6,12,0,13,7,0,0,0,Wolfsburg,bundesliga
1,1,10,0,7,0,1,FC Koln,2009-08-08,H,16,8,0,24,11,0,0,0,Dortmund,bundesliga
2,3,20,0,15,3,2,Hannover,2009-08-08,H,5,16,0,10,4,0,0,3,Hertha,bundesliga
3,10,28,0,9,3,2,Bayern Munich,2009-08-08,D,3,10,0,9,1,1,1,0,Hoffenheim,bundesliga
4,5,28,0,13,7,2,Leverkusen,2009-08-08,D,3,22,0,8,4,2,1,1,Mainz,bundesliga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12784,7,1,0,17,4,1,Udinese,2017-05-28,H,6,15,0,17,6,0,3,2,Inter,serie-a
12785,7,16,0,17,4,4,Empoli,2017-05-28,H,0,18,0,10,5,0,0,3,Palermo,serie-a
12786,2,14,0,13,4,0,Genoa,2017-05-28,H,6,8,0,24,7,1,1,2,Roma,serie-a
12787,10,8,0,22,7,0,Napoli,2017-05-28,A,5,9,0,12,2,2,0,1,Sampdoria,serie-a


In [27]:
# next, we check for null values in `train_df` again.

train_df.isnull().sum()

AC          0
AF          2
AR          0
AS          0
AST         0
AY          0
AwayTeam    0
Date        0
FTR         0
HC          0
HF          2
HR          0
HS          0
HST         0
HTAG        1
HTHG        1
HY          1
HomeTeam    0
league      0
dtype: int64

In [28]:
# list indices of empty null under each column of train_df after the above changes
for colname in train_df.isnull():
    print(colname, "indices :: ", list(train_df.index[train_df[colname].isnull()]))

AC indices ::  []
AF indices ::  [5620, 5733]
AR indices ::  []
AS indices ::  []
AST indices ::  []
AY indices ::  []
AwayTeam indices ::  []
Date indices ::  []
FTR indices ::  []
HC indices ::  []
HF indices ::  [5620, 5733]
HR indices ::  []
HS indices ::  []
HST indices ::  []
HTAG indices ::  [12426]
HTHG indices ::  [12426]
HY indices ::  [11894]
HomeTeam indices ::  []
league indices ::  []


In [29]:
train_df[train_df.isnull().any(axis=1)] # we should see the same indices as above

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
5620,5,,0,6,3,3,Marseille,2011-09-18,H,3,,0,10,5,0.0,2.0,0.0,Lyon,ligue-1
5733,1,,0,9,3,5,Nancy,2011-12-17,A,8,,0,9,1,1.0,0.0,2.0,Caen,ligue-1
11894,5,16.0,0,10,1,5,Juventus,2015-02-03,D,1,11.0,1,8,3,0.0,0.0,,Roma,serie-a
12426,7,21.0,0,15,5,2,Pescara,2016-08-28,A,2,12.0,0,13,3,,,2.0,Sassuolo,serie-a


We will be imputing these null values making use of the current season information (since teams go thru different forms in different seasons), team that is playing (stronger teams have more corners, shots, shots on target) etc.

In [30]:
train_df

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
0,3,12,0,14,4,0,Stuttgart,2009-07-08,H,6,12,0,13,7,0,0,0,Wolfsburg,bundesliga
1,1,10,0,7,0,1,FC Koln,2009-08-08,H,16,8,0,24,11,0,0,0,Dortmund,bundesliga
2,3,20,0,15,3,2,Hannover,2009-08-08,H,5,16,0,10,4,0,0,3,Hertha,bundesliga
3,10,28,0,9,3,2,Bayern Munich,2009-08-08,D,3,10,0,9,1,1,1,0,Hoffenheim,bundesliga
4,5,28,0,13,7,2,Leverkusen,2009-08-08,D,3,22,0,8,4,2,1,1,Mainz,bundesliga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12784,7,1,0,17,4,1,Udinese,2017-05-28,H,6,15,0,17,6,0,3,2,Inter,serie-a
12785,7,16,0,17,4,4,Empoli,2017-05-28,H,0,18,0,10,5,0,0,3,Palermo,serie-a
12786,2,14,0,13,4,0,Genoa,2017-05-28,H,6,8,0,24,7,1,1,2,Roma,serie-a
12787,10,8,0,22,7,0,Napoli,2017-05-28,A,5,9,0,12,2,2,0,1,Sampdoria,serie-a


In [31]:
train_df.columns

Index(['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'AwayTeam', 'Date', 'FTR', 'HC',
       'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY', 'HomeTeam', 'league'],
      dtype='object')

In [32]:
# reorder columns as needed

train_df = train_df[['Date', 'league', 'HomeTeam', 'AwayTeam', 'AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'HC', 'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY', 'FTR']]

In [33]:
train_df

Unnamed: 0,Date,league,HomeTeam,AwayTeam,AC,AF,AR,AS,AST,AY,HC,HF,HR,HS,HST,HTAG,HTHG,HY,FTR
0,2009-07-08,bundesliga,Wolfsburg,Stuttgart,3,12,0,14,4,0,6,12,0,13,7,0,0,0,H
1,2009-08-08,bundesliga,Dortmund,FC Koln,1,10,0,7,0,1,16,8,0,24,11,0,0,0,H
2,2009-08-08,bundesliga,Hertha,Hannover,3,20,0,15,3,2,5,16,0,10,4,0,0,3,H
3,2009-08-08,bundesliga,Hoffenheim,Bayern Munich,10,28,0,9,3,2,3,10,0,9,1,1,1,0,D
4,2009-08-08,bundesliga,Mainz,Leverkusen,5,28,0,13,7,2,3,22,0,8,4,2,1,1,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12784,2017-05-28,serie-a,Inter,Udinese,7,1,0,17,4,1,6,15,0,17,6,0,3,2,H
12785,2017-05-28,serie-a,Palermo,Empoli,7,16,0,17,4,4,0,18,0,10,5,0,0,3,H
12786,2017-05-28,serie-a,Roma,Genoa,2,14,0,13,4,0,6,8,0,24,7,1,1,2,H
12787,2017-05-28,serie-a,Sampdoria,Napoli,10,8,0,22,7,0,5,9,0,12,2,2,0,1,A


Next, let's look at the test_df

In [34]:
test_df

Unnamed: 0_level_0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,Referee,league
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,5,17,0,19,4,2,Leverkusen,2017-08-18,4,13,0,13,8,0,2,1,Bayern Munich,,bundesliga
1,7,16,0,13,1,3,Augsburg,2017-08-19,3,18,0,11,5,0,1,4,Hamburg,,bundesliga
2,1,12,0,9,2,1,Stuttgart,2017-08-19,5,18,0,10,3,0,0,2,Hertha,,bundesliga
3,4,15,0,11,2,3,Werder Bremen,2017-08-19,6,17,0,14,3,0,0,1,Hoffenheim,,bundesliga
4,1,14,0,6,2,5,Hannover,2017-08-19,13,18,0,14,6,0,0,3,Mainz,,bundesliga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,1,5,1,9,3,1,Fiorentina,2018-05-20,6,10,0,18,11,1,2,2,Milan,,serie-a
376,9,9,0,9,4,0,Crotone,2018-05-20,6,3,0,19,9,0,2,0,Napoli,,serie-a
377,9,8,0,15,6,1,Roma,2018-05-20,5,14,0,16,3,1,0,1,Sassuolo,,serie-a
378,3,11,0,9,3,4,Sampdoria,2018-05-20,5,10,0,16,6,0,1,1,Spal,,serie-a


In [35]:
test_df.columns

Index(['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'AwayTeam', 'Date', 'HC', 'HF',
       'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY', 'HomeTeam', 'Referee',
       'league'],
      dtype='object')

In [36]:
# reorder columns as needed

test_df = test_df[['Date', 'league', 'HomeTeam', 'AwayTeam', 'Referee', 'AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'HC', 'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY']]

In [37]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1826 entries, 0 to 379
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      1826 non-null   datetime64[ns]
 1   league    1826 non-null   object        
 2   HomeTeam  1826 non-null   object        
 3   AwayTeam  1826 non-null   object        
 4   Referee   380 non-null    object        
 5   AC        1826 non-null   int64         
 6   AF        1826 non-null   int64         
 7   AR        1826 non-null   int64         
 8   AS        1826 non-null   int64         
 9   AST       1826 non-null   int64         
 10  AY        1826 non-null   int64         
 11  HC        1826 non-null   int64         
 12  HF        1826 non-null   int64         
 13  HR        1826 non-null   int64         
 14  HS        1826 non-null   int64         
 15  HST       1826 non-null   int64         
 16  HTAG      1826 non-null   int64         
 17  HTHG      1826 



Almost no column in test_df except the "Referee" has null values. We can even disregard this column since he/she has no influence on the game.

In [38]:
test_df

Unnamed: 0_level_0,Date,league,HomeTeam,AwayTeam,Referee,AC,AF,AR,AS,AST,AY,HC,HF,HR,HS,HST,HTAG,HTHG,HY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,2017-08-18,bundesliga,Bayern Munich,Leverkusen,,5,17,0,19,4,2,4,13,0,13,8,0,2,1
1,2017-08-19,bundesliga,Hamburg,Augsburg,,7,16,0,13,1,3,3,18,0,11,5,0,1,4
2,2017-08-19,bundesliga,Hertha,Stuttgart,,1,12,0,9,2,1,5,18,0,10,3,0,0,2
3,2017-08-19,bundesliga,Hoffenheim,Werder Bremen,,4,15,0,11,2,3,6,17,0,14,3,0,0,1
4,2017-08-19,bundesliga,Mainz,Hannover,,1,14,0,6,2,5,13,18,0,14,6,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,2018-05-20,serie-a,Milan,Fiorentina,,1,5,1,9,3,1,6,10,0,18,11,1,2,2
376,2018-05-20,serie-a,Napoli,Crotone,,9,9,0,9,4,0,6,3,0,19,9,0,2,0
377,2018-05-20,serie-a,Sassuolo,Roma,,9,8,0,15,6,1,5,14,0,16,3,1,0,1
378,2018-05-20,serie-a,Spal,Sampdoria,,3,11,0,9,3,4,5,10,0,16,6,0,1,1


In [39]:
test_df = test_df.drop(['Referee'], axis=1, inplace=False) # drop Referee column

Let us delete any duplicated rows in both the dataframes, if any exist.

In [40]:
np.unique(train_df.duplicated(keep=False)) # check for duplicate rows in train_df

array([False])

In [41]:
np.unique(test_df.duplicated(keep=False)) # check for duplicate rows in test_df

array([False])

Thus, there are no duplicate rows in train_df and test_df. 

Next, let's verify the number of null values in the two dataframes.

In [42]:
train_df.isnull().sum().sum()

7

In [45]:
train_df.isnull().sum() # list out columns that have null values

Date        0
league      0
HomeTeam    0
AwayTeam    0
AC          0
AF          2
AR          0
AS          0
AST         0
AY          0
HC          0
HF          2
HR          0
HS          0
HST         0
HTAG        1
HTHG        1
HY          1
FTR         0
dtype: int64

We will be imputing these values now.

In [43]:
test_df.isnull().sum().sum()

0