In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

In [2]:
# read the train and test csv files

train_df = pd.read_csv('original_data/train.csv')
test_df = pd.read_csv('original_data/test-3.csv', index_col='index') # no need to read the index column

## Data Exploration, Data Cleaning

Let us examine the dataframes and their columns

In [3]:
train_df

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
0,3.0,12.0,0.0,14.0,4.0,0.0,Stuttgart,07/08/09,H,6.0,12.0,0.0,13.0,7.0,0.0,0.0,0.0,Wolfsburg,bundesliga
1,1.0,10.0,0.0,7.0,0.0,1.0,FC Koln,08/08/09,H,16.0,8.0,0.0,24.0,11.0,0.0,0.0,0.0,Dortmund,bundesliga
2,3.0,20.0,0.0,15.0,3.0,2.0,Hannover,08/08/09,H,5.0,16.0,0.0,10.0,4.0,0.0,0.0,3.0,Hertha,bundesliga
3,10.0,28.0,0.0,9.0,3.0,2.0,Bayern Munich,08/08/09,D,3.0,10.0,0.0,9.0,1.0,1.0,1.0,0.0,Hoffenheim,bundesliga
4,5.0,28.0,0.0,13.0,7.0,2.0,Leverkusen,08/08/09,D,3.0,22.0,0.0,8.0,4.0,2.0,1.0,1.0,Mainz,bundesliga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12784,7.0,1.0,0.0,17.0,4.0,1.0,Udinese,28/05/17,H,6.0,15.0,0.0,17.0,6.0,0.0,3.0,2.0,Inter,serie-a
12785,7.0,16.0,0.0,17.0,4.0,4.0,Empoli,28/05/17,H,0.0,18.0,0.0,10.0,5.0,0.0,0.0,3.0,Palermo,serie-a
12786,2.0,14.0,0.0,13.0,4.0,0.0,Genoa,28/05/17,H,6.0,8.0,0.0,24.0,7.0,1.0,1.0,2.0,Roma,serie-a
12787,10.0,8.0,0.0,22.0,7.0,0.0,Napoli,28/05/17,A,5.0,9.0,0.0,12.0,2.0,2.0,0.0,1.0,Sampdoria,serie-a


In [4]:
test_df

Unnamed: 0_level_0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,Referee,league
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,5,17,0,19,4,2,Leverkusen,18/08/17,4,13,0,13,8,0,2,1,Bayern Munich,,bundesliga
1,7,16,0,13,1,3,Augsburg,19/08/17,3,18,0,11,5,0,1,4,Hamburg,,bundesliga
2,1,12,0,9,2,1,Stuttgart,19/08/17,5,18,0,10,3,0,0,2,Hertha,,bundesliga
3,4,15,0,11,2,3,Werder Bremen,19/08/17,6,17,0,14,3,0,0,1,Hoffenheim,,bundesliga
4,1,14,0,6,2,5,Hannover,19/08/17,13,18,0,14,6,0,0,3,Mainz,,bundesliga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,1,5,1,9,3,1,Fiorentina,20/05/18,6,10,0,18,11,1,2,2,Milan,,serie-a
376,9,9,0,9,4,0,Crotone,20/05/18,6,3,0,19,9,0,2,0,Napoli,,serie-a
377,9,8,0,15,6,1,Roma,20/05/18,5,14,0,16,3,1,0,1,Sassuolo,,serie-a
378,3,11,0,9,3,4,Sampdoria,20/05/18,5,10,0,16,6,0,1,1,Spal,,serie-a


In [5]:
train_df.columns

Index(['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'AwayTeam', 'Date', 'FTR', 'HC',
       'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY', 'HomeTeam', 'league'],
      dtype='object')

In [6]:
test_df.columns

Index(['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'AwayTeam', 'Date', 'HC', 'HF',
       'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY', 'HomeTeam', 'Referee',
       'league'],
      dtype='object')

In [7]:
print(f"Number of columns in train_df = {len(train_df.columns)}")
print(f"Number of columns in test_df = {len(test_df.columns)}")

Number of columns in train_df = 19
Number of columns in test_df = 19


At first glance, there is a `Referee` column in the test dataset which is not in the train dataset.
In the train df, the FTR (what we should predict) appears but not in the test df, as it should.  

Let us find out if there are more such columns.

In [8]:
commonColumns = set(list(train_df.columns)).intersection(set(list(test_df.columns))) # between train and test dataframes
print(sorted(commonColumns), f"i.e. {len(commonColumns)} columns are common between train_df and test_df")

['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'AwayTeam', 'Date', 'HC', 'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY', 'HomeTeam', 'league'] i.e. 18 columns are common between train_df and test_df


In [9]:
set(list(train_df.columns)) - commonColumns  # columns only in train_df

{'FTR'}

In [10]:
set(list(test_df.columns)) - commonColumns  # columns only in test_df

{'Referee'}

The referee should not influence the outcome of the game. So, we can safely drop that column in test_df.  

As we train our model, FTR column is not one of the predictors since that is what we are predicting. We drop this too for training. 

So, both the train and test dataframes will have the same columns once we drop these columns from the respective dataframes.

Let us check the datatypes and missing values in the two DFs.

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12789 entries, 0 to 12788
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   AC        12780 non-null  float64
 1   AF        12778 non-null  float64
 2   AR        12780 non-null  float64
 3   AS        12780 non-null  float64
 4   AST       12780 non-null  float64
 5   AY        12780 non-null  float64
 6   AwayTeam  12782 non-null  object 
 7   Date      12782 non-null  object 
 8   FTR       12782 non-null  object 
 9   HC        12780 non-null  float64
 10  HF        12778 non-null  float64
 11  HR        12780 non-null  float64
 12  HS        12780 non-null  float64
 13  HST       12780 non-null  float64
 14  HTAG      12779 non-null  float64
 15  HTHG      12779 non-null  float64
 16  HY        12779 non-null  float64
 17  HomeTeam  12782 non-null  object 
 18  league    12789 non-null  object 
dtypes: float64(14), object(5)
memory usage: 1.9+ MB


Almost every column in train_df except the "league" has null values.  Even the FTR column i.e. the result column has 7 missing values. We will deal with these missing values later in this notebook.

In [12]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1826 entries, 0 to 379
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   AC        1826 non-null   int64 
 1   AF        1826 non-null   int64 
 2   AR        1826 non-null   int64 
 3   AS        1826 non-null   int64 
 4   AST       1826 non-null   int64 
 5   AY        1826 non-null   int64 
 6   AwayTeam  1826 non-null   object
 7   Date      1826 non-null   object
 8   HC        1826 non-null   int64 
 9   HF        1826 non-null   int64 
 10  HR        1826 non-null   int64 
 11  HS        1826 non-null   int64 
 12  HST       1826 non-null   int64 
 13  HTAG      1826 non-null   int64 
 14  HTHG      1826 non-null   int64 
 15  HY        1826 non-null   int64 
 16  HomeTeam  1826 non-null   object
 17  Referee   380 non-null    object
 18  league    1826 non-null   object
dtypes: int64(14), object(5)
memory usage: 285.3+ KB


Almost no column in test_df except Referee has null values. Since this column can be safely dropped, test_df will have no null values!!

We can make use of date information in many ways like identify the season, number of days since last match (to capture player fatigue), fouls or goals per game UNTIL THE LAST GAME etc.  

#### Since we are dealing with time sensitive data here, the model should not have access to data/statistics from future matches)

In [13]:
# change string column to datetime64 column

train_df["Date"] = train_df["Date"].apply(pd.to_datetime)
test_df["Date"] = test_df["Date"].apply(pd.to_datetime)

In [133]:
print(f"\nOldest date in train_df is {min(train_df['Date'])}")
print(f"Latest date in train_df is {max(train_df['Date'])}")

print(f"\nOldest date in test_df is {min(test_df['Date'])}")
print(f"Latest date in test_df is {max(test_df['Date'])}")


Oldest date in train_df is 2009-01-11 00:00:00
Latest date in train_df is 2017-12-05 00:00:00

Oldest date in test_df is 2017-01-10 00:00:00
Latest date in test_df is 2018-12-05 00:00:00


The date column has been read correctly since it matches the information given in the problem statement i.e. the matches in the training date from 2009 to 2017.  

The dates tell us that it's calendar year 2009 and NOT start-of-season 2009 for train_df. Similar observations can be made for latest date in train_df, oldest date of test_df, latest_date of test_df. We will cross-check this later in this notebook.

In [14]:
# the following columns are always non-negative integers and can never be float values. The values are always less than the max value of Int8 since there can't something like 127 (max value of signed int) corners/shots in a match. Let us change them to integer values.

columnNames = ['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'HC', 'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY']
for columnName in columnNames:
    if min(train_df[columnName]) < 0 or min(test_df[columnName]) < 0:
        raise Exception(f"Value under {columnName} in train_df or test_df is negative. This cannot happen. Some issue with the data!!")
    # if no issue found, change that column to integer type.
    train_df[columnName] = train_df[columnName].astype('Int8')
    test_df[columnName] = test_df[columnName].astype('Int8')

In [15]:
# the following columns are/should always be strings

columnNames = ['AwayTeam', 'HomeTeam', 'league'] # skipped Referee since we will dropping it. FTR will be looked into.
for columnName in columnNames:
    train_df[columnName] = train_df[columnName].astype('string')
    test_df[columnName] = test_df[columnName].astype('string')

In [16]:
# check for any unexpected values under the FTR column

Counter(list(train_df["FTR"].astype('string')))

Counter({'H': 5931, 'D': 3283, 'A': 3568, <NA>: 7})

In [17]:
# change FTR column to string.
train_df["FTR"] = train_df["FTR"].astype('string') # null values are taken care of automatically

In [18]:
# check for duplicated rows and keep only one occurence of the row. We'll choose the first occurence of the row. 
print(f"Number of rows in train_df = {len(train_df)}")
print(f"Number of rows in test_df = {len(test_df)}")

# drop duplicates
train_df = train_df.drop_duplicates(keep='first')
test_df = test_df.drop_duplicates(keep='first')
print("After dropping duplicates ...")

print(f"Number of rows in train_df = {len(train_df)}")
print(f"Number of rows in test_df = {len(test_df)}")


Number of rows in train_df = 12789
Number of rows in test_df = 1826
After dropping duplicates ...
Number of rows in train_df = 12785
Number of rows in test_df = 1826


Four rows from train_df where there were duplicates have been dropped.

In [24]:
# let us drop the Referee column in the test_df
test_df = test_df.drop(['Referee'], axis=1)

Once again, let us list the datatypes and number of null values under each column for train_df and test_df

In [25]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12785 entries, 0 to 12788
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   AC        12780 non-null  Int8          
 1   AF        12778 non-null  Int8          
 2   AR        12780 non-null  Int8          
 3   AS        12780 non-null  Int8          
 4   AST       12780 non-null  Int8          
 5   AY        12780 non-null  Int8          
 6   AwayTeam  12782 non-null  string        
 7   Date      12782 non-null  datetime64[ns]
 8   FTR       12782 non-null  string        
 9   HC        12780 non-null  Int8          
 10  HF        12778 non-null  Int8          
 11  HR        12780 non-null  Int8          
 12  HS        12780 non-null  Int8          
 13  HST       12780 non-null  Int8          
 14  HTAG      12779 non-null  Int8          
 15  HTHG      12779 non-null  Int8          
 16  HY        12779 non-null  Int8          
 17  HomeTeam  12

In [27]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1826 entries, 0 to 379
Data columns (total 18 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   AC        1826 non-null   Int8          
 1   AF        1826 non-null   Int8          
 2   AR        1826 non-null   Int8          
 3   AS        1826 non-null   Int8          
 4   AST       1826 non-null   Int8          
 5   AY        1826 non-null   Int8          
 6   AwayTeam  1826 non-null   string        
 7   Date      1826 non-null   datetime64[ns]
 8   HC        1826 non-null   Int8          
 9   HF        1826 non-null   Int8          
 10  HR        1826 non-null   Int8          
 11  HS        1826 non-null   Int8          
 12  HST       1826 non-null   Int8          
 13  HTAG      1826 non-null   Int8          
 14  HTHG      1826 non-null   Int8          
 15  HY        1826 non-null   Int8          
 16  HomeTeam  1826 non-null   string        
 17  league    1826 

Get the count of null values in all columns in the two dataframes

In [30]:
train_df.isnull().sum() # number of null values by column in train_df

AC          5
AF          7
AR          5
AS          5
AST         5
AY          5
AwayTeam    3
Date        3
FTR         3
HC          5
HF          7
HR          5
HS          5
HST         5
HTAG        6
HTHG        6
HY          6
HomeTeam    3
league      0
dtype: int64

In [31]:
test_df.isnull().sum() # number of null values by column in test_df

AC          0
AF          0
AR          0
AS          0
AST         0
AY          0
AwayTeam    0
Date        0
HC          0
HF          0
HR          0
HS          0
HST         0
HTAG        0
HTHG        0
HY          0
HomeTeam    0
league      0
dtype: int64

Let us look at the rows with one or more values in the dataframes.

In [40]:
train_df[train_df.isnull().any(axis=1)] # shape = 9 rows, 19 columns

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
5620,5.0,,0.0,6.0,3.0,3.0,Marseille,2011-09-18,H,3.0,,0.0,10.0,5.0,0.0,2.0,0.0,Lyon,ligue-1
5733,1.0,,0.0,9.0,3.0,5.0,Nancy,2011-12-17,A,8.0,,0.0,9.0,1.0,1.0,0.0,2.0,Caen,ligue-1
7082,,,,,,,,NaT,,,,,,,,,,,ligue-1
7408,,,,,,,Lyon,2017-04-16,A,,,,,,,,,Bastia,ligue-1
9363,,,,,,,,NaT,,,,,,,,,,,premier-league
11298,,,,,,,Roma,2012-09-23,A,,,,,,,,,Cagliari,serie-a
11644,,,,,,,,NaT,,,,,,,,,,,serie-a
11894,5.0,16.0,0.0,10.0,1.0,5.0,Juventus,2015-02-03,D,1.0,11.0,1.0,8.0,3.0,0.0,0.0,,Roma,serie-a
12426,7.0,21.0,0.0,15.0,5.0,2.0,Pescara,2016-08-28,A,2.0,12.0,0.0,13.0,3.0,,,2.0,Sassuolo,serie-a


Row with indices 7082, 9363, 11644 have null under all columns without any information other than the leagues they belong to.   If we knew the Home and Away teams along with the season information, we could have retrieved from outside sources and filled in the information. **Unfortunately, we will have to drop these rows.**

In [46]:
train_df = train_df.drop(axis=1, index = [7082, 9363, 11644])  # drop from train_df where FTR is null

In [48]:
train_df[train_df.isnull().any(axis=1)] # check rows with one or more null values again

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
5620,5.0,,0.0,6.0,3.0,3.0,Marseille,2011-09-18,H,3.0,,0.0,10.0,5.0,0.0,2.0,0.0,Lyon,ligue-1
5733,1.0,,0.0,9.0,3.0,5.0,Nancy,2011-12-17,A,8.0,,0.0,9.0,1.0,1.0,0.0,2.0,Caen,ligue-1
7408,,,,,,,Lyon,2017-04-16,A,,,,,,,,,Bastia,ligue-1
11298,,,,,,,Roma,2012-09-23,A,,,,,,,,,Cagliari,serie-a
11894,5.0,16.0,0.0,10.0,1.0,5.0,Juventus,2015-02-03,D,1.0,11.0,1.0,8.0,3.0,0.0,0.0,,Roma,serie-a
12426,7.0,21.0,0.0,15.0,5.0,2.0,Pescara,2016-08-28,A,2.0,12.0,0.0,13.0,3.0,,,2.0,Sassuolo,serie-a


In [41]:
test_df[test_df.isnull().any(axis=1)] # shape = 0 rows, 18 columns

Unnamed: 0_level_0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1


We can get information for the missing values using Date, HomeTeam and AwayTeam using the CSVs we can download. For that, let's identify the leagues we need to download.  

Please look into **0_downloadMoreData.py** file on how we can download data for each season for different leagues. Using the data downloaded, we can fill in the NA values we see train_df. Notes about these can be found [here](https://www.football-data.co.uk/notes.txt)

In [160]:
# list the leagues in both the dfs
print("Leagues in train_df ::", list(np.unique(train_df["league"]))) 
print("Leagues in test_df :: ", list(np.unique(test_df["league"])))

Leagues in train_df :: ['bundesliga', 'la-liga', 'ligue-1', 'premier-league', 'serie-a']
Leagues in test_df ::  ['bundesliga', 'la-liga', 'ligue-1', 'premier-league', 'serie-a']


In [184]:
# identify rows in train_df that have one or more null rows
train_df_oneOrMoreNulls = train_df[train_df.isnull().any(axis=1)]
train_df_oneOrMoreNulls

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
5620,5.0,,0.0,6.0,3.0,3.0,Marseille,2011-09-18,H,3.0,,0.0,10.0,5.0,0.0,2.0,0.0,Lyon,ligue-1
5733,1.0,,0.0,9.0,3.0,5.0,Nancy,2011-12-17,A,8.0,,0.0,9.0,1.0,1.0,0.0,2.0,Caen,ligue-1
7408,,,,,,,Lyon,2017-04-16,A,,,,,,,,,Bastia,ligue-1
11298,,,,,,,Roma,2012-09-23,A,,,,,,,,,Cagliari,serie-a
11894,5.0,16.0,0.0,10.0,1.0,5.0,Juventus,2015-02-03,D,1.0,11.0,1.0,8.0,3.0,0.0,0.0,,Roma,serie-a
12426,7.0,21.0,0.0,15.0,5.0,2.0,Pescara,2016-08-28,A,2.0,12.0,0.0,13.0,3.0,,,2.0,Sassuolo,serie-a


In [None]:
# impute data 

In [None]:
for 

Our priliminary analysis of the train_df above showed that the matches are from the beginning of calendar year 2009 (not season 2009-10) and end of calendar year 2017 (not season 2016-17). Let us verify that claim.

In [125]:
for league in list(np.unique(train_df["league"])):
    print(f"League :: {league}")
    print(sorted(Counter(train_df[(train_df["league"] == league)]["Date"].dt.strftime("%m")).items()))
    print("=======\n")

League :: bundesliga
[('01', 194), ('02', 214), ('03', 185), ('04', 242), ('05', 142), ('06', 70), ('07', 72), ('08', 184), ('09', 240), ('10', 229), ('11', 213), ('12', 157)]

League :: la-liga
[('01', 296), ('02', 268), ('03', 250), ('04', 302), ('05', 201), ('06', 80), ('07', 90), ('08', 197), ('09', 289), ('10', 270), ('11', 235), ('12', 182)]

League :: ligue-1
[('01', 250), ('02', 257), ('03', 211), ('04', 260), ('05', 205), ('06', 92), ('07', 106), ('08', 269), ('09', 271), ('10', 262), ('11', 247), ('12', 230)]

League :: premier-league
[('01', 288), ('02', 217), ('03', 215), ('04', 276), ('05', 185), ('06', 79), ('07', 67), ('08', 255), ('09', 238), ('10', 273), ('11', 242), ('12', 325)]

League :: serie-a
[('01', 278), ('02', 266), ('03', 226), ('04', 265), ('05', 188), ('06', 124), ('07', 91), ('08', 173), ('09', 310), ('10', 315), ('11', 236), ('12', 188)]



We see matches in every month of the season. There seems to be some issue with the data since football leagues take a Summer break in these months.  

Let us verify that further by checking if any league has any break during these months. We expect to see zero matches.

In [158]:
# count the number of matches in the summer months of the year for all the leagues in the training data

for year in range(2009, 2017+1): # since train_df is from calendar year 2009 to 2017
    for league in list(np.unique(train_df["league"])):
        print(f"League :: {str(year)}" f" {league}")
        print(sorted(Counter(train_df[
                                (train_df["league"] == league) & \
                                (train_df["Date"].dt.year == year) & \
                                ((train_df["Date"].dt.month == 6) | (train_df["Date"].dt.month == 7)) # summer months
                            ]["Date"].dt.strftime("%m")).items() ))
        print("=======")
    print("***"*10)

League :: 2009 bundesliga
[('06', 3), ('07', 6)]
League :: 2009 la-liga
[('06', 6), ('07', 4)]
League :: 2009 ligue-1
[('06', 4), ('07', 7)]
League :: 2009 premier-league
[('06', 2), ('07', 5)]
League :: 2009 serie-a
[('06', 8), ('07', 2)]
******************************
League :: 2010 bundesliga
[('06', 18), ('07', 7)]
League :: 2010 la-liga
[('06', 9), ('07', 20)]
League :: 2010 ligue-1
[('06', 19), ('07', 21)]
League :: 2010 premier-league
[('06', 19), ('07', 6)]
League :: 2010 serie-a
[('06', 17), ('07', 24)]
******************************
League :: 2011 bundesliga
[('06', 11), ('07', 11)]
League :: 2011 la-liga
[('06', 16), ('07', 9)]
League :: 2011 ligue-1
[('06', 23), ('07', 10)]
League :: 2011 premier-league
[('06', 7), ('07', 7)]
League :: 2011 serie-a
[('06', 32), ('07', 2)]
******************************
League :: 2012 bundesliga
[('06', 5), ('07', 11)]
League :: 2012 la-liga
[('06', 5), ('07', 16)]
League :: 2012 ligue-1
[('06', 10), ('07', 19)]
League :: 2012 premier-league

In [159]:
# let us list some Premier League matches in July 2014

train_df[(train_df["league"] == "premier-league") & (train_df["Date"].dt.year == 2014) & (train_df["Date"].dt.month == 7)]

Unnamed: 0,AC,AF,AR,AS,AST,AY,AwayTeam,Date,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HY,HomeTeam,league
9130,5,14,1,10,4,4,Leicester,2014-07-12,H,5,9,0,16,7,1,1,4,Aston Villa,premier-league
9131,4,18,1,11,2,1,Swansea,2014-07-12,H,7,11,0,15,7,1,1,0,West Ham,premier-league


We see two matches here but looking at data on the internet on Wikipedia and football-data.co.uk, we see that these matches did not actually happen. 

The [2013-14 PL season](https://en.wikipedia.org/wiki/2013–14_Premier_League) started on 17 August 2013, and concluded on 11 May 2014.  The [football-data.co.uk for 2013-14 season](https://www.football-data.co.uk/mmz4281/1314/E0.csv) source also confirms this.  
The [2014-15 PL season](https://en.wikipedia.org/wiki/2014–15_Premier_League) started on 16 August 2014 and concluded on 24 May 2015. The [football-data.co.uk for 2014-15 season](https://www.football-data.co.uk/mmz4281/1415/E0.csv) source also confirms this.  

**However, for the purposes of this exercise we will ignore this issue FOR NOW.**
