# Beat The Bookies: Predicting EPL Matches
_Team C_

__Mohammad Ali Syed, Abdul Al-Fahim, Dylan Hoi, Henry Chen, Chris Wong & Yolanne Lee__

**Contents:**

[Section 1](#section1): Introduction

[Section 2](#section2): Data Import

[Section 3](#section3): Data Transformation & Exploration

[Section 4](#section4): Methodology Overview

[Section 5](#section5): Model Training & Validation

[Section 6](#section6): Results

[Section 7](#section7): Final Predictions on Test Set

## Introduction
<a name='section1'></a>

## Data Import
<a name='section2'></a>

In [1]:
#Import packages
import numpy as np
import pandas as pd
import datetime as datetime

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

import matplotlib.pyplot as plt

In [2]:
#Load data

#Change this to your directory
dirName = 'Data_Files/'
filePath = dirName + 'epl-training.csv'

data = pd.read_csv(filePath)
#Remove empty nan columns at the end
data = data.iloc[:, 0:22]
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,16/08/08,Arsenal,West Brom,1,0,H,1,0,H,H Webb,24,5,14,4,11,8,7,5,0,0,0,0
1,16/08/08,Bolton,Stoke,3,1,H,3,0,H,C Foy,14,8,8,2,13,12,4,3,1,2,0,0
2,16/08/08,Everton,Blackburn,2,3,A,1,1,D,A Marriner,10,15,5,11,11,9,3,5,2,2,0,0
3,16/08/08,Hull,Fulham,2,1,H,1,1,D,P Walton,11,12,6,6,10,9,5,6,3,0,0,0
4,16/08/08,Middlesbrough,Tottenham,2,1,H,0,0,D,M Atkinson,14,8,10,5,11,12,7,9,1,2,0,0


## Data Transformation & Exploration
<a name='section3'></a>

In [3]:
############################################# Feature Visualisation
#Visualise correlations between different statistics
from pandas.plotting import scatter_matrix

#Sort data by teams
teams = {}
referees = {}
for i in data.groupby('HomeTeam').mean().T.columns:
    teams[i] = []
for i in data.groupby('Referee').mean().T.columns:
    referees[i] = []

#Compute summary stats per team
temp = data[(data["HomeTeam"] == "Arsenal")]
temp_ = temp.iloc[:, [3,6,10,12,14,16,18,20]]
temp_.sum()

#make data frame for both away teams and home teams for summary stats

test = temp.iloc[:, 5]
test.value_counts()[0]/len(test) #note the 0 index is H so may need to be changed for away 
# temp2 = data[(data["AwayTeam"] == "Arsenal")]
# temp2.head()

0.6396761133603239

In [4]:
#Correlation matrix between full time goals and other features
corr= data.corr()
corr_y = abs(corr["FTHG"])
highest_corr = corr_y[corr_y >0.4]
highest_corr.sort_values(ascending=True)
print("FTHG: \n" + str(highest_corr))

corr= data.corr()
corr_y = abs(corr["FTAG"])
highest_corr = corr_y[corr_y >0.4]
highest_corr.sort_values(ascending=True)
print("FTAG: \n" + str(highest_corr))

FTHG: 
FTHG    1.000000
HTHG    0.686279
HST     0.447116
Name: FTHG, dtype: float64
FTAG: 
FTAG    1.000000
HTAG    0.689755
AST     0.455887
Name: FTAG, dtype: float64


In [5]:
#Split dataset into input and output data

#Output variable
y = data.iloc[:, 5:6]
#Reformat y to make it suitable for LabelEncoder
y = np.array(y).reshape(len(y))
#Encode y
y = LabelEncoder().fit_transform(y)

#Input variables
#Remove give away columns such as goals scored
data_filtered = data.drop(labels = data.columns[[3, 4, 5, 6, 7, 8]], axis = 1)

In [6]:
#Data preprocessing

#Dates
data_filtered['Date'] = pd.to_datetime(data_filtered['Date'])
#year has been removed as we need to predict future results -> https://towardsdatascience.com/machine-learning-with-datetime-feature-engineering-predicting-healthcare-appointment-no-shows-5e4ca3a85f96
data_filtered['Month'] = data_filtered['Date'].dt.month
data_filtered['Week'] = data_filtered['Date'].dt.isocalendar().week
data_filtered['Day'] = data_filtered['Date'].dt.day
#Extract encoded dates
dates_split = data_filtered.iloc[:, 16:19]
#Remove encoded dates and original date column
data_filtered = data_filtered.drop(labels = data_filtered.columns[[0, 16, 17, 18]], axis = 1)

#Encode categorical data
encoder = OneHotEncoder(handle_unknown='ignore')

#Teams
home_t = data_filtered.iloc[:, 0:1]
home_t = encoder.fit_transform(home_t)

away_t = data_filtered.iloc[:, 1:2]
away_t = encoder.fit_transform(away_t)
data_filtered = data_filtered.drop(labels = data_filtered.columns[[0,1]], axis = 1)

#Referees 
ref = data_filtered.iloc[:, 0:1]
ref = encoder.fit_transform(ref)
data_filtered = data_filtered.drop(labels = data_filtered.columns[[0]], axis = 1)

#Re-stack columns
data_filtered = data_filtered.join(pd.DataFrame(ref.toarray()), rsuffix = '_ref')
data_filtered = data_filtered.join(pd.DataFrame(home_t.toarray()), rsuffix = '_home')
data_filtered = data_filtered.join(pd.DataFrame(away_t.toarray()), rsuffix = '_away')
data_filtered = dates_split.join(data_filtered)
data_filtered.columns = data_filtered.columns.astype(str)

In [7]:
#Train model on entire featureset
rf=RandomForestClassifier(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(data_filtered, y, test_size=0.3, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
base_accuracy = preds == y_test
base_accuracy = (np.sum(base_accuracy) / len(y_test)) * 100
print("Accuracy on entire featureset is: " + str(base_accuracy) + "%")

Accuracy on entire featureset is: 57.827260458839405%


In [8]:
#Train model without Referee feature
data_filtered = data_filtered.iloc[:, 0:15].join(data_filtered.iloc[:, 58:])
rf=RandomForestClassifier(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(data_filtered, y, test_size=0.3, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
accuracy = preds == y_test
accuracy = (np.sum(accuracy) / len(y_test)) * 100
print("Accuracy without Referee: " + str(accuracy) + "%")
print("Difference from before: " + str(accuracy - base_accuracy) + "%")

Accuracy without Referee: 57.962213225371116%
Difference from before: 0.13495276653171118%


In [9]:
#Train model without Date feature
data_filtered = data_filtered.iloc[:, 3:]
rf=RandomForestClassifier(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(data_filtered, y, test_size=0.3, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
accuracy = preds == y_test
accuracy = (np.sum(accuracy) / len(y_test)) * 100
print("Accuracy without Dates: " + str(accuracy) + "%")
print("Difference from before: " + str(accuracy - base_accuracy) + "%")

Accuracy without Dates: 56.81511470985156%
Difference from before: -1.012145748987848%


In [10]:
#Train model on only in game stats to identify most important ones
data_filtered = data_filtered.iloc[:, 0:12]
rf=RandomForestClassifier(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(data_filtered, y, test_size=0.3, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
all_stats_accuracy = preds == y_test
all_stats_accuracy = (np.sum(all_stats_accuracy) / len(y_test)) * 100
print("Accuracy on all game stats: " + str(all_stats_accuracy) + "%")

Accuracy on all game stats: 55.06072874493927%


In [11]:
#Visualise and analyse initial results

#Display feature importances in descending order
feature_importances = list(zip(X_train, rf.feature_importances_))
feature_importances_ranked = sorted(feature_importances, key = lambda x: x[1], reverse = True)
print("Feature Importances: ")
[print('Feature: {:35} Importance: {}'.format(*pair)) for pair in feature_importances_ranked];

print("\nConfusion Matrix: ")
print(confusion_matrix(y_test,preds))
print("\nClassification Report: ")
print(classification_report(y_test,preds))

Feature Importances: 
Feature: HST                                 Importance: 0.13373568693956137
Feature: AST                                 Importance: 0.11086929727346594
Feature: HS                                  Importance: 0.10711965963253876
Feature: AS                                  Importance: 0.10459574408160308
Feature: AF                                  Importance: 0.10278875719257885
Feature: HF                                  Importance: 0.10274903136845186
Feature: HC                                  Importance: 0.09664473425445547
Feature: AC                                  Importance: 0.09167319027635455
Feature: AY                                  Importance: 0.06454575651360132
Feature: HY                                  Importance: 0.0615627757744895
Feature: AR                                  Importance: 0.012278852078380604
Feature: HR                                  Importance: 0.011436514614518712

Confusion Matrix: 
[[244  46 127]
 [134  50 186]
 [1

In [12]:
#Feature Selection
#change names and display selected features more nicely, ideally with their importance, gini impurity...
sel = SelectFromModel(RandomForestClassifier(random_state = 42))
temp = sel.fit_transform(X_train, y_train)
selected_feat= X_train.columns[(sel.get_support())]
print(selected_feat)

Index(['HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC'], dtype='object')


In [13]:
#Train model on selected game stats only
indexes = []
for feat in selected_feat:
    indexes.append(data_filtered.columns.get_loc(feat))
    
data_filtered = data_filtered.iloc[:, indexes]

rf=RandomForestClassifier(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(data_filtered, y, test_size=0.3, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
reduced_stats_accuracy = preds == y_test
reduced_stats_accuracy = (np.sum(reduced_stats_accuracy) / len(y_test)) * 100
print("Accuracy on reduced in game stats: " + str(reduced_stats_accuracy) + "%")
print("Difference compared to all game stats: " + str(all_stats_accuracy - reduced_stats_accuracy) + "%")

print("\nConfusion Matrix: ")
print(confusion_matrix(y_test,preds))
print("\nClassification Report: ")
print(classification_report(y_test,preds))

Accuracy on reduced in game stats: 54.79082321187584%
Difference compared to all game stats: 0.26990553306342946%

Confusion Matrix: 
[[234  39 144]
 [107  64 199]
 [118  63 514]]

Classification Report: 
              precision    recall  f1-score   support

           0       0.51      0.56      0.53       417
           1       0.39      0.17      0.24       370
           2       0.60      0.74      0.66       695

    accuracy                           0.55      1482
   macro avg       0.50      0.49      0.48      1482
weighted avg       0.52      0.55      0.52      1482



In [14]:
#Visualisation of new featureset/tree

In [15]:
#Produce new dataset
#Fix column names
#Restack teams and dates

#Original teams are needed to be able to compute priors
data_filtered = data.iloc[:, [1, 2]].join(data_filtered)
data_filtered = dates_split.join(data_filtered)

#Stack previously removed giveaway columns
data_filtered = data_filtered.join(data.iloc[:, [3, 4, 6, 7, 8]])

#Feature engineer second half goals
#Second half home goals
SHHG = np.array(data.iloc[:, [3]]) - np.array(data.iloc[:, [6]])
#Second half away goals
SHAG = np.array(data.iloc[:, [4]]) - np.array(data.iloc[:, [7]])
data_filtered = data_filtered.join(pd.DataFrame(SHHG), rsuffix = 'SHHG')
data_filtered = data_filtered.join(pd.DataFrame(SHAG), rsuffix = 'SHAG')
data_filtered.columns = data_filtered.columns.astype(str)
#One hot encode dates? -> make the one hot columns but dont stack now, will be better to do after priors all together with teams

## Methodology Overview
<a name='section4'></a>

## Model Training & Validation
<a name='section5'></a>

## Results
<a name='section6'></a>

## Final Predictions on Test Set
<a name='section7'></a>