# Beat The Bookies: Predicting EPL Matches
_Team C_

__Mohammad Ali Syed, Abdul Al-Fahim, Dylan Hoi, Henry Chen, Chris Wong & Yolanne Lee__

**Contents:**

[Section 1](#section1): Introduction

[Section 2](#section2): Data Import

[Section 3](#section3): Data Transformation & Exploration

[Section 4](#section4): Methodology Overview

[Section 5](#section5): Model Training & Validation

[Section 6](#section6): Results

[Section 7](#section7): Final Predictions on Test Set

## Introduction
<a name='section1'></a>

## Data Import
<a name='section2'></a>

In [1]:
#import packages
import numpy as np
import pandas as pd
import datetime as datetime

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

import matplotlib.pyplot as plt

In [2]:
#Load data

#Change this to your directory
dirName = 'Data_Files/'
filePath = dirName + 'epl-training.csv'

data = pd.read_csv(filePath)
#Remove empty nan columns at the end
data = data.iloc[:, 0:22]
pd.set_option('display.max_columns', None)

## Data Transformation & Exploration
<a name='section3'></a>

In [3]:
############################################# Feature Visualisation
#Visualise correlations between different statistics
from pandas.plotting import scatter_matrix

#Sort data by teams
teams = {}
referees = {}
for i in data.groupby('HomeTeam').mean().T.columns:
    teams[i] = []
for i in data.groupby('Referee').mean().T.columns:
    referees[i] = []

In [19]:
#Split dataset into input and output data

#Output variable
y = data.iloc[:, 5:6]
#Reformat y to make it suitable for LabelEncoder
y = np.array(y).reshape(len(y))
#Encode y
y = LabelEncoder().fit_transform(y)

#Input variables
#Remove give away columns such as goals scored
data_filtered = data.drop(labels = data.columns[[3, 4, 5, 6, 7, 8]], axis = 1)

In [20]:
#Dates
data_filtered['Date'] = pd.to_datetime(data_filtered['Date'])
#year has been removed as we need to predict future results -> https://towardsdatascience.com/machine-learning-with-datetime-feature-engineering-predicting-healthcare-appointment-no-shows-5e4ca3a85f96
data_filtered['Month'] = data_filtered['Date'].dt.month
data_filtered['Week'] = data_filtered['Date'].dt.isocalendar().week
data_filtered['Day'] = data_filtered['Date'].dt.day
#Extract encoded dates
dates_split = data_filtered.iloc[:, 16:19]
#Remove encoded dates and original date column
data_filtered = data_filtered.drop(labels = data_filtered.columns[[0, 16, 17, 18]], axis = 1)

In [21]:
#Encode categorical data
encoder = OneHotEncoder(handle_unknown='ignore')

In [22]:
#Teams
home_t = data_filtered.iloc[:, 0:1]
home_t = encoder.fit_transform(home_t)

away_t = data_filtered.iloc[:, 1:2]
away_t = encoder.fit_transform(away_t)
data_filtered = data_filtered.drop(labels = data_filtered.columns[[0,1]], axis = 1)

In [23]:
#Referees 
ref = data_filtered.iloc[:, 0:1]
ref = encoder.fit_transform(ref)
data_filtered = data_filtered.drop(labels = data_filtered.columns[[0]], axis = 1)

In [24]:
#Re-stack columns
data_filtered = data_filtered.join(pd.DataFrame(ref.toarray()), rsuffix = '_ref')
data_filtered = data_filtered.join(pd.DataFrame(home_t.toarray()), rsuffix = '_home')
data_filtered = data_filtered.join(pd.DataFrame(away_t.toarray()), rsuffix = '_away')
data_filtered = dates_split.join(data_filtered)
data_filtered.columns = data_filtered.columns.astype(str)

In [10]:
#Scale data

#scale appropriate columns -> look how example Notebook scaled wrto one hot encoded columns
# scaler = StandardScaler()
# scaled = scaler.fit_transform(data_filtered.iloc[:, 4:15])

In [25]:
#Train model
rf=RandomForestClassifier(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(data_filtered, y, test_size=0.3, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
base_accuracy = preds == y_test
base_accuracy = (np.sum(base_accuracy) / len(y_test)) * 100
print("Accuracy is: " + str(base_accuracy) + "%")

Accuracy is: 57.827260458839405%


In [26]:
data_filtered = data_filtered.iloc[:, 0:15].join(data_filtered.iloc[:, 58:])
rf=RandomForestClassifier(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(data_filtered, y, test_size=0.3, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
accuracy = preds == y_test
accuracy = (np.sum(accuracy) / len(y_test)) * 100
print("Accuracy without Referee: " + str(accuracy) + "%")
print("Difference from before: " + str(accuracy - base_accuracy))

Accuracy without Referee: 57.962213225371116%
Difference from before: 0.13495276653171118


In [28]:
data_filtered = data_filtered.iloc[:, 3:]
rf=RandomForestClassifier(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(data_filtered, y, test_size=0.3, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
accuracy = preds == y_test
accuracy = (np.sum(accuracy) / len(y_test)) * 100
print("Accuracy without Dates: " + str(accuracy) + "%")
print("Difference from before: " + str(accuracy - base_accuracy))
#Dates provide valuable information

Accuracy without Referee: 56.81511470985156%
Difference from before: -1.012145748987848


In [29]:
data_filtered = data_filtered.iloc[:, 0:12]
rf=RandomForestClassifier(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(data_filtered, y, test_size=0.3, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
accuracy = preds == y_test
accuracy = (np.sum(accuracy) / len(y_test)) * 100
print("Accuracy with only game stats: " + str(accuracy) + "%")

Unnamed: 0,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,24,5,14,4,11,8,7,5,0,0,0,0
1,14,8,8,2,13,12,4,3,1,2,0,0
2,10,15,5,11,11,9,3,5,2,2,0,0
3,11,12,6,6,10,9,5,6,3,0,0,0
4,14,8,10,5,11,12,7,9,1,2,0,0


In [14]:
#Visualise and analyse initial results

# print(confusion_matrix(y_test,preds))
# print(classification_report(y_test,preds))
#refs: 15:58
# print(rf.feature_importances_)
feature_importances = list(zip(X_train, rf.feature_importances_))
feature_importances_ranked = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Feature: {:35} Importance: {}'.format(*pair)) for pair in feature_importances_ranked];
# print(feature_importances)

Feature: HST                                 Importance: 0.13373568693956137
Feature: AST                                 Importance: 0.11086929727346594
Feature: HS                                  Importance: 0.10711965963253876
Feature: AS                                  Importance: 0.10459574408160308
Feature: AF                                  Importance: 0.10278875719257885
Feature: HF                                  Importance: 0.10274903136845186
Feature: HC                                  Importance: 0.09664473425445547
Feature: AC                                  Importance: 0.09167319027635455
Feature: AY                                  Importance: 0.06454575651360132
Feature: HY                                  Importance: 0.0615627757744895
Feature: AR                                  Importance: 0.012278852078380604
Feature: HR                                  Importance: 0.011436514614518712


In [15]:
#Feature Selection
sel = SelectFromModel(RandomForestClassifier(random_state = 42))
temp = sel.fit_transform(X_train, y_train)
selected_feat= X_train.columns[(sel.get_support())]
print(selected_feat)
rf.fit(temp, y_train)
preds = rf.predict(sel.transform(X_test))
print(confusion_matrix(y_test,preds))
print(classification_report(y_test,preds))

base_accuracy = preds == y_test
base_accuracy = (np.sum(base_accuracy) / len(y_test)) * 100
print("Accuracy with filtered game stats: " + str(base_accuracy) + "%")
#scaled_home = scaler.fit_transform(data_filtered.iloc[:, 4:15])

Index(['HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC'], dtype='object')
[[234  39 144]
 [107  64 199]
 [118  63 514]]
              precision    recall  f1-score   support

           0       0.51      0.56      0.53       417
           1       0.39      0.17      0.24       370
           2       0.60      0.74      0.66       695

    accuracy                           0.55      1482
   macro avg       0.50      0.49      0.48      1482
weighted avg       0.52      0.55      0.52      1482

Accuracy with filtered game stats: 54.79082321187584%




In [16]:
indexes = []
for feat in selected_feat:
    indexes.append(data_filtered.columns.get_loc(feat))
    
data_filtered = data_filtered.iloc[:, indexes]
# data_filtered = data_filtered.join(pd.DataFrame(home_t.toarray()), rsuffix = '_home')
# data_filtered = data_filtered.join(pd.DataFrame(away_t.toarray()), rsuffix = '_away')
# data_filtered = dates_split.join(data_filtered)
# data_filtered.columns = data_filtered.columns.astype(str)

rf=RandomForestClassifier(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(data_filtered, y, test_size=0.3, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
accuracy = preds == y_test
accuracy = (np.sum(accuracy) / len(y_test)) * 100
print("Accuracy: " + str(accuracy) + "%")

feature_importances = list(zip(X_train, rf.feature_importances_))
feature_importances_ranked = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Feature: {:35} Importance: {}'.format(*pair)) for pair in feature_importances_ranked];

Accuracy: 54.79082321187584%
Feature: HST                                 Importance: 0.1481086731561694
Feature: HS                                  Importance: 0.12712081313495965
Feature: AST                                 Importance: 0.1260112346081479
Feature: AS                                  Importance: 0.12539776540908287
Feature: HF                                  Importance: 0.12447530553237358
Feature: AF                                  Importance: 0.12329108123989384
Feature: HC                                  Importance: 0.11444302002549617
Feature: AC                                  Importance: 0.11115210689387661


In [17]:
#Visualisation of new featureset/tree

In [18]:
#Produce new datasets

## Methodology Overview
<a name='section4'></a>

## Model Training & Validation
<a name='section5'></a>

## Results
<a name='section6'></a>

## Final Predictions on Test Set
<a name='section7'></a>