# Beat The Bookies: Predicting EPL Matches
_Team C_

__Mohammad Ali Syed, Abdul Al-Fahim, Dylan Hoi, Henry Chen, Chris Wong & Yolanne Lee__

**Contents:**

[Section 1](#section1): Introduction

[Section 2](#section2): Data Import

[Section 3](#section3): Data Transformation & Exploration

[Section 4](#section4): Methodology Overview

[Section 5](#section5): Model Training & Validation

[Section 6](#section6): Results

[Section 7](#section7): Final Predictions on Test Set

## Introduction
<a name='section1'></a>

## Data Import
<a name='section2'></a>

In [1]:
#import packages
import numpy as np
import pandas as pd
import datetime as datetime

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

import matplotlib.pyplot as plt

In [2]:
#Load data

#Change this to your directory
dirName = 'Data_Files/'
filePath = dirName + 'epl-training.csv'

data = pd.read_csv(filePath)
#Remove empty nan columns at the end
data = data.iloc[:, 0:22]
pd.set_option('display.max_columns', None)

## Data Transformation & Exploration
<a name='section3'></a>

In [3]:
############################################# Feature Visualisation
#Visualise correlations between different statistics
from pandas.plotting import scatter_matrix

#Sort data by teams
teams = {}
referees = {}
for i in data.groupby('HomeTeam').mean().T.columns:
    teams[i] = []
for i in data.groupby('Referee').mean().T.columns:
    referees[i] = []

In [50]:
#Split dataset into input and output data

#Output variable
y = data.iloc[:, 5:6]
#Reformat y to make it suitable for LabelEncoder
y = np.array(y).reshape(len(y))
#Encode y
y = LabelEncoder().fit_transform(y)

#Input variables
#Remove give away columns such as goals scored
data_filtered = data.drop(labels = data.columns[[3, 4, 5, 6, 7, 8]], axis = 1)

In [51]:
#Dates
data_filtered['Date'] = pd.to_datetime(data_filtered['Date'])
#year has been removed as we need to predict future results -> https://towardsdatascience.com/machine-learning-with-datetime-feature-engineering-predicting-healthcare-appointment-no-shows-5e4ca3a85f96
data_filtered['Month'] = data_filtered['Date'].dt.month
data_filtered['Week'] = data_filtered['Date'].dt.isocalendar().week
data_filtered['Day'] = data_filtered['Date'].dt.day
#Extract encoded dates
dates_split = data_filtered.iloc[:, 16:19]
#Remove encoded dates and original date column
data_filtered = data_filtered.drop(labels = data_filtered.columns[[0, 16, 17, 18]], axis = 1)

In [52]:
#Encode categorical data
encoder = OneHotEncoder(handle_unknown='ignore')

In [53]:
#Teams
home_t = data_filtered.iloc[:, 0:1]
home_t = encoder.fit_transform(home_t)

away_t = data_filtered.iloc[:, 1:2]
away_t = encoder.fit_transform(away_t)
data_filtered = data_filtered.drop(labels = data_filtered.columns[[0,1]], axis = 1)

In [54]:
#Referees 
ref = data_filtered.iloc[:, 0:1]
ref = encoder.fit_transform(ref)
data_filtered = data_filtered.drop(labels = data_filtered.columns[[0]], axis = 1)

In [55]:
#Re-stack columns
data_filtered = data_filtered.join(pd.DataFrame(ref.toarray()), rsuffix = '_ref')
data_filtered = data_filtered.join(pd.DataFrame(home_t.toarray()), rsuffix = '_home')
data_filtered = data_filtered.join(pd.DataFrame(away_t.toarray()), rsuffix = '_away')
data_filtered = dates_split.join(data_filtered)
data_filtered.columns = data_filtered.columns.astype(str)
data_filtered = data_filtered.iloc[:, 3:15]
data_filtered.head()

Unnamed: 0,Month,Week,Day,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,0_home,1_home,2_home,3_home,4_home,5_home,6_home,7_home,8_home,9_home,10_home,11_home,12_home,13_home,14_home,15_home,16_home,17_home,18_home,19_home,20_home,21_home,22_home,23_home,24_home,25_home,26_home,27_home,28_home,29_home,30_home,31_home,32_home,33_home,34_home,35_home,36_home,37_home,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,9.1,10.1,11.1,12.1,13.1,14.1,15.1,16.1,17.1,18.1,19.1,20.1,21.1,22.1,23.1,24.1,25.1,26.1,27.1,28.1,29.1,30.1,31.1,32.1,33.1,34.1,35.1,36.1,37.1
0,8,33,16,24,5,14,4,11,8,7,5,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,8,33,16,14,8,8,2,13,12,4,3,1,2,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8,33,16,10,15,5,11,11,9,3,5,2,2,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,33,16,11,12,6,6,10,9,5,6,3,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,33,16,14,8,10,5,11,12,7,9,1,2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [10]:
#Scale data

#scale appropriate columns -> look how example Notebook scaled wrto one hot encoded columns
# scaler = StandardScaler()
# scaled = scaler.fit_transform(data_filtered.iloc[:, 4:15])

In [47]:
#Train model
rf=RandomForestClassifier(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(data_filtered, y, test_size=0.3, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
base_accuracy = preds == y_test
base_accuracy = (np.sum(base_accuracy) / len(y_test)) * 100
print("Accuracy is: " + str(base_accuracy) + "%")

Accuracy is: 49.59514170040486%


In [48]:
#Visualise and analyse initial results

# print(confusion_matrix(y_test,preds))
# print(classification_report(y_test,preds))
#refs: 15:58
# print(rf.feature_importances_)
feature_importances = list(zip(X_train, rf.feature_importances_))
feature_importances_ranked = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Feature: {:35} Importance: {}'.format(*pair)) for pair in feature_importances_ranked];
# print(feature_importances)

Feature: HF                                  Importance: 0.16992815611908976
Feature: AF                                  Importance: 0.16629696303971198
Feature: HC                                  Importance: 0.16164107804061992
Feature: AC                                  Importance: 0.14857356649372386
Feature: AST                                 Importance: 0.14535724105010933
Feature: AY                                  Importance: 0.09929227109608879
Feature: HY                                  Importance: 0.08018471312405809
Feature: AR                                  Importance: 0.014732129997910754
Feature: HR                                  Importance: 0.013993881038687666


In [49]:
#Feature Selection

sel = SelectFromModel(RandomForestClassifier(random_state = 42))
temp = sel.fit_transform(X_train, y_train)
selected_feat= X_train.columns[(sel.get_support())]
print(selected_feat)
rf.fit(temp, y_train)
preds = rf.predict(sel.transform(X_test))
print(confusion_matrix(y_test,preds))
print(classification_report(y_test,preds))

base_accuracy = preds == y_test
base_accuracy = (np.sum(base_accuracy) / len(y_test)) * 100
print("Accuracy is: " + str(base_accuracy) + "%")
#scaled_home = scaler.fit_transform(data_filtered.iloc[:, 4:15])

Index(['AST', 'HF', 'AF', 'HC', 'AC'], dtype='object')
[[215  47 155]
 [122  52 196]
 [187 101 407]]
              precision    recall  f1-score   support

           0       0.41      0.52      0.46       417
           1       0.26      0.14      0.18       370
           2       0.54      0.59      0.56       695

    accuracy                           0.45      1482
   macro avg       0.40      0.41      0.40      1482
weighted avg       0.43      0.45      0.44      1482

Accuracy is: 45.47908232118758%




## Methodology Overview
<a name='section4'></a>

## Model Training & Validation
<a name='section5'></a>

## Results
<a name='section6'></a>

## Final Predictions on Test Set
<a name='section7'></a>