# Beat The Bookies: Predicting EPL Matches
_Team C_

__Mohammad Ali Syed, Abdul Al-Fahim, Dylan Hoi, Henry Chen, Chris Wong & Yolanne Lee__

**Contents:**

[Section 1](#section1): Introduction

[Section 2](#section2): Data Import

[Section 3](#section3): Data Transformation & Exploration

[Section 4](#section4): Methodology Overview

[Section 5](#section5): Model Training & Validation

[Section 6](#section6): Results

[Section 7](#section7): Final Predictions on Test Set

## Introduction
<a name='section1'></a>

## Data Import
<a name='section2'></a>

In [1]:
#import packages
import numpy as np
import pandas as pd
import datetime as datetime

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

import matplotlib.pyplot as plt

In [2]:
#Load data

#Change this to your directory
dirName = 'Data_Files/'
filePath = dirName + 'epl-training.csv'

data = pd.read_csv(filePath)
#Remove empty nan columns at the end
data = data.iloc[:, 0:22]
pd.set_option('display.max_columns', None)

## Data Transformation & Exploration
<a name='section3'></a>

In [3]:
############################################# Feature Visualisation
#Visualise correlations between different statistics
from pandas.plotting import scatter_matrix

#Sort data by teams
teams = {}
referees = {}
for i in data.groupby('HomeTeam').mean().T.columns:
    teams[i] = []
for i in data.groupby('Referee').mean().T.columns:
    referees[i] = []

In [4]:
#Split dataset into input and output data

#Output variable
y = data.iloc[:, 5:6]
#Reformat y to make it suitable for LabelEncoder
y = np.array(y).reshape(len(y))
#Encode y
y = LabelEncoder().fit_transform(y)

#Input variables
#Remove give away columns such as goals scored
data_filtered = data.drop(labels = data.columns[[3, 4, 5, 6, 7, 8]], axis = 1)

In [5]:
#Dates
data_filtered['Date'] = pd.to_datetime(data_filtered['Date'])
#year has been removed as we need to predict future results -> https://towardsdatascience.com/machine-learning-with-datetime-feature-engineering-predicting-healthcare-appointment-no-shows-5e4ca3a85f96
data_filtered['Month'] = data_filtered['Date'].dt.month
data_filtered['Week'] = data_filtered['Date'].dt.isocalendar().week
data_filtered['Day'] = data_filtered['Date'].dt.day
#Extract encoded dates
dates_split = data_filtered.iloc[:, 16:19]
#Remove encoded dates and original date column
data_filtered = data_filtered.drop(labels = data_filtered.columns[[0, 16, 17, 18]], axis = 1)

In [6]:
#Encode categorical data
encoder = OneHotEncoder(handle_unknown='ignore')

In [None]:
#Teams
home_t = data_filtered.iloc[:, 0:1]
home_t = encoder.fit_transform(home_t)

away_t = data_filtered.iloc[:, 1:2]
away_t = encoder.fit_transform(away_t)
data_filtered = data_filtered.drop(labels = data_filtered.columns[[0,1]], axis = 1)

In [None]:
#Referees 
ref = data_filtered.iloc[:, 0:1]
ref = encoder.fit_transform(ref)
data_filtered = data_filtered.drop(labels = data_filtered.columns[[0]], axis = 1)

In [None]:
#Re-stack columns
data_filtered = data_filtered.join(pd.DataFrame(ref.toarray()), rsuffix = '_ref')
data_filtered = data_filtered.join(pd.DataFrame(home_t.toarray()), rsuffix = '_home')
data_filtered = data_filtered.join(pd.DataFrame(away_t.toarray()), rsuffix = '_away')
data_filtered = dates_split.join(data_filtered)
data_filtered.columns = data_filtered.columns.astype(str)
data_filtered = data_filtered.iloc[:, 3:15]
data_filtered.head()

In [None]:
#Scale data

#scale appropriate columns -> look how example Notebook scaled wrto one hot encoded columns
# scaler = StandardScaler()
# scaled = scaler.fit_transform(data_filtered.iloc[:, 4:15])

In [None]:
#Train model
rf=RandomForestClassifier(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(data_filtered, y, test_size=0.3, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
base_accuracy = preds == y_test
base_accuracy = (np.sum(base_accuracy) / len(y_test)) * 100
print("Accuracy is: " + str(base_accuracy) + "%")

In [None]:
#Visualise and analyse initial results

# print(confusion_matrix(y_test,preds))
# print(classification_report(y_test,preds))
#refs: 15:58
# print(rf.feature_importances_)
feature_importances = list(zip(X_train, rf.feature_importances_))
feature_importances_ranked = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Feature: {:35} Importance: {}'.format(*pair)) for pair in feature_importances_ranked];
# print(feature_importances)

In [None]:
#Feature Selection

sel = SelectFromModel(RandomForestClassifier(random_state = 42))
temp = sel.fit_transform(X_train, y_train)
selected_feat= X_train.columns[(sel.get_support())]
print(selected_feat)
rf.fit(temp, y_train)
preds = rf.predict(sel.transform(X_test))
print(confusion_matrix(y_test,preds))
print(classification_report(y_test,preds))

base_accuracy = preds == y_test
base_accuracy = (np.sum(base_accuracy) / len(y_test)) * 100
print("Accuracy is: " + str(base_accuracy) + "%")
#scaled_home = scaler.fit_transform(data_filtered.iloc[:, 4:15])

## Methodology Overview
<a name='section4'></a>

## Model Training & Validation
<a name='section5'></a>

## Results
<a name='section6'></a>

## Final Predictions on Test Set
<a name='section7'></a>