# ML model which predicts number of goals in EPL

### Import necessary libraries

In [67]:
import pandas as pd
import numpy as np


### Load the datasets

In [68]:
X=pd.read_csv('PL_features.csv')  # Load features dataset prepared from data_preparation file
X.head() # Display loaded dataset

Unnamed: 0,HomeTeam,AwayTeam,Referee,BWH,BWD,BWA,Avg<2.5,Avg>2.5,Day_of_year,Day_of_week,Month,Year,Season
0,Aston Villa,West Ham,M Dean,1.85,3.4,4.1,1.75,2.01,226,Saturday,8,2010,2010/2011
1,Blackburn,Everton,P Dowd,2.75,3.25,2.45,1.7,2.08,226,Saturday,8,2010,2010/2011
2,Bolton,Fulham,S Attwell,2.15,3.25,3.25,1.69,2.09,226,Saturday,8,2010,2010/2011
3,Chelsea,West Brom,M Clattenburg,1.15,6.75,16.0,2.49,1.49,226,Saturday,8,2010,2010/2011
4,Sunderland,Birmingham,A Taylor,2.15,3.2,3.3,1.65,2.16,226,Saturday,8,2010,2010/2011


In [69]:
y=pd.read_csv('Goals_labels.csv')   # Load labels dataset prepared from data_preparation file
y.head()

Unnamed: 0,Number_of_goals
0,3+
1,0-2
2,0-2
3,3+
4,3+


### Make pipeline

In [70]:
list_cols=list(X.columns)  # Make a list with column names from features dataset
print (list_cols)

['HomeTeam', 'AwayTeam', 'Referee', 'BWH', 'BWD', 'BWA', 'Avg<2.5', 'Avg>2.5', 'Day_of_year', 'Day_of_week', 'Month', 'Year', 'Season']


In [114]:
X_cat=X[['HomeTeam', 'AwayTeam', 'Referee','Day_of_week','Season','Month', 'Year']]   # Define categorical colums
X_num=X[['BWH', 'BWD', 'BWA', 'Avg<2.5', 'Avg>2.5', 'Day_of_year']]  # Define numerical columns
print (X_cat.head())
print (X_num.head())

      HomeTeam    AwayTeam        Referee Day_of_week     Season  Month  Year
0  Aston Villa    West Ham         M Dean    Saturday  2010/2011      8  2010
1    Blackburn     Everton         P Dowd    Saturday  2010/2011      8  2010
2       Bolton      Fulham      S Attwell    Saturday  2010/2011      8  2010
3      Chelsea   West Brom  M Clattenburg    Saturday  2010/2011      8  2010
4   Sunderland  Birmingham       A Taylor    Saturday  2010/2011      8  2010
    BWH   BWD    BWA  Avg<2.5  Avg>2.5  Day_of_year
0  1.85  3.40   4.10     1.75     2.01          226
1  2.75  3.25   2.45     1.70     2.08          226
2  2.15  3.25   3.25     1.69     2.09          226
3  1.15  6.75  16.00     2.49     1.49          226
4  2.15  3.20   3.30     1.65     2.16          226


### One hot encoder

In [115]:
from sklearn.preprocessing import OneHotEncoder  # Import OneHotEncoder 
encoder = OneHotEncoder()
encoder.fit(X_cat)   # Fit categorical columns to encoder

OneHotEncoder()

In [116]:
encoder.categories_

[array(['Arsenal', 'Aston Villa', 'Birmingham', 'Blackburn', 'Blackpool',
        'Bolton', 'Bournemouth', 'Brighton', 'Burnley', 'Cardiff',
        'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Huddersfield',
        'Hull', 'Leeds', 'Leicester', 'Liverpool', 'Man City',
        'Man United', 'Middlesbrough', 'Newcastle', 'Norwich', 'QPR',
        'Reading', 'Sheffield United', 'Southampton', 'Stoke',
        'Sunderland', 'Swansea', 'Tottenham', 'Watford', 'West Brom',
        'West Ham', 'Wigan', 'Wolves'], dtype=object),
 array(['Arsenal', 'Aston Villa', 'Birmingham', 'Blackburn', 'Blackpool',
        'Bolton', 'Bournemouth', 'Brighton', 'Burnley', 'Cardiff',
        'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Huddersfield',
        'Hull', 'Leeds', 'Leicester', 'Liverpool', 'Man City',
        'Man United', 'Middlesbrough', 'Newcastle', 'Norwich', 'QPR',
        'Reading', 'Sheffield United', 'Southampton', 'Stoke',
        'Sunderland', 'Swansea', 'Tottenham', 'Watfor

In [117]:
X_cat_1hot = encoder.transform(X_cat).toarray()   # Transform categorical columns to OneHot encoded columns

In [118]:
X_cat_1hot  # Display new array

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.]])

### Label Binarizer

In [76]:
# If it's needed, labels can be converted to binar values
from sklearn.preprocessing import LabelBinarizer  # Import LabelBinarizer
encoder_lb = LabelBinarizer()
y_lb = encoder_lb.fit_transform(y)  # Fit labels values to encoder
y_lb  # Display new array

array([[1],
       [0],
       [0],
       ...,
       [0],
       [0],
       [1]])

### SK learn pipeline

In [77]:
# For complete preprocessing of data:
from sklearn.pipeline import Pipeline  # Import Pipeline
from sklearn.preprocessing import StandardScaler  # Import StandarScaler
from sklearn.pipeline import FeatureUnion  #Import FeatureUnion

In [119]:
num_attribs = list (X_num)  # Create list with columns names of numerical attributes
cat_attribs = list (X_cat)  # Create list with columns names of categorical attributes

In [120]:
# Define class for selection of columns from dataframes
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [121]:
# Define pipeline for numerical features
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('std_scaler', StandardScaler()),
])

In [122]:
# Define pipeline for categorical features
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('one_hot_encoder', OneHotEncoder()),
])

In [123]:
# Merge both pipelines in one
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])

In [124]:
X_prepared = full_pipeline.fit_transform(X)  # Apply full pipeline on features dataset
X_prepared.toarray()   # Switch new dataset to Numpy array type
X_prepared.shape  # Display shape of new array

(3878, 156)

### Split training set

In [125]:
#Shuffle and split prepared datasets to train and test (20%) datasets. It's possible to use y_lb nstead of y
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.2, random_state=42)


## Train the models

### Linear Support Vector Machine Model

In [126]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear',probability=True)
clf_svm.fit(X_train, np.ravel(y_train))

SVC(kernel='linear', probability=True)

### Decision Tree Model

In [127]:
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(X_train,  np.ravel(y_train))

DecisionTreeClassifier()

### Random Forest Model

In [128]:
from sklearn.ensemble import RandomForestClassifier
clf_forest=RandomForestClassifier(random_state=42)
clf_forest.fit(X_train,  np.ravel(y_train))

RandomForestClassifier(random_state=42)

### Logistic Regression

In [129]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(max_iter=10000)
clf_log.fit(X_train,  np.ravel(y_train))


LogisticRegression(max_iter=10000)

### Perceptron

In [130]:
from sklearn.linear_model import Perceptron

clf_per = Perceptron(penalty='l2', max_iter=100000)
clf_per.fit(X_train, np.ravel(y_train))

Perceptron(max_iter=100000, penalty='l2')

### MLP Classifier

In [131]:
from sklearn.neural_network import MLPClassifier
clf_mlp = MLPClassifier(hidden_layer_sizes=(300, ),early_stopping=True,random_state=1, max_iter=1000)
clf_mlp.fit(X_train, np.ravel(y_train))

MLPClassifier(early_stopping=True, hidden_layer_sizes=(300,), max_iter=1000,
              random_state=1)

### Models performance (comparision between models)

In [132]:
#Mean accuracy
print(clf_svm.score(X_test, y_test))
print(clf_dec.score(X_test, y_test))
print(clf_log.score(X_test, y_test))
print(clf_forest.score(X_test, y_test))
print(clf_per.score(X_test, y_test))
print(clf_mlp.score(X_test, y_test))

0.5541237113402062
0.4948453608247423
0.5425257731958762
0.5154639175257731
0.529639175257732
0.5528350515463918


In [133]:
#f1 score
from sklearn.metrics import f1_score

print(f1_score(y_test,clf_svm.predict(X_test), average=None))
print(f1_score(y_test,clf_dec.predict(X_test), average=None))
print(f1_score(y_test,clf_log.predict(X_test), average=None))
print(f1_score(y_test,clf_forest.predict(X_test), average=None))
print(f1_score(y_test,clf_per.predict(X_test), average=None))
print(f1_score(y_test,clf_mlp.predict(X_test), average=None))

[0.44012945 0.62955032]
[0.45251397 0.53110048]
[0.47251114 0.59613197]
[0.44868035 0.56781609]
[0.43234837 0.59845985]
[0.4376013  0.62887701]


#### Based on evaluated results, choosen model is Logistic Regression

## Fine tuning

### GridSearchCV

In [29]:
from sklearn.model_selection import GridSearchCV

In [149]:
parameters = {'solver':('lbfgs','liblinear','newton-cg'),'max_iter':(1000, 10000)}
clf = GridSearchCV(clf_log, parameters, cv=5)
clf.fit(X_train,  np.ravel(y_train))

print(clf.best_params_)
print (clf.best_estimator_)
print(clf.score(X_test, y_test))
print(f1_score(y_test,clf.predict(X_test), average=None))

{'max_iter': 1000, 'solver': 'liblinear'}
LogisticRegression(max_iter=1000, solver='liblinear')
0.5438144329896907
[0.47321429 0.59772727]


## Prediction

In [134]:
X_predict=pd.read_csv('new_round.csv')
X_predict.head()

Unnamed: 0,HomeTeam,AwayTeam,Referee,BWH,BWD,BWA,Avg<2.5,Avg>2.5,Day_of_year,Day_of_week,Month,Year,Season
0,Newcastle,Chelsea,C Pawson,6.0,4.6,1.5,1.62,2.2,326,Saturday,11,2020,2020/2021
1,Aston Villa,Brighton,M Oliver,2.2,3.6,3.1,1.72,2.05,326,Saturday,11,2020,2020/2021
2,Tottenham,Man City,M Dean,4.1,3.8,1.83,1.6,2.25,326,Saturday,11,2020,2020/2021
3,Man United,West Brom,D Coote,1.3,5.5,9.75,1.62,2.2,326,Saturday,11,2020,2020/2021
4,Fulham,Everton,A Madley,4.0,3.8,1.85,1.72,2.05,327,Sunday,11,2020,2020/2021


In [135]:
X_pred_prep = full_pipeline.transform(X_predict)
X_pred_prep.toarray()

print (X_pred_prep.shape[1])
print (X_prepared.shape[1])

156
156


In [150]:
y_predict=clf.predict(X_pred_prep)
y_predict.reshape((10,1))
print(y_predict)

['0-2' '0-2' '0-2' '0-2' '3+' '0-2' '0-2' '0-2' '3+' '3+']


In [151]:
prediction=pd.DataFrame(columns=X_predict.columns)

In [152]:
prediction[['Home']]=X_predict[['HomeTeam']]
prediction[['Away']]=X_predict[['AwayTeam']]
prediction[['Prediction']]=y_predict

In [153]:
prediction[['Home','Away','Prediction']].head(10)

Unnamed: 0,Home,Away,Prediction
0,Newcastle,Chelsea,0-2
1,Aston Villa,Brighton,0-2
2,Tottenham,Man City,0-2
3,Man United,West Brom,0-2
4,Fulham,Everton,3+
5,Sheffield United,West Ham,0-2
6,Leeds,Arsenal,0-2
7,Liverpool,Leicester,0-2
8,Burnley,Crystal Palace,3+
9,Wolves,Southampton,3+
