In [1]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from IPython.display import display
%matplotlib inline

In [2]:
# Read data and drop redundant column.
data = pd.read_csv('epl_train.csv')

# Preview data.
data = data[data.MW > 3]

data.drop(['Unnamed: 0','HomeTeam', 'AwayTeam', 'Date', 'MW', 'HTFormPtsStr', 'ATFormPtsStr', 'FTHG', 'FTAG',
           'HTGS', 'ATGS', 'HTGC', 'ATGC','HomeTeamLP', 'AwayTeamLP','DiffPts','HTFormPts','ATFormPts',
           'HM4','HM5','AM4','AM5','HTLossStreak5','ATLossStreak5','HTWinStreak5','ATWinStreak5',
           'HTWinStreak3','HTLossStreak3','ATWinStreak3','ATLossStreak3'],1, inplace=True)

cleanup_nums = {"FTR":     {"H": 0, "D": 1, "A": 2}}
data = data.replace(cleanup_nums)
display(data.head())

Unnamed: 0,FTR,HTP,ATP,HM1,HM2,HM3,AM1,AM2,AM3,B365H,B365D,B365A,HTGD,ATGD,DiffFormPts,DiffLP
30,0,1.0,1.5,D,L,W,W,L,W,2.38,3.25,3.2,-0.5,0.0,-0.5,-2.0
31,0,2.25,1.0,W,W,W,D,W,L,1.36,4.75,9.0,1.5,0.0,1.25,-5.0
32,1,0.75,0.75,L,W,L,W,L,L,2.1,3.3,3.6,-0.5,-1.0,0.0,-4.0
33,0,1.5,0.75,W,W,L,L,L,W,1.73,3.6,5.25,0.5,-0.75,0.75,-6.0
34,0,0.75,0.75,W,L,L,L,L,W,1.73,3.6,5.25,-1.25,-1.0,0.0,-8.0


In [3]:
X_all = data.drop(['FTR'],1)
y_all = data['FTR']

cols = [['HTGD','ATGD','HTP','ATP','DiffLP']]

In [4]:
#last 3 wins
X_all.HM1 = X_all.HM1.astype('str')
X_all.HM2 = X_all.HM2.astype('str')
X_all.HM3 = X_all.HM3.astype('str')
X_all.AM1 = X_all.AM1.astype('str')
X_all.AM2 = X_all.AM2.astype('str')
X_all.AM3 = X_all.AM3.astype('str')

def features(X):
    
    temp = pd.DataFrame(index = X.index)

    # Check each column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised version
        temp = temp.join(col_data)
    
    return temp

X_all = features(X_all)
print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))

Processed feature columns (27 total features):
['HTP', 'ATP', 'HM1_D', 'HM1_L', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_W', 'AM1_D', 'AM1_L', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_W', 'B365H', 'B365D', 'B365A', 'HTGD', 'ATGD', 'DiffFormPts', 'DiffLP']


In [5]:
print("Feature values:")
display(X_all.head())

Feature values:


Unnamed: 0,HTP,ATP,HM1_D,HM1_L,HM1_W,HM2_D,HM2_L,HM2_W,HM3_D,HM3_L,...,AM3_D,AM3_L,AM3_W,B365H,B365D,B365A,HTGD,ATGD,DiffFormPts,DiffLP
30,1.0,1.5,1,0,0,0,1,0,0,0,...,0,0,1,2.38,3.25,3.2,-0.5,0.0,-0.5,-2.0
31,2.25,1.0,0,0,1,0,0,1,0,0,...,0,1,0,1.36,4.75,9.0,1.5,0.0,1.25,-5.0
32,0.75,0.75,0,1,0,0,0,1,0,1,...,0,1,0,2.1,3.3,3.6,-0.5,-1.0,0.0,-4.0
33,1.5,0.75,0,0,1,0,0,1,0,1,...,0,0,1,1.73,3.6,5.25,0.5,-0.75,0.75,-6.0
34,0.75,0.75,0,0,1,0,1,0,0,1,...,0,0,1,1.73,3.6,5.25,-1.25,-1.0,0.0,-8.0


In [6]:
from sklearn.model_selection import train_test_split

# Random split dataset into training and testing set 4:1 ratio.
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
                                                    test_size = 0.2,
                                                    random_state = 2,
                                                    stratify = y_all)

In [7]:
from time import time 
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, pos_label='H',average='micro'), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print(f1, acc)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

In [9]:
# Initialize Logistic Regression
clf_lr= LogisticRegression(random_state = 39)
train_predict(clf_lr, X_train, y_train, X_test, y_test)

Training a LogisticRegression using a training set size of 3079. . .
Trained model in 0.1706 seconds
Made predictions in 0.0030 seconds.
0.570315037349789 0.570315037349789
F1 score and accuracy score for training set: 0.5703 , 0.5703.
Made predictions in 0.0030 seconds.
F1 score and accuracy score for test set: 0.5818 , 0.5818.


In [11]:
# Hyper paramter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# parameter grid
parameters = {
    'penalty' : ['l1','l2','elasticnet'], 
    'C'       : [-3,1,3,7],
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear','saga'],
    'random_state' :[39]
            }

logreg = LogisticRegression()
grid = GridSearchCV(logreg,                    
                   param_grid = parameters,  
                   scoring='accuracy',        
                   cv=10)    

grid.fit(X_train,y_train)

# Get the estimator
print("Best Parameters: ",grid.best_params_)
print("Best Estimators: ",grid.best_estimator_)
print(classification_report(y_test, grid.predict(X_test)))

Best Parameters:  {'C': 3, 'penalty': 'l2', 'random_state': 39, 'solver': 'newton-cg'}
Best Estimators:  LogisticRegression(C=3, random_state=39, solver='newton-cg')
              precision    recall  f1-score   support

           0       0.62      0.81      0.70       360
           1       0.41      0.11      0.18       189
           2       0.56      0.63      0.59       221

    accuracy                           0.59       770
   macro avg       0.53      0.52      0.49       770
weighted avg       0.55      0.59      0.54       770



In [13]:
# Metric results after tuning
clf_lr = LogisticRegression(C=3, random_state=39, solver='newton-cg')
train_predict(clf_lr, X_train, y_train, X_test, y_test)

Training a LogisticRegression using a training set size of 3079. . .
Trained model in 0.2005 seconds
Made predictions in 0.0040 seconds.
0.5716141604417019 0.5716141604417019
F1 score and accuracy score for training set: 0.5716 , 0.5716.
Made predictions in 0.0030 seconds.
F1 score and accuracy score for test set: 0.5870 , 0.5870.


In [14]:
#importing test set
from sklearn.metrics import classification_report, confusion_matrix
epl = pd.read_csv('epl_test.csv')

epl = epl[epl.MW > 3]

epl.drop(['Unnamed: 0','HomeTeam', 'AwayTeam', 'Date', 'MW', 'HTFormPtsStr', 'ATFormPtsStr', 'FTHG', 'FTAG',
           'HTGS', 'ATGS', 'HTGC', 'ATGC','HomeTeamLP', 'AwayTeamLP','DiffPts','HTFormPts','ATFormPts',
           'HM4','HM5','AM4','AM5','HTLossStreak5','ATLossStreak5','HTWinStreak5','ATWinStreak5',
           'HTWinStreak3','HTLossStreak3','ATWinStreak3','ATLossStreak3'],1, inplace=True)
cleanup_nums = {"FTR":     {"H": 0, "D": 1, "A": 2}}
epl = epl.replace(cleanup_nums)
X1_all = epl.drop(['FTR'],1)
y1_all = epl['FTR']
X1_all.HM1 = X1_all.HM1.astype('str')
X1_all.HM2 = X1_all.HM2.astype('str')
X1_all.HM3 = X1_all.HM3.astype('str')
X1_all.AM1 = X1_all.AM1.astype('str')
X1_all.AM2 = X1_all.AM2.astype('str')
X1_all.AM3 = X1_all.AM3.astype('str')

#we want continous vars that are integers for our input data, so lets remove any categorical vars
def preprocess_features(X):
    ''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X1_all = preprocess_features(X1_all)
print("Processed feature columns ({} total features):\n{}".format(len(X1_all.columns), list(X1_all.columns)))

Processed feature columns (27 total features):
['HTP', 'ATP', 'HM1_D', 'HM1_L', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_W', 'AM1_D', 'AM1_L', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_W', 'B365H', 'B365D', 'B365A', 'HTGD', 'ATGD', 'DiffFormPts', 'DiffLP']


In [17]:
# Checking metrics using test set
clf_lr = LogisticRegression(C=3, random_state=39, solver='newton-cg')
clf_lr.fit(X1_all,y1_all)
f1, acc = predict_labels(clf_lr, X1_all, y1_all)
print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
print(classification_report(y1_all, clf_lr.predict(X1_all)))

Made predictions in 0.0020 seconds.
F1 score and accuracy score for training set: 0.5943 , 0.5943.
              precision    recall  f1-score   support

           0       0.63      0.70      0.66       133
           1       0.51      0.27      0.35        81
           2       0.58      0.68      0.63       136

    accuracy                           0.59       350
   macro avg       0.57      0.55      0.55       350
weighted avg       0.58      0.59      0.58       350



In [19]:
#Predictions
y1_predict = clf_lr.predict(X1_all)
print(y1_predict)

[2 0 2 0 0 0 1 2 2 0 0 2 2 0 2 2 1 0 2 2 2 1 0 2 2 2 1 2 2 2 2 0 0 0 0 0 1
 2 1 0 2 2 0 1 2 0 1 2 2 0 0 0 2 2 2 1 0 2 2 1 0 2 2 2 2 1 0 0 2 0 2 0 2 2
 0 0 0 2 0 0 1 2 0 2 2 0 0 2 0 0 0 0 1 1 0 1 2 2 2 0 1 0 2 0 1 0 1 2 0 1 2
 0 2 0 0 0 2 2 0 2 0 2 1 0 1 2 0 2 0 1 2 2 2 2 2 2 2 0 0 0 0 1 0 2 0 2 2 0
 0 0 2 0 2 0 2 2 2 2 2 2 0 0 2 0 2 0 0 0 2 0 2 0 1 0 2 0 2 0 2 2 2 1 2 0 2
 2 2 0 2 1 0 2 2 2 0 2 0 0 2 2 2 2 0 0 0 0 2 2 2 0 2 0 0 2 2 0 0 0 0 1 0 1
 0 0 0 2 2 0 0 1 2 2 2 1 2 2 2 1 2 0 0 0 0 0 0 0 1 2 0 1 2 2 0 0 0 2 2 2 2
 2 0 0 2 2 0 2 0 0 2 2 2 2 0 2 2 0 2 2 0 2 0 1 0 0 0 2 0 0 2 0 1 1 2 2 0 1
 0 0 2 2 1 0 2 0 0 1 2 2 1 0 2 0 1 2 0 0 0 0 2 0 1 2 2 2 0 2 2 0 1 0 0 2 2
 2 0 0 0 2 2 2 0 2 2 0 2 0 2 2 0 2]


In [22]:
# Checking returns using Logistic Regression
funds = 100
wager = 10
favourites = 0
no_bets = 0
min_diff = 0.00

for i in range(len(X1_all)):
    addition = 30+i
    prediction = y1_predict[i]
    print('\nMatch',i+1)
    print('\nPrediction', prediction)
    print('Actual', y1_all.iloc[i])
    print('Favourite', np.argmin([epl.loc[addition,'B365H'], epl.loc[addition,'B365D'], epl.loc[addition,'B365A']]))
    print('Home, Draw and Away odds', epl.loc[addition,'B365H'], epl.loc[addition,'B365D'], epl.loc[addition,'B365A'])      
          
    
    if prediction == 2:
            if prediction == np.argmin([epl.loc[addition,'B365H'], epl.loc[addition,'B365D'], epl.loc[addition,'B365A']]): 
                                
                favourites +=1
                
            if  prediction == y1_all.iloc[i]:
                funds += (wager * epl.loc[addition,'B365A']) - wager
            else:
                funds -= wager
            
    elif prediction == 1:
            if prediction == np.argmin([epl.loc[addition,'B365H'], epl.loc[addition,'B365D'], epl.loc[addition,'B365A']]):  

                favourites +=1
            if  prediction == y1_all.iloc[i]:
                funds +=( wager * epl.loc[addition,'B365D']) - wager
            else:
                funds -= wager

    else:

            if prediction == np.argmin([epl.loc[addition,'B365H'], epl.loc[addition,'B365D'], epl.loc[addition,'B365A']]):
                favourites +=1
            if  prediction == y1_all.iloc[i]:
                funds += (wager * epl.loc[addition,'B365A']) - wager
            else:
                funds -= wager
   
    print('Funds', funds)
    

print(f'No bet placed {no_bets} times')


Match 1

Prediction 2
Actual 1
Favourite 2
Home, Draw and Away odds 7.5 5.25 1.36
Funds 90

Match 2

Prediction 0
Actual 0
Favourite 0
Home, Draw and Away odds 2.4 3.2 3.1
Funds 111.0

Match 3

Prediction 2
Actual 2
Favourite 0
Home, Draw and Away odds 1.61 4.2 5.25
Funds 153.5

Match 4

Prediction 0
Actual 0
Favourite 0
Home, Draw and Away odds 1.72 3.8 4.75
Funds 191.0

Match 5

Prediction 0
Actual 0
Favourite 0
Home, Draw and Away odds 1.57 4.0 6.0
Funds 241.0

Match 6

Prediction 0
Actual 0
Favourite 0
Home, Draw and Away odds 1.5 4.0 7.5
Funds 306.0

Match 7

Prediction 1
Actual 2
Favourite 0
Home, Draw and Away odds 1.85 3.8 4.0
Funds 296.0

Match 8

Prediction 2
Actual 0
Favourite 2
Home, Draw and Away odds 10.0 5.5 1.3
Funds 286.0

Match 9

Prediction 2
Actual 1
Favourite 2
Home, Draw and Away odds 3.75 4.2 1.83
Funds 276.0

Match 10

Prediction 0
Actual 1
Favourite 0
Home, Draw and Away odds 1.45 4.75 6.5
Funds 266.0

Match 11

Prediction 0
Actual 0
Favourite 0
Home, Draw and

Home, Draw and Away odds 1.8 3.5 4.75
Funds 2334.0

Match 144

Prediction 2
Actual 2
Favourite 2
Home, Draw and Away odds 6.5 4.33 1.5
Funds 2339.0

Match 145

Prediction 0
Actual 0
Favourite 0
Home, Draw and Away odds 1.8 3.75 4.5
Funds 2374.0

Match 146

Prediction 2
Actual 2
Favourite 2
Home, Draw and Away odds 6.25 3.75 1.6
Funds 2380.0

Match 147

Prediction 2
Actual 1
Favourite 0
Home, Draw and Away odds 1.95 3.8 3.6
Funds 2370.0

Match 148

Prediction 0
Actual 0
Favourite 0
Home, Draw and Away odds 1.14 8.5 17.0
Funds 2530.0

Match 149

Prediction 0
Actual 0
Favourite 0
Home, Draw and Away odds 1.4 4.5 8.0
Funds 2600.0

Match 150

Prediction 0
Actual 0
Favourite 0
Home, Draw and Away odds 1.57 4.2 5.5
Funds 2645.0

Match 151

Prediction 2
Actual 0
Favourite 2
Home, Draw and Away odds 2.87 3.4 2.45
Funds 2635.0

Match 152

Prediction 0
Actual 0
Favourite 0
Home, Draw and Away odds 1.22 6.5 12.0
Funds 2745.0

Match 153

Prediction 2
Actual 2
Favourite 2
Home, Draw and Away odds 6.

Actual 1
Favourite 2
Home, Draw and Away odds 4.5 4.33 1.66
Funds 4830.7

Match 287

Prediction 0
Actual 1
Favourite 0
Home, Draw and Away odds 1.57 4.0 6.0
Funds 4820.7

Match 288

Prediction 0
Actual 0
Favourite 0
Home, Draw and Away odds 1.75 3.8 4.5
Funds 4855.7

Match 289

Prediction 2
Actual 2
Favourite 2
Home, Draw and Away odds 9.0 4.75 1.36
Funds 4859.3

Match 290

Prediction 0
Actual 0
Favourite 0
Home, Draw and Away odds 1.53 3.8 6.0
Funds 4909.3

Match 291

Prediction 1
Actual 2
Favourite 0
Home, Draw and Away odds 2.0 3.5 3.75
Funds 4899.3

Match 292

Prediction 1
Actual 1
Favourite 0
Home, Draw and Away odds 1.22 6.5 12.0
Funds 4954.3

Match 293

Prediction 2
Actual 2
Favourite 2
Home, Draw and Away odds 4.75 3.6 1.75
Funds 4961.8

Match 294

Prediction 2
Actual 0
Favourite 2
Home, Draw and Away odds 5.25 3.4 1.75
Funds 4951.8

Match 295

Prediction 0
Actual 2
Favourite 0
Home, Draw and Away odds 2.0 3.4 3.9
Funds 4941.8

Match 296

Prediction 1
Actual 1
Favourite 2
Home,