In [90]:
# Import the necessary libraries.
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from IPython.display import display

%matplotlib inline

In [144]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [145]:
# Read football stats dataset from CSV File
football_ds = pd.read_csv('/content/drive/My Drive/EPLDataset.csv')

# Remove inital matchweeks to adjust for early weeks
football_ds = football_ds[football_ds.MW > 3]


In [129]:
all_labels = football_ds['FTR']

football_ds = football_ds[['HTP','ATP','Attendance','HM1','HM2','HM3','AM1','AM2','AM3' ,'DiffFormPts']]

# Preview attendance.
print(football_ds)

          HTP       ATP  Attendance HM1 HM2 HM3 AM1 AM2 AM3  DiffFormPts
30   0.750000  0.750000       30526   L   W   L   W   L   L     0.000000
31   2.250000  1.750000       40178   W   W   W   D   W   W     0.500000
32   0.750000  0.250000       25495   L   L   W   L   L   D     0.500000
33   1.250000  0.250000       38767   D   W   D   D   L   L     1.000000
34   1.500000  2.250000       32149   W   W   L   W   W   W    -0.750000
..        ...       ...         ...  ..  ..  ..  ..  ..  ..          ...
375  2.473684  1.500000       53331   W   W   W   W   W   W     0.131579
376  1.736842  0.815789       74457   D   D   L   L   L   L     0.052632
377  1.000000  0.394737       30367   L   D   D   D   L   L     0.105263
378  1.842105  1.394737       60124   L   L   W   W   D   W    -0.105263
379  1.315789  1.289474       20067   L   L   D   W   W   D    -0.078947

[350 rows x 10 columns]


In [130]:
all_labels

30     D
31     H
32     A
33     D
34     A
      ..
375    H
376    A
377    D
378    D
379    A
Name: FTR, Length: 350, dtype: object

In [131]:
# Get the total # of matches.
total_matches = football_ds.shape[0]

#Get the total wins by away team
total_awayw = len(all_labels[all_labels == 'A'])
total_homew = len(all_labels[all_labels == 'H'])
total_draws = len(all_labels[all_labels == 'D'])

# Get the aggregate of Games by team type
print(f'Aggregate No. of Games: {total_matches}')
print(f'Aggregate No. of Games Won By Away Teams: {total_awayw}')
print(f'Aggregate No. of Games Won By Home Teams: {total_homew}')
print(f'Aggregate No. of Games Ended in Draw: {total_draws}')


Aggregate No. of Games: 350
Aggregate No. of Games Won By Away Teams: 118
Aggregate No. of Games Won By Home Teams: 167
Aggregate No. of Games Ended in Draw: 65


In [133]:
#Feature scaling some of the data
from sklearn.preprocessing import scale

required_columns = [['HTP','ATP']]
for j in required_columns:
    all_features[j] = scale(all_features[j])

In [134]:
#Change Type To String
football_ds.HM1 = football_ds.HM1.astype('str')
football_ds.HM2 = football_ds.HM2.astype('str')
football_ds.HM3 = football_ds.HM3.astype('str')
football_ds.AM1 = football_ds.AM1.astype('str')
football_ds.AM2 = football_ds.AM2.astype('str')
football_ds.AM3 = football_ds.AM3.astype('str')


#Method to change cat vars to dummy vars
def convert_cat_dummy(input):
    
    # Initialize new output DataFrame
    result_df = pd.DataFrame(index = input.index)

    # Iterate through every column
    for clmn, clmn_info in input.iteritems():

        # If data type is categorical, convert to dummy variables
        if clmn_info.dtype == object:
            clmn_info = pd.get_dummies(clmn_info, prefix = clmn)
                    
        # Collect the revised columns
        result_df = result_df.join(clmn_info)
    
    return result_df

football_ds = convert_cat_dummy(football_ds)


In [135]:
print (f'Features {len(football_ds.columns)}')
print(f'Total Features: {list(football_ds.columns)}')

Features 22
Total Features: ['HTP', 'ATP', 'Attendance', 'HM1_D', 'HM1_L', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_W', 'AM1_D', 'AM1_L', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_W', 'DiffFormPts']


In [136]:
# Display the first few rows in the data
football_ds.head()

Unnamed: 0,HTP,ATP,Attendance,HM1_D,HM1_L,HM1_W,HM2_D,HM2_L,HM2_W,HM3_D,HM3_L,HM3_W,AM1_D,AM1_L,AM1_W,AM2_D,AM2_L,AM2_W,AM3_D,AM3_L,AM3_W,DiffFormPts
30,0.75,0.75,30526,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0.0
31,2.25,1.75,40178,0,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,1,0.5
32,0.75,0.25,25495,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0.5
33,1.25,0.25,38767,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,1,0,1.0
34,1.5,2.25,32149,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,-0.75


In [137]:
from sklearn.model_selection import train_test_split

#Divide the data in 80/20 way to decide the split
train_feat, test_feat, train_lbl, test_lbl = train_test_split(football_ds, all_labels, 
                                                    train_size = 0.80,
                                                    random_state = 3,
                                                    stratify = all_labels)

In [138]:
# Training and Evaluating Models

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

def predict_outcomes(csf, train_feat, train_lbl, test_feat, test_lbl):
    print("Training & Predicting Outcomes")    
    #Print the name of the classifier
    print (f'Predicting using {csf.__class__.__name__} algorithm')
    
    # Train the classifier
    print("Training the ml model")
    csf.fit(train_feat, train_lbl)
    print("Finished training the ml model")

    # Print the results of prediction for both training and testing
    print("Predicting outcomes")
    y_pred = csf.predict(train_feat)
    accSc = accuracy_score(train_lbl,y_pred)* 100
    print("The Train Accuracy Score is:",accSc)
    f1Sc = f1_score(train_lbl, y_pred,average='macro')* 100
    print("The Training F1 Score is:",f1Sc)

    #Get Accuracy & F1 Score for Testing Set
    predictedTestLabel = csf.predict(test_feat)
    testAccSc = accuracy_score(test_lbl,predictedTestLabel)*100
    print("The Test Accuracy Score is:",testAccSc)

    precisionSc = precision_score(test_lbl,predictedTestLabel,average='macro')*100
    print("The Test Precision Score is:",precisionSc)

    testf1Sc = f1_score(test_lbl, predictedTestLabel,average='macro')* 100
    print("The Test F1 Score is:",testf1Sc)


In [143]:
#Import classifiers 
from sklearn import tree
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
#Set up classifiers
csf_xgb = xgb.XGBClassifier(seed = 64)
csf_lr = LogisticRegression(random_state =12,multi_class='ovr')
csf_svm = SVC(random_state = 234, kernel='rbf')
csf_nc = NearestCentroid()
csf_rf = RandomForestClassifier()
csf_knn = KNeighborsClassifier(n_neighbors=4)
csf_vc = VotingClassifier(estimators=[('lr', csf_lr), ('rf', csf_rf), ('svm', csf_svm)], voting='hard')


predict_outcomes(csf_rf,train_feat,train_lbl,test_feat, test_lbl)
print ('   ')
predict_outcomes(csf_xgb,train_feat,train_lbl,test_feat, test_lbl)
print ('   ')
predict_outcomes(csf_lr,train_feat,train_lbl,test_feat, test_lbl)
print ('   ')
predict_outcomes(csf_svm,train_feat,train_lbl,test_feat, test_lbl)
print ('   ')
predict_outcomes(csf_nc,train_feat,train_lbl,test_feat, test_lbl)
print ('   ')
predict_outcomes(csf_knn,train_feat,train_lbl,test_feat, test_lbl)
print ('   ')
predict_outcomes(csf_vc,train_feat,train_lbl,test_feat, test_lbl)
print ('   ')

Training & Predicting Outcomes
Predicting using RandomForestClassifier algorithm
Training the ml model
Finished training the ml model
Predicting outcomes
The Train Accuracy Score is: 100.0
The Training F1 Score is: 100.0
The Test Accuracy Score is: 60.0
The Test Precision Score is: 56.744956338452276
The Test F1 Score is: 47.88199964670554
   
Training & Predicting Outcomes
Predicting using XGBClassifier algorithm
Training the ml model
Finished training the ml model
Predicting outcomes
The Train Accuracy Score is: 89.64285714285715
The Training F1 Score is: 88.64617471409458
The Test Accuracy Score is: 50.0
The Test Precision Score is: 40.71428571428572
The Test F1 Score is: 41.191553544494724
   
Training & Predicting Outcomes
Predicting using LogisticRegression algorithm
Training the ml model
Finished training the ml model
Predicting outcomes
The Train Accuracy Score is: 57.14285714285714
The Training F1 Score is: 40.677851555240174
The Test Accuracy Score is: 54.285714285714285
The 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The Train Accuracy Score is: 53.57142857142857
The Training F1 Score is: 43.46420689416877
The Test Accuracy Score is: 47.14285714285714
The Test Precision Score is: 35.0
The Test F1 Score is: 35.99137931034483
   
Training & Predicting Outcomes
Predicting using KNeighborsClassifier algorithm
Training the ml model
Finished training the ml model
Predicting outcomes
The Train Accuracy Score is: 66.07142857142857
The Training F1 Score is: 64.64828707819362
The Test Accuracy Score is: 50.0
The Test Precision Score is: 43.136223344556676
The Test F1 Score is: 43.343388637506294
   
Training & Predicting Outcomes
Predicting using VotingClassifier algorithm
Training the ml model
Finished training the ml model
Predicting outcomes
The Train Accuracy Score is: 65.71428571428571
The Training F1 Score is: 47.74410774410775
The Test Accuracy Score is: 57.14285714285714
The Test Precision Score is: 37.89836347975883
The Test F1 Score is: 41.53766769865842
   


  _warn_prf(average, modifier, msg_start, len(result))


In [125]:
import pickle

In [127]:
with open('fypmodelrf', 'wb') as f:
  pickle.dump(csf_rf, f)