## Machine Learning : In-Depth Analysis


Objective of this exercise is to apply different Machine learning algorithm to find match scores and outcome

## Content

1. Poisson Model to predict the score and probability of winnning
    2. Predict the score for last 10 games
4. Classification models to predict match outcome
    5. KNeighborsClassifier
    6. RandomForestClassifier
    7. SVC

** 1. Poisson Model to predict the score and probability of winnning **

** Import necessary libraries and set required configurations **

In [1]:
import pandas as pd
import chardet
import dateutil.parser
from dateutil.tz import gettz
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import poisson
import numpy as np
import seaborn as sns
from scipy.special  import comb
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import auc
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,roc_curve,plot_confusion_matrix
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [2]:
pd.set_option('display.max_columns', 18)
pd.set_option('display.width', 900)

In [3]:
with open('data/premier_league_data_8seasons-cleaned.tsv', 'rb') as f:
    result = chardet.detect(f.read())
df= pd.read_csv('data/premier_league_data_8seasons-cleaned.tsv',delimiter='\t',encoding = result['encoding'])
sns.set(style='darkgrid')

In [4]:
def fill_missing_value(df1):
    df1['tot_offside_a'] = df1['tot_offside_a'].fillna(0)
    df1['tot_offside_h'] = df1['tot_offside_h'].fillna(0)
    df1['shot_off_target_h'] = df1['shot_off_target_h'].fillna(0)
    df1['shot_off_target_a'] = df1['shot_off_target_a'].fillna(0)
    df1['corner_taken_h'] = df1['corner_taken_h'].fillna(0)
    df1['corner_taken_a'] = df1['corner_taken_a'].fillna(0)
    df1['interception_h'] = df1['interception_h'].fillna(0)
    df1['interception_a'] = df1['interception_a'].fillna(0)
    df1['first_half_goals_h'] = df1['first_half_goals_h'].fillna(0)
    df1['first_half_goals_a'] = df1['first_half_goals_a'].fillna(0)
    df1['goals_openplay_h'] = df1['goals_openplay_h'].fillna(0)
    df1['goals_openplay_a'] = df1['goals_openplay_a'].fillna(0)
    df1['big_chance_created_h'] = df1['big_chance_created_h'].fillna(0)
    df1['big_chance_created_a'] = df1['big_chance_created_a'].fillna(0)
    df1['big_chance_scored_h'] = df1['big_chance_scored_h'].fillna(0)
    df1['big_chance_scored_a'] = df1['big_chance_scored_a'].fillna(0)
    df1['big_chance_missed_h'] = df1['big_chance_missed_h'].fillna(0)
    df1['big_chance_missed_a'] = df1['big_chance_missed_a'].fillna(0)
    df1['own_goal_accrued_h'] = df1['own_goal_accrued_h'].fillna(0)
    df1['own_goal_accrued_a'] = df1['own_goal_accrued_a'].fillna(0)
    df1['saves_h'] = df1['saves_h'].fillna(0)
    df1['saves_a'] = df1['saves_a'].fillna(0)
    df1['tot_yel_card_h'] = df1['tot_yel_card_h'].fillna(0)
    df1['tot_yel_card_a'] = df1['tot_yel_card_a'].fillna(0)
    df1['forward_goals_h'] = df1['forward_goals_h'].fillna(0)
    df1['forward_goals_a'] = df1['forward_goals_a'].fillna(0)
    df1['defender_goals_h'] = df1['defender_goals_h'].fillna(0)
    df1['defender_goals_a'] = df1['defender_goals_a'].fillna(0)
    df1['midfielder_goals_h'] = df1['midfielder_goals_h'].fillna(0)
    df1['midfielder_goals_a'] = df1['midfielder_goals_a'].fillna(0)
    df1['subs_made_h'] = df1['subs_made_h'].fillna(0)
    df1['subs_made_a'] = df1['subs_made_a'].fillna(0)
    df1['first_half_goals_h'] = (df1['first_half_goals_h'].fillna(0)).astype(int)
    df1['first_half_goals_a'] = (df1['first_half_goals_a'].fillna(0)).astype(int)
    df1['seasonid'] = df1['seasonlabel'].str.split("/",n=1,expand=True)[0].astype(int)
    return df1

def get_seasons_data1 (df1 ,seasonlabel1):
    if (seasonlabel1 =='All'):
        print ('Getting All Seasons data')
        return df1
    else:
        print('Getting data for season:',seasonlabel1)
        return df1[df1['seasonlabel']==seasonlabel1]

def get_seasons_data (df1 ,keyval,seasonlabel1,duration):
    df1 = df1.sort_values(['gameid'],ascending=True)
    if(duration=='past') :
        if seasonlabel1 in keyval.keys():
            val = int(keyval[seasonlabel1])
            l1 = [keyval[i] for i in keyval if keyval[i] <= val]
        return(df1[df1.seasonid.isin(l1)][:-20],df1[df1.seasonid.isin(l1)][-20:])
    else:
        return (df1[df1['seasonlabel']==seasonlabel1][:-20] ,df1[df1['seasonlabel']==seasonlabel1][-20:] )


def get_probability (team_h_sc, team_a_sc, max_goals=10):
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [team_h_sc, team_a_sc]]
    return(np.outer(np.array(team_pred[0]), np.array(team_pred[1])))

def win_probability (array1):
    return np.sum(np.tril(array1, -1)) * 100

def draw_probability (array1):
    return np.sum(np.diag(array1))*100

def loss_probability (array1):
    return np.sum(np.triu(array1, 1))*100

In [5]:
df_cleaned = fill_missing_value(df)

In [6]:
df_cleaned_subset = df_cleaned[['gameid','seasonid','seasonlabel','gamedate','team_h','team_a','score_h','score_a','matchtime','formation_h','formation_a','possession_pct_h','possession_pct_a','corner_taken_h','corner_taken_a','first_half_goals_h','first_half_goals_a']]
frame1 = df_cleaned_subset[['gameid','seasonlabel','seasonid','gamedate','matchtime','team_h','team_a','formation_h','possession_pct_h','corner_taken_h','score_h','corner_taken_h','corner_taken_a','first_half_goals_h']].assign(game_type='H').rename(columns={'team_h':'team', 'team_a':'opponent','score_h':'goals','formation_h':'teamFormation','formation_a':'opponentFormation','possession_pct_h':'possession','corner_taken_h':'teamCorners','corner_taken_a':'opponentCorners','first_half_goals_h':'htgoals'})
frame2 = df_cleaned_subset[['gameid','seasonlabel','seasonid','gamedate','matchtime','team_a','team_h','formation_a','possession_pct_a','corner_taken_a','score_a','corner_taken_a','corner_taken_h','first_half_goals_a']].assign(game_type='A').rename(columns={'team_a':'team', 'team_h':'opponent','score_a':'goals','formation_a':'teamFormation','formation_h':'opponentFormation','possession_pct_a':'possession','corner_taken_a':'teamCorners','corner_taken_h':'opponentCorners','first_half_goals_a':'htgoals'})
df_model_data = pd.concat([frame1,frame2])
df_model_data_season = df_model_data

# Get only seasons data
model_season = '2018/19'
current_or_past = 'past'
season_key= df_model_data[['seasonid','seasonlabel']].drop_duplicates().reset_index().set_index('seasonlabel')['seasonid'].to_dict()


In [7]:
ret_data = get_seasons_data(df_model_data,season_key, model_season,current_or_past)
df_model_data_season = ret_data[0]
df_predict_data_season = ret_data[1]
print(df_predict_data_season.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 2660 to 2669
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gameid           20 non-null     int64  
 1   seasonlabel      20 non-null     object 
 2   seasonid         20 non-null     int32  
 3   gamedate         20 non-null     object 
 4   matchtime        20 non-null     int64  
 5   team             20 non-null     object 
 6   opponent         20 non-null     object 
 7   teamFormation    20 non-null     int64  
 8   possession       20 non-null     int64  
 9   teamCorners      20 non-null     float64
 10  goals            20 non-null     int64  
 11  teamCorners      20 non-null     float64
 12  opponentCorners  20 non-null     float64
 13  htgoals          20 non-null     int32  
 14  game_type        20 non-null     object 
dtypes: float64(3), int32(2), int64(5), object(5)
memory usage: 2.3+ KB
None


** Poisson model with initial set of Feature list to predict last 20 match of season **

In [8]:
pmodel = smf.glm(formula="goals ~ game_type + team + opponent+teamFormation+possession", data=df_model_data_season,family=sm.families.Poisson()).fit()
print(pmodel.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  goals   No. Observations:                 6060
Model:                            GLM   Df Residuals:                     5992
Model Family:                 Poisson   Df Model:                           67
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -8693.0
Date:                Sun, 31 May 2020   Deviance:                       6714.2
Time:                        22:28:03   Pearson chi2:                 5.89e+03
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [9]:
lgameid=  df_predict_data_season.gameid.unique()
df_pred_output = pd.DataFrame(columns=['Date','Team','Opponent','Actual Score','Predicted Score','Win Prob.(T)','Draw Prob','Loss Prob.(T)'])
lst_pred_output = []
for index , row in df_cleaned_subset[df_cleaned_subset.gameid.isin(lgameid)].iterrows():
    tzinfos = {"BST": -7200, "CST": gettz("America/Chicago")}
    gd = dateutil.parser.parse(row['gamedate'],tzinfos=tzinfos).strftime("%m/%d/%Y")
    tm1 = row['team_h']
    tm2 = row['team_a']
    fm1=  row['formation_h']
    fm2=  row['formation_a']
    pos1 = row['possession_pct_h']
    pos2 = row['possession_pct_a']
    htg1 = row['first_half_goals_h']
    htg2 = row['first_half_goals_a']
    tm1sc = pmodel.predict(pd.DataFrame(data={'team': tm1, 'opponent': tm2,'possession':pos1,
                                                'game_type': 'H','teamFormation':fm1}, index=[1])).values[0]
    tm2sc = pmodel.predict(pd.DataFrame(data={'team': tm2, 'opponent': tm1,'possession':pos2,
                                                'game_type': 'A','teamFormation':fm2}, index=[1])).values[0]
    sc_mat = get_probability( tm1sc, tm2sc,max_goals=8)
    wp = win_probability(sc_mat)
    dp = draw_probability(sc_mat)
    lp = loss_probability(sc_mat)
    lst_pred_output.append({'Date':gd,'Team':tm1,'Opponent':tm2,'Actual Score':str(row['score_h'])+':'+str(row['score_a']),'Predicted Score':str(round(tm1sc,2))+':'+str(round(tm2sc,2)),'Win Prob.(T)':round(wp,2),'Draw Prob':round(dp,2),'Loss Prob.(T)':round(lp,2)})
df_pred_output = df_pred_output.append(lst_pred_output)
print(df_pred_output)

         Date                      Team                 Opponent Actual Score Predicted Score  Win Prob.(T)  Draw Prob  Loss Prob.(T)
0  05/12/2019  Brighton and Hove Albion          Manchester City          1:4       0.77:1.85         14.88      21.95          63.16
1  05/12/2019                   Burnley                  Arsenal          1:3        0.9:1.72         19.38      23.71          56.90
2  05/12/2019            Crystal Palace          AFC Bournemouth          5:3       1.64:1.18         47.92      24.45          27.62
3  05/12/2019                    Fulham         Newcastle United          0:4       1.16:1.58         27.91      24.96          47.13
4  05/12/2019            Leicester City                  Chelsea          0:0       1.16:1.68         26.29      24.14          49.57
5  05/12/2019                 Liverpool  Wolverhampton Wanderers          2:0        2.7:0.76         78.06      13.74           8.00
6  05/12/2019         Manchester United             Cardiff Ci

** Classification models to predict match outcome **

In [10]:
df_model = df_cleaned[['gameid','team_h','team_a','matchtime','formation_h','formation_a','possession_pct_h','possession_pct_a','corner_taken_h','corner_taken_a','score_h','score_a','matchoutcome']]
X = df_model[['team_h','team_a','formation_h','formation_a','possession_pct_h','corner_taken_h','corner_taken_a']]
y = np.array(df_model['matchoutcome'])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
numerical_features = ['formation_h','formation_a','possession_pct_h','corner_taken_h','corner_taken_a']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

categorical_features = ['team_h','team_a']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

** KNeighborsClassifier **

In [12]:
clf_k = Pipeline(steps=[('preprocessor', preprocessor),('classifier', KNeighborsClassifier(n_neighbors=10))])
clf_k.fit(X_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['formation_h', 'formation_a',
                                                   'possession_pct_h',
                                                   'corner_taken_h',
                                                   'corner_taken_a']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                 

In [13]:
test_pred_k = clf_k.predict(X_test)
test_pred_prob_k = clf_k.predict_proba(X_test)

In [14]:
print('confusion_matrix:', confusion_matrix(y_test, test_pred_k))
print('classification_report:', classification_report(y_test, test_pred_k))
print('accuracy_score:', accuracy_score(y_test, test_pred_k))


confusion_matrix: [[ 66  31 102]
 [ 36  22  88]
 [ 54  31 178]]
classification_report:               precision    recall  f1-score   support

           A       0.42      0.33      0.37       199
           D       0.26      0.15      0.19       146
           H       0.48      0.68      0.56       263

    accuracy                           0.44       608
   macro avg       0.39      0.39      0.38       608
weighted avg       0.41      0.44      0.41       608

accuracy_score: 0.4375


** RandomForestClassifier **

In [15]:
clf_r = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(n_estimators=200))])
clf_r.fit(X_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['formation_h', 'formation_a',
                                                   'possession_pct_h',
                                                   'corner_taken_h',
                                                   'corner_taken_a']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                 

In [16]:
test_pred_r = clf_r.predict(X_test)
test_pred_prob_r = clf_r.predict_proba(X_test)

In [17]:
print('confusion_matrix:', confusion_matrix(y_test, test_pred_r))
print('classification_report:', classification_report(y_test, test_pred_r))
print('accuracy_score:', accuracy_score(y_test, test_pred_r))

confusion_matrix: [[ 88  22  89]
 [ 38  15  93]
 [ 34  24 205]]
classification_report:               precision    recall  f1-score   support

           A       0.55      0.44      0.49       199
           D       0.25      0.10      0.14       146
           H       0.53      0.78      0.63       263

    accuracy                           0.51       608
   macro avg       0.44      0.44      0.42       608
weighted avg       0.47      0.51      0.47       608

accuracy_score: 0.506578947368421


** SVC **

In [18]:
clf_s = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', svm.SVC(kernel='linear',C=1.0,gamma=0.2))])
clf_s.fit(X_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['formation_h', 'formation_a',
                                                   'possession_pct_h',
                                                   'corner_taken_h',
                                                   'corner_taken_a']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                 

In [20]:
test_pred_s = clf_s.predict(X_test)

In [21]:
print('confusion_matrix:', confusion_matrix(y_test, test_pred_s))
print('classification_report:', classification_report(y_test, test_pred_s))
print('accuracy_score:', accuracy_score(y_test, test_pred_s))

confusion_matrix: [[ 71   5 123]
 [ 20   2 124]
 [ 18   5 240]]
classification_report:               precision    recall  f1-score   support

           A       0.65      0.36      0.46       199
           D       0.17      0.01      0.03       146
           H       0.49      0.91      0.64       263

    accuracy                           0.51       608
   macro avg       0.44      0.43      0.38       608
weighted avg       0.47      0.51      0.43       608

accuracy_score: 0.5148026315789473
