In [1]:
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error as mae

`gms_all` holds all match data, `gms_10` holds only 2010 data, `gms_14` holds on 2014 data and `gms_exc_14` holds all data excluding the 2014 match data

In [2]:
gms_all = pd.read_csv('matches.csv')
gms_10 = gms_all.loc[gms_all['key']=='world.2010']
gms_14 = gms_all.loc[gms_all['key']=='world.2014']
gms_exc_14 = gms_all[gms_all['key']!='world.2014']
display(gms_10.describe())
display(gms_14.describe())

Unnamed: 0,id,round_id,score1,score2
count,64.0,64.0,64.0,64.0
mean,1606.5,391.6875,1.1875,1.046875
std,18.618987,5.279114,1.34371,0.966579
min,1575.0,382.0,0.0,0.0
25%,1590.75,387.0,0.0,0.0
50%,1606.5,392.5,1.0,1.0
75%,1622.25,396.25,2.0,1.25
max,1638.0,401.0,7.0,4.0


Unnamed: 0,id,round_id,score1,score2
count,64.0,64.0,64.0,64.0
mean,834.5,121.71875,1.359375,1.53125
std,18.618987,5.226516,1.186795,1.583333
min,803.0,112.0,0.0,0.0
25%,818.75,117.0,0.0,0.0
50%,834.5,122.5,1.0,1.0
75%,850.25,126.25,2.0,3.0
max,866.0,131.0,5.0,7.0


It's necessary to 'mirror' all the matches due to symmetry before fitting the model

In [3]:
gls_10 = pd.concat([gms_10.rename(columns={'score1':'score'}),
                    gms_10.rename(columns={'score2':'score','team1':'team2','team2':'team1'})])
poisson_model = smf.glm(formula="score ~ team1 + team2", data=gls_10, 
                        family=sm.families.Poisson()).fit()
poisson_model.summary()

0,1,2,3
Dep. Variable:,score,No. Observations:,128.0
Model:,GLM,Df Residuals:,65.0
Model Family:,Poisson,Df Model:,62.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-136.98
Date:,"Mon, 25 Jun 2018",Deviance:,76.045
Time:,18:13:57,Pearson chi2:,66.7
No. Iterations:,21,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-22.2783,1.69e+04,-0.001,0.999,-3.31e+04,3.31e+04
team1[T.Argentina],22.0959,1.69e+04,0.001,0.999,-3.31e+04,3.31e+04
team1[T.Australia],22.4852,1.69e+04,0.001,0.999,-3.31e+04,3.31e+04
team1[T.Brazil],22.4561,1.69e+04,0.001,0.999,-3.31e+04,3.31e+04
team1[T.Cameroon],21.7755,1.69e+04,0.001,0.999,-3.31e+04,3.31e+04
team1[T.Chile],22.4197,1.69e+04,0.001,0.999,-3.31e+04,3.31e+04
team1[T.Côte d'Ivoire],22.1038,1.69e+04,0.001,0.999,-3.31e+04,3.31e+04
team1[T.Denmark],22.3803,1.69e+04,0.001,0.999,-3.31e+04,3.31e+04
team1[T.England],21.4962,1.69e+04,0.001,0.999,-3.31e+04,3.31e+04


Now define the wrapper function for the predictions. In the event of previously unseen teams, resort to using the arithmetic means.

In [4]:
pd.set_option('mode.chained_assignment', None)
def pred_team_gls(model, team1, team2, gls_df, have_teams=None):
    if have_teams is None: #initialises when first run
        have_teams = set(gls_df['team1'])|set(gls_df['team2'])
    if team1 in have_teams: 
        if team2 in have_teams:
            pred = model.predict(pd.DataFrame(data={'team1': team1, 'team2': team2},index=[0])).values[0]
        else:
            team1_gls = pd.concat([gls_df.loc[gls_df['team1']==team1,'score1'],
                                gls_df.loc[gls_df['team2']==team1,'score2']])
            pred = team1_gls.mean()
    else:
        if team2 in have_teams:
            team2_gls = pd.concat([gls_df.loc[gls_df['team1']==team2,'score1'],
                                gls_df.loc[gls_df['team2']==team2,'score2']])
            pred = team2_gls.mean()        
        else:
            pred = gls_df['score'].mean()
    return np.round(pred,8)

gms_14['pred1'] = gms_14.apply(lambda r: pred_team_gls(poisson_model, r['team1'],r['team2'], gls_10), axis=1)
gms_14['pred2'] = gms_14.apply(lambda r: pred_team_gls(poisson_model, r['team2'],r['team1'], gls_10), axis=1)
pred_14_df = gms_14[['score1', 'score2', 'team1', 'team2', 'pred1', 'pred2']]
pred_14_df.head()

Unnamed: 0,score1,score2,team1,team2,pred1,pred2
0,3,1,Brazil,Croatia,0.75,0.75
1,1,0,Mexico,Cameroon,0.652688,1.338506
2,1,5,Spain,Netherlands,0.885424,0.436429
3,3,1,Chile,Australia,2.303845,2.222653
4,3,0,Colombia,Greece,1.0,1.0


In [5]:
def evaluate_predictions(pred_df, score1='score1', score2='score2',
                        pred1='pred1', pred2='pred2'):
    score1_mae = mae(gms_14[score1], gms_14[pred1])
    score2_mae = mae(gms_14[score2], gms_14[pred2])
    display('score1_mae is {}'.format(score1_mae))
    display('score2_mae is {}'.format(score2_mae))
    display('combined mae is {}'.format((score1_mae+score2_mae)/2))
    
evaluate_predictions(gms_14)

'score1_mae is 1.14221197'

'score2_mae is 1.3622858896875'

'combined mae is 1.25224892984375'

The Negative Binomial model offers a slight improvement over the Poisson

In [6]:
nbinom_model = smf.glm(formula="score ~ team1 + team2", data=gls_10, 
                        family=sm.families.NegativeBinomial()).fit()
gms_14.loc[:,'pred1'] = gms_14.apply(lambda r: pred_team_gls(nbinom_model, r['team1'],r['team2'], gls_10), axis=1)
gms_14.loc[:,'pred2'] = gms_14.apply(lambda r: pred_team_gls(nbinom_model, r['team2'],r['team1'], gls_10), axis=1)

evaluate_predictions(gms_14)

'score1_mae is 1.1256913885937498'

'score2_mae is 1.3099595985937498'

'combined mae is 1.2178254935937498'

Time to train on more data (excluding 2014) and see what the results look like

In [7]:
gls_exc_14 = pd.concat([gms_exc_14.rename(columns={'score1':'score'}),
                    gms_exc_14.rename(columns={'score2':'score','team1':'team2','team2':'team1'})])
regr = smf.glm(formula="score ~ team1 + team2", data=gls_exc_14, 
                        family=sm.families.Poisson()).fit()
display(regr.summary())
gms_14['pred1'] = gms_14.apply(lambda r: pred_team_gls(regr, r['team1'],r['team2'], gls_exc_14), axis=1)
gms_14['pred2'] = gms_14.apply(lambda r: pred_team_gls(regr, r['team2'],r['team1'], gls_exc_14), axis=1)

0,1,2,3
Dep. Variable:,score,No. Observations:,1544.0
Model:,GLM,Df Residuals:,1385.0
Model Family:,Poisson,Df Model:,158.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-2192.0
Date:,"Mon, 25 Jun 2018",Deviance:,1721.8
Time:,18:13:58,Pearson chi2:,1560.0
No. Iterations:,22,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.4847,0.507,-0.955,0.339,-1.479,0.510
team1[T.Angola],-0.8761,1.089,-0.804,0.421,-3.011,1.259
team1[T.Argentina],0.8823,0.426,2.073,0.038,0.048,1.717
team1[T.Australia],0.3657,0.551,0.664,0.507,-0.714,1.445
team1[T.Austria],0.8315,0.444,1.873,0.061,-0.038,1.701
team1[T.Belgium],0.5162,0.444,1.163,0.245,-0.354,1.386
team1[T.Bolivia],-1.3171,1.084,-1.215,0.224,-3.441,0.807
team1[T.Brazil],1.1728,0.421,2.787,0.005,0.348,1.998
team1[T.Bulgaria],0.2234,0.468,0.477,0.633,-0.694,1.141


By using more training data, the prediction error goes down significantly

In [8]:
evaluate_predictions(gms_14)

'score1_mae is 1.0343669218749998'

'score2_mae is 1.2575602362499998'

'combined mae is 1.1459635790624998'

The Negative Binomial again offers a tiny improvement

In [9]:
regr = smf.glm(formula="score ~ team1 + team2", data=gls_exc_14, 
                        family=sm.families.NegativeBinomial()).fit()
gms_14['pred1'] = gms_14.apply(lambda r: pred_team_gls(regr, r['team1'],r['team2'], gls_exc_14), axis=1)
gms_14['pred2'] = gms_14.apply(lambda r: pred_team_gls(regr, r['team2'],r['team1'], gls_exc_14), axis=1)
evaluate_predictions(gms_14)

'score1_mae is 1.02920588984375'

'score2_mae is 1.2507499242187499'

'combined mae is 1.13997790703125'

Now use the all the available World Cup to predict results for the 2018 matches

In [11]:
gls_all = pd.concat([gms_all.rename(columns={'score1':'score'}),
                    gms_all.rename(columns={'score2':'score','team1':'team2','team2':'team1'})])
pred_df = pd.read_csv('group_matches_2018_raw.csv')
regr_all = smf.glm(formula="score ~ team1 + team2", data=gls_all, 
                        family=sm.families.Poisson()).fit()
pred_df['pred1'] = pred_df.apply(lambda r: pred_team_gls(regr_all, r['team1'],r['team2'], gls_all), axis=1)
pred_df['pred2'] = pred_df.apply(lambda r: pred_team_gls(regr_all, r['team2'],r['team1'], gls_all), axis=1)
pred_df['pred1'] = np.round(pred_df['pred1'],0)
pred_df['pred2'] = np.round(pred_df['pred2'],0)
pred_df

Unnamed: 0,team1,team2,score1,score2,pred1,pred2
0,Russia,Saudi Arabia,5.0,0.0,3.0,1.0
1,Egypt,Uruguay,0.0,1.0,1.0,1.0
2,Russia,Egypt,,,1.0,1.0
3,Uruguay,Saudi Arabia,,,3.0,1.0
4,Uruguay,Russia,,,2.0,1.0
5,Saudi Arabia,Egypt,,,0.0,2.0
6,Morocco,Iran,0.0,1.0,1.0,1.0
7,Portugal,Spain,3.0,3.0,1.0,1.0
8,Portugal,Morocco,,,2.0,1.0
9,Iran,Spain,,,0.0,2.0
