In [1]:
# importing the tools required for the Poisson regression model
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn


In [2]:
df = pd.read_csv("http://www.football-data.co.uk/mmz4281/1617/E0.csv")

In [3]:
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA
0,E0,13/08/16,Burnley,Swansea,0,1,A,0,0,D,...,1.61,32,-0.25,2.13,2.06,1.86,1.81,2.79,3.16,2.89
1,E0,13/08/16,Crystal Palace,West Brom,0,1,A,0,0,D,...,1.52,33,-0.5,2.07,2.0,1.9,1.85,2.25,3.15,3.86
2,E0,13/08/16,Everton,Tottenham,1,1,D,1,0,H,...,1.77,32,0.25,1.91,1.85,2.09,2.0,3.64,3.54,2.16
3,E0,13/08/16,Hull,Leicester,2,1,H,1,0,H,...,1.67,31,0.25,2.35,2.26,2.03,1.67,4.68,3.5,1.92
4,E0,13/08/16,Man City,Sunderland,2,1,H,1,0,H,...,2.48,34,-1.5,1.81,1.73,2.2,2.14,1.25,6.5,14.5


In [41]:
def get_home_team_advantage(goals_df,pval = 0.05):
    
    #extract relavant columns
    model_goals_df = goals_df[['HomeTeam','AwayTeam','FTHG','FTAG']]
    # rename goal columns
    model_goals_df = model_goals_df.rename(columns = {'FTHG':'HomeGoals','FTAG':'AwayGoals'})
    
     # reformat dataframe for the model
    goal_model_data = pd.concat([model_goals_df[['HomeTeam','AwayTeam','HomeGoals']].assign(home=1).rename(
                columns={'HomeTeam':'team', 'AwayTeam':'opponent','HomeGoals':'goals'}),
               model_goals_df[['AwayTeam','HomeTeam','AwayGoals']].assign(home=0).rename(
                columns={'AwayTeam':'team', 'HomeTeam':'opponent','AwayGoals':'goals'})])
    
    # build poisson model
    poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, 
                            family=sm.families.Poisson()).fit()
    # output model parameters
    poisson_model.summary()
    
    #return poisson_model.summary()
    return np.concatenate((np.array([poisson_model.params['home']]), 
                    poisson_model.conf_int(alpha=pval).values[-1]))
    

In [15]:
get_home_team_advantage(df)

0,1,2,3
Dep. Variable:,goals,No. Observations:,760.0
Model:,GLM,Df Residuals:,720.0
Model Family:,Poisson,Df Model:,39.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-1072.1
Date:,"Sun, 08 Aug 2021",Deviance:,794.94
Time:,08:57:12,Pearson chi2:,678.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3830,0.195,1.964,0.050,0.001,0.765
team[T.Bournemouth],-0.3144,0.177,-1.778,0.075,-0.661,0.032
team[T.Burnley],-0.6711,0.197,-3.410,0.001,-1.057,-0.285
team[T.Chelsea],0.0881,0.158,0.559,0.576,-0.221,0.397
team[T.Crystal Palace],-0.4141,0.182,-2.276,0.023,-0.771,-0.058
team[T.Everton],-0.2173,0.171,-1.272,0.203,-0.552,0.118
team[T.Hull],-0.6991,0.200,-3.489,0.000,-1.092,-0.306
team[T.Leicester],-0.4551,0.184,-2.470,0.013,-0.816,-0.094
team[T.Liverpool],0.0109,0.161,0.068,0.946,-0.304,0.326


In [6]:
k.head()

Unnamed: 0,team,opponent,goals,home
0,Burnley,Swansea,0,1
1,Crystal Palace,West Brom,0,1
2,Everton,Tottenham,1,1
3,Hull,Leicester,2,1
4,Man City,Sunderland,2,1


In [13]:
k.tail()

Unnamed: 0,team,opponent,goals,home
375,Middlesbrough,Liverpool,0,0
376,Crystal Palace,Man United,0,0
377,Stoke,Southampton,1,0
378,West Brom,Swansea,1,0
379,Man City,Watford,5,0


In [9]:

#extract relavant columns
model_goals_df = df[['HomeTeam','AwayTeam','FTHG','FTAG']]
# rename goal columns
model_goals_df = model_goals_df.rename(columns = {'FTHG':'HomeGoals','FTAG':'AwayGoals'})

model_goals_df[['HomeTeam','AwayTeam','HomeGoals']].assign(home=1).rename(
                columns={'HomeTeam':'team', 'AwayTeam':'opponent','HomeGoals':'goals'})

Unnamed: 0,team,opponent,goals,home
0,Burnley,Swansea,0,1
1,Crystal Palace,West Brom,0,1
2,Everton,Tottenham,1,1
3,Hull,Leicester,2,1
4,Man City,Sunderland,2,1
...,...,...,...,...
375,Liverpool,Middlesbrough,3,1
376,Man United,Crystal Palace,2,1
377,Southampton,Stoke,0,1
378,Swansea,West Brom,2,1


In [12]:
model_goals_df[['AwayTeam','HomeTeam','AwayGoals']].assign(home=0).rename(
                columns={'AwayTeam':'team', 'HomeTeam':'opponent','AwayGoals':'goals'})

Unnamed: 0,team,opponent,goals,home
0,Swansea,Burnley,1,0
1,West Brom,Crystal Palace,1,0
2,Tottenham,Everton,1,0
3,Leicester,Hull,1,0
4,Sunderland,Man City,1,0
...,...,...,...,...
375,Middlesbrough,Liverpool,0,0
376,Crystal Palace,Man United,0,0
377,Stoke,Southampton,1,0
378,West Brom,Swansea,1,0


In [16]:
all_league_goals = pd.read_json(
    "https://raw.githubusercontent.com/dashee87/blogScripts/master/files/all_league_goals.json")
# reorder the columns to it a bit more logical
all_league_goals = all_league_goals[['country', 'league', 'date', 'HomeTeam', 
                                     'AwayTeam', 'FTHG', 'FTAG', 'awarded']]
all_league_goals.head()

Unnamed: 0,country,league,date,HomeTeam,AwayTeam,FTHG,FTAG,awarded
0,Albania,Super League 2016/2017,2017-05-27,Korabi Peshkopi,Flamurtari,0,3,False
1,Albania,Super League 2016/2017,2017-05-27,Laci,Teuta,2,1,False
2,Albania,Super League 2016/2017,2017-05-27,Luftetari Gjirokastra,Kukesi,1,0,False
3,Albania,Super League 2016/2017,2017-05-27,Skenderbeu,Partizani,2,2,False
4,Albania,Super League 2016/2017,2017-05-27,Vllaznia,KF Tirana,0,0,False


In [30]:
all_league_goals.to_csv(r"C:\Users\Suwani Gunasekara\OneDrive\Documents\Sem 7 ML\Mid term\ex1.csv")

In [17]:
# little bit of data cleansing to remove fixtures that were abandoned/awarded/postponed
all_league_goals = all_league_goals[~all_league_goals['awarded']]
all_league_goals = all_league_goals[all_league_goals['FTAG']!='POSTP.']
all_league_goals = all_league_goals[all_league_goals['FTAG']!='CAN.']
all_league_goals[['FTAG', 'FTHG']] = all_league_goals[['FTAG', 'FTHG']].astype(int)

In [18]:
all_league_goals.head()

Unnamed: 0,country,league,date,HomeTeam,AwayTeam,FTHG,FTAG,awarded
0,Albania,Super League 2016/2017,2017-05-27,Korabi Peshkopi,Flamurtari,0,3,False
1,Albania,Super League 2016/2017,2017-05-27,Laci,Teuta,2,1,False
2,Albania,Super League 2016/2017,2017-05-27,Luftetari Gjirokastra,Kukesi,1,0,False
3,Albania,Super League 2016/2017,2017-05-27,Skenderbeu,Partizani,2,2,False
4,Albania,Super League 2016/2017,2017-05-27,Vllaznia,KF Tirana,0,0,False


In [28]:
all_league_goals.loc[all_league_goals.country == 'Albania']

Unnamed: 0,country,league,date,HomeTeam,AwayTeam,FTHG,FTAG,awarded
0,Albania,Super League 2016/2017,2017-05-27,Korabi Peshkopi,Flamurtari,0,3,False
1,Albania,Super League 2016/2017,2017-05-27,Laci,Teuta,2,1,False
2,Albania,Super League 2016/2017,2017-05-27,Luftetari Gjirokastra,Kukesi,1,0,False
3,Albania,Super League 2016/2017,2017-05-27,Skenderbeu,Partizani,2,2,False
4,Albania,Super League 2016/2017,2017-05-27,Vllaznia,KF Tirana,0,0,False
...,...,...,...,...,...,...,...,...
4284,Albania,Super League 2016/2017,2016-08-09,Vllaznia,Kukesi,0,0,False
4285,Albania,Super League 2016/2017,2016-07-09,Korabi Peshkopi,Laci,0,0,False
4286,Albania,Super League 2016/2017,2016-07-09,Luftetari Gjirokastra,Partizani,0,1,False
4287,Albania,Super League 2016/2017,2016-07-09,Skenderbeu,Flamurtari,2,1,False


In [42]:
home_advantage_country = pd.DataFrame(all_league_goals.assign(match_goals = all_league_goals['FTHG'] +  
                                      all_league_goals['FTHG']).groupby(['country','league']).agg(
        {'HomeTeam':['size','nunique'], 'match_goals':'mean'}).to_records())
home_advantage_country.columns = ['country', 'league', 'num_games', 'num_teams', 'avg_goals']
temp_set = []
for i in range(80):
    temp_set.append(get_home_team_advantage(all_league_goals[(
                    all_league_goals['country']==home_advantage_country['country'][i]) & (
                all_league_goals['league']==home_advantage_country['league'][i])]))
temp_set = pd.DataFrame(temp_set,columns= ['home_advantage_score', 'left_tail', 'right_tail'])
home_advantage_country = pd.concat([home_advantage_country, temp_set], axis=1).sort_values('home_advantage_score', 
                                            ascending=False).reset_index(drop=True)
home_advantage_country.index = home_advantage_country.index + 1
# if you want display more/less rows than the default option
pd.options.display.max_rows = 40
home_advantage_country.assign(avg_goals= pd.Series.round(home_advantage_country['avg_goals'], 3),
                             home_advantage_score= pd.Series.round(home_advantage_country['home_advantage_score'], 3),
                             left_tail= pd.Series.round(home_advantage_country['left_tail'], 3),
                             right_tail= pd.Series.round(home_advantage_country['right_tail'], 3))

Unnamed: 0,country,league,num_games,num_teams,avg_goals,home_advantage_score,left_tail,right_tail
1,Haiti,Championnat National 2017,237,16,2.329,0.741,0.533,0.949
2,Algeria,Ligue 1 2016/2017,238,16,2.790,0.698,0.512,0.884
3,Ghana,Premier League 2017,238,16,2.924,0.676,0.494,0.857
4,Bolivia,Liga de Futbol Prof 2016/2017,132,12,4.470,0.624,0.431,0.818
5,Guatemala,Liga Nacional 2016/2017,264,12,2.803,0.620,0.448,0.792
...,...,...,...,...,...,...,...,...
156,Vietnam,V-League 2017,182,14,3.165,,,
157,Wales,Premier League 2016/2017,132,12,3.333,,,
158,Yemen,Division 1 2013/2014,180,14,2.678,,,
159,Zambia,Super League 2017,379,20,2.306,,,


In [43]:
home_advantage_country = pd.DataFrame(all_league_goals.assign(match_goals = all_league_goals['FTHG'] +  
                                      all_league_goals['FTHG']).groupby(['country','league']).agg(
        {'HomeTeam':['size','nunique'], 'match_goals':'mean'}).to_records())
home_advantage_country.columns = ['country', 'league', 'num_games', 'num_teams', 'avg_goals']
temp_set = []
for i in range(home_advantage_country.shape[0]):
    temp_set.append(get_home_team_advantage(all_league_goals[(
                    all_league_goals['country']==home_advantage_country['country'][i]) & (
                all_league_goals['league']==home_advantage_country['league'][i])]))
temp_set = pd.DataFrame(temp_set,columns= ['home_advantage_score', 'left_tail', 'right_tail'])
home_advantage_country = pd.concat([home_advantage_country, temp_set], axis=1).sort_values('home_advantage_score', 
                                            ascending=False).reset_index(drop=True)
home_advantage_country.index = home_advantage_country.index + 1
# if you want display more/less rows than the default option
pd.options.display.max_rows = 40
home_advantage_country.assign(avg_goals= pd.Series.round(home_advantage_country['avg_goals'], 3),
                             home_advantage_score= pd.Series.round(home_advantage_country['home_advantage_score'], 3),
                             left_tail= pd.Series.round(home_advantage_country['left_tail'], 3),
                             right_tail= pd.Series.round(home_advantage_country['right_tail'], 3))

  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))


ValueError: NaN, inf or invalid value detected in weights, estimation infeasible.

In [20]:
home_advantage_country = pd.DataFrame(all_league_goals.assign(match_goals = all_league_goals['FTHG'] +  
                                      all_league_goals['FTHG']).groupby(['country','league']).agg(
        {'HomeTeam':['size','nunique'], 'match_goals':'mean'}).to_records())
home_advantage_country.columns = ['country', 'league', 'num_games', 'num_teams', 'avg_goals']

In [22]:
home_advantage_country.head()

Unnamed: 0,country,league,num_games,num_teams,avg_goals
0,Albania,Super League 2016/2017,180,10,2.333333
1,Algeria,Ligue 1 2016/2017,238,16,2.789916
2,Andorra,Primera Divisió 2016/2017,83,8,3.204819
3,Angola,Girabola 2017,239,16,2.677824
4,Argentina,Primera Division 2016/2017,450,30,2.497778


In [25]:
home_advantage_country.groupby(['country','league']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,num_games,num_teams,avg_goals
country,league,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Albania,Super League 2016/2017,180,10,2.333333
Algeria,Ligue 1 2016/2017,238,16,2.789916
Andorra,Primera Divisió 2016/2017,83,8,3.204819
Angola,Girabola 2017,239,16,2.677824
Argentina,Primera Division 2016/2017,450,30,2.497778
...,...,...,...,...
Vietnam,V-League 2017,182,14,3.164835
Wales,Premier League 2016/2017,132,12,3.333333
Yemen,Division 1 2013/2014,180,14,2.677778
Zambia,Super League 2017,379,20,2.306069


In [27]:
home_advantage_country.loc[home_advantage_country.country == 'England']

Unnamed: 0,country,league,num_games,num_teams,avg_goals
40,England,Premier League 2016/2017,380,20,3.194737


In [33]:
home_advantage_country = pd.DataFrame(all_league_goals.assign(match_goals = all_league_goals['FTHG'] +  
                                      all_league_goals['FTHG']).groupby(['country','league']).agg(
        {'HomeTeam':['size','nunique'], 'match_goals':'mean'}).to_records())
home_advantage_country.columns = ['country', 'league', 'num_games', 'num_teams', 'avg_goals']
temp_set = []
for i in range(home_advantage_country.shape[0]):
    temp_set.append(all_league_goals[(
                    all_league_goals['country']==home_advantage_country['country'][i]) & (
                all_league_goals['league']==home_advantage_country['league'][i])])

In [36]:
temp_set

[      country                  league       date               HomeTeam  \
 0     Albania  Super League 2016/2017 2017-05-27        Korabi Peshkopi   
 1     Albania  Super League 2016/2017 2017-05-27                   Laci   
 2     Albania  Super League 2016/2017 2017-05-27  Luftetari Gjirokastra   
 3     Albania  Super League 2016/2017 2017-05-27             Skenderbeu   
 4     Albania  Super League 2016/2017 2017-05-27               Vllaznia   
 ...       ...                     ...        ...                    ...   
 4284  Albania  Super League 2016/2017 2016-08-09               Vllaznia   
 4285  Albania  Super League 2016/2017 2016-07-09        Korabi Peshkopi   
 4286  Albania  Super League 2016/2017 2016-07-09  Luftetari Gjirokastra   
 4287  Albania  Super League 2016/2017 2016-07-09             Skenderbeu   
 4439  Albania  Super League 2016/2017 2016-07-09                  Teuta   
 
         AwayTeam  FTHG  FTAG  awarded  
 0     Flamurtari     0     3    False  
 1  