<a href="https://colab.research.google.com/github/weraines/sandbox/blob/master/soccer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from scipy.stats import poisson,skellam
import statsmodels.api as sm
import statsmodels.formula.api as smf

df_soccer = pd.read_csv(
    'https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv')

df_soccer_epl = df_soccer[df_soccer['league_id'] == 2411]

df_soccer_epl_2018 = df_soccer_epl[df_soccer_epl['date'] > '2018-08-01']

df_soccer_epl_2018_lite =  df_soccer_epl_2018[['team1', 'team2', 'score1', 
                                               'score2', 'xg1', 'xg2',]]

df_soccer_epl_2018_lite = df_soccer_epl_2018_lite.rename(
    columns={'team1': 'home', 'team2': 'away', 'score1': 'home_goals', 
             'score2': 'away_goals', 'xg1': 'home_xg', 'xg2': 'away_xg'})

goal_model_data = pd.concat([df_soccer_epl_2018_lite[['home', 'away', 'home_goals']].assign(ht=1).rename(columns={'home': 'team', 'away': 'opponent', 'home_goals': 'goals'}), 
                             df_soccer_epl_2018_lite[['away', 'home', 'away_goals']].assign(ht=0).rename(columns={'away': 'team', 'home': 'opponent', 'away_goals': 'goals'})])

poisson_model = smf.glm(formula="goals ~ ht + team +opponent", data=goal_model_data, family=sm.families.Poisson()).fit()

def simulate_match(foot_model, homeTeam, awayTeam, max_goals=10):
    home_goals_avg = foot_model.predict(pd.DataFrame(data={'team': homeTeam, 
                                                            'opponent': awayTeam,'ht':1},
                                                      index=[1])).values[0]
    away_goals_avg = foot_model.predict(pd.DataFrame(data={'team': awayTeam, 
                                                            'opponent': homeTeam,'ht':0},
                                                      index=[1])).values[0]
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [home_goals_avg, away_goals_avg]]
    return(np.outer(np.array(team_pred[0]), np.array(team_pred[1])))
  
def total_offset(g_total):
  if g_total - int(g_total) == .25:
    u_adjust = -1
    o_adjust = 0 
  elif g_total -int(g_total) == .75:
    u_adjust = 0
    o_adjust = 1
  elif g_total%1 == 0:
    u_adjust = 1
    o_adjust = 0
  else:
    u_adjust = 0
    o_adjust = 0
    
  u = int(g_total) - 5 + u_adjust
  o = int(g_total) - 5 + 1 + o_adjust
  
  return[u, o]
  
def spread_home(gl_spread):
  if gl_spread <= -3.75:
    h_gl = -5
  elif gl_spread <= -2.75:
    h_gl = -4
  elif gl_spread <= -1.75:
    h_gl = -3
  elif gl_spread <= -.75:
    h_gl = -2
  elif gl_spread <= .25:
    h_gl = -1
  elif gl_spread <= 1.25:
    h_gl = 0
  elif gl_spread <= 2.25:
    h_gl = 1
  elif gl_spread <= 3.25:
    h_gl = 2
  elif gl_spread <= 4.25:
    h_gl = 3
  return(h_gl)

def spread_away(gl_spread):
  if gl_spread <= -4.5:
    a_gl = -4
  if gl_spread <= -3.5:
    a_gl = -3
  elif gl_spread <= -2.5:
    a_gl = -2
  elif gl_spread <= -1.5:
    a_gl = -1
  elif gl_spread <= -.5:
    a_gl = 0
  elif gl_spread <= .5:
    a_gl = 1
  elif gl_spread <= 1.5:
    a_gl = 2
  elif gl_spread <= 2.5:
    a_gl = 3
  elif gl_spread <= 3.5:
    a_gl = 4
  elif gl_spread <= 4.5:
    a_gl = 5
  return(a_gl)

df_games = pd.read_csv('gd37.csv')

h_gl_list = []
a_gl_list = []
h_ml_list = []
a_ml_list = []
t_ml_list = []
t_over = []
t_under = []

for index, row in df_games.iterrows():
  sim_match = simulate_match(poisson_model, row['Home'], row['Away'], max_goals=5)
  h_gl_list.append(np.sum(np.tril(sim_match, spread_home(row['home_gl']))))
  a_gl_list.append(np.sum(np.triu(sim_match, spread_away(row['home_gl']))))
  h_ml_list.append(np.sum(np.tril(sim_match, -1)))
  a_ml_list.append(np.sum(np.triu(sim_match, 1)))
  t_ml_list.append(np.sum(np.diag(sim_match)))
  
  sim_match_total = np.flip(sim_match, 1).T
  
  t_under.append(np.sum(np.tril(sim_match_total, total_offset(row['total'])[0])))
  t_over.append(np.sum(np.triu(sim_match_total, total_offset(row['total'])[1])))
  
df_games['pred_h_gl'] = h_gl_list
df_games['pred_a_gl'] = a_gl_list
df_games['pre_h_ml'] = h_ml_list
df_games['pred_a_ml'] = a_ml_list
df_games['pred_t_ml'] = t_ml_list
df_games['pred_over'] = t_over
df_games['pred_under'] = t_under

df_games.to_csv('gd37_pred.csv')

In [0]:
import matplotlib.pyplot as plt
import seaborn
from scipy.stats import poisson,skellam
import statsmodels.api as sm
import statsmodels.formula.api as smf

goal_model_data = pd.concat([df_soccer_epl_2018_lite[['home', 'away', 'home_goals']].assign(ht=1).rename(columns={'home': 'team', 'away': 'opponent', 'home_goals': 'goals'}), 
                             df_soccer_epl_2018_lite[['away', 'home', 'away_goals']].assign(ht=0).rename(columns={'away': 'team', 'home': 'opponent', 'away_goals': 'goals'})])

poisson_model = smf.glm(formula="goals ~ ht + team +opponent", data=goal_model_data, family=sm.families.Poisson()).fit()
poisson_model.summary()

0,1,2,3
Dep. Variable:,goals,No. Observations:,720
Model:,GLM,Df Residuals:,680
Model Family:,Poisson,Df Model:,39
Link Function:,log,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-1001.5
Date:,"Fri, 03 May 2019",Deviance:,708.85
Time:,14:16:04,Pearson chi2:,614.
No. Iterations:,5,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.5059,0.193,2.626,0.009,0.128,0.883
team[T.Arsenal],0.2849,0.184,1.547,0.122,-0.076,0.646
team[T.Brighton and Hove Albion],-0.4729,0.223,-2.121,0.034,-0.910,-0.036
team[T.Burnley],-0.1629,0.205,-0.793,0.428,-0.565,0.240
team[T.Cardiff City],-0.5423,0.230,-2.361,0.018,-0.992,-0.092
team[T.Chelsea],0.1248,0.190,0.657,0.511,-0.248,0.497
team[T.Crystal Palace],-0.1837,0.206,-0.890,0.373,-0.588,0.221
team[T.Everton],-0.0512,0.198,-0.258,0.797,-0.440,0.338
team[T.Fulham],-0.4117,0.221,-1.862,0.063,-0.845,0.022


In [0]:
def simulate_match(foot_model, homeTeam, awayTeam, max_goals=10):
    home_goals_avg = foot_model.predict(pd.DataFrame(data={'team': homeTeam, 
                                                            'opponent': awayTeam,'ht':1},
                                                      index=[1])).values[0]
    away_goals_avg = foot_model.predict(pd.DataFrame(data={'team': awayTeam, 
                                                            'opponent': homeTeam,'ht':0},
                                                      index=[1])).values[0]
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [home_goals_avg, away_goals_avg]]
    return(np.outer(np.array(team_pred[0]), np.array(team_pred[1])))
  
def total_offset(g_total):
  if g_total - int(g_total) == .25:
    u_adjust = -1
    o_adjust = 0 
  elif g_total -int(g_total) == .75:
    u_adjust = 0
    o_adjust = 1
  elif g_total%1 == 0:
    u_adjust = 1
    o_adjust = 0
  else:
    u_adjust = 0
    o_adjust = 0
    
  u = int(g_total) - 5 + u_adjust
  o = int(g_total) - 5 + 1 + o_adjust
  
  return[u, o]
  
def spread_home(gl_spread):
  if gl_spread <= -3.75:
    h_gl = -5
  elif gl_spread <= -2.75:
    h_gl = -4
  elif gl_spread <= -1.75:
    h_gl = -3
  elif gl_spread <= -.75:
    h_gl = -2
  elif gl_spread <= .25:
    h_gl = -1
  elif gl_spread <= 1.25:
    h_gl = 0
  elif gl_spread <= 2.25:
    h_gl = 1
  elif gl_spread <= 3.25:
    h_gl = 2
  elif gl_spread <= 4.25:
    h_gl = 3
  return(h_gl)

def spread_away(gl_spread):
  if gl_spread <= -4.5:
    a_gl = -4
  if gl_spread <= -3.5:
    a_gl = -3
  elif gl_spread <= -2.5:
    a_gl = -2
  elif gl_spread <= -1.5:
    a_gl = -1
  elif gl_spread <= -.5:
    a_gl = 0
  elif gl_spread <= .5:
    a_gl = 1
  elif gl_spread <= 1.5:
    a_gl = 2
  elif gl_spread <= 2.5:
    a_gl = 3
  elif gl_spread <= 3.5:
    a_gl = 4
  elif gl_spread <= 4.5:
    a_gl = 5
  return(a_gl)

In [0]:
df_games = pd.read_csv('gd37.csv')

Unnamed: 0,Home,Away,home_gl,home_gl_prob,away_gl,away_gl_prob,home_ml,away_ml,tie_ml,total,over,under
0,Everton,Burnley,-1.0,0.512195,1.0,0.555556,0.64539,0.181818,0.25641,2.5,0.555556,0.512195
1,AFC Bournemouth,Tottenham Hotspur,0.5,0.52381,-0.5,0.545455,0.273973,0.541284,0.27027,3.0,0.512195,0.555556
2,West Ham United,Southampton,-0.25,0.512195,0.25,0.555556,0.444444,0.344828,0.285714,3.0,0.512195,0.555556
3,Wolverhampton,Fulham,-1.0,0.545455,1.0,0.52381,0.666667,0.166667,0.243902,2.5,0.574468,0.487805
4,Cardiff City,Crystal Palace,0.0,0.555556,0.0,0.512195,0.4,0.377358,0.298507,2.5,0.52381,0.545455


In [0]:
h_gl_list = []
a_gl_list = []
h_ml_list = []
a_ml_list = []
t_ml_list = []
t_over = []
t_under = []

for index, row in df_games.iterrows():
  sim_match = simulate_match(poisson_model, row['Home'], row['Away'], max_goals=5)
  h_gl_list.append(np.sum(np.tril(sim_match, spread_home(row['home_gl']))))
  a_gl_list.append(np.sum(np.triu(sim_match, spread_away(row['home_gl']))))
  h_ml_list.append(np.sum(np.tril(sim_match, -1)))
  a_ml_list.append(np.sum(np.triu(sim_match, 1)))
  t_ml_list.append(np.sum(np.diag(sim_match)))
  
  sim_match_total = np.flip(sim_match, 1).T
  
  t_under.append(np.sum(np.tril(sim_match_total, total_offset(row['total'])[0])))
  t_over.append(np.sum(np.triu(sim_match_total, total_offset(row['total'])[1])))
  
df_games['pred_h_gl'] = h_gl_list
df_games['pred_a_gl'] = a_gl_list
df_games['pre_h_ml'] = h_ml_list
df_games['pred_a_ml'] = a_ml_list
df_games['pred_t_ml'] = t_ml_list
df_games['pred_over'] = t_over
df_games['pred_under'] = t_under

df_games.to_csv('gd37_pred.csv')


In [0]:
liv_hud = simulate_match(poisson_model, 'West Ham United', 'Southampton', max_goals=5)

print(np.sum(np.tril(liv_hud, -1)))
print(np.sum(np.triu(liv_hud, 1)))
print(np.sum(np.diag(liv_hud)))

0.4830517712938376
0.26644517261415435
0.241733505930464


In [0]:
liv_hud_tot = np.flip(liv_hud,1).T

total = 3

print(np.sum(np.tril(liv_hud_tot, total_offset(total)[0])))
print(np.sum(np.triu(liv_hud_tot, total_offset(total)[1])))

0.6814757338059017
0.3097547160325544


In [0]:
spread = -1

print(np.sum(np.tril(liv_hud, spread_home(spread))))
print(np.sum(np.triu(liv_hud, spread_away(spread))))

0.34657905231739317
0.39590641045409075


In [0]:
liv_hud

array([[0.05802101, 0.06809629, 0.03996057, 0.01563322, 0.00458698,
        0.0010767 ],
       [0.09708663, 0.11394561, 0.06686606, 0.02615909, 0.0076754 ,
        0.00180164],
       [0.08122759, 0.09533267, 0.05594353, 0.02188602, 0.00642163,
        0.00150735],
       [0.04530608, 0.05317342, 0.03120346, 0.0122073 , 0.00358177,
        0.00084075],
       [0.01895268, 0.02224379, 0.0130532 , 0.00510662, 0.00149835,
        0.00035171],
       [0.00634271, 0.00744411, 0.00436839, 0.00170898, 0.00050144,
        0.0001177 ]])