# Set up

In [1]:
# update path with data dir
import sys
sys.path.append('../data/')
sys.path.append('../modelling/')

In [2]:
import player_data as player
import goodness_fit
import pandas as pd
from datetime import datetime
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
import patsy

  from pandas.core import datetools


# Get data

In [3]:
file_name = 'csvs/element_gameweek_' + datetime.today().strftime('%Y-%m-%d') + '.csv'

In [4]:
try:
    element_gameweek_df = pd.read_csv(file_name)
except:
    element_gameweek_df = player.get_element_gameweek_df()
    element_gameweek_df.to_csv(file_name)

In [5]:
current_event = 30
element_types = [1, 2, 3, 4]
threshold_minutes = 1

In [6]:
element_gameweek_df =\
element_gameweek_df[
    (element_gameweek_df['event'] <= current_event)
    & (element_gameweek_df['element_type'].isin(element_types))
    & (element_gameweek_df['minutes'] >= threshold_minutes)
]

# Models

## Clean sheets model

### Preparing data

In [7]:
csm_df = element_gameweek_df.copy()

In [8]:
csm_formula = 'clean_sheets ~ minutes + C(opposition_team) + C(own_team)'

csm_response_df, csm_explanatory_df = patsy.dmatrices(csm_formula, csm_df, return_type='dataframe')

### Fitting model

In [9]:
csm_results = sm.GLM(
    csm_response_df,
    csm_explanatory_df,
    family=sm.families.Binomial(),
    missing='drop'
).fit()

In [10]:
print(csm_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:           clean_sheets   No. Observations:                 8250
Model:                            GLM   Df Residuals:                     8210
Model Family:                Binomial   Df Model:                           39
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -3467.2
Date:                Sat, 23 Mar 2019   Deviance:                       6934.4
Time:                        16:34:38   Pearson chi2:                 7.08e+03
No. Iterations:                     7                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                   -6.8933      0.316    -21.827      0.000      -7.512      -6.274
C(oppositi

### Predictions

In [44]:
csm_results.predict()

<bound method Results.predict of <statsmodels.genmod.generalized_linear_model.GLMResults object at 0x7f33c671b5c0>>

In [45]:
csm_explanatory_df

Unnamed: 0,Intercept,C(opposition_team)[T.2],C(opposition_team)[T.3],C(opposition_team)[T.4],C(opposition_team)[T.5],C(opposition_team)[T.6],C(opposition_team)[T.7],C(opposition_team)[T.8],C(opposition_team)[T.9],C(opposition_team)[T.10],...,C(own_team)[T.12],C(own_team)[T.13],C(own_team)[T.14],C(own_team)[T.15],C(own_team)[T.16],C(own_team)[T.17],C(own_team)[T.18],C(own_team)[T.19],C(own_team)[T.20],minutes
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0
36,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0
37,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
38,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0


## Goals model

### Preparing data

In [23]:
gm_df = element_gameweek_df.copy()

In [32]:
gm_formula = 'goals_scored ~ np.log(minutes) + C(opposition_team) + C(element)'

gm_response_df, gm_explanatory_df = patsy.dmatrices(gm_formula, gm_df, return_type='dataframe')

### Fitting model

In [33]:
gm_results = sm.GLM(
    gm_response_df,
    gm_explanatory_df,
    family=sm.families.Poisson(),
    missing='drop'
).fit()

In [34]:
print(gm_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:           goals_scored   No. Observations:                 8250
Model:                            GLM   Df Residuals:                     7737
Model Family:                 Poisson   Df Model:                          512
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -2048.1
Date:                Sat, 23 Mar 2019   Deviance:                       2585.8
Time:                        16:39:22   Pearson chi2:                 5.61e+03
No. Iterations:                    25                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                  -31.1378   2.72e+05     -0.000      1.000   -5.33e+05    5.33e+05
C(oppositi

### Predictions

In [58]:
gm_results.predict(gm_explanatory_df.iloc[:1])

0    4.345173e-13
dtype: float64

True

## Assists model

### Preparing data

In [36]:
am_df = element_gameweek_df.copy()

In [37]:
am_formula = 'assists ~ np.log(minutes) + C(opposition_team) + C(element)'

am_response_df, am_explanatory_df = patsy.dmatrices(am_formula, am_df, return_type='dataframe')

### Fitting model

In [38]:
am_results = sm.GLM(
    am_response_df,
    am_explanatory_df,
    family=sm.families.Poisson(),
    missing='drop'
).fit()

In [39]:
print(am_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                assists   No. Observations:                 8250
Model:                            GLM   Df Residuals:                     7737
Model Family:                 Poisson   Df Model:                          512
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -2007.5
Date:                Sat, 23 Mar 2019   Deviance:                       2623.4
Time:                        16:40:35   Pearson chi2:                 5.07e+03
No. Iterations:                    25                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                  -30.7248   2.85e+05     -0.000      1.000   -5.58e+05    5.58e+05
C(oppositi