# Set up

In [1]:
# update path with data dir
import sys
sys.path.append('../data/')
sys.path.append('../modelling/')

In [2]:
import player_data as player
import goodness_fit
import pandas as pd
from datetime import datetime
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
import patsy

  from pandas.core import datetools


# Get data

In [3]:
file_name = 'csvs/element_gameweek_' + datetime.today().strftime('%Y-%m-%d') + '.csv'

In [4]:
try:
    element_gameweek_df = pd.read_csv(file_name)
except:
    element_gameweek_df = player.get_element_gameweek_df()
    element_gameweek_df.to_csv(file_name)

In [5]:
current_event = 30
element_types = [2, 3, 4]
threshold_minutes = 1

In [6]:
element_gameweek_df =\
element_gameweek_df[
    (element_gameweek_df['event'] <= current_event)
    & (element_gameweek_df['element_type'].isin(element_types))
    & (element_gameweek_df['minutes'] >= threshold_minutes)
]

In [7]:
player.add_home_categorical_variable(element_gameweek_df)

# Poisson player minutes model

## Preparing data

In [8]:
sppmm_df = element_gameweek_df.copy()

In [9]:
sppmm_df = player.add_categorical_variables(sppmm_df, 'opposition_team')
sppmm_df = player.add_categorical_variables(sppmm_df, 'element')

In [10]:
sppmm_redundant_columns = [
    'row_id', 'assists', 'attempted_passes', 'big_chances_created',
    'big_chances_missed', 'bonus', 'bps', 'clean_sheets',
    'clearances_blocks_interceptions', 'completed_passes', 'creativity',
    'dribbles', 'ea_index', 'element', 'errors_leading_to_goal',
    'errors_leading_to_goal_attempt', 'fixture', 'fouls', 'goals_conceded',
    'ict_index', 'id', 'influence', 'key_passes', 'kickoff_time',
    'kickoff_time_formatted', 'loaned_in', 'loaned_out', 'offside',
    'open_play_crosses', 'opponent_team', 'own_goals', 'penalties_conceded',
    'penalties_missed', 'penalties_saved', 'recoveries', 'red_cards', 'round',
    'saves', 'selected', 'tackled', 'tackles', 'target_missed', 'team_a_score',
    'team_h_score', 'threat', 'total_points', 'transfers_balance',
    'transfers_in', 'transfers_out', 'value', 'winning_goals',
    'yellow_cards', 'event', 'team_a', 'team_a_difficulty', 'team_h',
    'team_h_difficulty', 'element_type', 'team', 'web_name', 'safe_web_name',
    'own_team', 'opposition_team', 'was_home', 'minutes'
]

In [11]:
sppmm_df['log_minutes'] = np.log(sppmm_df['minutes'])

In [12]:
sppmm_df = player.remove_redundant_columns(sppmm_df, sppmm_redundant_columns)

In [13]:
sppmm_response_df, sppmm_explanatory_df = player.get_response_explanatory_dfs(sppmm_df, 'goals_scored')

## Fitting model

In [14]:
sppmm_results = sm.GLM(
    sppmm_response_df,
    sppmm_explanatory_df,
    family=sm.families.Poisson(),
    missing='drop'
).fit()

In [15]:
print(sppmm_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:           goals_scored   No. Observations:                 7649
Model:                            GLM   Df Residuals:                     7170
Model Family:                 Poisson   Df Model:                          478
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -2048.1
Date:                Fri, 22 Mar 2019   Deviance:                       2585.8
Time:                        16:32:14   Pearson chi2:                 5.61e+03
No. Iterations:                    25                                         
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -4.5824      0.712     -6.434      0.000      -5.978      -3.186
opposition_team_2      0.313

## Model fit

In [16]:
sppmm_results.aic

5054.161947454179

In [17]:
sppmm_results.bic

-61530.68640581876

In [18]:
sppmm_pearson_results = goodness_fit.pearson_chi_squared_test(sppmm_results)
sppmm_pearson_results

(False, 1.0)

# Poisson player minutes model with Patsy

## Preparing data

In [26]:
patsy_df = element_gameweek_df.copy()

In [74]:
patsy_formula = 'goals_scored ~ np.log(minutes) + C(opposition_team) + C(element)'

patsy_response_df, patsy_explanatory_df = patsy.dmatrices(patsy_formula, patsy_df, return_type='dataframe')

## Fitting model

In [75]:
patsy_results = sm.GLM(
    patsy_response_df,
    patsy_explanatory_df,
    family=sm.families.Poisson(),
    missing='drop'
).fit()

In [76]:
print(patsy_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:           goals_scored   No. Observations:                 7649
Model:                            GLM   Df Residuals:                     7170
Model Family:                 Poisson   Df Model:                          478
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -2048.1
Date:                Fri, 22 Mar 2019   Deviance:                       2585.8
Time:                        16:47:12   Pearson chi2:                 5.61e+03
No. Iterations:                    25                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                   -4.5824      0.712     -6.434      0.000      -5.978      -3.186
C(oppositi

## Model fit

In [77]:
patsy_results.aic

5054.1619474541785

In [78]:
patsy_results.bic

-61530.68640581876

In [79]:
patsy_pearson_results = goodness_fit.pearson_chi_squared_test(patsy_results)
patsy_pearson_results

(False, 1.0)