# Set up

In [1]:
# update path with data dir
import sys
sys.path.append('../data/')
sys.path.append('../modelling/')

In [2]:
import player_data as player
import goodness_fit
import pandas as pd
from datetime import datetime
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import patsy

  from pandas.core import datetools


In [3]:
response_variable = 'goals_scored'

# Get data

In [4]:
file_name = 'csvs/element_gameweek_' + datetime.today().strftime('%Y-%m-%d') + '.csv'

In [5]:
try:
    element_gameweek_df = pd.read_csv(file_name)
except:
    element_gameweek_df = player.get_element_gameweek_df()
    element_gameweek_df.to_csv(file_name)

In [6]:
current_event = 32
element_types = [2, 3, 4]
threshold_minutes = 1

In [7]:
element_gameweek_df =\
element_gameweek_df[
    (element_gameweek_df['event'] <= current_event)
    & (element_gameweek_df['element_type'].isin(element_types))
    & (element_gameweek_df['minutes'] >= threshold_minutes)
]

In [8]:
player.add_home_categorical_variable(element_gameweek_df)

In [9]:
element_gameweek_df['row_id'] = element_gameweek_df['row_id'].rank()

In [10]:
element_gameweek_df['element_fixture_rank'] = element_gameweek_df.groupby('element')['row_id'].rank()

In [11]:
rolling_df = \
element_gameweek_df.copy().groupby('element', as_index=True)[['minutes', 'goals_scored', 'assists']]\
.rolling(38, min_periods=1).sum().reset_index()[['element', 'minutes', 'goals_scored', 'assists']]

In [12]:
rolling_df['element_fixture_rank'] = rolling_df.groupby('element')['minutes'].rank()

In [13]:
rolling_df['element_fixture_rank'] = rolling_df['element_fixture_rank'] + 1

In [None]:
element_gameweek_df =\
element_gameweek_df.join(
    rolling_df.set_index(['element', 'element_fixture_rank']),
    on=['element', 'element_fixture_rank'],
    rsuffix='_rolling')

In [None]:
element_gameweek_df['goals_scored_per_minute_rolling'] = \
element_gameweek_df['goals_scored_rolling'] / element_gameweek_df['minutes_rolling']

element_gameweek_df['assists_per_minute_rolling'] = \
element_gameweek_df['assists_rolling'] / element_gameweek_df['minutes_rolling']

In [None]:
rolling_p5_df = \
element_gameweek_df.copy().groupby('element', as_index=True)[['minutes', 'goals_scored', 'assists']]\
.rolling(5, min_periods=1).sum().reset_index()[['element', 'minutes', 'goals_scored', 'assists']]

In [None]:
rolling_p5_df['element_fixture_rank'] = rolling_p5_df.groupby('element')['minutes'].rank()

In [None]:
rolling_p5_df['element_fixture_rank'] = rolling_p5_df['element_fixture_rank'] + 1

In [None]:
element_gameweek_df =\
element_gameweek_df.join(
    rolling_p5_df.set_index(['element', 'element_fixture_rank']),
    on=['element', 'element_fixture_rank'],
    rsuffix='_rolling_p5')

In [None]:
element_gameweek_df['goals_scored_per_minute_rolling_p5'] = \
element_gameweek_df['goals_scored_rolling_p5'] / element_gameweek_df['minutes_rolling_p5']

element_gameweek_df['assists_per_minute_rolling_p5'] = \
element_gameweek_df['assists_rolling_p5'] / element_gameweek_df['minutes_rolling_p5']

In [None]:
element_gameweek_df.head()

Unnamed: 0,row_id,assists,attempted_passes,big_chances_created,big_chances_missed,bonus,bps,clean_sheets,clearances_blocks_interceptions,completed_passes,...,minutes_rolling,goals_scored_rolling,assists_rolling,goals_scored_per_minute_rolling,assists_per_minute_rolling,minutes_rolling_p5,goals_scored_rolling_p5,assists_rolling_p5,goals_scored_per_minute_rolling_p5,assists_per_minute_rolling_p5
80,1.0,0,92,0,0,0,11,0,6,85,...,,,,,,,,,,
82,2.0,0,74,0,0,0,17,0,4,70,...,90.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0
83,3.0,0,38,0,0,0,10,0,2,36,...,159.0,0.0,0.0,0.0,0.0,159.0,0.0,0.0,0.0,0.0
84,4.0,0,64,0,0,0,19,0,10,57,...,204.0,0.0,0.0,0.0,0.0,204.0,0.0,0.0,0.0,0.0
85,5.0,0,89,0,0,0,17,0,2,87,...,294.0,0.0,0.0,0.0,0.0,294.0,0.0,0.0,0.0,0.0


# Models

## Helpers

In [None]:
def get_all_columns(formula, df):
    return patsy.dmatrices(formula, df, return_type='dataframe')[1].columns

def add_missing_columns(df, columns):
    for col in set(columns) - set(df.columns):
        df[col] = np.zeros(len(df))
    
    return df[columns]

In [None]:
def kfold_cross_validation(formula, df, splits):
    columns = get_all_columns(formula, df)
    
    folds = []
    
    for train_index, test_index in KFold(n_splits=splits, shuffle=True).split(df):
        folds.append((df.iloc[train_index], df.iloc[test_index]))
    
    eval_metrics = []
    for f in folds:
        try:
            response_train, explanatory_train = patsy.dmatrices(formula, f[0], return_type='dataframe')
            explanatory_train = add_missing_columns(explanatory_train, columns)
            
            response_test, explanatory_test = patsy.dmatrices(formula, f[1], return_type='dataframe')
            explanatory_test = add_missing_columns(explanatory_test, columns)
            
            model = sm.GLM(response_train, explanatory_train, family=sm.families.Poisson(), missing='drop').fit()
            reponse_test_pred = model.predict(explanatory_test)
            reponse_train_pred = model.predict(explanatory_train)

            mean_squared_error_test = mean_squared_error(response_test, reponse_test_pred)
            mean_squared_error_train = mean_squared_error(response_train, reponse_train_pred)
            mean_absolute_error_test = mean_absolute_error(response_test, reponse_test_pred)
            mean_absolute_error_train = mean_absolute_error(response_train, reponse_train_pred)

            m = {
                'mean_squared_error_test': mean_squared_error_test,
                'mean_absolute_error_test': mean_absolute_error_test,
                'mean_squared_error_train': mean_squared_error_train,
                'mean_absolute_error_train': mean_absolute_error_train,
            }

            eval_metrics.append(m)
        except Exception as e:
            print(e)
            
    return pd.DataFrame(eval_metrics)

In [None]:
def n_kfold_cross_validation(formula, df, splits, n):
    eval_metrics_arr = []
    for i in range(0, n):
        print('iteration ', i + 1)
        eval_metrics_arr.append(kfold_cross_validation(formula, df, splits))
    
    eval_metrics = pd.concat(eval_metrics_arr)
    return pd.DataFrame(eval_metrics)

## Poisson teams minutes model

In [None]:
sptmm_df = element_gameweek_df.copy()

In [None]:
sptmm_formula = \
f'{response_variable} ~ np.log(minutes) + C(element_type) + C(own_team) + C(opposition_team) + was_home'

In [None]:
sptmm_eval_df = n_kfold_cross_validation(sptmm_formula, sptmm_df, 10, 10)

iteration  1
iteration  2
iteration  3
iteration  4
iteration  5
iteration  6
iteration  7
iteration  8
iteration  9
iteration  10


In [None]:
sptmm_eval_df['model'] = 'Poisson teams minutes'
sptmm_eval = sptmm_eval_df.groupby('model').mean()
sptmm_eval

Unnamed: 0_level_0,mean_absolute_error_test,mean_absolute_error_train,mean_squared_error_test,mean_squared_error_train
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Poisson teams minutes,0.174362,0.172976,0.10702,0.105167


## Poisson teams minutes season form model

In [None]:
sptmmf_df = element_gameweek_df.copy()

In [None]:
sptmmf_formula = \
f'''
{response_variable} ~ 
np.log(minutes) + C(element_type) + C(own_team) + C(opposition_team) + was_home
+ goals_scored_per_minute_rolling + assists_per_minute_rolling
'''

In [None]:
sptmmf_eval_df = n_kfold_cross_validation(sptmmf_formula, sptmmf_df, 10, 10)

iteration  1
iteration  2
iteration  3
iteration  4
iteration  5
iteration  6
iteration  7
iteration  8
iteration  9
iteration  10


In [None]:
sptmmf_eval_df['model'] = 'Poisson teams minutes form'
sptmmf_eval = sptmmf_eval_df.groupby('model').mean()
sptmmf_eval

Unnamed: 0_level_0,mean_absolute_error_test,mean_absolute_error_train,mean_squared_error_test,mean_squared_error_train
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Poisson teams minutes form,0.177906,0.175007,0.124746,0.106857


## Poisson teams minutes recent form model

In [None]:
sptmm5_df = element_gameweek_df.copy()

In [None]:
sptmm5_formula = \
f'''
{response_variable} ~ 
np.log(minutes) + C(element_type) + C(own_team) + C(opposition_team) + was_home
+ goals_scored_per_minute_rolling_p5 + assists_per_minute_rolling_p5
'''

In [None]:
sptmm5_eval_df = n_kfold_cross_validation(sptmm5_formula, sptmm5_df, 10, 10)

iteration  1
iteration  2
iteration  3
iteration  4
iteration  5
iteration  6
iteration  7
iteration  8
iteration  9
iteration  10


In [50]:
sptmm5_eval_df['model'] = 'Poisson teams minutes recent form'
sptmm5_eval = sptmm5_eval_df.groupby('model').median()
sptmm5_eval

Unnamed: 0_level_0,mean_absolute_error_test,mean_absolute_error_train,mean_squared_error_test,mean_squared_error_train
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Poisson teams minutes recent form,0.182082,0.178784,0.110627,0.108089


## Poisson players minutes model

In [None]:
sppmm_df = element_gameweek_df.copy()

In [None]:
sppmm_formula = \
f'{response_variable} ~ np.log(minutes) + C(element) + C(opposition_team) + was_home'

In [44]:
sppmm_eval_df = n_kfold_cross_validation(sppmm_formula, sppmm_df, 10, 10)

iteration  1
iteration  2
iteration  4
iteration  5
iteration  6
iteration  7
SVD did not converge
iteration  8
iteration  9
iteration  10


In [45]:
sppmm_eval_df['model'] = 'Poisson players minutes'
sppmm_eval = sppmm_eval_df.groupby('model').mean()
sppmm_eval

Unnamed: 0_level_0,mean_absolute_error_test,mean_absolute_error_train,mean_squared_error_test,mean_squared_error_train
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Poisson players minutes,0.160746,0.151465,0.104168,0.092732


# Model comparison

In [52]:
pd.concat([
    sptmm_eval,
    sptmmf_eval,
    sptmm5_eval,
    sppmm_eval,
]).round(3)

Unnamed: 0_level_0,mean_absolute_error_test,mean_absolute_error_train,mean_squared_error_test,mean_squared_error_train
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Poisson teams minutes,0.174,0.173,0.107,0.105
Poisson teams minutes form,0.178,0.175,0.125,0.107
Poisson teams minutes recent form,0.182,0.179,0.111,0.108
Poisson players minutes,0.161,0.151,0.104,0.093
