# Set up

In [1]:
# update path with data dir
import sys
sys.path.append('../data/')
sys.path.append('../modelling/')

In [36]:
import player_data as player
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, precision_score, recall_score
import patsy
from datetime import datetime
import numpy as np

In [3]:
response_variable = 'clean_sheets'

# Getting data

In [4]:
file_name = 'csvs/element_gameweek_' + datetime.today().strftime('%Y-%m-%d') + '.csv'

In [5]:
try:
    element_gameweek_df = pd.read_csv(file_name)
except:
    element_gameweek_df = player.get_element_gameweek_df()
    element_gameweek_df.to_csv(file_name)

In [6]:
current_event = 32
element_types = [2, 3, 4]
threshold_minutes = 1

In [7]:
element_gameweek_df =\
element_gameweek_df[
    (element_gameweek_df['event'] <= current_event)
    & (element_gameweek_df['element_type'].isin(element_types))
    & (element_gameweek_df['minutes'] >= threshold_minutes)
]

In [8]:
player.add_home_categorical_variable(element_gameweek_df)

In [9]:
element_gameweek_df['clean_sheets_no_time'] =\
element_gameweek_df['goals_conceded'].apply(lambda x: 1 if x == 0 else 0)

In [10]:
element_gameweek_df['row_id'] = element_gameweek_df['row_id'].rank()

In [11]:
element_gameweek_df['element_fixture_rank'] = element_gameweek_df.groupby('element')['row_id'].rank()

In [12]:
rolling_cols = [
    'clearances_blocks_interceptions',
    'errors_leading_to_goal',
    'errors_leading_to_goal_attempt',
    'goals_conceded',
    'saves',
    'penalties_saved',
    'tackles'
]

In [13]:
rolling_df = \
element_gameweek_df.copy().groupby('element', as_index=True)[['minutes'] + rolling_cols]\
.rolling(38, min_periods=1).sum().reset_index()[['element', 'minutes'] + rolling_cols]

In [14]:
rolling_df['element_fixture_rank'] = rolling_df.groupby('element')['minutes'].rank()

In [15]:
rolling_df['element_fixture_rank'] = rolling_df['element_fixture_rank'] + 1

In [16]:
element_gameweek_df =\
element_gameweek_df.join(
    rolling_df.set_index(['element', 'element_fixture_rank']),
    on=['element', 'element_fixture_rank'],
    rsuffix='_rolling')

In [17]:
for i in rolling_cols:
    element_gameweek_df[i + '_per_minute_rolling'] = \
    element_gameweek_df[i + '_rolling'] / element_gameweek_df['minutes_rolling']

In [18]:
rolling_p5_df = \
element_gameweek_df.copy().groupby('element', as_index=True)[['minutes'] + rolling_cols]\
.rolling(5, min_periods=1).sum().reset_index()[['element', 'minutes'] + rolling_cols]

In [19]:
rolling_p5_df['element_fixture_rank'] = rolling_p5_df.groupby('element')['minutes'].rank()

In [20]:
rolling_p5_df['element_fixture_rank'] = rolling_p5_df['element_fixture_rank'] + 1

In [21]:
element_gameweek_df =\
element_gameweek_df.join(
    rolling_p5_df.set_index(['element', 'element_fixture_rank']),
    on=['element', 'element_fixture_rank'],
    rsuffix='_rolling_p5')

In [22]:
for i in rolling_cols:
    element_gameweek_df[i + '_per_minute_rolling_p5'] = \
    element_gameweek_df[i + '_rolling_p5'] / element_gameweek_df['minutes_rolling']

In [23]:
element_gameweek_df.head()

Unnamed: 0,row_id,assists,attempted_passes,big_chances_created,big_chances_missed,bonus,bps,clean_sheets,clearances_blocks_interceptions,completed_passes,...,saves_rolling_p5,penalties_saved_rolling_p5,tackles_rolling_p5,clearances_blocks_interceptions_per_minute_rolling_p5,errors_leading_to_goal_per_minute_rolling_p5,errors_leading_to_goal_attempt_per_minute_rolling_p5,goals_conceded_per_minute_rolling_p5,saves_per_minute_rolling_p5,penalties_saved_per_minute_rolling_p5,tackles_per_minute_rolling_p5
80,1.0,0,92,0,0,0,11,0,6,85,...,,,,,,,,,,
82,2.0,0,74,0,0,0,17,0,4,70,...,0.0,0.0,0.0,0.066667,0.0,0.0,0.033333,0.0,0.0,0.0
83,3.0,0,38,0,0,0,10,0,2,36,...,0.0,0.0,1.0,0.062893,0.0,0.0,0.025157,0.0,0.0,0.006289
84,4.0,0,64,0,0,0,19,0,10,57,...,0.0,0.0,1.0,0.058824,0.0,0.0,0.02451,0.0,0.0,0.004902
85,5.0,0,89,0,0,0,17,0,2,87,...,0.0,0.0,2.0,0.07483,0.0,0.0,0.020408,0.0,0.0,0.006803


In [24]:
element_gameweek_df.columns

Index(['row_id', 'assists', 'attempted_passes', 'big_chances_created',
       'big_chances_missed', 'bonus', 'bps', 'clean_sheets',
       'clearances_blocks_interceptions', 'completed_passes', 'creativity',
       'dribbles', 'ea_index', 'element', 'errors_leading_to_goal',
       'errors_leading_to_goal_attempt', 'fixture', 'fouls', 'goals_conceded',
       'goals_scored', 'ict_index', 'id', 'influence', 'key_passes',
       'kickoff_time', 'kickoff_time_formatted', 'loaned_in', 'loaned_out',
       'minutes', 'offside', 'open_play_crosses', 'opponent_team', 'own_goals',
       'penalties_conceded', 'penalties_missed', 'penalties_saved',
       'recoveries', 'red_cards', 'round', 'saves', 'selected', 'tackled',
       'tackles', 'target_missed', 'team_a_score', 'team_h_score', 'threat',
       'total_points', 'transfers_balance', 'transfers_in', 'transfers_out',
       'value', 'was_home', 'winning_goals', 'yellow_cards', 'event', 'team_a',
       'team_a_difficulty', 'team_h', 'team

# Neural network

In [116]:
def add_missing_columns(df, columns):
    for col in set(columns) - set(df.columns):
        df[col] = np.zeros(len(df))
    
    return df[columns]

In [117]:
rolling_vars = ' + '.join([i + '_per_minute_rolling' for i in rolling_cols])
rolling_p5_vars = ' + '.join([i + '_per_minute_rolling_p5' for i in rolling_cols])

formula = \
f'{response_variable} ~ C(element) + C(opposition_team) + was_home  + ' + rolling_vars
#+ ' + ' + rolling_p5_vars

print(formula)

clean_sheets ~ C(element) + C(opposition_team) + was_home  + clearances_blocks_interceptions_per_minute_rolling + errors_leading_to_goal_per_minute_rolling + errors_leading_to_goal_attempt_per_minute_rolling + goals_conceded_per_minute_rolling + saves_per_minute_rolling + penalties_saved_per_minute_rolling + tackles_per_minute_rolling


In [118]:
y, X = patsy.dmatrices(formula, element_gameweek_df, return_type='dataframe')

In [119]:
len(X.columns)

486

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [121]:
X_train = add_missing_columns(X_train, X.columns)
X_test = add_missing_columns(X_test, X.columns)

In [122]:
# standardise explanatory data by subtracting the mean and scaling to unit variance
# find standardising transformations based on training set
scaler = StandardScaler().fit(X_train)

# standardise training set
X_train = scaler.transform(X_train)

# standardise test set
X_test = scaler.transform(X_test)

In [123]:
# instantiate model
model = Sequential()

# add the input layer, i.e. first hidden layer
model.add(Dense(units=25, activation='relu', input_shape=(len(X.columns),)))

# add a hidden layer
model.add(Dense(units=25, activation='relu'))

# add a hidden layer
model.add(Dense(units=25, activation='relu'))

# add a hidden layer
model.add(Dense(units=25, activation='relu'))
          
# add the output layer
model.add(Dense(units=1, activation='sigmoid'))

In [124]:
# set training configuration
model.compile(
    loss='binary_crossentropy',
    optimizer='adam'
)

In [125]:
# train model on training set
model.fit(
    X_train, # explanatory variable training data
    y_train, # response variable training data
    epochs=20, # number of training iterations
    batch_size=10, # investigate this - The batch size that you specify in the code above defines the number of samples that going to be propagated through the network
    verbose=1 # monitor training progress
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f38900de278>

In [126]:
# get predictions for test set
y_pred = model.predict(X_test).flatten()

# calculate cross entropy loss for test set
test_loss = {
    'mean_absolute_error': mean_absolute_error(y_test, y_pred),
    'precision_score': precision_score(y_test, [i.round() for i in y_pred]),
    'recall_score': recall_score(y_test, [i.round() for i in y_pred])
}
test_loss

{'mean_absolute_error': 0.23264430048976129,
 'precision_score': 0.48459958932238195,
 'recall_score': 0.4609375}

In [106]:
print(y_pred.round(3))

[0.    0.    0.    ... 0.    0.    0.997]
