# Set up

In [1]:
from footbot.data import utils
import pandas as pd
import patsy
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from footbot.optimiser import team_selector
import scipy.stats

In [2]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 1000)

# Getting data

In [3]:
client = utils.set_up_bigquery(secrets_path='../secrets/service_account.json')

In [4]:
sql = \
'''
SELECT
  event,
  fixture,
  element,
  total_points,
  minutes,
  safe_web_name,
  element_type,
  team,
  value,
  total_points_per_minute_previous_season,
  goals_scored_per_minute_previous_season,
  assists_per_minute_previous_season,
  clean_sheets_per_minute_previous_season,
  goals_conceded_per_minute_previous_season,
  own_goals_per_minute_previous_season,
  penalties_saved_per_minute_previous_season,
  penalties_missed_per_minute_previous_season,
  yellow_cards_per_minute_previous_season,
  red_cards_per_minute_previous_season,
  saves_per_minute_previous_season,
  bonus_per_minute_previous_season,
  bps_per_minute_previous_season,
  opponent_team,
  was_home,
  was_sunday,
  was_weekday,
  was_late,
  was_early,
  rolling_avg_total_points_element,
  rolling_avg_minutes_element,
  rolling_avg_goals_scored_element,
  rolling_avg_assists_element,
  rolling_avg_clean_sheets_element,
  rolling_avg_goals_conceded_element,
  rolling_avg_own_goals_element,
  rolling_avg_penalties_saved_element,
  rolling_avg_penalties_missed_element,
  rolling_avg_yellow_cards_element,
  rolling_avg_red_cards_element,
  rolling_avg_saves_element,
  rolling_avg_bonus_element,
  rolling_avg_bps_element,
  rolling_avg_total_points_element_p1,
  rolling_avg_total_points_element_p2,
  rolling_avg_total_points_element_p3,
  rolling_avg_total_points_element_p4,
  rolling_avg_total_points_element_p5,
  rolling_avg_total_points_element_p10,
  rolling_avg_goals_scored_element_p1,
  rolling_avg_goals_scored_element_p2,
  rolling_avg_goals_scored_element_p3,
  rolling_avg_goals_scored_element_p4,
  rolling_avg_goals_scored_element_p5,
  rolling_avg_goals_scored_element_p10,
  rolling_avg_assists_element_p1,
  rolling_avg_assists_element_p2,
  rolling_avg_assists_element_p3,
  rolling_avg_assists_element_p4,
  rolling_avg_assists_element_p5,
  rolling_avg_assists_element_p10,
  rolling_avg_clean_sheets_element_p1,
  rolling_avg_clean_sheets_element_p2,
  rolling_avg_clean_sheets_element_p3,
  rolling_avg_clean_sheets_element_p4,
  rolling_avg_clean_sheets_element_p5,
  rolling_avg_clean_sheets_element_p10,
  rolling_avg_goals_conceded_element_p1,
  rolling_avg_goals_conceded_element_p2,
  rolling_avg_goals_conceded_element_p3,
  rolling_avg_goals_conceded_element_p4,
  rolling_avg_goals_conceded_element_p5,
  rolling_avg_goals_conceded_element_p10,
  rolling_avg_saves_element_p1,
  rolling_avg_saves_element_p2,
  rolling_avg_saves_element_p3,
  rolling_avg_saves_element_p4,
  rolling_avg_saves_element_p5,
  rolling_avg_saves_element_p10,
  rolling_avg_minutes_element_p1,
  rolling_avg_minutes_element_p2,
  rolling_avg_minutes_element_p3,
  rolling_avg_minutes_element_p4,
  rolling_avg_minutes_element_p5,
  rolling_avg_minutes_element_p10,
  rolling_avg_total_points_against_opponent_team_element_type,
  rolling_avg_minutes_against_opponent_team_element_type,
  rolling_avg_goals_scored_against_opponent_team_element_type,
  rolling_avg_assists_against_opponent_team_element_type,
  rolling_avg_clean_sheets_against_opponent_team_element_type,
  rolling_avg_goals_conceded_against_opponent_team_element_type,
  rolling_avg_own_goals_against_opponent_team_element_type,
  rolling_avg_penalties_saved_against_opponent_team_element_type,
  rolling_avg_penalties_missed_against_opponent_team_element_type,
  rolling_avg_yellow_cards_against_opponent_team_element_type,
  rolling_avg_red_cards_against_opponent_team_element_type,
  rolling_avg_saves_against_opponent_team_element_type,
  rolling_avg_bonus_against_opponent_team_element_type,
  rolling_avg_bps_against_opponent_team_element_type,
  rolling_avg_total_points_element_type,
  rolling_avg_minutes_element_type,
  rolling_avg_goals_scored_element_type,
  rolling_avg_assists_element_type,
  rolling_avg_clean_sheets_element_type,
  rolling_avg_goals_conceded_element_type,
  rolling_avg_own_goals_element_type,
  rolling_avg_penalties_saved_element_type,
  rolling_avg_penalties_missed_element_type,
  rolling_avg_yellow_cards_element_type,
  rolling_avg_red_cards_element_type,
  rolling_avg_saves_element_type,
  rolling_avg_bonus_element_type,
  rolling_avg_bps_element_type,
  expected_total_points_against_opponent_team_element_type,
  expected_minutes_against_opponent_team_element_type,
  expected_goals_scored_against_opponent_team_element_type,
  expected_assists_against_opponent_team_element_type,
  expected_clean_sheets_against_opponent_team_element_type,
  expected_goals_conceded_against_opponent_team_element_type,
  expected_own_goals_against_opponent_team_element_type,
  expected_penalties_saved_against_opponent_team_element_type,
  expected_penalties_missed_against_opponent_team_element_type,
  expected_yellow_cards_against_opponent_team_element_type,
  expected_red_cards_against_opponent_team_element_type,
  expected_saves_against_opponent_team_element_type,
  expected_bonus_against_opponent_team_element_type,
  expected_bps_against_opponent_team_element_type,
  rolling_avg_squad,
  rolling_avg_first_team,
  rolling_avg_vice_or_captain,
  rolling_avg_squad_p1,
  rolling_avg_first_team_p1,
  rolling_avg_vice_or_captain_p1
FROM
  `footbot-001.fpl.element_gameweeks_features_1920_v01`
'''

In [5]:
# get dataframe
df_all = client.query(sql).to_dataframe()

In [6]:
last_train_event = 3
last_test_event = 12
element_filter_field = 'rolling_avg_total_points_element'
element_filter_value = 0

In [7]:
# impute missing values by element type
df = pd.concat([
    df_all[df_all['element_type'] == i].fillna(
        df_all[
            (df_all['event'] <= last_train_event)
            & (df_all['element_type'] == i)
        ].mean()).fillna(0)
    for i in range(1, 5)
])

In [None]:
# impute missing values by element type
df = pd.concat([
    df_all[df_all['element_type'] == i].fillna(
        df_all[
            df_all['element_type'] == i
        ].mean()).fillna(0)
    for i in range(1, 5)
])

In [8]:
# df = df[df['element_type'] != 1]

In [9]:
df_all['predicted_total_points'] = 0

# Evaluating models

## Helpers

### Modelling

In [10]:
scaled_feature_cols = [
    i for i in df.columns if i not in [
        'total_points',
        'goals_scored',
        'assists',
        'clean_sheets',
        'goals_conceded',
        'saves',
        'minutes',
        'element',
        'safe_web_name',
        'element_type',
        'cluster',
        'team',
        'event',
        'fixture',
        'opponent_team',
        'was_home',
        'was_sunday',
        'was_weekday',
        'was_late',
        'was_early',
    ]
]

scaled_feature_cols

['value',
 'total_points_per_minute_previous_season',
 'goals_scored_per_minute_previous_season',
 'assists_per_minute_previous_season',
 'clean_sheets_per_minute_previous_season',
 'goals_conceded_per_minute_previous_season',
 'own_goals_per_minute_previous_season',
 'penalties_saved_per_minute_previous_season',
 'penalties_missed_per_minute_previous_season',
 'yellow_cards_per_minute_previous_season',
 'red_cards_per_minute_previous_season',
 'saves_per_minute_previous_season',
 'bonus_per_minute_previous_season',
 'bps_per_minute_previous_season',
 'rolling_avg_total_points_element',
 'rolling_avg_minutes_element',
 'rolling_avg_goals_scored_element',
 'rolling_avg_assists_element',
 'rolling_avg_clean_sheets_element',
 'rolling_avg_goals_conceded_element',
 'rolling_avg_own_goals_element',
 'rolling_avg_penalties_saved_element',
 'rolling_avg_penalties_missed_element',
 'rolling_avg_yellow_cards_element',
 'rolling_avg_red_cards_element',
 'rolling_avg_saves_element',
 'rolling_avg

In [11]:
formula = \
'''
total_points
~ C(element_type)
+ C(opponent_team)
+ C(team)
+ C(was_home)
+ C(was_sunday)
+ C(was_weekday)
+ C(was_late)
+ C(was_early)
+
'''  + ' + '.join(scaled_feature_cols)

In [12]:
def calculate_expected_dimension_against_opponent_team_element_type(
    row,
    feature
):
    dimension = feature[9:-35]
    a = 'rolling_avg_' + dimension + '_against_opponent_team_element_type'
    b = 'rolling_avg_' + dimension + '_element'
    c = 'rolling_avg_' + dimension + '_element_type'
    try:
        return row[a] * row[b] / row[c]
    except ZeroDivisionError:
        return 0

def munge_data(df, e, prediction_events, minute_threshold=60):
    # filter weeks after test week
    event_df = df.copy()
    event_df = event_df[event_df['event'] <= e + prediction_events - 1]

    cols = event_df.columns

    # columns that we wouldn't know ahead of time, but can fill down
    unknown_element_cols = [
        'value',
        'rolling_avg_total_points_element',
        'rolling_avg_minutes_element',
        'rolling_avg_goals_scored_element',
        'rolling_avg_assists_element',
        'rolling_avg_clean_sheets_element',
        'rolling_avg_goals_conceded_element',
        'rolling_avg_own_goals_element',
        'rolling_avg_penalties_saved_element',
        'rolling_avg_penalties_missed_element',
        'rolling_avg_yellow_cards_element',
        'rolling_avg_red_cards_element',
        'rolling_avg_saves_element',
        'rolling_avg_bonus_element',
        'rolling_avg_bps_element',
        'rolling_avg_total_points_element_p1',
        'rolling_avg_total_points_element_p2',
        'rolling_avg_total_points_element_p3',
        'rolling_avg_total_points_element_p4',
        'rolling_avg_total_points_element_p5',
        'rolling_avg_total_points_element_p10',
        'rolling_avg_goals_scored_element_p1',
        'rolling_avg_goals_scored_element_p2',
        'rolling_avg_goals_scored_element_p3',
        'rolling_avg_goals_scored_element_p4',
        'rolling_avg_goals_scored_element_p5',
        'rolling_avg_goals_scored_element_p10',
        'rolling_avg_assists_element_p1',
        'rolling_avg_assists_element_p2',
        'rolling_avg_assists_element_p3',
        'rolling_avg_assists_element_p4',
        'rolling_avg_assists_element_p5',
        'rolling_avg_assists_element_p10',
        'rolling_avg_clean_sheets_element_p1',
        'rolling_avg_clean_sheets_element_p2',
        'rolling_avg_clean_sheets_element_p3',
        'rolling_avg_clean_sheets_element_p4',
        'rolling_avg_clean_sheets_element_p5',
        'rolling_avg_clean_sheets_element_p10',
        'rolling_avg_goals_conceded_element_p1',
        'rolling_avg_goals_conceded_element_p2',
        'rolling_avg_goals_conceded_element_p3',
        'rolling_avg_goals_conceded_element_p4',
        'rolling_avg_goals_conceded_element_p5',
        'rolling_avg_goals_conceded_element_p10',
        'rolling_avg_saves_element_p1',
        'rolling_avg_saves_element_p2',
        'rolling_avg_saves_element_p3',
        'rolling_avg_saves_element_p4',
        'rolling_avg_saves_element_p5',
        'rolling_avg_saves_element_p10',
        'rolling_avg_minutes_element_p1',
        'rolling_avg_minutes_element_p2',
        'rolling_avg_minutes_element_p3',
        'rolling_avg_minutes_element_p4',
        'rolling_avg_minutes_element_p5',
        'rolling_avg_minutes_element_p10',
        'rolling_avg_total_points_element_type',
        'rolling_avg_minutes_element_type',
        'rolling_avg_goals_scored_element_type',
        'rolling_avg_assists_element_type',
        'rolling_avg_clean_sheets_element_type',
        'rolling_avg_goals_conceded_element_type',
        'rolling_avg_own_goals_element_type',
        'rolling_avg_penalties_saved_element_type',
        'rolling_avg_penalties_missed_element_type',
        'rolling_avg_yellow_cards_element_type',
        'rolling_avg_red_cards_element_type',
        'rolling_avg_saves_element_type',
        'rolling_avg_bonus_element_type',
        'rolling_avg_bps_element_type',
        'rolling_avg_squad',
        'rolling_avg_first_team',
        'rolling_avg_vice_or_captain',
        'rolling_avg_squad_p1',
        'rolling_avg_first_team_p1',
        'rolling_avg_vice_or_captain_p1']

    # columns that we wouldn't know ahead of time and need to look up
    unknown_opponent_cols = [
        'rolling_avg_total_points_against_opponent_team_element_type',
        'rolling_avg_minutes_against_opponent_team_element_type',
        'rolling_avg_goals_scored_against_opponent_team_element_type',
        'rolling_avg_assists_against_opponent_team_element_type',
        'rolling_avg_clean_sheets_against_opponent_team_element_type',
        'rolling_avg_goals_conceded_against_opponent_team_element_type',
        'rolling_avg_own_goals_against_opponent_team_element_type',
        'rolling_avg_penalties_saved_against_opponent_team_element_type',
        'rolling_avg_penalties_missed_against_opponent_team_element_type',
        'rolling_avg_yellow_cards_against_opponent_team_element_type',
        'rolling_avg_red_cards_against_opponent_team_element_type',
        'rolling_avg_saves_against_opponent_team_element_type',
        'rolling_avg_bonus_against_opponent_team_element_type',
        'rolling_avg_bps_against_opponent_team_element_type',
    ]

    # columns that we wouldn't know ahead of time and we need to calculate
    unknown_engineered_cols = [
        'expected_total_points_against_opponent_team_element_type',
        'expected_minutes_against_opponent_team_element_type',
        'expected_goals_scored_against_opponent_team_element_type',
        'expected_assists_against_opponent_team_element_type',
        'expected_clean_sheets_against_opponent_team_element_type',
        'expected_goals_conceded_against_opponent_team_element_type',
        'expected_own_goals_against_opponent_team_element_type',
        'expected_penalties_saved_against_opponent_team_element_type',
        'expected_penalties_missed_against_opponent_team_element_type',
        'expected_yellow_cards_against_opponent_team_element_type',
        'expected_red_cards_against_opponent_team_element_type',
        'expected_saves_against_opponent_team_element_type',
        'expected_bonus_against_opponent_team_element_type',
        'expected_bps_against_opponent_team_element_type',
    ]

    # fill in nans for future data we wouldn't know
    event_df.loc[event_df['event'] > e,
                 unknown_element_cols + unknown_opponent_cols + unknown_engineered_cols
                ] = np.nan
    event_df.sort_values(['element', 'event', 'fixture'], inplace=True)
    
    # fill down the element data
    event_df[unknown_element_cols] = event_df[unknown_element_cols].fillna(method='ffill')

    # create look up tables for opponent team data
    # we have to look two events back, as some teams won't have played last event
    against_opponent_event_df_1 = event_df[event_df['event'] == e][
        ['opponent_team','element_type','event'] + unknown_opponent_cols].drop_duplicates()


    against_opponent_event_df_2 = event_df[event_df['event'] == e - 1][
        ['opponent_team','element_type','event'] + unknown_opponent_cols].drop_duplicates()

    against_opponent_event_df = pd.concat([against_opponent_event_df_1, against_opponent_event_df_2])

    # get the most recent opponent team data
    against_opponent_event_df = against_opponent_event_df.join(
        against_opponent_event_df.groupby(['opponent_team', 'element_type'])['event'].max(),
        on=['opponent_team', 'element_type'],
        rsuffix='_most_recent')

    against_opponent_event_df = \
    against_opponent_event_df[against_opponent_event_df['event'] == against_opponent_event_df['event_most_recent']]

    event_df = event_df.join(
        against_opponent_event_df.set_index(['opponent_team', 'element_type']),
        on=['opponent_team', 'element_type'],
        rsuffix='_fill')

    # fill in opponent team data from lookup table
    for i in unknown_opponent_cols:
        event_df.loc[event_df['event'] > e, i] = event_df[event_df['event'] > e][i+'_fill']

    # calculate calculated fields
    for i in unknown_engineered_cols:
        event_df[i] = event_df.apply(
            calculate_expected_dimension_against_opponent_team_element_type,
            axis=1,
            args=(i,)
        )


    # filter out irrelevant players
    event_df = event_df[
        event_df[element_filter_field] > element_filter_value
    ][cols]
    
    return event_df


def split_data(event_df, last_train_event, last_test_event):
    # define train-test split
    test_fold = [-1 if i <= last_train_event else 0 for i in event_df['event'] if i <= last_test_event]
    ps = PredefinedSplit(test_fold)

    # split df into train and test
    for train_index, test_index in ps.split():
        event_df_train, event_df_test = \
        event_df.copy().iloc[train_index], event_df.copy().iloc[test_index]
    
    return event_df_train, event_df_test, ps


def standardise_data(event_df, event_df_train, event_df_test, scaled_feature_cols):
    scale_train = event_df_train.copy()
    scale_test = event_df_test.copy()
    scale_df = event_df.copy()
    scaled_event_df_train = event_df_train.copy()
    scaled_event_df_test = event_df_test.copy()
    scaled_event_df = event_df.copy()

    scaler = StandardScaler().fit(scale_train[scaled_feature_cols].values)

    scale_train = scaler.transform(scale_train[scaled_feature_cols].values)
    scale_test = scaler.transform(scale_test[scaled_feature_cols].values)
    scale_df = scaler.transform(scale_df[scaled_feature_cols].values)

    scaled_event_df_train[scaled_feature_cols] = scale_train
    scaled_event_df_test[scaled_feature_cols] = scale_test
    scaled_event_df[scaled_feature_cols] = scale_df
    
    return scaled_event_df_train, scaled_event_df_test, scaled_event_df


def get_pcs(event_X_train, event_X_test, n_categorical_features):
    pca = PCA()
    pca.fit(event_X_train[:,(n_categorical_features + 1):])
    
    event_X_train_pca = np.concatenate(
        (
            event_X_train[:,:(n_categorical_features + 1)],
            pca.transform(event_X_train[:,(n_categorical_features + 1):])
        ), axis=1)
    
    event_X_test_pca = np.concatenate(
        (
            event_X_test[:,:(n_categorical_features + 1)],
            pca.transform(event_X_test[:,(n_categorical_features + 1):])
        ), axis=1)
    
    return event_X_train_pca, event_X_test_pca


def split_matrices(event_X, event_y, ps):
    for train_index, test_index in ps.split():
        event_X_train, event_X_test = event_X[train_index], event_X[test_index]
        event_y_train, event_y_test = event_y[train_index], event_y[test_index]
    
    return event_X_train, event_X_test, event_y_train, event_y_test


def select_features(event_X, event_X_train, event_X_test, features_index):    
    event_X_train_sel = event_X_train[:,features_index]
    event_X_test_sel = event_X_test[:,features_index]
    event_X_sel = event_X[:,features_index]
    
    return event_X_train_sel, event_X_test_sel, event_X_sel


def retune_model(event_df,
                 last_train_event,
                 last_validation_event,
                 standardise,
                 features_index,
                 model,
                 parameter_space,
                 n_iter_tune=100):
    
    # split data into train and validation set
    event_df_train, event_df_validation, ps_tune = split_data(event_df, last_train_event, last_validation_event)

    # standardise appropriate variables if necessary
    scaled_event_df = event_df.copy()
    if standardise:
        scaled_event_df_train, scaled_event_df_validation, scaled_event_df = \
        standardise_data(event_df, event_df_train, event_df_validation, scaled_feature_cols)        

    # get reponse vector and feature matrix
    event_y, event_X = patsy.dmatrices(formula, scaled_event_df, return_type='matrix')

    # split response vector and feature matrix into train and test
    event_X_train, event_X_validation, event_y_train, event_y_validation = \
    split_matrices(event_X, event_y, ps_tune)

    # if only certain features selected, get their indices
    event_X_train_sel = event_X_train
    event_X_validation_sel = event_X_validation
    event_X_sel = event_X
    if features_index:
        event_X_train_sel, event_X_validation_sel, event_X_sel = \
        select_features(event_X, event_X_train, event_X_validation, features_index)
    
    # search hyperparameter space
    tuner = RandomizedSearchCV(
        model,
        parameter_space,
        n_iter=n_iter_tune,
        scoring='neg_mean_squared_error',
        refit=True,
        cv=ps_tune,
        error_score=100, n_jobs=1)
    
    tuner.fit(event_X_sel, event_y.ravel())
    
    return tuner.best_estimator_

In [13]:
def calculate_team_total_points(df,
                                first_team_elements,
                                captain_elements,
                                bench_elements,
                                event,
                                num_transfers=0,
                                carried_over_transfers=0
                               ):
    df = df.copy()
    df = df[df['event'] == event]
    df = df[df['element'].isin(list(first_team_elements) + list(bench_elements))]
    df['is_first_team'] = 0
    df.loc[df['element'].isin(list(first_team_elements)),'is_first_team'] = 1
    df['is_captain'] = 0
    df.loc[df['element'].isin(list(captain_elements)),'is_captain'] = 1


    df_group = df.groupby('element')[['predicted_total_points', 'total_points', 'minutes']].sum()
    df = df[['safe_web_name', 'element', 'value', 'element_type', 'is_first_team', 'is_captain']].drop_duplicates()
    df = df.join(df_group, on='element')
    df.sort_values('predicted_total_points', ascending=False, inplace=True)

    captain_selection = captain_elements[0]
    vice_selection = df.iloc[1]['element']

    is_captain_missing = len(df[(df['element'] == captain_selection) & (df['minutes'] == 0)])

    if is_captain_missing:
        df['is_captain'] = df['element'].apply(lambda x: 1 if x == vice_selection else 0)
    else:
        df['is_captain'] = df['element'].apply(lambda x: 1 if x == captain_selection else 0)

    missing_players = list(df[(df['minutes'] == 0) & (df['is_first_team'] == 1)]['element'])
    present_bench_players = list(df[(df['minutes'] > 0) & (df['is_first_team'] == 0)]['element'])
    num_missing_players = len(missing_players)
    num_present_bench_players = len(present_bench_players)

    if num_missing_players > 0:

        num_keepers = 1
        min_defenders = 3
        min_midfielders = 2
        min_strikers = 1

        df[df['minutes'] == 0]

        for i in range(0, min(3, num_missing_players, num_present_bench_players)):
            substitute = df[df['is_first_team'] == 0].iloc[i]['element']

            for missing_player in missing_players:
                sub_loop_df = df.copy()

                sub_loop_df.loc[sub_loop_df['element'] == substitute,'is_first_team'] = 1
                sub_loop_df.loc[sub_loop_df['element'] == missing_player,'is_first_team'] = 0

                num_team_keepers = len(
                    sub_loop_df[(sub_loop_df['is_first_team'] == 1) & (sub_loop_df['element_type'] == 1)])
                num_team_defenders = len(
                    sub_loop_df[(sub_loop_df['is_first_team'] == 1) & (sub_loop_df['element_type'] == 2)])
                num_team_midfielders = len(
                    sub_loop_df[(sub_loop_df['is_first_team'] == 1) & (sub_loop_df['element_type'] == 3)])
                num_team_strikers = len(
                    sub_loop_df[(sub_loop_df['is_first_team'] == 1) & (sub_loop_df['element_type'] == 4)])

                if (
                    (num_team_keepers == num_keepers)
                    & (num_team_defenders >= min_defenders)
                    & (num_team_midfielders >= min_midfielders)
                    & (num_team_strikers >= min_strikers)
                ):
                    df = sub_loop_df.copy()
                    missing_players = list(df[(df['minutes'] == 0) & (df['is_first_team'] == 1)]['element'])
                    num_missing_players = len(missing_players)
                    break


    transfer_cost = max(num_transfers - carried_over_transfers - 1, 0) * 4

    team_total_points = \
    sum(df[df['is_first_team'] == 1]['total_points'] * (df[df['is_first_team'] == 1]['is_captain'] + 1))

    team_predicted_total_points = \
    sum(df[df['is_first_team'] == 1]['predicted_total_points'] * (df[df['is_first_team'] == 1]['is_captain'] + 1))

    return team_total_points - transfer_cost, team_predicted_total_points, df

### Predicting points

In [14]:
def predict_test_set(df,
                     model,
                     parameter_space=False,
                     prediction_events=1,
                     prediction_weight=1,
                     features_index=False,
                     standardise=False,
                     pcs=False,
                     start=21,
                     end=38,
                     n_iter_tune=100,
                     verbose=0):
    y_pred_arr = []
    y_test_arr = []
    event_df_test_arr = []

    # for each event we want to predict
    for e in range(start, end + 1):
        if verbose > 0:
            print('predicting event', e)
        
        # munge data
        event_df = munge_data(df, e, prediction_events)

        # split df into train and test
        event_df_train, event_df_test, ps = split_data(event_df, e - 1, 38)
        
        # standardise appropriate variables if necessary
        scaled_event_df = event_df.copy()
        if standardise:
            scaled_event_df_train, scaled_event_df_test, scaled_event_df = \
            standardise_data(event_df, event_df_train, event_df_test, scaled_feature_cols)

        # get reponse vector and feature matrix
        event_y, event_X = patsy.dmatrices(formula, scaled_event_df, return_type='matrix')
        
        # split response vector and feature matrix into train and test
        event_X_train, event_X_test, event_y_train, event_y_test = \
        split_matrices(event_X, event_y, ps)
        
        # get pcs if necessary
        if pcs:
            n_categorical_features = event_X.design_info.column_names.index('value') - 1
            event_X_train, event_X_test = get_pcs(event_X_train, event_X_test, n_categorical_features)
        
        # if only certain features selected, get their indices
        event_X_train_sel = event_X_train
        event_X_test_sel = event_X_test
        event_X_sel = event_X
        if features_index:
            event_X_train_sel, event_X_test_sel, event_X_sel = \
            select_features(event_X, event_X_train, event_X_test, features_index)
            
        # retune hyperparameters
        if parameter_space:
            model = retune_model(event_df,
                                 e - 6,
                                 e - 1,
                                 standardise,
                                 features_index,
                                 model,
                                 parameter_space,
                                 n_iter_tune)

        # fit model on training data
        model.fit(event_X_train_sel, event_y_train.ravel())
        # predict test event
        event_y_pred = model.predict(event_X_test_sel).flatten()
        
        # collect predictions and observations 
        y_pred_arr.append(event_y_pred)
        y_test_arr.append(event_y_test)
        
        event_df_test['predicted_total_points'] = event_y_pred
        event_df_test['prediction_event'] = e
        
        event_df_test_arr.append(event_df_test)
        
    return np.concatenate(y_pred_arr).ravel(), np.concatenate(y_test_arr).ravel(), pd.concat(event_df_test_arr)

### Constructing teams

In [15]:
def get_event_players(
    df,
    e,
    prediction_events,
    optimise_key,
    prediction_weight):
    
    event_players = df.copy()
    
    event_players = \
    event_players[
        (event_players['prediction_event'] == e)
        & (event_players['event'] <= e + prediction_events - 1)
    ]        
    present_elements = event_players['element'].drop_duplicates().values
    df_all_players = df_all.copy()
    df_all_players = df_all_players[df_all_players['event'] == e]
    df_all_players['prediction_event'] = e
    event_players = pd.concat([event_players, df_all_players[~df_all_players['element'].isin(present_elements)]])
    
    
    event_players['event_diff'] = event_players['event'] - event_players['prediction_event']
    event_players['prediction_weight'] = prediction_weight**(event_players['event_diff'])
    event_players['optimise_key_weighted'] = event_players['prediction_weight'] * event_players[optimise_key]
    
    event_players_df = event_players.copy()
    
    event_players_group = event_players.groupby('element')['optimise_key_weighted'].sum()
    event_players = event_players[['element', 'value', 'element_type', 'team']].drop_duplicates()
    event_players = event_players.join(event_players_group, on='element')
    event_players = event_players.to_dict('records')
    
    return event_players, event_players_df

In [102]:
def construct_event_teams(df,
                          prediction_events=1,
                          prediction_weight=1,
                          from_scratch_prediction_events=1,
                          optimise_key='predicted_total_points',
                          start=21,
                          end=38,
                          total_budget=1000,
                          captain_factor=2,
                          bench_factor=0.1,
                          transfer_penalty=4,
                          transfer_limit=15,
                          start_event_squad=None,
                          verbose=0
                         ):
    first_team_arr = []
    captain_arr = []
    bench_arr = []
    transfers_arr = []
    team_total_points_arr = []
    predicted_team_total_points_arr = []
    team_df_arr = []
    carried_over_transfers = 0
    
    for e in range(start, end + 1):
        if verbose > 0:
            print('selecting team for event', e)
        
        event_players, event_players_df = get_event_players(
            df,
            e,
            prediction_events,
            optimise_key,
            prediction_weight
        )
        
        from_scratch_event_players, _ = get_event_players(
            df,
            e,
            from_scratch_prediction_events,
            optimise_key,
            prediction_weight
        )
        
        if e == start and not start_event_squad:
            try:
                event_first_team, event_captain, event_bench, event_transfers = \
                team_selector.select_team(
                    from_scratch_event_players,
                    optimise_key='optimise_key_weighted',
                    total_budget=total_budget,
                    captain_factor=captain_factor,
                    bench_factor=bench_factor,
                    existing_squad_elements=None,
                    transfer_penalty=transfer_penalty,
                    transfer_limit=transfer_limit
                )

                first_team_arr.append(event_first_team)
                captain_arr.append(event_captain)
                bench_arr.append(event_bench)
                transfers_arr.append(event_transfers)

                event_num_transfers = 1
            except Exception as ex:
                print(e, ex)
                first_team_arr.append([])
                captain_arr.append([])
                bench_arr.append([])
                transfers_arr.append([])

                event_num_transfers = 1
        else:
            try:
                if e == start and start_event_squad:
                    existing_squad_elements = start_event_squad
                else:
                    existing_squad_elements = event_first_team + event_bench
                
                event_first_team, event_captain, event_bench, event_transfers = \
                team_selector.select_team(
                    event_players,
                    optimise_key='optimise_key_weighted',
                    total_budget=total_budget,
                    captain_factor=captain_factor,
                    bench_factor=bench_factor,
                    existing_squad_elements=existing_squad_elements,
                    transfer_penalty=transfer_penalty,
                    transfer_limit=transfer_limit
                )
                
                _, event_captain, _, _ = \
                team_selector.select_team(
                    event_players,
                    optimise_key='optimise_key_weighted',
                    total_budget=total_budget,
                    captain_factor=captain_factor,
                    bench_factor=bench_factor,
                    existing_squad_elements=event_first_team+event_bench,
                    transfer_penalty=0,
                    transfer_limit=0
                )
                

                first_team_arr.append(event_first_team)
                captain_arr.append(event_captain)
                bench_arr.append(event_bench)
                transfers_arr.append(event_transfers)
                
                event_num_transfers = len(event_transfers['transfers_in'])
            except Exception as ex:
                print(e, ex)
                first_team_arr.append(event_first_team)
                captain_arr.append(event_captain)
                bench_arr.append(event_bench)
                transfers_arr.append({
                    'transfers_in': set(),
                    'transfers_out': set()})
                
                event_num_transfers = 0
        
        event_num_transfers = max(event_num_transfers - carried_over_transfers, 0)
        
        event_team_total_points, event_team_predicted_total_points, event_team_df = \
        calculate_team_total_points(event_players_df,
                                    event_first_team,
                                    event_captain,
                                    event_bench,
                                    e,
                                    event_num_transfers,
                                    carried_over_transfers)
        
        if event_num_transfers == 0 and carried_over_transfers == 0:
            carried_over_transfers = 1
        if event_num_transfers in (0, 1) and carried_over_transfers == 1:
            carried_over_transfers = 1
        if event_num_transfers == 1 and carried_over_transfers == 0:
            carried_over_transfers = 0
        if event_num_transfers > 1:
            carried_over_transfers = 0


        team_total_points_arr.append(event_team_total_points)
        predicted_team_total_points_arr.append(event_team_predicted_total_points)
        team_df_arr.append(event_team_df)
    
    return (
        first_team_arr, bench_arr,
        team_total_points_arr,
        predicted_team_total_points_arr,
        team_df_arr,
        transfers_arr
    )

## Constructing teams

In [103]:
start_event_squad = [
    134,
    214,
    48,
    130,
    181,
    191,
    409,
    183,
    141,
    103,
    234,
    271,
    119,
    212,
    202
]

### Least squares

In [32]:
ls_model = LinearRegression()

In [33]:
ls_y_pred, ls_y_test, ls_df_test = predict_test_set(
    df,
    ls_model,
    standardise=True,
    pcs=False,
    prediction_events=5,
    start=2,
    end=12,
    verbose=True
)

predicting event 2
predicting event 3
predicting event 4
predicting event 5
predicting event 6
predicting event 7
predicting event 8
predicting event 9
predicting event 10
predicting event 11
predicting event 12


In [34]:
ls_test_loss = (
    mean_absolute_error(ls_y_test, ls_y_pred),
    mean_squared_error(ls_y_test, ls_y_pred),
    r2_score(ls_y_test, ls_y_pred)
)
ls_test_loss

(193553291172894.66, 8.660559647384349e+29, -1.0631978572868254e+29)

In [48]:
(
    ls_first_team_arr,
    ls_bench_arr,
    ls_total_points_arr,
    ls_predicted_total_points_arr,
    ls_team_df_arr,
    ls_transfers_arr
) = \
construct_event_teams(
    ls_df_test,
    total_budget=1000,
    prediction_events=2,
    from_scratch_prediction_events=5,
    transfer_penalty=4,
    transfer_limit=1,
    start=2,
    end=12,
    start_event_squad=start_event_squad,
    verbose=1)
    

ls_total_points = sum(ls_total_points_arr)
ls_total_points

selecting team for event 2
selecting team for event 3
selecting team for event 4
selecting team for event 5
selecting team for event 6
selecting team for event 7
selecting team for event 8
selecting team for event 9
selecting team for event 10
selecting team for event 11
selecting team for event 12


425

In [58]:
(
    ls_first_team_arr,
    ls_bench_arr,
    ls_total_points_arr,
    ls_predicted_total_points_arr,
    ls_team_df_arr,
    ls_transfers_arr
) = \
construct_event_teams(
    ls_df_test,
    total_budget=1000,
    prediction_events=2,
    from_scratch_prediction_events=5,
    transfer_penalty=4,
    transfer_limit=1,
    start=6,
    end=12,
    verbose=1)
    

ls_total_points = sum(ls_total_points_arr)
ls_total_points

selecting team for event 6
selecting team for event 7
selecting team for event 8
selecting team for event 9
selecting team for event 10
selecting team for event 11
selecting team for event 12


240

In [60]:
240 + 258

498

### Ridge

In [104]:
ridge_model = Ridge(alpha=300)

In [108]:
ridge_y_pred, ridge_y_test, ridge_df_test = predict_test_set(
    df,
    ridge_model,
    standardise=True,
    pcs=False,
    prediction_events=5,
    start=2,
    end=16,
    verbose=True
)

predicting event 2
predicting event 3
predicting event 4
predicting event 5
predicting event 6
predicting event 7
predicting event 8
predicting event 9
predicting event 10
predicting event 11
predicting event 12
predicting event 13


UnboundLocalError: local variable 'event_df_train' referenced before assignment

In [None]:
ridge_test_loss = (
    mean_absolute_error(ridge_y_test, ridge_y_pred),
    mean_squared_error(ridge_y_test, ridge_y_pred),
    r2_score(ridge_y_test, ridge_y_pred)
)
ridge_test_loss

In [107]:
(
    ridge_first_team_arr,
    ridge_bench_arr,
    ridge_total_points_arr,
    ridge_predicted_total_points_arr,
    ridge_team_df_arr,
    ridge_transfers_arr
) = \
construct_event_teams(
    ridge_df_test,
    total_budget=1000,
    prediction_events=2,
    from_scratch_prediction_events=5,
    transfer_penalty=0,
    transfer_limit=1,
    bench_factor=0.1,
    start=2,
    end=16,
    verbose=1)
    

ridge_total_points = sum(ridge_total_points_arr)
ridge_total_points + 78

selecting team for event 6
selecting team for event 7
selecting team for event 8
selecting team for event 9
selecting team for event 10
selecting team for event 11
selecting team for event 12


377

### Lasso

In [40]:
lasso_model = Lasso(alpha=0.01)

In [41]:
lasso_y_pred, lasso_y_test, lasso_df_test = predict_test_set(
    df,
    lasso_model,
    standardise=True,
    pcs=False,
    prediction_events=5,
    start=2,
    end=12,
    verbose=True
)

predicting event 2
predicting event 3


  positive)


predicting event 4
predicting event 5
predicting event 6
predicting event 7
predicting event 8
predicting event 9
predicting event 10
predicting event 11
predicting event 12


In [42]:
lasso_test_loss = (
    mean_absolute_error(lasso_y_test, lasso_y_pred),
    mean_squared_error(lasso_y_test, lasso_y_pred),
    r2_score(lasso_y_test, lasso_y_pred)
)
lasso_test_loss

(2.1171152949050556, 58.18899964206488, -6.143466733791509)

In [63]:
(
    lasso_first_team_arr,
    lasso_bench_arr,
    lasso_total_points_arr,
    lasso_predicted_total_points_arr,
    lasso_team_df_arr,
    lasso_transfers_arr
) = \
construct_event_teams(
    lasso_df_test,
    total_budget=1000,
    prediction_events=2,
    prediction_weight=1,
    from_scratch_prediction_events=5,
    transfer_penalty=4,
    transfer_limit=1,
    bench_factor=0.1,
    start=6,
    end=12,
    verbose=1)
    

lasso_total_points = sum(lasso_total_points_arr)
lasso_total_points

selecting team for event 6
selecting team for event 7
selecting team for event 8
selecting team for event 9
selecting team for event 10
selecting team for event 11
selecting team for event 12


332

In [64]:
332+258

590

### Gradient boosted trees

In [None]:
boost_teams_total_points_arr = []

for i in range(0, 100):
    print('iteration ', i)
    boost_model = GradientBoostingRegressor(
        criterion='mse',
        min_samples_leaf=0.0175,
        max_features=1.0,
        learning_rate=0.02,
        subsample=0.3,
        n_estimators=200
    )
    
    boost_y_pred, boost_y_test, boost_df_test = predict_test_set(
    df,
    boost_model,
    standardise=False,
    prediction_events=5,
    verbose=0)
    
    (
        boost_first_team_arr,
        boost_bench_arr,
        boost_total_points_arr,
        boost_predicted_total_points_arr,
        boost_team_df_arr,
        boost_transfers_arr
    ) = \
    construct_event_teams_from_existing(
        boost_df_test,
        total_budget=1057.5,
        prediction_events=2,
        from_scratch_prediction_events=5,
        transfer_penalty=4,
        transfer_limit=1,
        verbose=0)
    
    boost_teams_total_points = sum(boost_total_points_arr)
    boost_teams_total_points_arr.append(boost_teams_total_points)

In [None]:
boost_teams_total_points_mean = np.mean(boost_teams_total_points_arr) 
boost_teams_total_points_std = np.std(boost_teams_total_points_arr)
    
boost_teams_total_points_mean, boost_teams_total_points_std

### Random team benchmark

In [None]:
random_benchmark_df_test = lr_df_test.copy()

In [None]:
random_benchmark_teams_total_points_arr = []

for i in range(0, 100):
    random_benchmark_df_test['predicted_total_points'] = \
    np.random.random(len(lr_df_test['predicted_total_points'])) * 7
    
    (
        random_benchmark_first_team_arr,
        random_benchmark_bench_arr,
        random_benchmark_total_points_arr,
        random_benchmark_predicted_total_points_arr,
        random_benchmark_team_df_arr,
        random_benchmark_transfers_arr
    ) = \
    construct_event_teams_from_existing(
        random_benchmark_df_test,
        total_budget=1057.5,
        prediction_events=2,
        from_scratch_prediction_events=5,
        transfer_penalty=4,
        transfer_limit=1,
        verbose=1)
    
    random_benchmark_teams_total_points = sum(random_benchmark_total_points_arr)
    random_benchmark_teams_total_points_arr.append(random_benchmark_teams_total_points)

In [None]:
random_benchmark_teams_total_points_mean = np.mean(random_benchmark_teams_total_points_arr) 
random_benchmark_teams_total_points_std = np.std(random_benchmark_teams_total_points_arr)
    
random_benchmark_teams_total_points_mean, random_benchmark_teams_total_points_std