# Set up

In [1]:
# update path with data dir
import sys
sys.path.append('../../optimiser/')

In [2]:
import pandas as pd
import patsy
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import PredefinedSplit
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import optimiser

# Getting data

In [3]:
file_name = '../csvs/element_gameweek_features_v06.csv'

In [4]:
element_gameweek_df = pd.read_csv(file_name)

In [5]:
element_gameweek_df.fillna(
    element_gameweek_df[element_gameweek_df['event'] <= 20].mean(),
    inplace=True
)

In [6]:
all_element_gameweek_df = element_gameweek_df.copy()

In [7]:
all_element_gameweek_df['predicted_total_points'] = 0

In [8]:
list(element_gameweek_df.columns)

['safe_web_name',
 'element',
 'event',
 'fixture',
 'team',
 'opposition_team',
 'was_home',
 'is_sunday',
 'is_weekday',
 'is_early',
 'is_late',
 'element_type',
 'value',
 'rolling_value_points',
 'rolling_avg_total_points_element',
 'rolling_max_total_points_element',
 'has_high_points_ever_element',
 'rolling_avg_bps_element',
 'rolling_avg_bonus_element',
 'rolling_avg_goals_scored_element',
 'rolling_avg_minutes_element',
 'rolling_avg_clean_sheets_element',
 'rolling_avg_completed_passes_element',
 'rolling_avg_recoveries_element',
 'rolling_avg_assists_element',
 'rolling_avg_key_passes_element',
 'rolling_avg_winning_goals_element',
 'rolling_avg_tackled_element',
 'rolling_avg_dribbles_element',
 'rolling_avg_clearances_blocks_interceptions_element',
 'rolling_avg_big_chances_created_element',
 'rolling_avg_target_missed_element',
 'rolling_avg_fouls_element',
 'rolling_avg_tackles_element',
 'rolling_avg_open_play_crosses_element',
 'rolling_avg_offside_element',
 'rolling

# Evaluating models

## Helpers

### Modelling

In [9]:
formula = \
'total_points ~ C(element) + C(element_type) + C(team) + C(opposition_team) + ' + \
' + '.join([i for i in element_gameweek_df.columns if i not in
 [
     'safe_web_name',
     'element',
     'event',
     'fixture',
     'team',
     'opposition_team',
     'element_type',
     'minutes',
     'rolling_value_points',
     'total_points'
 ]])

In [10]:
scaled_feature_cols = [
    i for i in element_gameweek_df.columns if i not in [
        'safe_web_name', 'element', 'event', 'fixture', 'team',
        'opposition_team', 'element_type', 'was_home', 'is_sunday',
        'is_weekday', 'is_late', 'is_early', 'has_high_points_ever_element',
        'total_points', 'minutes', 'rolling_value_points'
    ]
]

### Predicting points

In [11]:
def calculate_expected_total_points_against_opposition_team(row):
    a = 'rolling_avg_total_points_against_opposition_team_element_type'
    b = 'rolling_avg_total_points_element'
    c = 'rolling_avg_total_points_element_type'
    return row[a] * row[b] / row[c]


def calculate_expected_total_points_against_opposition_team_home_away(row):
    a = 'rolling_avg_total_points_against_opposition_team_element_type_home'
    b = 'rolling_avg_total_points_against_opposition_team_element_type_away'
    c = 'rolling_avg_total_points_element'
    d = 'rolling_avg_total_points_element_type'
    if row['was_home'] == 1:
        return row[a] * row[c] / row[d]
    else:
        return row[b] * row[c] / row[d]


def calculate_expected_total_points_element_home_away(row):
    a = 'rolling_avg_total_points_element_home'
    b = 'rolling_avg_total_points_element_away'
    if row['was_home'] == 1:
        return row[a]
    else:
        return row[b]

In [16]:
def predict_test_set(df,
                     model,
                     prediction_events=1,
                     features=False,
                     standardise=True,
                     start=21,
                     end=38,
                     verbose=0):
    y_pred_arr = []
    y_test_arr = []
    event_df_test_arr = []

    # for each event we want to predict
    for e in range(start, end + 1):
        if verbose > 0:
            print('predicting event', e)
        
        # filter weeks after test week
        event_df = df.copy()
        event_df = event_df[event_df['event'] <= e + prediction_events - 1]
        
        cols = event_df.columns

        # columns that we can fill down
        unknown_element_cols = [
            'value', 'rolling_avg_total_points_element',
            'rolling_max_total_points_element', 'has_high_points_ever_element',
            'rolling_avg_bps_element', 'rolling_avg_bonus_element',
            'rolling_avg_goals_scored_element', 'rolling_avg_minutes_element',
            'rolling_avg_clean_sheets_element', 'rolling_avg_completed_passes_element',
            'rolling_avg_recoveries_element', 'rolling_avg_assists_element',
            'rolling_avg_key_passes_element', 'rolling_avg_winning_goals_element',
            'rolling_avg_tackled_element', 'rolling_avg_dribbles_element',
            'rolling_avg_clearances_blocks_interceptions_element',
            'rolling_avg_big_chances_created_element',
            'rolling_avg_target_missed_element', 'rolling_avg_fouls_element',
            'rolling_avg_tackles_element', 'rolling_avg_open_play_crosses_element',
            'rolling_avg_offside_element', 'rolling_avg_big_chances_missed_element',
            'rolling_avg_saves_element', 'rolling_avg_goals_conceded_element',
            'rolling_avg_minutes_element_p3', 'rolling_avg_total_points_element_p3',
            'total_points_element_premium_p3', 'rolling_avg_bps_element_p3',
            'rolling_avg_total_points_own_team',
            'rolling_avg_total_points_element_type',
            'rolling_avg_total_points_element_home',
            'rolling_avg_total_points_element_away'
        ]

        # columns we need to look up
        unknown_opposition_cols = [
            'rolling_avg_total_points_opposition_team',
            'rolling_avg_total_points_against_opposition_team',
            'rolling_avg_total_points_against_opposition_team_element_type',
            'rolling_avg_total_points_against_opposition_team_element_type_home',
            'rolling_avg_total_points_against_opposition_team_element_type_away',
        ]

        # columns we need to calculate
        unknown_engineered_cols = [
            'expected_total_points_against_opposition_team',
            'expected_total_points_against_opposition_team_home_away',
            'expected_total_points_element_home_away'
        ]

        # fill in nans for future data we wouldn't know
        event_df.loc[event_df['event'] > e,
                     unknown_element_cols + unknown_opposition_cols + unknown_engineered_cols
                    ] = np.nan
        event_df.sort_values(['element', 'event', 'fixture'], inplace=True)
        # fill down the element data
        event_df[unknown_element_cols] = event_df[unknown_element_cols].fillna(method='ffill')

        # create look up tables for opposition team data
        # we have to look two events back, as some teams won't have played last event
        against_opposition_event_df_1 = event_df[event_df['event'] == e][
            ['opposition_team','element_type','event'] + unknown_opposition_cols].drop_duplicates()


        against_opposition_event_df_2 = event_df[event_df['event'] == e - 1][
            ['opposition_team','element_type','event'] + unknown_opposition_cols].drop_duplicates()

        against_opposition_event_df = pd.concat([against_opposition_event_df_1, against_opposition_event_df_2])

        # get the most recent opposition team data
        against_opposition_event_df = against_opposition_event_df.join(
            against_opposition_event_df.groupby(['opposition_team', 'element_type'])['event'].max(),
            on=['opposition_team', 'element_type'],
            rsuffix='_most_recent')

        against_opposition_event_df = \
        against_opposition_event_df[against_opposition_event_df['event'] == against_opposition_event_df['event_most_recent']]

        event_df = event_df.join(
            against_opposition_event_df.set_index(['opposition_team', 'element_type']),
            on=['opposition_team', 'element_type'],
            rsuffix='_fill')

        # fill in opposition team data from lookup table
        for i in unknown_opposition_cols:
            event_df.loc[event_df['event'] > e, i] = event_df[event_df['event'] > e][i+'_fill']

        # calculate engineered fields
        event_df['expected_total_points_against_opposition_team'] = \
        event_df.apply(calculate_expected_total_points_against_opposition_team, axis=1)

        event_df['expected_total_points_against_opposition_team_home_away'] = \
        event_df.apply(calculate_expected_total_points_against_opposition_team_home_away, axis=1)
        
        event_df['expected_total_points_element_home_away'] = \
        event_df.apply(calculate_expected_total_points_element_home_away, axis=1)

        # filter for frequently appearing players
        event_df = event_df[event_df['rolling_avg_minutes_element_p3'] >= 60][cols]
        
        # define train-test split
        test_fold = [-1 if i <= e - 1 else 0 for i in event_df['event']]
        ps = PredefinedSplit(test_fold)

        # split df into train and test
        for train_index, test_index in ps.split():
            event_df_train, event_df_test = \
            event_df.copy().iloc[train_index], event_df.copy().iloc[test_index]
        
        # standardise appropriate variables if necessary
        scaled_event_df = event_df.copy()
        if standardise:
            scale_train = event_df_train.copy()
            scale_test = event_df_test.copy()
            scale_df = event_df.copy()
            scaled_event_df_train = event_df_train.copy()
            scaled_event_df_test = event_df_test.copy()

            scaler = StandardScaler().fit(scale_train[scaled_feature_cols].values)

            scale_train = scaler.transform(scale_train[scaled_feature_cols].values)
            scale_test = scaler.transform(scale_test[scaled_feature_cols].values)
            scale_df = scaler.transform(scale_df[scaled_feature_cols].values)

            scaled_event_df_train[scaled_feature_cols] = scale_train
            scaled_event_df_test[scaled_feature_cols] = scale_test
            scaled_event_df[scaled_feature_cols] = scale_df         

        # get reponse vector and feature matrix
        event_y, event_X = patsy.dmatrices(formula, scaled_event_df, return_type='matrix')
        
        # split response vector and feature matrix into train and test
        for train_index, test_index in ps.split():
            event_X_train, event_X_test = event_X[train_index], event_X[test_index]
            event_y_train, event_y_test = event_y[train_index], event_y[test_index]
        
        # get size of total feature set
        n_all_features = event_X.shape[1]
        
        # if only certain features selected, get their indices
        if features:
            features_index = np.zeros(n_all_features, dtype=bool)

            for i in range(0, n_all_features):
                j = event_X.design_info.column_names[i]
                if j in features:
                    features_index[i] = True
        # otherwise take all features
        else:
            features_index = np.ones(n_all_features, dtype=bool)
        
        # throw away features that are not selected
        event_X_train_sel = event_X_train[:,features_index]
        event_X_test_sel = event_X_test[:,features_index]

        # fit model on training data
        model.fit(event_X_train_sel, event_y_train.ravel())
        # predict test event
        event_y_pred = model.predict(event_X_test_sel).flatten()
        
        # collect predictions and observations 
        y_pred_arr.append(event_y_pred)
        y_test_arr.append(event_y_test)
        
        event_df_test['predicted_total_points'] = event_y_pred
        event_df_test['prediction_event'] = e
        event_df_test_arr.append(event_df_test)
        
    
    return np.concatenate(y_pred_arr).ravel(), np.concatenate(y_test_arr).ravel(), pd.concat(event_df_test_arr)

### Constructing teams

In [None]:
def construct_event_teams_from_scratch(df,
                                       prediction_events=1,
                                       optimise_key='predicted_total_points',
                                       start=21,
                                       end=38,
                                       total_budget=1000,
                                       verbose=0):
    first_team_arr = []
    bench_arr = []
    team_total_points_arr = []
    predicted_team_total_points_arr = []
    team_df_arr = []

    for e in range(start, end + 1):
        if verbose > 0:
            print('predicting event', e)
        
        event_players = df.copy()
        event_players = \
        event_players[
            (event_players['prediction_event'] == e)
            & (event_players['event'] <= e + prediction_events - 1)
        ]

        event_players_group = event_players.groupby('element')[optimise_key].sum()
        event_players = event_players[['element', 'value', 'element_type', 'team']].drop_duplicates()

        event_players = event_players.join(event_players_group, on='element')
        
        event_players = event_players.to_dict('records')

        try:
            event_first_team, event_bench = \
            optimiser.construct_optimal_team_from_scratch(
                event_players,
                optimise_key=optimise_key,
                total_budget=total_budget)

            first_team_arr.append(event_first_team)
            bench_arr.append(event_bench)
        except Exception as ex:
            print(e, ex)
            continue
    
        event_team_total_points, event_team_predicted_total_points, event_team_df = \
        optimiser.calculate_team_total_points(df[df['event'] == e], event_first_team, event_bench, e)

        team_total_points_arr.append(event_team_total_points)
        predicted_team_total_points_arr.append(event_team_predicted_total_points)
        team_df_arr.append(event_team_df)
    
    return first_team_arr, bench_arr, team_total_points_arr, predicted_team_total_points_arr, team_df_arr

In [None]:
def construct_event_teams_from_existing(df,
                                        prediction_events=1,
                                        optimise_key='predicted_total_points',
                                        total_budget=1000,
                                        start=21,
                                        end=38,
                                        transfer_penalty=4,
                                        transfer_limit=11,
                                        verbose=0):
    first_team_arr = []
    bench_arr = []
    team_total_points_arr = []
    predicted_team_total_points_arr = []
    team_df_arr = []
    transfers_arr = []
    carried_over_transfers = 0

    for e in range(start, end + 1):
        if verbose > 0:
            print('predicting event', e)
        
        event_players = df.copy()
        event_players = \
        event_players[
            (event_players['prediction_event'] == e)
            & (event_players['event'] <= e + prediction_events - 1)
        ]        

        present_elements = event_players['element'].drop_duplicates().values
        
        all_df = all_element_gameweek_df.copy()
        all_df = all_df[all_df['event'] == e]
        all_df['prediction_event'] = e
        
        event_players = pd.concat([event_players, all_df[~all_df['element'].isin(present_elements)]])
        event_players_df = event_players.copy()

        event_players_group = event_players.groupby('element')[optimise_key].sum()
        event_players = event_players[['element', 'value', 'element_type', 'team']].drop_duplicates()

        event_players = event_players.join(event_players_group, on='element')
        
        event_players = event_players.to_dict('records')
        
        if e == start:
            try:
                event_first_team, event_bench = \
                optimiser.construct_optimal_team_from_scratch(
                    event_players,
                    optimise_key=optimise_key,
                    total_budget=total_budget
                )

                first_team_arr.append(event_first_team)
                bench_arr.append(event_bench)
                transfers_arr.append([])
                
                event_num_transfers = 1
            except Exception as ex:
                print(e, ex)
                first_team_arr.append([])
                bench_arr.append([])
                transfers_arr.append([])
                
                event_num_transfers = 1
        else:
            try:
                event_first_team, event_bench, event_transfers = \
                optimiser.construct_optimal_team_from_existing(
                    event_players,
                    event_first_team,
                    event_bench,
                    total_budget=total_budget,
                    optimise_key=optimise_key,
                    transfer_penalty=transfer_penalty,
                    transfer_limit=transfer_limit
                )

                first_team_arr.append(event_first_team)
                bench_arr.append(event_bench)
                transfers_arr.append(event_transfers)
                
                event_num_transfers = len(event_transfers[0])
            except Exception as ex:
                print(e, ex)
                first_team_arr.append([])
                bench_arr.append([])
                transfers_arr.append([])
                
                event_num_transfers = 0
        
        event_num_transfers = max(event_num_transfers - carried_over_transfers, 0)
        
        event_team_total_points, event_team_predicted_total_points, event_team_df = \
        optimiser.calculate_team_total_points(event_players_df,
                                              event_first_team,
                                              event_bench,
                                              e,
                                              event_num_transfers,
                                              carried_over_transfers)
        
        if event_num_transfers == 0 and carried_over_transfers == 0:
            carried_over_transfers = 1
        if event_num_transfers in (0, 1) and carried_over_transfers == 1:
            carried_over_transfers = 1
        if event_num_transfers == 1 and carried_over_transfers == 0:
            carried_over_transfers = 0
        if event_num_transfers > 1:
            carried_over_transfers = 0


        team_total_points_arr.append(event_team_total_points)
        predicted_team_total_points_arr.append(event_team_predicted_total_points)
        team_df_arr.append(event_team_df)
    
    return (
        first_team_arr, bench_arr,
        team_total_points_arr,
        predicted_team_total_points_arr,
        team_df_arr,
        transfers_arr
    )

## Predicting points

### Linear regression

In [14]:
lr_model = LinearRegression()

In [None]:
lr_features = [
    'C(element_type)[T.2]', 'C(element_type)[T.3]', 'C(element_type)[T.4]',
    'C(team)[T.2]', 'C(team)[T.3]', 'C(team)[T.4]', 'C(team)[T.6]',
    'C(team)[T.7]', 'C(team)[T.8]', 'C(team)[T.9]', 'C(team)[T.10]',
    'C(team)[T.12]', 'C(team)[T.13]', 'C(team)[T.14]', 'C(team)[T.15]',
    'C(team)[T.16]', 'C(team)[T.17]', 'C(team)[T.19]', 'C(team)[T.20]',
    'C(opposition_team)[T.2]', 'C(opposition_team)[T.3]',
    'C(opposition_team)[T.4]', 'C(opposition_team)[T.5]',
    'C(opposition_team)[T.7]', 'C(opposition_team)[T.8]',
    'C(opposition_team)[T.9]', 'C(opposition_team)[T.10]',
    'C(opposition_team)[T.11]', 'C(opposition_team)[T.12]',
    'C(opposition_team)[T.13]', 'C(opposition_team)[T.14]',
    'C(opposition_team)[T.15]', 'C(opposition_team)[T.16]',
    'C(opposition_team)[T.18]', 'C(opposition_team)[T.19]',
    'C(opposition_team)[T.20]', 'was_home', 'is_sunday', 'is_weekday', 'value',
    'rolling_avg_bps_element', 'rolling_avg_minutes_element',
    'rolling_avg_key_passes_element', 'rolling_avg_winning_goals_element',
    'rolling_avg_offside_element', 'rolling_avg_big_chances_missed_element',
    'rolling_avg_goals_conceded_element', 'rolling_avg_minutes_element_p3',
    'rolling_avg_total_points_element_p3', 'rolling_avg_total_points_own_team',
    'expected_total_points_element_home_away',
    'rolling_avg_total_points_element_type',
    'rolling_avg_total_points_opposition_team',
    'rolling_avg_total_points_against_opposition_team',
    'expected_total_points_against_opposition_team'
]

In [18]:
lr_y_pred, lr_y_test, lr_df_test = predict_test_set(
    element_gameweek_df,
    lr_model,
    prediction_events=5,
#     features=lr_features,
    standardise=True,
    start=21,
    end=21
)

In [None]:
lr_test_loss = (
    mean_absolute_error(lr_y_test, lr_y_pred),
    mean_squared_error(lr_y_test, lr_y_pred),
    r2_score(lr_y_test, lr_y_pred)
)
lr_test_loss

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(pd.Series(lr_y_pred), bins=range(-5,20), alpha=0.75, density=True, label='predicted')
plt.hist(pd.Series(lr_y_test), bins=range(-5,20), alpha=0.75, density=True, label='observed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(pd.Series(lr_y_test), pd.Series(lr_y_pred), alpha=0.1)
plt.plot(np.linspace(-5, 25), np.linspace(-5, 25))
plt.ylabel('Predicted total points')
plt.xlabel('Observed total points')
plt.ylim((-3, 12))
plt.show()

### Ridge regression (lush)

In [None]:
lush_ridge_model = Ridge(alpha=13.8)

In [None]:
lush_ridge_features = [
    'C(element)[T.5]', 'C(element)[T.6]', 'C(element)[T.12]',
    'C(element)[T.14]', 'C(element)[T.18]', 'C(element)[T.23]',
    'C(element)[T.24]', 'C(element)[T.27]', 'C(element)[T.28]',
    'C(element)[T.33]', 'C(element)[T.37]', 'C(element)[T.39]',
    'C(element)[T.40]', 'C(element)[T.43]', 'C(element)[T.48]',
    'C(element)[T.49]', 'C(element)[T.50]', 'C(element)[T.56]',
    'C(element)[T.58]', 'C(element)[T.59]', 'C(element)[T.60]',
    'C(element)[T.61]', 'C(element)[T.62]', 'C(element)[T.69]',
    'C(element)[T.71]', 'C(element)[T.72]', 'C(element)[T.73]',
    'C(element)[T.75]', 'C(element)[T.76]', 'C(element)[T.80]',
    'C(element)[T.84]', 'C(element)[T.85]', 'C(element)[T.88]',
    'C(element)[T.90]', 'C(element)[T.91]', 'C(element)[T.94]',
    'C(element)[T.98]', 'C(element)[T.99]', 'C(element)[T.101]',
    'C(element)[T.102]', 'C(element)[T.110]', 'C(element)[T.115]',
    'C(element)[T.116]', 'C(element)[T.122]', 'C(element)[T.124]',
    'C(element)[T.125]', 'C(element)[T.126]', 'C(element)[T.127]',
    'C(element)[T.134]', 'C(element)[T.137]', 'C(element)[T.140]',
    'C(element)[T.145]', 'C(element)[T.147]', 'C(element)[T.149]',
    'C(element)[T.150]', 'C(element)[T.151]', 'C(element)[T.152]',
    'C(element)[T.156]', 'C(element)[T.157]', 'C(element)[T.160]',
    'C(element)[T.168]', 'C(element)[T.169]', 'C(element)[T.172]',
    'C(element)[T.177]', 'C(element)[T.178]', 'C(element)[T.181]',
    'C(element)[T.183]', 'C(element)[T.192]', 'C(element)[T.195]',
    'C(element)[T.197]', 'C(element)[T.198]', 'C(element)[T.206]',
    'C(element)[T.210]', 'C(element)[T.211]', 'C(element)[T.215]',
    'C(element)[T.220]', 'C(element)[T.221]', 'C(element)[T.226]',
    'C(element)[T.227]', 'C(element)[T.234]', 'C(element)[T.236]',
    'C(element)[T.239]', 'C(element)[T.243]', 'C(element)[T.245]',
    'C(element)[T.246]', 'C(element)[T.247]', 'C(element)[T.249]',
    'C(element)[T.251]', 'C(element)[T.252]', 'C(element)[T.253]',
    'C(element)[T.254]', 'C(element)[T.256]', 'C(element)[T.258]',
    'C(element)[T.262]', 'C(element)[T.263]', 'C(element)[T.267]',
    'C(element)[T.268]', 'C(element)[T.269]', 'C(element)[T.271]',
    'C(element)[T.275]', 'C(element)[T.276]', 'C(element)[T.280]',
    'C(element)[T.282]', 'C(element)[T.288]', 'C(element)[T.289]',
    'C(element)[T.291]', 'C(element)[T.295]', 'C(element)[T.297]',
    'C(element)[T.299]', 'C(element)[T.300]', 'C(element)[T.301]',
    'C(element)[T.302]', 'C(element)[T.305]', 'C(element)[T.306]',
    'C(element)[T.309]', 'C(element)[T.310]', 'C(element)[T.312]',
    'C(element)[T.315]', 'C(element)[T.317]', 'C(element)[T.318]',
    'C(element)[T.319]', 'C(element)[T.324]', 'C(element)[T.326]',
    'C(element)[T.327]', 'C(element)[T.330]', 'C(element)[T.332]',
    'C(element)[T.333]', 'C(element)[T.336]', 'C(element)[T.343]',
    'C(element)[T.346]', 'C(element)[T.351]', 'C(element)[T.352]',
    'C(element)[T.353]', 'C(element)[T.355]', 'C(element)[T.356]',
    'C(element)[T.357]', 'C(element)[T.359]', 'C(element)[T.363]',
    'C(element)[T.364]', 'C(element)[T.365]', 'C(element)[T.367]',
    'C(element)[T.370]', 'C(element)[T.380]', 'C(element)[T.382]',
    'C(element)[T.383]', 'C(element)[T.386]', 'C(element)[T.390]',
    'C(element)[T.391]', 'C(element)[T.393]', 'C(element)[T.396]',
    'C(element)[T.400]', 'C(element)[T.402]', 'C(element)[T.403]',
    'C(element)[T.408]', 'C(element)[T.411]', 'C(element)[T.415]',
    'C(element)[T.417]', 'C(element)[T.423]', 'C(element)[T.425]',
    'C(element)[T.432]', 'C(element)[T.433]', 'C(element)[T.435]',
    'C(element)[T.437]', 'C(element)[T.447]', 'C(element)[T.450]',
    'C(element)[T.451]', 'C(element)[T.455]', 'C(element)[T.459]',
    'C(element)[T.461]', 'C(element)[T.462]', 'C(element)[T.463]',
    'C(element)[T.464]', 'C(element)[T.465]', 'C(element)[T.467]',
    'C(element)[T.468]', 'C(element)[T.473]', 'C(element)[T.476]',
    'C(element)[T.480]', 'C(element)[T.483]', 'C(element)[T.484]',
    'C(element)[T.487]', 'C(element)[T.488]', 'C(element)[T.489]',
    'C(element)[T.493]', 'C(element)[T.503]', 'C(element)[T.504]',
    'C(element)[T.505]', 'C(element)[T.509]', 'C(element)[T.511]',
    'C(element)[T.516]', 'C(element)[T.536]', 'C(element)[T.549]',
    'C(element_type)[T.2]', 'C(element_type)[T.3]', 'C(element_type)[T.4]',
    'C(team)[T.3]', 'C(team)[T.4]', 'C(team)[T.6]', 'C(team)[T.7]',
    'C(team)[T.9]', 'C(team)[T.10]', 'C(team)[T.12]', 'C(team)[T.14]',
    'C(team)[T.15]', 'C(team)[T.16]', 'C(team)[T.17]', 'C(team)[T.19]',
    'C(team)[T.20]', 'C(opposition_team)[T.2]', 'C(opposition_team)[T.3]',
    'C(opposition_team)[T.4]', 'C(opposition_team)[T.5]',
    'C(opposition_team)[T.6]', 'C(opposition_team)[T.7]',
    'C(opposition_team)[T.9]', 'C(opposition_team)[T.10]',
    'C(opposition_team)[T.12]', 'C(opposition_team)[T.13]',
    'C(opposition_team)[T.15]', 'C(opposition_team)[T.16]',
    'C(opposition_team)[T.17]', 'C(opposition_team)[T.18]',
    'C(opposition_team)[T.19]', 'was_home', 'is_weekday', 'is_early',
    'is_late', 'value', 'rolling_avg_total_points_element',
    'rolling_max_total_points_element', 'has_high_points_ever_element',
    'rolling_avg_bps_element', 'rolling_avg_minutes_element',
    'rolling_avg_clean_sheets_element', 'rolling_avg_winning_goals_element',
    'rolling_avg_clearances_blocks_interceptions_element',
    'rolling_avg_offside_element', 'rolling_avg_big_chances_missed_element',
    'rolling_avg_goals_conceded_element', 'rolling_avg_minutes_element_p3',
    'rolling_avg_bps_element_p3', 'rolling_avg_total_points_own_team',
    'expected_total_points_element_home_away',
    'rolling_avg_total_points_opposition_team',
    'rolling_avg_total_points_against_opposition_team'
]

In [None]:
lush_ridge_y_pred, lush_ridge_y_test, lush_ridge_df_test = predict_test_set(
    element_gameweek_df,
    lush_ridge_model,
    prediction_events=10,
    features=lush_ridge_features,
    standardise=True
)

In [None]:
lush_ridge_test_loss = (
    mean_absolute_error(lush_ridge_y_test, lush_ridge_y_pred),
    mean_squared_error(lush_ridge_y_test, lush_ridge_y_pred),
    r2_score(lush_ridge_y_test, lush_ridge_y_pred)
)
lush_ridge_test_loss

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(pd.Series(lush_ridge_y_pred), bins=range(-5,20), alpha=0.75, density=True, label='predicted')
plt.hist(pd.Series(lush_ridge_y_test), bins=range(-5,20), alpha=0.75, density=True, label='observed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(pd.Series(lush_ridge_y_test), pd.Series(lush_ridge_y_pred), alpha=0.1)
plt.plot(np.linspace(-5, 25), np.linspace(-5, 25))
plt.ylabel('Predicted total points')
plt.xlabel('Observed total points')
plt.ylim((-3, 12))
plt.show()

### Ridge regression (stark)

In [None]:
stark_ridge_model = Ridge(alpha=0.36)

In [None]:
stark_ridge_features = [
    'C(element)[T.12]', 'C(element)[T.14]', 'C(element)[T.40]',
    'C(element)[T.43]', 'C(element)[T.49]', 'C(element)[T.62]',
    'C(element)[T.69]', 'C(element)[T.71]', 'C(element)[T.73]',
    'C(element)[T.75]', 'C(element)[T.88]', 'C(element)[T.91]',
    'C(element)[T.101]', 'C(element)[T.115]', 'C(element)[T.116]',
    'C(element)[T.122]', 'C(element)[T.134]', 'C(element)[T.137]',
    'C(element)[T.147]', 'C(element)[T.150]', 'C(element)[T.156]',
    'C(element)[T.160]', 'C(element)[T.172]', 'C(element)[T.178]',
    'C(element)[T.211]', 'C(element)[T.221]', 'C(element)[T.234]',
    'C(element)[T.245]', 'C(element)[T.246]', 'C(element)[T.247]',
    'C(element)[T.256]', 'C(element)[T.262]', 'C(element)[T.269]',
    'C(element)[T.288]', 'C(element)[T.295]', 'C(element)[T.299]',
    'C(element)[T.300]', 'C(element)[T.301]', 'C(element)[T.302]',
    'C(element)[T.305]', 'C(element)[T.306]', 'C(element)[T.315]',
    'C(element)[T.333]', 'C(element)[T.352]', 'C(element)[T.356]',
    'C(element)[T.364]', 'C(element)[T.365]', 'C(element)[T.368]',
    'C(element)[T.380]', 'C(element)[T.390]', 'C(element)[T.391]',
    'C(element)[T.393]', 'C(element)[T.403]', 'C(element)[T.408]',
    'C(element)[T.411]', 'C(element)[T.415]', 'C(element)[T.423]',
    'C(element)[T.425]', 'C(element)[T.437]', 'C(element)[T.450]',
    'C(element)[T.462]', 'C(element)[T.463]', 'C(element)[T.465]',
    'C(element)[T.467]', 'C(element)[T.476]', 'C(element)[T.480]',
    'C(element)[T.484]', 'C(element)[T.504]', 'C(element)[T.536]',
    'C(element_type)[T.2]', 'C(element_type)[T.3]', 'C(element_type)[T.4]',
    'C(team)[T.14]', 'C(opposition_team)[T.9]', 'C(opposition_team)[T.12]',
    'C(opposition_team)[T.13]', 'value'
]

In [None]:
stark_ridge_y_pred, stark_ridge_y_test, stark_ridge_df_test = predict_test_set(
    element_gameweek_df,
    stark_ridge_model,
    features=stark_ridge_features,
    standardise=True
)

In [None]:
stark_ridge_test_loss = (
    mean_absolute_error(stark_ridge_y_test, stark_ridge_y_pred),
    mean_squared_error(stark_ridge_y_test, stark_ridge_y_pred),
    r2_score(stark_ridge_y_test, stark_ridge_y_pred)
)
stark_ridge_test_loss

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(pd.Series(stark_ridge_y_pred), bins=range(-5,20), alpha=0.75, density=True, label='predicted')
plt.hist(pd.Series(stark_ridge_y_test), bins=range(-5,20), alpha=0.75, density=True, label='observed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(pd.Series(stark_ridge_y_test), pd.Series(stark_ridge_y_pred), alpha=0.1)
plt.plot(np.linspace(-5, 25), np.linspace(-5, 25))
plt.ylabel('Predicted total points')
plt.xlabel('Observed total points')
plt.ylim((-3, 12))
plt.show()

### Lasso regression

In [None]:
lasso_model = Lasso(alpha=1e-10)

In [None]:
lasso_features = [
    'value', 'rolling_avg_total_points_element',
    'rolling_max_total_points_element', 'has_high_points_ever_element',
    'rolling_avg_bps_element', 'rolling_avg_bonus_element',
    'rolling_avg_goals_scored_element', 'rolling_avg_minutes_element',
    'rolling_avg_clean_sheets_element', 'rolling_avg_assists_element',
    'rolling_avg_key_passes_element', 'rolling_avg_winning_goals_element',
    'rolling_avg_tackled_element',
    'rolling_avg_clearances_blocks_interceptions_element',
    'rolling_avg_big_chances_created_element', 'rolling_avg_offside_element',
    'rolling_avg_big_chances_missed_element', 'rolling_avg_saves_element',
    'rolling_avg_goals_conceded_element', 'rolling_avg_minutes_element_p3',
    'rolling_avg_total_points_element_p3', 'total_points_element_premium_p3',
    'rolling_avg_bps_element_p3', 'rolling_avg_total_points_own_team',
    'expected_total_points_element_home_away',
    'rolling_avg_total_points_element_type',
    'rolling_avg_total_points_opposition_team',
    'rolling_avg_total_points_against_opposition_team_element_type',
    'expected_total_points_against_opposition_team',
    'expected_total_points_against_opposition_team_home_away'
]

In [None]:
lasso_y_pred, lasso_y_test, lasso_df_test = predict_test_set(
    element_gameweek_df,
    lasso_model,
    features=lasso_features,
    standardise=True
)

In [None]:
lasso_test_loss = (
    mean_absolute_error(lasso_y_test, lasso_y_pred),
    mean_squared_error(lasso_y_test, lasso_y_pred),
    r2_score(lasso_y_test, lasso_y_pred)
)
lasso_test_loss

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(pd.Series(lasso_y_pred), bins=range(-5,20), alpha=0.75, density=True, label='predicted')
plt.hist(pd.Series(lasso_y_test), bins=range(-5,20), alpha=0.75, density=True, label='observed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(pd.Series(lasso_y_test), pd.Series(lasso_y_pred), alpha=0.1)
plt.plot(np.linspace(-5, 25), np.linspace(-5, 25))
plt.ylabel('Predicted total points')
plt.xlabel('Observed total points')
plt.ylim((-3, 12))
plt.show()

### Radial basis SVM regression

In [None]:
rbf_svr_model = SVR(kernel='rbf',gamma=0.004, C=2.9, epsilon=1.9)

In [None]:
rbf_svr_features = ['Intercept',
 'C(element)[T.2]',
 'C(element)[T.4]',
 'C(element)[T.5]',
 'C(element)[T.6]',
 'C(element)[T.7]',
 'C(element)[T.8]',
 'C(element)[T.9]',
 'C(element)[T.11]',
 'C(element)[T.12]',
 'C(element)[T.13]',
 'C(element)[T.14]',
 'C(element)[T.15]',
 'C(element)[T.17]',
 'C(element)[T.18]',
 'C(element)[T.22]',
 'C(element)[T.23]',
 'C(element)[T.24]',
 'C(element)[T.26]',
 'C(element)[T.27]',
 'C(element)[T.28]',
 'C(element)[T.29]',
 'C(element)[T.31]',
 'C(element)[T.33]',
 'C(element)[T.35]',
 'C(element)[T.36]',
 'C(element)[T.37]',
 'C(element)[T.39]',
 'C(element)[T.40]',
 'C(element)[T.42]',
 'C(element)[T.43]',
 'C(element)[T.45]',
 'C(element)[T.47]',
 'C(element)[T.48]',
 'C(element)[T.49]',
 'C(element)[T.50]',
 'C(element)[T.51]',
 'C(element)[T.54]',
 'C(element)[T.55]',
 'C(element)[T.56]',
 'C(element)[T.57]',
 'C(element)[T.58]',
 'C(element)[T.59]',
 'C(element)[T.60]',
 'C(element)[T.61]',
 'C(element)[T.62]',
 'C(element)[T.66]',
 'C(element)[T.69]',
 'C(element)[T.70]',
 'C(element)[T.71]',
 'C(element)[T.72]',
 'C(element)[T.73]',
 'C(element)[T.74]',
 'C(element)[T.75]',
 'C(element)[T.76]',
 'C(element)[T.77]',
 'C(element)[T.78]',
 'C(element)[T.79]',
 'C(element)[T.80]',
 'C(element)[T.82]',
 'C(element)[T.83]',
 'C(element)[T.84]',
 'C(element)[T.85]',
 'C(element)[T.87]',
 'C(element)[T.88]',
 'C(element)[T.90]',
 'C(element)[T.91]',
 'C(element)[T.92]',
 'C(element)[T.94]',
 'C(element)[T.95]',
 'C(element)[T.98]',
 'C(element)[T.99]',
 'C(element)[T.100]',
 'C(element)[T.101]',
 'C(element)[T.102]',
 'C(element)[T.105]',
 'C(element)[T.106]',
 'C(element)[T.110]',
 'C(element)[T.113]',
 'C(element)[T.115]',
 'C(element)[T.116]',
 'C(element)[T.118]',
 'C(element)[T.122]',
 'C(element)[T.124]',
 'C(element)[T.125]',
 'C(element)[T.126]',
 'C(element)[T.127]',
 'C(element)[T.133]',
 'C(element)[T.134]',
 'C(element)[T.135]',
 'C(element)[T.137]',
 'C(element)[T.138]',
 'C(element)[T.140]',
 'C(element)[T.141]',
 'C(element)[T.142]',
 'C(element)[T.143]',
 'C(element)[T.145]',
 'C(element)[T.147]',
 'C(element)[T.149]',
 'C(element)[T.150]',
 'C(element)[T.151]',
 'C(element)[T.152]',
 'C(element)[T.154]',
 'C(element)[T.156]',
 'C(element)[T.157]',
 'C(element)[T.160]',
 'C(element)[T.161]',
 'C(element)[T.162]',
 'C(element)[T.164]',
 'C(element)[T.167]',
 'C(element)[T.168]',
 'C(element)[T.169]',
 'C(element)[T.172]',
 'C(element)[T.175]',
 'C(element)[T.177]',
 'C(element)[T.178]',
 'C(element)[T.180]',
 'C(element)[T.181]',
 'C(element)[T.182]',
 'C(element)[T.183]',
 'C(element)[T.184]',
 'C(element)[T.189]',
 'C(element)[T.190]',
 'C(element)[T.191]',
 'C(element)[T.192]',
 'C(element)[T.193]',
 'C(element)[T.195]',
 'C(element)[T.197]',
 'C(element)[T.198]',
 'C(element)[T.199]',
 'C(element)[T.200]',
 'C(element)[T.201]',
 'C(element)[T.202]',
 'C(element)[T.206]',
 'C(element)[T.208]',
 'C(element)[T.210]',
 'C(element)[T.211]',
 'C(element)[T.213]',
 'C(element)[T.215]',
 'C(element)[T.219]',
 'C(element)[T.220]',
 'C(element)[T.221]',
 'C(element)[T.222]',
 'C(element)[T.223]',
 'C(element)[T.224]',
 'C(element)[T.225]',
 'C(element)[T.226]',
 'C(element)[T.227]',
 'C(element)[T.231]',
 'C(element)[T.233]',
 'C(element)[T.234]',
 'C(element)[T.236]',
 'C(element)[T.239]',
 'C(element)[T.242]',
 'C(element)[T.243]',
 'C(element)[T.245]',
 'C(element)[T.246]',
 'C(element)[T.247]',
 'C(element)[T.249]',
 'C(element)[T.251]',
 'C(element)[T.252]',
 'C(element)[T.253]',
 'C(element)[T.254]',
 'C(element)[T.255]',
 'C(element)[T.256]',
 'C(element)[T.257]',
 'C(element)[T.258]',
 'C(element)[T.260]',
 'C(element)[T.262]',
 'C(element)[T.263]',
 'C(element)[T.264]',
 'C(element)[T.265]',
 'C(element)[T.267]',
 'C(element)[T.268]',
 'C(element)[T.269]',
 'C(element)[T.270]',
 'C(element)[T.271]',
 'C(element)[T.272]',
 'C(element)[T.274]',
 'C(element)[T.275]',
 'C(element)[T.276]',
 'C(element)[T.280]',
 'C(element)[T.281]',
 'C(element)[T.282]',
 'C(element)[T.284]',
 'C(element)[T.285]',
 'C(element)[T.286]',
 'C(element)[T.288]',
 'C(element)[T.289]',
 'C(element)[T.291]',
 'C(element)[T.292]',
 'C(element)[T.293]',
 'C(element)[T.294]',
 'C(element)[T.295]',
 'C(element)[T.296]',
 'C(element)[T.297]',
 'C(element)[T.298]',
 'C(element)[T.299]',
 'C(element)[T.300]',
 'C(element)[T.301]',
 'C(element)[T.302]',
 'C(element)[T.304]',
 'C(element)[T.305]',
 'C(element)[T.306]',
 'C(element)[T.309]',
 'C(element)[T.310]',
 'C(element)[T.311]',
 'C(element)[T.312]',
 'C(element)[T.313]',
 'C(element)[T.315]',
 'C(element)[T.317]',
 'C(element)[T.318]',
 'C(element)[T.319]',
 'C(element)[T.320]',
 'C(element)[T.324]',
 'C(element)[T.326]',
 'C(element)[T.327]',
 'C(element)[T.328]',
 'C(element)[T.330]',
 'C(element)[T.331]',
 'C(element)[T.332]',
 'C(element)[T.333]',
 'C(element)[T.335]',
 'C(element)[T.336]',
 'C(element)[T.339]',
 'C(element)[T.340]',
 'C(element)[T.343]',
 'C(element)[T.344]',
 'C(element)[T.345]',
 'C(element)[T.346]',
 'C(element)[T.347]',
 'C(element)[T.348]',
 'C(element)[T.349]',
 'C(element)[T.351]',
 'C(element)[T.352]',
 'C(element)[T.353]',
 'C(element)[T.354]',
 'C(element)[T.355]',
 'C(element)[T.356]',
 'C(element)[T.357]',
 'C(element)[T.359]',
 'C(element)[T.360]',
 'C(element)[T.361]',
 'C(element)[T.362]',
 'C(element)[T.363]',
 'C(element)[T.364]',
 'C(element)[T.365]',
 'C(element)[T.367]',
 'C(element)[T.368]',
 'C(element)[T.369]',
 'C(element)[T.370]',
 'C(element)[T.372]',
 'C(element)[T.376]',
 'C(element)[T.378]',
 'C(element)[T.380]',
 'C(element)[T.381]',
 'C(element)[T.382]',
 'C(element)[T.383]',
 'C(element)[T.386]',
 'C(element)[T.389]',
 'C(element)[T.390]',
 'C(element)[T.391]',
 'C(element)[T.392]',
 'C(element)[T.393]',
 'C(element)[T.394]',
 'C(element)[T.395]',
 'C(element)[T.396]',
 'C(element)[T.400]',
 'C(element)[T.402]',
 'C(element)[T.403]',
 'C(element)[T.405]',
 'C(element)[T.406]',
 'C(element)[T.408]',
 'C(element)[T.409]',
 'C(element)[T.410]',
 'C(element)[T.411]',
 'C(element)[T.412]',
 'C(element)[T.413]',
 'C(element)[T.415]',
 'C(element)[T.417]',
 'C(element)[T.419]',
 'C(element)[T.423]',
 'C(element)[T.425]',
 'C(element)[T.426]',
 'C(element)[T.427]',
 'C(element)[T.430]',
 'C(element)[T.431]',
 'C(element)[T.432]',
 'C(element)[T.433]',
 'C(element)[T.435]',
 'C(element)[T.437]',
 'C(element)[T.440]',
 'C(element)[T.445]',
 'C(element)[T.446]',
 'C(element)[T.447]',
 'C(element)[T.450]',
 'C(element)[T.451]',
 'C(element)[T.453]',
 'C(element)[T.454]',
 'C(element)[T.455]',
 'C(element)[T.456]',
 'C(element)[T.458]',
 'C(element)[T.459]',
 'C(element)[T.460]',
 'C(element)[T.461]',
 'C(element)[T.462]',
 'C(element)[T.463]',
 'C(element)[T.464]',
 'C(element)[T.465]',
 'C(element)[T.466]',
 'C(element)[T.467]',
 'C(element)[T.468]',
 'C(element)[T.473]',
 'C(element)[T.474]',
 'C(element)[T.475]',
 'C(element)[T.476]',
 'C(element)[T.478]',
 'C(element)[T.479]',
 'C(element)[T.480]',
 'C(element)[T.481]',
 'C(element)[T.482]',
 'C(element)[T.483]',
 'C(element)[T.484]',
 'C(element)[T.486]',
 'C(element)[T.487]',
 'C(element)[T.488]',
 'C(element)[T.489]',
 'C(element)[T.490]',
 'C(element)[T.492]',
 'C(element)[T.493]',
 'C(element)[T.494]',
 'C(element)[T.498]',
 'C(element)[T.499]',
 'C(element)[T.503]',
 'C(element)[T.504]',
 'C(element)[T.505]',
 'C(element)[T.506]',
 'C(element)[T.507]',
 'C(element)[T.508]',
 'C(element)[T.509]',
 'C(element)[T.510]',
 'C(element)[T.511]',
 'C(element)[T.512]',
 'C(element)[T.513]',
 'C(element)[T.516]',
 'C(element)[T.517]',
 'C(element)[T.518]',
 'C(element)[T.524]',
 'C(element)[T.526]',
 'C(element)[T.527]',
 'C(element)[T.534]',
 'C(element)[T.536]',
 'C(element)[T.539]',
 'C(element)[T.549]',
 'C(element)[T.556]',
 'C(element_type)[T.2]',
 'C(element_type)[T.3]',
 'C(element_type)[T.4]',
 'C(team)[T.2]',
 'C(team)[T.3]',
 'C(team)[T.4]',
 'C(team)[T.5]',
 'C(team)[T.6]',
 'C(team)[T.7]',
 'C(team)[T.8]',
 'C(team)[T.9]',
 'C(team)[T.10]',
 'C(team)[T.11]',
 'C(team)[T.12]',
 'C(team)[T.13]',
 'C(team)[T.14]',
 'C(team)[T.15]',
 'C(team)[T.16]',
 'C(team)[T.17]',
 'C(team)[T.18]',
 'C(team)[T.19]',
 'C(team)[T.20]',
 'C(opposition_team)[T.2]',
 'C(opposition_team)[T.3]',
 'C(opposition_team)[T.4]',
 'C(opposition_team)[T.5]',
 'C(opposition_team)[T.6]',
 'C(opposition_team)[T.7]',
 'C(opposition_team)[T.8]',
 'C(opposition_team)[T.9]',
 'C(opposition_team)[T.10]',
 'C(opposition_team)[T.11]',
 'C(opposition_team)[T.12]',
 'C(opposition_team)[T.13]',
 'C(opposition_team)[T.14]',
 'C(opposition_team)[T.15]',
 'C(opposition_team)[T.16]',
 'C(opposition_team)[T.17]',
 'C(opposition_team)[T.18]',
 'C(opposition_team)[T.19]',
 'C(opposition_team)[T.20]',
 'was_home',
 'is_sunday',
 'is_weekday',
 'is_early',
 'is_late',
 'value',
 'rolling_avg_total_points_element',
 'rolling_max_total_points_element',
 'rolling_avg_total_points_own_team',
 'expected_total_points_element_home_away',
 'rolling_avg_total_points_element_type',
 'rolling_avg_total_points_opposition_team',
 'rolling_avg_total_points_against_opposition_team',
 'rolling_avg_total_points_against_opposition_team_element_type',
 'expected_total_points_against_opposition_team',
 'expected_total_points_against_opposition_team_home_away']

In [None]:
rbf_svr_y_pred, rbf_svr_y_test, rbf_svr_df_test = predict_test_set(
    element_gameweek_df,
    rbf_svr_model,
    features=rbf_svr_features,
    standardise=True
)

In [None]:
# landers
rbf_svr_test_loss = (
    mean_absolute_error(rbf_svr_y_test, rbf_svr_y_pred),
    mean_squared_error(rbf_svr_y_test, rbf_svr_y_pred),
    r2_score(rbf_svr_y_test, rbf_svr_y_pred)
)
rbf_svr_test_loss

In [None]:
# full
rbf_svr_test_loss = (
    mean_absolute_error(rbf_svr_y_test, rbf_svr_y_pred),
    mean_squared_error(rbf_svr_y_test, rbf_svr_y_pred),
    r2_score(rbf_svr_y_test, rbf_svr_y_pred)
)
rbf_svr_test_loss

In [None]:
# refined
rbf_svr_test_loss = (
    mean_absolute_error(rbf_svr_y_test, rbf_svr_y_pred),
    mean_squared_error(rbf_svr_y_test, rbf_svr_y_pred),
    r2_score(rbf_svr_y_test, rbf_svr_y_pred)
)
rbf_svr_test_loss

In [None]:
# full selected
rbf_svr_test_loss = (
    mean_absolute_error(rbf_svr_y_test, rbf_svr_y_pred),
    mean_squared_error(rbf_svr_y_test, rbf_svr_y_pred),
    r2_score(rbf_svr_y_test, rbf_svr_y_pred)
)
rbf_svr_test_loss

In [None]:
# refined selected
rbf_svr_test_loss = (
    mean_absolute_error(rbf_svr_y_test, rbf_svr_y_pred),
    mean_squared_error(rbf_svr_y_test, rbf_svr_y_pred),
    r2_score(rbf_svr_y_test, rbf_svr_y_pred)
)
rbf_svr_test_loss

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(pd.Series(rbf_svr_y_pred), bins=range(-5,20), alpha=0.75, density=True, label='predicted')
plt.hist(pd.Series(rbf_svr_y_test), bins=range(-5,20), alpha=0.75, density=True, label='observed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(pd.Series(rbf_svr_y_test), pd.Series(rbf_svr_y_pred), alpha=0.1)
plt.plot(np.linspace(-5, 25), np.linspace(-5, 25))
plt.ylabel('Predicted total points')
plt.xlabel('Observed total points')
plt.ylim((-3, 12))
plt.show()

### Linear SVM regression

In [None]:
linear_svr_model = SVR(kernel='linear', C=18, epsilon=1.97)

In [None]:
linear_svr_features = [
    'C(element)[T.8]', 'C(element)[T.9]', 'C(element)[T.12]',
    'C(element)[T.13]', 'C(element)[T.14]', 'C(element)[T.18]',
    'C(element)[T.26]', 'C(element)[T.28]', 'C(element)[T.31]',
    'C(element)[T.35]', 'C(element)[T.37]', 'C(element)[T.40]',
    'C(element)[T.42]', 'C(element)[T.43]', 'C(element)[T.48]',
    'C(element)[T.49]', 'C(element)[T.50]', 'C(element)[T.54]',
    'C(element)[T.56]', 'C(element)[T.58]', 'C(element)[T.60]',
    'C(element)[T.61]', 'C(element)[T.62]', 'C(element)[T.69]',
    'C(element)[T.71]', 'C(element)[T.73]', 'C(element)[T.75]',
    'C(element)[T.79]', 'C(element)[T.80]', 'C(element)[T.84]',
    'C(element)[T.85]', 'C(element)[T.88]', 'C(element)[T.90]',
    'C(element)[T.91]', 'C(element)[T.92]', 'C(element)[T.95]',
    'C(element)[T.98]', 'C(element)[T.99]', 'C(element)[T.101]',
    'C(element)[T.102]', 'C(element)[T.110]', 'C(element)[T.115]',
    'C(element)[T.116]', 'C(element)[T.122]', 'C(element)[T.126]',
    'C(element)[T.133]', 'C(element)[T.134]', 'C(element)[T.137]',
    'C(element)[T.138]', 'C(element)[T.140]', 'C(element)[T.145]',
    'C(element)[T.147]', 'C(element)[T.149]', 'C(element)[T.151]',
    'C(element)[T.152]', 'C(element)[T.156]', 'C(element)[T.157]',
    'C(element)[T.160]', 'C(element)[T.164]', 'C(element)[T.168]',
    'C(element)[T.169]', 'C(element)[T.172]', 'C(element)[T.175]',
    'C(element)[T.178]', 'C(element)[T.183]', 'C(element)[T.189]',
    'C(element)[T.191]', 'C(element)[T.192]', 'C(element)[T.195]',
    'C(element)[T.198]', 'C(element)[T.199]', 'C(element)[T.206]',
    'C(element)[T.210]', 'C(element)[T.211]', 'C(element)[T.213]',
    'C(element)[T.215]', 'C(element)[T.219]', 'C(element)[T.221]',
    'C(element)[T.226]', 'C(element)[T.233]', 'C(element)[T.242]',
    'C(element)[T.245]', 'C(element)[T.246]', 'C(element)[T.247]',
    'C(element)[T.249]', 'C(element)[T.254]', 'C(element)[T.255]',
    'C(element)[T.256]', 'C(element)[T.257]', 'C(element)[T.258]',
    'C(element)[T.260]', 'C(element)[T.262]', 'C(element)[T.263]',
    'C(element)[T.265]', 'C(element)[T.268]', 'C(element)[T.269]',
    'C(element)[T.271]', 'C(element)[T.274]', 'C(element)[T.280]',
    'C(element)[T.281]', 'C(element)[T.288]', 'C(element)[T.289]',
    'C(element)[T.292]', 'C(element)[T.295]', 'C(element)[T.299]',
    'C(element)[T.301]', 'C(element)[T.304]', 'C(element)[T.305]',
    'C(element)[T.306]', 'C(element)[T.315]', 'C(element)[T.317]',
    'C(element)[T.318]', 'C(element)[T.324]', 'C(element)[T.331]',
    'C(element)[T.333]', 'C(element)[T.336]', 'C(element)[T.343]',
    'C(element)[T.344]', 'C(element)[T.346]', 'C(element)[T.351]',
    'C(element)[T.352]', 'C(element)[T.353]', 'C(element)[T.357]',
    'C(element)[T.360]', 'C(element)[T.363]', 'C(element)[T.364]',
    'C(element)[T.368]', 'C(element)[T.369]', 'C(element)[T.370]',
    'C(element)[T.376]', 'C(element)[T.380]', 'C(element)[T.390]',
    'C(element)[T.391]', 'C(element)[T.393]', 'C(element)[T.395]',
    'C(element)[T.400]', 'C(element)[T.408]', 'C(element)[T.411]',
    'C(element)[T.415]', 'C(element)[T.423]', 'C(element)[T.425]',
    'C(element)[T.431]', 'C(element)[T.432]', 'C(element)[T.433]',
    'C(element)[T.437]', 'C(element)[T.440]', 'C(element)[T.446]',
    'C(element)[T.447]', 'C(element)[T.450]', 'C(element)[T.455]',
    'C(element)[T.456]', 'C(element)[T.459]', 'C(element)[T.461]',
    'C(element)[T.462]', 'C(element)[T.465]', 'C(element)[T.467]',
    'C(element)[T.468]', 'C(element)[T.473]', 'C(element)[T.476]',
    'C(element)[T.478]', 'C(element)[T.480]', 'C(element)[T.484]',
    'C(element)[T.486]', 'C(element)[T.487]', 'C(element)[T.488]',
    'C(element)[T.492]', 'C(element)[T.493]', 'C(element)[T.503]',
    'C(element)[T.504]', 'C(element)[T.505]', 'C(element)[T.506]',
    'C(element)[T.509]', 'C(element)[T.510]', 'C(element)[T.513]',
    'C(element)[T.516]', 'C(element)[T.526]', 'C(element)[T.536]',
    'C(element_type)[T.2]', 'C(element_type)[T.3]', 'C(element_type)[T.4]',
    'C(team)[T.3]', 'C(team)[T.5]', 'C(team)[T.6]', 'C(team)[T.7]',
    'C(team)[T.8]', 'C(team)[T.9]', 'C(team)[T.10]', 'C(team)[T.11]',
    'C(team)[T.12]', 'C(team)[T.14]', 'C(team)[T.16]', 'C(team)[T.17]',
    'C(team)[T.19]', 'C(team)[T.20]', 'C(opposition_team)[T.3]',
    'C(opposition_team)[T.4]', 'C(opposition_team)[T.5]',
    'C(opposition_team)[T.6]', 'C(opposition_team)[T.7]',
    'C(opposition_team)[T.9]', 'C(opposition_team)[T.10]',
    'C(opposition_team)[T.12]', 'C(opposition_team)[T.13]',
    'C(opposition_team)[T.15]', 'C(opposition_team)[T.16]',
    'C(opposition_team)[T.17]', 'C(opposition_team)[T.18]',
    'C(opposition_team)[T.19]', 'is_sunday', 'is_weekday', 'value',
    'rolling_avg_minutes_element', 'rolling_avg_goals_conceded_element',
    'rolling_avg_minutes_element_p3', 'rolling_avg_total_points_own_team',
    'expected_total_points_element_home_away',
    'rolling_avg_total_points_opposition_team',
    'rolling_avg_total_points_against_opposition_team'
]

In [None]:
linear_svr_y_pred, linear_svr_y_test, linear_svr_df_test = predict_test_set(
    element_gameweek_df,
    linear_svr_model,
    features=linear_svr_features,
    standardise=True
)

In [None]:
linear_svr_test_loss = (
    mean_absolute_error(linear_svr_y_test, linear_svr_y_pred),
    mean_squared_error(linear_svr_y_test, linear_svr_y_pred),
    r2_score(linear_svr_y_test, linear_svr_y_pred)
)
linear_svr_test_loss

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(pd.Series(linear_svr_y_pred), bins=range(-5,20), alpha=0.75, density=True, label='predicted')
plt.hist(pd.Series(linear_svr_y_test), bins=range(-5,20), alpha=0.75, density=True, label='observed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(pd.Series(linear_svr_y_test), pd.Series(linear_svr_y_pred), alpha=0.1)
plt.plot(np.linspace(-5, 25), np.linspace(-5, 25))
plt.ylabel('Predicted total points')
plt.xlabel('Observed total points')
plt.ylim((-3, 12))
plt.show()

### Decision tree (lush)

In [None]:
lush_tree_test_loss_arr = []
lush_tree_y_pred_arr = []
lush_tree_y_test_arr = []
lush_tree_df_test_arr = []

for i in range(0, 1):
    lush_tree_model = DecisionTreeRegressor(
        min_samples_leaf=0.07,
        random_state=i
    )
    
    lush_tree_y_pred, lush_tree_y_test, lush_tree_df_test = predict_test_set(
        element_gameweek_df,
        lush_tree_model,
        standardise=False)
    
    lush_tree_test_loss = (
        mean_absolute_error(lush_tree_y_test, lush_tree_y_pred),
        mean_squared_error(lush_tree_y_test, lush_tree_y_pred),
        r2_score(lush_tree_y_test, lush_tree_y_pred))
    
    lush_tree_test_loss_arr.append(lush_tree_test_loss)
    lush_tree_y_pred_arr.append(lush_tree_y_pred)
    lush_tree_y_test_arr.append(lush_tree_y_test)
    lush_tree_df_test_arr.append(lush_tree_df_test)

lush_tree_test_loss_mean = (
    np.mean([i[0] for i in lush_tree_test_loss_arr]),
    np.mean([i[1] for i in lush_tree_test_loss_arr]),
    np.mean([i[2] for i in lush_tree_test_loss_arr])
)

lush_tree_test_loss_std = (
    np.std([i[0] for i in lush_tree_test_loss_arr]),
    np.std([i[1] for i in lush_tree_test_loss_arr]),
    np.std([i[2] for i in lush_tree_test_loss_arr])
)
    
print(lush_tree_test_loss_mean)
print(lush_tree_test_loss_std)

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(pd.Series(lush_tree_y_pred_arr[0]), bins=range(-5,20), alpha=0.75, density=True, label='predicted')
plt.hist(pd.Series(lush_tree_y_test_arr[0]), bins=range(-5,20), alpha=0.75, density=True, label='observed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(pd.Series(lush_tree_y_test_arr[0]), pd.Series(lush_tree_y_pred_arr[0]), alpha=0.1)
plt.plot(np.linspace(-5, 25), np.linspace(-5, 25))
plt.ylabel('Predicted total points')
plt.xlabel('Observed total points')
plt.ylim((-3, 12))
plt.show()

### Decision tree (stark)

In [None]:
stark_tree_features = ['value', 'expected_total_points_against_opposition_team']

In [None]:
stark_tree_test_loss_arr = []
stark_tree_y_pred_arr = []
stark_tree_y_test_arr = []
stark_tree_df_test_arr = []

for i in range(0, 1):
    stark_tree_model = DecisionTreeRegressor(
        min_samples_leaf=0.027,
        random_state=i
    )
    
    stark_tree_y_pred, stark_tree_y_test, stark_tree_df_test = predict_test_set(
        element_gameweek_df,
        stark_tree_model,
        features=stark_tree_features,
        standardise=False)
    
    stark_tree_test_loss = (
        mean_absolute_error(stark_tree_y_test, stark_tree_y_pred),
        mean_squared_error(stark_tree_y_test, stark_tree_y_pred),
        r2_score(stark_tree_y_test, stark_tree_y_pred))
    
    stark_tree_test_loss_arr.append(stark_tree_test_loss)
    stark_tree_y_pred_arr.append(stark_tree_y_pred)
    stark_tree_y_test_arr.append(stark_tree_y_test)
    stark_tree_df_test_arr.append(stark_tree_df_test)

stark_tree_test_loss_mean = (
    np.mean([i[0] for i in stark_tree_test_loss_arr]),
    np.mean([i[1] for i in stark_tree_test_loss_arr]),
    np.mean([i[2] for i in stark_tree_test_loss_arr])
)

stark_tree_test_loss_std = (
    np.std([i[0] for i in stark_tree_test_loss_arr]),
    np.std([i[1] for i in stark_tree_test_loss_arr]),
    np.std([i[2] for i in stark_tree_test_loss_arr])
)
    
print(stark_tree_test_loss_mean)
print(stark_tree_test_loss_std)

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(pd.Series(stark_tree_y_pred_arr[0]), bins=range(-5,20), alpha=0.75, density=True, label='predicted')
plt.hist(pd.Series(stark_tree_y_test_arr[0]), bins=range(-5,20), alpha=0.75, density=True, label='observed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(pd.Series(stark_tree_y_test_arr[0]), pd.Series(stark_tree_y_pred_arr[0]), alpha=0.1)
plt.plot(np.linspace(-5, 25), np.linspace(-5, 25))
plt.ylabel('Predicted total points')
plt.xlabel('Observed total points')
plt.ylim((-3, 12))
plt.show()

### Random forest (lush)

In [None]:
lush_forest_test_loss_arr = []
lush_forest_y_pred_arr = []
lush_forest_y_test_arr = []
lush_forest_df_test_arr = []

for i in range(0, 1):
    lush_forest_model = RandomForestRegressor(
        n_estimators=120,
        min_samples_leaf=0.015,
        max_features=0.84,
        random_state=i)
    
    lush_forest_y_pred, lush_forest_y_test, lush_forest_df_test = predict_test_set(
        element_gameweek_df,
        lush_forest_model,
        standardise=False)
    
    lush_forest_test_loss = (
        mean_absolute_error(lush_forest_y_test, lush_forest_y_pred),
        mean_squared_error(lush_forest_y_test, lush_forest_y_pred),
        r2_score(lush_forest_y_test, lush_forest_y_pred))
    
    lush_forest_test_loss_arr.append(lush_forest_test_loss)
    lush_forest_y_pred_arr.append(lush_forest_y_pred)
    lush_forest_y_test_arr.append(lush_forest_y_test)
    lush_forest_df_test_arr.append(lush_forest_df_test)

lush_forest_test_loss_mean = (
    np.mean([i[0] for i in lush_forest_test_loss_arr]),
    np.mean([i[1] for i in lush_forest_test_loss_arr]),
    np.mean([i[2] for i in lush_forest_test_loss_arr])
)

lush_forest_test_loss_std = (
    np.std([i[0] for i in lush_forest_test_loss_arr]),
    np.std([i[1] for i in lush_forest_test_loss_arr]),
    np.std([i[2] for i in lush_forest_test_loss_arr])
)
    
print(lush_forest_test_loss_mean)
print(lush_forest_test_loss_std)

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(pd.Series(lush_forest_y_pred_arr[0]), bins=range(-5,20), alpha=0.75, density=True, label='predicted')
plt.hist(pd.Series(lush_forest_y_test_arr[0]), bins=range(-5,20), alpha=0.75, density=True, label='observed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(pd.Series(lush_forest_y_test_arr[0]), pd.Series(lush_forest_y_pred_arr[0]), alpha=0.1)
plt.plot(np.linspace(-5, 25), np.linspace(-5, 25))
plt.ylabel('Predicted total points')
plt.xlabel('Observed total points')
plt.ylim((-3, 12))
plt.show()

### Random forest (stark)

In [None]:
stark_forest_features = [
    'value', 'is_late', 'rolling_avg_total_points_element',
    'rolling_var_total_points_element', 'rolling_avg_minutes_element_p3',
    'rolling_avg_bps_element', 'rolling_avg_bonus_element',
    'rolling_avg_goals_scored_element', 'rolling_avg_minutes_element',
    'rolling_avg_clean_sheets_element', 'rolling_avg_completed_passes_element',
    'rolling_avg_recoveries_element', 'rolling_avg_key_passes_element',
    'rolling_avg_tackled_element', 'rolling_avg_dribbles_element',
    'rolling_avg_clearances_blocks_interceptions_element',
    'rolling_avg_target_missed_element', 'rolling_avg_fouls_element',
    'rolling_avg_tackles_element', 'rolling_avg_big_chances_missed_element',
    'rolling_avg_goals_conceded_element',
    'rolling_avg_total_points_element_type',
    'rolling_avg_total_points_own_team',
    'rolling_avg_total_points_opposition_team',
    'rolling_avg_total_points_against_opposition_team',
    'rolling_avg_total_points_against_opposition_team_element_type',
    'expected_avg_total_points_against_opposition_team',
    'expected_avg_total_points_at_ground'
]

In [None]:
stark_forest_test_loss_arr = []
stark_forest_y_pred_arr = []
stark_forest_y_test_arr = []
stark_forest_df_test_arr = []

for i in range(0, 1):
    stark_forest_model = RandomForestRegressor(
        n_estimators=120,
        min_samples_leaf=0.0135,
        max_features=0.73,
        random_state=i)
    
    stark_forest_y_pred, stark_forest_y_test, stark_forest_df_test = predict_test_set(
        element_gameweek_df,
        stark_forest_model,
        features=stark_forest_features,
        standardise=False)
    
    stark_forest_test_loss = (
        mean_absolute_error(stark_forest_y_test, stark_forest_y_pred),
        mean_squared_error(stark_forest_y_test, stark_forest_y_pred),
        r2_score(stark_forest_y_test, stark_forest_y_pred))
    
    stark_forest_test_loss_arr.append(stark_forest_test_loss)
    stark_forest_y_pred_arr.append(stark_forest_y_pred)
    stark_forest_y_test_arr.append(stark_forest_y_test)
    stark_forest_df_test_arr.append(stark_forest_df_test)

stark_forest_test_loss_mean = (
    np.mean([i[0] for i in stark_forest_test_loss_arr]),
    np.mean([i[1] for i in stark_forest_test_loss_arr]),
    np.mean([i[2] for i in stark_forest_test_loss_arr])
)

stark_forest_test_loss_std = (
    np.std([i[0] for i in stark_forest_test_loss_arr]),
    np.std([i[1] for i in stark_forest_test_loss_arr]),
    np.std([i[2] for i in stark_forest_test_loss_arr])
)
    
print(stark_forest_test_loss_mean)
print(stark_forest_test_loss_std)

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(pd.Series(stark_forest_y_pred_arr[0]), bins=range(-5,20), alpha=0.75, density=True, label='predicted')
plt.hist(pd.Series(stark_forest_y_test_arr[0]), bins=range(-5,20), alpha=0.75, density=True, label='observed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(pd.Series(stark_forest_y_test_arr[0]), pd.Series(stark_forest_y_pred_arr[0]), alpha=0.1)
plt.plot(np.linspace(-5, 25), np.linspace(-5, 25))
plt.ylabel('Predicted total points')
plt.xlabel('Observed total points')
plt.ylim((-3, 12))
plt.show()

### Gradient boosted trees (lush)

In [None]:
lush_boost_test_loss_arr = []
lush_boost_y_pred_arr = []
lush_boost_y_test_arr = []
lush_boost_df_test_arr = []

for i in range(0, 1):
    lush_boost_model = GradientBoostingRegressor(
        n_estimators=120,
        max_features=0.61,
        min_samples_leaf=0.015,
        learning_rate=0.035,
        criterion='mse',
        random_state=i)
    
    lush_boost_y_pred, lush_boost_y_test, lush_boost_df_test = predict_test_set(
        element_gameweek_df,
        lush_boost_model,
        standardise=False)
    
    lush_boost_test_loss = (
        mean_absolute_error(lush_boost_y_test, lush_boost_y_pred),
        mean_squared_error(lush_boost_y_test, lush_boost_y_pred),
        r2_score(lush_boost_y_test, lush_boost_y_pred))
    
    lush_boost_test_loss_arr.append(lush_boost_test_loss)
    lush_boost_y_pred_arr.append(lush_boost_y_pred)
    lush_boost_y_test_arr.append(lush_boost_y_test)
    lush_boost_df_test_arr.append(lush_boost_df_test)

lush_boost_test_loss_mean = (
    np.mean([i[0] for i in lush_boost_test_loss_arr]),
    np.mean([i[1] for i in lush_boost_test_loss_arr]),
    np.mean([i[2] for i in lush_boost_test_loss_arr])
)

lush_boost_test_loss_std = (
    np.std([i[0] for i in lush_boost_test_loss_arr]),
    np.std([i[1] for i in lush_boost_test_loss_arr]),
    np.std([i[2] for i in lush_boost_test_loss_arr])
)
    
print(lush_boost_test_loss_mean)
print(lush_boost_test_loss_std)

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(pd.Series(lush_boost_y_pred_arr[0]), bins=range(-5,20), alpha=0.75, density=True, label='predicted')
plt.hist(pd.Series(lush_boost_y_test_arr[0]), bins=range(-5,20), alpha=0.75, density=True, label='observed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(pd.Series(lush_boost_y_test_arr[0]), pd.Series(lush_boost_y_pred_arr[0]), alpha=0.1)
plt.plot(np.linspace(-5, 25), np.linspace(-5, 25))
plt.ylabel('Predicted total points')
plt.xlabel('Observed total points')
plt.ylim((-3, 12))
plt.show()

### Gradient boosted trees (stark)

In [None]:
stark_boost_features = [
    'value', 'rolling_avg_minutes_element_p3', 'rolling_avg_minutes_element',
    'rolling_avg_key_passes_element',
    'rolling_avg_clearances_blocks_interceptions_element',
    'rolling_avg_big_chances_missed_element',
    'rolling_avg_total_points_own_team',
    'rolling_avg_total_points_opposition_team',
    'rolling_avg_total_points_against_opposition_team',
    'rolling_avg_total_points_against_opposition_team_element_type',
    'expected_avg_total_points_against_opposition_team',
    'expected_avg_total_points_at_ground'
]

In [None]:
stark_boost_test_loss_arr = []
stark_boost_y_pred_arr = []
stark_boost_y_test_arr = []
stark_boost_df_test_arr = []

for i in range(0, 1):
    stark_boost_model = GradientBoostingRegressor(
        n_estimators=120,
        max_features=0.8,
        min_samples_leaf=0.02,
        learning_rate=0.05,
        criterion='mse',
        random_state=i)
    
    stark_boost_y_pred, stark_boost_y_test, stark_boost_df_test = predict_test_set(
        element_gameweek_df,
        stark_boost_model,
        features=stark_boost_features,
        standardise=False)
    
    stark_boost_test_loss = (
        mean_absolute_error(stark_boost_y_test, stark_boost_y_pred),
        mean_squared_error(stark_boost_y_test, stark_boost_y_pred),
        r2_score(stark_boost_y_test, stark_boost_y_pred))
    
    stark_boost_test_loss_arr.append(stark_boost_test_loss)
    stark_boost_y_pred_arr.append(stark_boost_y_pred)
    stark_boost_y_test_arr.append(stark_boost_y_test)
    stark_boost_df_test_arr.append(stark_boost_df_test)

stark_boost_test_loss_mean = (
    np.mean([i[0] for i in stark_boost_test_loss_arr]),
    np.mean([i[1] for i in stark_boost_test_loss_arr]),
    np.mean([i[2] for i in stark_boost_test_loss_arr])
)

stark_boost_test_loss_std = (
    np.std([i[0] for i in stark_boost_test_loss_arr]),
    np.std([i[1] for i in stark_boost_test_loss_arr]),
    np.std([i[2] for i in stark_boost_test_loss_arr])
)
    
print(stark_boost_test_loss_mean)
print(stark_boost_test_loss_std)

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(pd.Series(stark_boost_y_pred_arr[0]), bins=range(-5,20), alpha=0.75, density=True, label='predicted')
plt.hist(pd.Series(stark_boost_y_test_arr[0]), bins=range(-5,20), alpha=0.75, density=True, label='observed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(pd.Series(stark_boost_y_test_arr[0]), pd.Series(stark_boost_y_pred_arr[0]), alpha=0.1)
plt.plot(np.linspace(-5, 25), np.linspace(-5, 25))
plt.ylabel('Predicted total points')
plt.xlabel('Observed total points')
plt.ylim((-3, 12))
plt.show()

## Constructing teams

### Linear regression

In [None]:
(
    lr_first_team_arr,
    lr_bench_arr,
    lr_total_points_arr,
    lr_predicted_total_points_arr,
    lr_team_df_arr,
    lr_transfers_arr
) = \
construct_event_teams_from_existing(
    lr_df_test,
    prediction_events=1,
    total_budget=1050,
    verbose=1)

In [None]:
lr_teams_total_points = sum(lr_total_points_arr)
lr_teams_total_points

### Ridge regression (lush)

In [None]:
(
    lush_ridge_first_team_arr,
    lush_ridge_bench_arr,
    lush_ridge_total_points_arr,
    lush_ridge_predicted_total_points_arr,
    lush_ridge_team_df_arr,
    lush_ridge_transfers_arr
) = \
construct_event_teams_from_existing(
    lush_ridge_df_test,
    total_budget=1050,
    verbose=1)

In [None]:
lush_ridge_teams_total_points = sum(lush_ridge_total_points_arr)
lush_ridge_teams_total_points

### Ridge regression (stark)

In [None]:
(
    stark_ridge_first_team_arr,
    stark_ridge_bench_arr,
    stark_ridge_total_points_arr,
    stark_ridge_predicted_total_points_arr,
    stark_ridge_team_df_arr,
    stark_ridge_transfers_arr
) = \
construct_event_teams_from_existing(stark_ridge_df_test, total_budget=1050, verbose=1)

In [None]:
stark_ridge_teams_total_points = sum(stark_ridge_total_points_arr)
stark_ridge_teams_total_points

### Lasso regression

In [None]:
(
    lasso_first_team_arr,
    lasso_bench_arr,
    lasso_total_points_arr,
    lasso_predicted_total_points_arr,
    lasso_team_df_arr,
    lasso_transfers_arr
) = \
construct_event_teams_from_existing(lasso_df_test, total_budget=1050, verbose=1)

In [None]:
lasso_teams_total_points = sum(lasso_total_points_arr)
lasso_teams_total_points

### Radial basis SVM regression

In [None]:
(
    rbf_svr_first_team_arr,
    rbf_svr_bench_arr,
    rbf_svr_total_points_arr,
    rbf_svr_predicted_total_points_arr,
    rbf_svr_team_df_arr,
    rbf_svr_transfers_arr
) = \
construct_event_teams_from_existing(rbf_svr_df_test, total_budget=1050, verbose=1)

In [None]:
rbf_svr_teams_total_points = sum(rbf_svr_total_points_arr)
rbf_svr_teams_total_points

### Linear basis SVM regression

In [None]:
(
    linear_svr_first_team_arr,
    linear_svr_bench_arr,
    linear_svr_total_points_arr,
    linear_svr_predicted_total_points_arr,
    linear_svr_team_df_arr,
    linear_svr_transfers_arr
) = \
construct_event_teams_from_existing(linear_svr_df_test, total_budget=1050, verbose=1)

In [None]:
linear_svr_teams_total_points = sum(linear_svr_total_points_arr)
linear_svr_teams_total_points

### Decision tree (lush)

In [None]:
lush_tree_teams_total_points_arr = []

for i in range(0, len(lush_tree_df_test_arr)):
    (
        lush_tree_first_team_arr,
        lush_tree_bench_arr,
        lush_tree_total_points_arr,
        lush_tree_predicted_total_points_arr,
        lush_tree_team_df_arr,
        lush_tree_transfers_arr
    ) = \
    construct_event_teams_from_existing(lush_tree_df_test_arr[i], total_budget=1050, verbose=0)
    
    lush_tree_teams_total_points = sum(lush_tree_total_points_arr)
    lush_tree_teams_total_points_arr.append(lush_tree_teams_total_points)

lush_tree_teams_total_points_mean = np.mean(lush_tree_teams_total_points_arr) 
lush_tree_teams_total_points_std = np.std(lush_tree_teams_total_points_arr)
    
print(lush_tree_teams_total_points_mean)
print(lush_tree_teams_total_points_std)

### Decision tree (stark)

In [None]:
stark_tree_teams_total_points_arr = []

for i in range(0, len(stark_tree_df_test_arr)):
    (
        stark_tree_first_team_arr,
        stark_tree_bench_arr,
        stark_tree_total_points_arr,
        stark_tree_predicted_total_points_arr,
        stark_tree_team_df_arr,
        stark_tree_transfers_arr
    ) = \
    construct_event_teams_from_existing(stark_tree_df_test_arr[i], total_budget=1050, verbose=0)
    
    stark_tree_teams_total_points = sum(stark_tree_total_points_arr)
    stark_tree_teams_total_points_arr.append(stark_tree_teams_total_points)

stark_tree_teams_total_points_mean = np.mean(stark_tree_teams_total_points_arr) 
stark_tree_teams_total_points_std = np.std(stark_tree_teams_total_points_arr)
    
print(stark_tree_teams_total_points_mean)
print(stark_tree_teams_total_points_std)

### Random forest (lush)

In [None]:
lush_forest_teams_total_points_arr = []

for i in range(0, len(lush_forest_df_test_arr)):
    (
        lush_forest_first_team_arr,
        lush_forest_bench_arr,
        lush_forest_total_points_arr,
        lush_forest_predicted_total_points_arr,
        lush_forest_team_df_arr,
        lush_forest_transfers_arr
    ) = \
    construct_event_teams_from_existing(lush_forest_df_test_arr[i], total_budget=1050, verbose=0)
    
    lush_forest_teams_total_points = sum(lush_forest_total_points_arr)
    lush_forest_teams_total_points_arr.append(lush_forest_teams_total_points)

lush_forest_teams_total_points_mean = np.mean(lush_forest_teams_total_points_arr) 
lush_forest_teams_total_points_std = np.std(lush_forest_teams_total_points_arr)
    
print(lush_forest_teams_total_points_mean)
print(lush_forest_teams_total_points_std)

### Random forest (stark)

In [None]:
stark_forest_teams_total_points_arr = []

for i in range(0, len(stark_forest_df_test_arr)):
    (
        stark_forest_first_team_arr,
        stark_forest_bench_arr,
        stark_forest_total_points_arr,
        stark_forest_predicted_total_points_arr,
        stark_forest_team_df_arr,
        stark_forest_transfers_arr
    ) = \
    construct_event_teams_from_existing(stark_forest_df_test_arr[i], total_budget=1050, verbose=0)
    
    stark_forest_teams_total_points = sum(stark_forest_total_points_arr)
    stark_forest_teams_total_points_arr.append(stark_forest_teams_total_points)

stark_forest_teams_total_points_mean = np.mean(stark_forest_teams_total_points_arr) 
stark_forest_teams_total_points_std = np.std(stark_forest_teams_total_points_arr)
    
print(stark_forest_teams_total_points_mean)
print(stark_forest_teams_total_points_std)

### Gradient boosted trees (lush)

In [None]:
lush_boost_teams_total_points_arr = []

for i in range(0, len(lush_boost_df_test_arr)):
    (
        lush_boost_first_team_arr,
        lush_boost_bench_arr,
        lush_boost_total_points_arr,
        lush_boost_predicted_total_points_arr,
        lush_boost_team_df_arr,
        lush_boost_transfers_arr
    ) = \
    construct_event_teams_from_existing(lush_boost_df_test_arr[i], total_budget=1050, verbose=0)
    
    lush_boost_teams_total_points = sum(lush_boost_total_points_arr)
    lush_boost_teams_total_points_arr.append(lush_boost_teams_total_points)

lush_boost_teams_total_points_mean = np.mean(lush_boost_teams_total_points_arr) 
lush_boost_teams_total_points_std = np.std(lush_boost_teams_total_points_arr)
    
print(lush_boost_teams_total_points_mean)
print(lush_boost_teams_total_points_std)

### Gradient boosted trees (stark)

In [None]:
stark_boost_teams_total_points_arr = []

for i in range(0, len(stark_boost_df_test_arr)):
    (
        stark_boost_first_team_arr,
        stark_boost_bench_arr,
        stark_boost_total_points_arr,
        stark_boost_predicted_total_points_arr,
        stark_boost_team_df_arr,
        stark_boost_transfers_arr
    ) = \
    construct_event_teams_from_existing(stark_boost_df_test_arr[i], total_budget=1050, verbose=0)
    
    stark_boost_teams_total_points = sum(stark_boost_total_points_arr)
    stark_boost_teams_total_points_arr.append(stark_boost_teams_total_points)

stark_boost_teams_total_points_mean = np.mean(stark_boost_teams_total_points_arr) 
stark_boost_teams_total_points_std = np.std(stark_boost_teams_total_points_arr)
    
print(stark_boost_teams_total_points_mean)
print(stark_boost_teams_total_points_std)

### Random benchmark

In [None]:
random_benchmark_df_test = stark_boost_df_test.copy()

In [None]:
random_benchmark_df_test['predicted_total_points'] = \
np.random.random(len(stark_boost_df_test['predicted_total_points'])) * 7

In [None]:
random_benchmark_teams_total_points_arr = []

for i in range(0, 11):
    (
        random_benchmark_first_team_arr,
        random_benchmark_bench_arr,
        random_benchmark_total_points_arr,
        random_benchmark_predicted_total_points_arr,
        random_benchmark_team_df_arr,
        random_benchmark_transfers_arr
    ) = \
    construct_event_teams_from_existing(random_benchmark_df_test, total_budget=1050, verbose=0)
    
    random_benchmark_teams_total_points = sum(random_benchmark_total_points_arr)
    random_benchmark_teams_total_points_arr.append(random_benchmark_teams_total_points)

random_benchmark_teams_total_points_mean = np.mean(random_benchmark_teams_total_points_arr) 
random_benchmark_teams_total_points_std = np.std(random_benchmark_teams_total_points_arr)
    
print(random_benchmark_teams_total_points_mean)
print(random_benchmark_teams_total_points_std)

## Summary

In [None]:
pd.DataFrame([
    ('Linear regression', lr_test_loss[0], lr_test_loss[1], lr_test_loss[2]),
    ('Ridge regression (lush)', lush_ridge_test_loss[0], lush_ridge_test_loss[1], lush_ridge_test_loss[2]),
    ('Ridge regression (stark)', stark_ridge_test_loss[0], stark_ridge_test_loss[1], stark_ridge_test_loss[2]),
    ('Lasso regression', lasso_test_loss[0], lasso_test_loss[1], lasso_test_loss[2]),
    ('Radius basis SVM regression', rbf_svr_test_loss[0], rbf_svr_test_loss[1], rbf_svr_test_loss[2]),
    ('Linear basis SVM regression', linear_svr_test_loss[0], linear_svr_test_loss[1], linear_svr_test_loss[2]),
    ('Decision tree (lush)', lush_tree_test_loss_mean[0], lush_tree_test_loss_mean[1], lush_tree_test_loss_mean[2]),
    ('Decision tree (stark)', stark_tree_test_loss_mean[0], stark_tree_test_loss_mean[1], stark_tree_test_loss_mean[2]),
#     ('Random forest (lush)', lush_forest_test_loss_mean[0], lush_forest_test_loss_mean[1], lush_forest_test_loss_mean[2]),
#     ('Random forest (stark)', stark_forest_test_loss_mean[0], stark_forest_test_loss_mean[1], stark_forest_test_loss_mean[2]),
#     ('Gradient boosted trees (lush)', lush_boost_test_loss_mean[0], lush_boost_test_loss_mean[1], lush_boost_test_loss_mean[2]),
#     ('Gradient boosted trees (stark)', stark_boost_test_loss_mean[0], stark_boost_test_loss_mean[1], stark_boost_test_loss_mean[2]),
], columns=['model', 'MAE', 'MSE', 'r2'])

In [None]:
pd.DataFrame([
    ('Random benchmark', random_benchmark_teams_total_points),
    ('Linear regression', lr_teams_total_points),
    ('Ridge regression (lush)', lush_ridge_teams_total_points),
    ('Ridge regression (stark)', stark_ridge_teams_total_points),
    ('Lasso regression', lasso_teams_total_points),
    ('Radius basis SVM regression', linear_svr_teams_total_points),
    ('Linear basis SVM regression', rbf_svr_teams_total_points),
    ('Decision tree (lush)', lush_tree_teams_total_points_mean),
    ('Decision tree (stark)', stark_tree_teams_total_points_mean),
#     ('Random forest (lush)', lush_forest_teams_total_points_mean),
#     ('Random forest (stark)', stark_forest_teams_total_points_mean),
#     ('Gradient boosted trees (lush)', lush_boost_teams_total_points_mean),
#     ('Gradient boosted trees (stark)', stark_boost_teams_total_points_mean),
], columns=['model', 'teams total points']).round(1)