In [None]:
!ls

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

import json
import random
import matplotlib as pyplot
from tqdm import tqdm

import fpl21.data as data

### Generate base data set
<hr>

In [None]:
# df = df.merge(player_attrs, on='element')

# df = df.merge(team_data, left_on='opponent_team', right_index=True) \
#    .rename(columns={c: f"opp_{c}" for c in team_cols}) \
#    .merge(team_data, left_on='team', right_index=True)  

#### Static player attributes

In [None]:
player_attrs_df = data.create_player_attrs_df()

#### Historical match data

In [None]:
history_df = data.create_history_df()

#### Team data

In [None]:
team_df = data.create_team_stats_df()
team_cols = list(team_df.columns)

In [None]:
df = history_df.merge(player_attrs_df, on='element') \
    .merge(team_df, left_on='opponent_team', right_index=True) \
    .rename(columns={c: f"opp_{c}" for c in team_cols}) \
    .merge(team_data, left_on='team', right_index=True)  

In [None]:
df.head()

In [None]:
labels = ['element', 'web_name', 'fixture', 'round',]

# Static player attributes
player_cols = ['element_type']
changing_player_cols = ['chance_of_playing_this_round', 'ep_this', 'ep_next']

# Attributes from fixture data
fixtures_cols = ['is_home', 'team', 'opponent_team'] + team_cols + [f"opp_{c}" for c in team_cols]

# Historical match data - contains outcomes and things that are correlated, e.g. number of minutes played
# Need to be lagged to use as predictors
history_cols = ['total_points', 'bonus', 'bps', 'minutes', 'selected', 'transfers_in', 'transfers_out'] 

df = df[labels + player_cols + changing_player_cols + fixtures_cols + history_cols]

# Others

# 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
# 'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
# 'red_cards', 'saves', 'influence', 'creativity',
# 'threat', 'ict_index', 'value', 'transfers_balance',


In [None]:
df

In [None]:
df.total_points.hist(bins=50)

### Feature generation
<hr>

In [None]:
def rolling_avg(df, window, col, default):
    rolling = pd.Series(dtype='float64')
    for x in df.element.unique():
        rolling = rolling.append(df[df.element==x].sort_values('fixture').rolling(window)[col].mean().shift(1))
    
    # Fill nas with default val
    rolling = rolling.fillna(default)
    
    rolling.name = f"avg_{col}_L{window}"
    return df.merge(rolling, left_index=True, right_index=True)

In [None]:
for var in tqdm(history_cols): # we can use previous outcomes to predict next values
    df = rolling_avg(df, 1, var, 0)

In [None]:
df.sort_index().head(10)

### Build model
<hr>

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
# only use data where rolling metrics are populated
df = df[df['round'] > 1]

In [None]:
df.element_type = df.element_type.astype(str)
df.team = df.team.astype(str)
df.opponent_team = df.opponent_team.astype(str)
df.chance_of_playing_this_round = df.chance_of_playing_this_round.fillna(0)

In [None]:
covars = [name for name in df.columns if name not in labels + history_cols + ['ep_this', 'ep_next']]

X = df[covars]
y = df.total_points

In [None]:
X.dtypes

In [None]:
X.head(1000)

In [None]:
# Custom train/test split to ensure all player observations in same set and prevent overstating test performance
# Loop as performance on test has high variance
perf_results = []
for _ in tqdm(range(10)):
    elements = list(df.element.unique())
    test_elements = random.sample(elements, len(elements) // 4)
    mask = np.array([(x in test_elements) for x in df.element])

    X_train = X[~mask]
    y_train = y[~mask]
    X_test = X[mask]
    y_test = y[mask]

    train = (X_train, y_train)
    test = (X_test, y_test)
#     print(X_train.shape, X_test.shape)

    reg = RandomForestRegressor(min_samples_split=10, criterion='mse')

    reg.fit(*train)
    perf_results.append((reg.score(*train), reg.score(*test)))

In [None]:
pd.DataFrame(perf_results, columns=['Train', 'Test']).describe().loc[['min', 'mean', 'max']].T

In [None]:
pd.DataFrame(reg.feature_importances_, index=covars, columns=['importance']) \
    .sort_values('importance').plot(kind='barh')

In [None]:
# Change Log (Test performance)

# 0.4000 - using ep_this, ep_next
# 0.1528 - baseline
# 0.1693 - more vars
# 0.1654 - new train test split to keep all player observations in one set and avoid overstating test performance
# 0.2757 - training/evaluation done on rows where rolling metrics are populated
# 0.2920 - teams labels and my team difficulty
# 0.3835 - add team goals for/against
# 0.3235 - average performance over several test sets

In [None]:
def scatter(df, x, y, noise=False):
    """Scatter plot with random noise to facilitate comparing continuous predictions with integer scores"""
    jitter = df[[x, y]]
    
    if noise:
        nx = np.random.normal(0, 0.2, len(df)) 
        ny = np.random.normal(0, 0.2, len(df)) 
        jitter[x] += nx
        jitter[y] += ny
    
    jitter.plot.scatter(x, y, xlim=(-2, 20), ylim=(-2, 20), figsize=(6,6))

In [None]:
# Parameter tuning
# results = []
# for n in tqdm([2, 5, 10, 20, 50, 100, 200, 500, 1000]):
#     reg = RandomForestRegressor(n_estimators=n, min_samples_split=100)
#     reg.fit(X_train, y_train)
#     results.append((n, reg.score(*train), reg.score(*test)))

# pd.DataFrame(results, columns=['ntrees', 'train', 'test']).set_index('ntrees').plot()

In [None]:
df['predicted_points'] = reg.predict(df[covars])
df['predicted_points'].hist(bins=50)

In [None]:
df.ep_this = df.ep_this.astype(float)
scatter(df, 'total_points', 'predicted_points', noise=True)
scatter(df, 'total_points', 'ep_this', noise=True)

In [None]:
def plot_predicted_points(df, pid):
    player = df[df.element==pid]
    player.set_index('fixture').sort_index()[['predicted_points', 'total_points']].plot(
        kind='bar', ylim=(-5, 25), title=f"{player.web_name.iloc[0]} ({pid})"
    )

for pid in df.element.unique()[:10]:
    plot_predicted_points(df, pid)

In [None]:
for pid in [30, 80, 275, 262, 110, 245, 62, 272, 35, 144, 277, 359, 413, 337, 189]:
    plot_predicted_points(df, pid)

### Predict next fixture points