In [None]:
!ls

In [None]:
import json
import pandas as pd
import matplotlib as pyplot
from tqdm import tqdm
import random
import numpy as np

In [None]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [None]:
from fpl21.utils import pp

### Generate base data set
<hr>

In [None]:
with open("players_list.json", "r") as f:
    print("loading players list from file")
    players_list = json.load(f)


In [None]:
# Players list file contains player attributes, fixtures list, historical fixtures and previous season performance
players_list[0].keys()

In [None]:
#history is previous fictures with stats. fixtures is forward looking
print(len(players_list[0]['history']))
print(len(players_list[0]['fixtures']))

In [None]:
# Create a player teams lookup
player_attrs = pd.DataFrame(
    [(p['id'], p['web_name'], p['team']) for p in players_list],
    columns=['element', 'name', 'team'])
#player_teams

In [None]:
hist = [p['history'] for p in players_list]
df = pd.DataFrame([x for sublist in hist for x in sublist])

In [None]:
df = df.merge(player_attrs, on='element')

In [None]:
df.columns

In [None]:
df.head()

In [None]:
# naive version of team diffculty
# average points per game
team_difficulty = df.groupby('team').total_points.sum() / df.groupby('team').fixture.nunique()
team_difficulty.name = 'team_difficulty'

In [None]:
#team_difficulty

In [None]:
df = df.merge(team_difficulty, left_on='opponent_team', right_index=True) \
       .rename(columns={'team_difficulty': 'opponent_difficulty'})

In [None]:
df.head()

In [None]:
# Restrict columns for initial development
df = df[
    [
        'element', 'name', 'fixture', 'team', 'opponent_team',
        'was_home', 'opponent_difficulty', 'minutes', 'total_points'
    ]
]

# Others
# 'opponent_team', 'kickoff_time', 'team_h_score', 'team_a_score', 'round', 'minutes',
# 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
# 'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
# 'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
# 'threat', 'ict_index', 'value', 'transfers_balance', 'selected',
# 'transfers_in', 'transfers_out'

In [None]:
df

In [None]:
df.total_points.hist(bins=50)

### Feature generation
<hr>

In [None]:
def rolling_avg(df, window, col, default):
    rolling = pd.Series(dtype='float64')
    for x in df.element.unique():
        rolling = rolling.append(df[df.element==x].sort_values('fixture').rolling(window)[col].mean().shift(1))
    
    # Fill nas with default val
    rolling = rolling.fillna(default)
    
    rolling.name = f"avg_{col}_L{window}"
    return df.merge(rolling, left_index=True, right_index=True)

In [None]:
for i in range(1, 3):
    df = rolling_avg(df, i, 'total_points', 0)

for i in range(1, 3):
    df = rolling_avg(df, i, 'minutes', 45)

In [None]:
df.sort_index().head(10)

### Build model
<hr>

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [None]:
covars = [
    #'team', 'opponent_team', # Need to encode these
    'was_home', 'opponent_difficulty', 'avg_total_points_L1', 'avg_total_points_L2', 'avg_minutes_L1', 'avg_minutes_L2']

X = df[covars]
y = df.total_points

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
train = (X_train, y_train)
test = (X_test, y_test)

In [None]:
reg = RandomForestRegressor(min_samples_split=100)

In [None]:
reg.fit(*train)

In [None]:
pd.DataFrame(reg.feature_importances_, index=covars, columns=['importance']) \
    .sort_values('importance').plot(kind='barh')

In [None]:
print(f"Train {reg.score(*train)}")
print(f" Test {reg.score(*test)}")

#### Results

| Name | Params | Other | Test Performance |
| --- | --- | --- | --- |
| Baseline | min_samples_split=100 | | 0.1528 | 
| | add avg_minutes_L1 L2 | | 0.1693 |


| Name | Vars |
| --- | --- |
| Baseline | 'was_home', 'opponent_difficulty', 'avg_total_points_L1', 'avg_total_points_L2' |


In [None]:
# Parameter tuning
# results = []
# for n in tqdm([2, 5, 10, 20, 50, 100, 200, 500, 1000]):
#     reg = RandomForestRegressor(n_estimators=n, min_samples_split=100)
#     reg.fit(X_train, y_train)
#     results.append((n, reg.score(*train), reg.score(*test)))

# pd.DataFrame(results, columns=['ntrees', 'train', 'test']).set_index('ntrees').plot()

### Diagnostics

In [None]:
df['predicted_points'] = reg.predict(X[covars])

In [None]:
noise = np.random.normal(0, 0.2, len(df)) 
jitter = df[['total_points', 'predicted_points']]
jitter['total_points'] += noise
jitter.plot.scatter('total_points', 'predicted_points')

In [None]:
def plot_predicted_points(df, pid):
    player = df[df.element==pid]
    player.set_index('fixture').sort_index()[['predicted_points', 'total_points']].plot(
        kind='bar', ylim=(-5, 25), title=f"{player.name.iloc[0]} ({pid})"
    )

for pid in df.element.unique()[:10]:
    plot_predicted_points(df, pid)