# FPL Prices Predicition Model

## Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge

'season_x', 'name', 'position', 'team_x', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'fixture', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'kickoff_time', 'minutes',
       'opponent_team', 'opp_team_name', 'own_goals', 'penalties_missed',
       'penalties_saved', 'red_cards', 'round', 'saves', 'selected',
       'team_a_score', 'team_h_score', 'threat', 'total_points',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards', 'GW'

In [2]:
cleaned_players = pd.read_csv("./clean_data/cleaned_merged_seasons.csv")
common_features = ['season_x', 'name', 'position', 'assists',
       'clean_sheets', 'creativity', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'minutes',
       'own_goals', 
       'penalties_saved', 'red_cards', 'round', 'saves', 'selected',
       'threat', 'total_points',
       'value',
       'yellow_cards']
cleaned_players = cleaned_players[common_features]

  cleaned_players = pd.read_csv("./clean_data/cleaned_merged_seasons.csv")


In [3]:
gk_features = ['season_x', 'name', 'position',
       'clean_sheets', 'goals_conceded',
       'influence', 'minutes',
       'own_goals', 
       'penalties_saved', 'round', 'saves', 'selected',
       'total_points',
       'value',
       ]
defender_features = ['season_x', 'name', 'position', 'assists',
       'clean_sheets', 'ict_index', 'goals_conceded',
       'goals_scored', 'influence', 'minutes',
       'own_goals', 
       'red_cards', 'round', 'saves', 'selected',
       'threat', 'total_points',
       'value',
       'yellow_cards']
mid_features = ['season_x', 'name', 'position', 'assists',
       'creativity', 'ict_index',
       'goals_scored', 'influence', 'minutes',
       'red_cards', 'round', 'selected',
       'threat', 'total_points',
       'value',
       'yellow_cards']
fwd_features = ['season_x', 'name', 'position', 'assists',
       'creativity', 'ict_index',
       'goals_scored', 'influence', 'minutes',
       'round', 'selected',
       'threat', 'total_points',
       'value',
       ]

In [4]:
gks = cleaned_players[cleaned_players['position'] == 'GK'][gk_features]
defenders = cleaned_players[cleaned_players['position'] == 'DEF'][defender_features]
mids = cleaned_players[cleaned_players['position'] == 'MID'][mid_features]
fwds = cleaned_players[cleaned_players['position'] == 'FWD'][fwd_features]
ds_list = [gks, defenders, mids, fwds]

In [5]:
for i, ds in enumerate(ds_list):
    ds['player_id'] = ds['name'].astype('category').cat.codes
    ds['season'] = ds['season_x'].apply(lambda x: int(x.split('-')[1]))
    ds.sort_values(by=['season', 'player_id', 'round'], inplace=True)
    ds.drop(columns=['name', 'season_x', 'position'], inplace=True)

In [6]:
def update_ds(df, n):
    stats_columns = [col for col in df.columns if col not in ['round', 'player_id', 'season', 'value']]
    df['next_game_value'] = df.groupby(['season', 'player_id'])['value'].shift(-n)
    df['value_change'] = df['next_game_value'] - df['value']
    for col in stats_columns:
        df[col] = (
            df.groupby(['season', 'player_id'])[col]
            .apply(lambda x: x.shift(-n + 1).rolling(window=n, min_periods=1).sum()).reset_index(level=[0, 1], drop=True)
        )
    return df

In [None]:
for ds in ds_list:
    ds = update_ds(ds, 1)


In [11]:
ds_list[1].describe()

Unnamed: 0,assists,clean_sheets,ict_index,goals_conceded,goals_scored,influence,minutes,own_goals,red_cards,round,saves,selected,threat,total_points,value,yellow_cards,player_id,season,next_game_value,value_change
count,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0,42151.0
mean,0.025385,0.103177,1.356338,0.533321,0.016607,7.455937,33.736471,0.003321,0.002538,20.085075,0.0,193987.3,2.732652,1.222343,45.809993,0.062798,266.709924,21.7034,45.782188,-0.027805
std,0.165668,0.304193,2.22198,1.020234,0.130186,11.544046,42.112434,0.057947,0.05032,10.71944,0.0,486134.2,7.611227,2.436594,6.054519,0.242602,155.312448,2.142303,6.06624,0.477573
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-7.0,37.0,0.0,0.0,17.0,37.0,-10.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,6596.0,0.0,0.0,42.0,0.0,133.0,21.0,42.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,29168.0,0.0,0.0,45.0,0.0,268.0,22.0,44.0,0.0
75%,0.0,0.0,2.2,1.0,0.0,14.0,90.0,0.0,0.0,29.0,0.0,153790.5,1.0,1.0,48.0,0.0,402.0,23.0,48.0,0.0
max,3.0,1.0,19.4,9.0,2.0,92.4,90.0,2.0,1.0,38.0,0.0,7793322.0,104.0,21.0,86.0,1.0,535.0,24.0,86.0,10.0


In [8]:
gk_features = ['clean_sheets', 'goals_conceded',
       'influence', 'minutes',
       'own_goals', 
       'penalties_saved', 'round', 'saves', 'selected',
       'total_points',
       #'value',
       ]
defender_features = ['assists',
       'clean_sheets', 'ict_index', 'goals_conceded',
       'goals_scored', 'influence', 'minutes',
       'own_goals', 
       'red_cards', 'round', 'saves', 'selected',
       'threat', 'total_points',
       #'value',
       'yellow_cards']
mid_features = ['assists',
       'creativity', 'ict_index',
       'goals_scored', 'influence', 'minutes',
       'red_cards', 'round', 'selected',
       'threat', 'total_points',
       #'value',
       'yellow_cards']
fwd_features = ['assists',
       'creativity', 'ict_index',
       'goals_scored', 'influence', 'minutes',
       'round', 'selected',
       'threat', 'total_points',
       #'value',
       ]

In [9]:
pos_list = ['gk', 'def', 'mid', 'fwd']
ds_features = [gk_features, defender_features, mid_features, fwd_features]
for i, ds in enumerate(ds_list):
    ds.dropna(inplace=True)
    X, y = ds[ds_features[i]], ds["value_change"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    model = Ridge()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(pos_list[i], " absolute error: ", mean_absolute_error(y_test, y_pred))

gk  absolute error:  0.081826114424121
def  absolute error:  0.1365800414430101
mid  absolute error:  0.14748202227653653
fwd  absolute error:  0.1991280779519458
