In [780]:
import pandas as pd # to read and manipulate datasets
from sklearn.ensemble import RandomForestRegressor # for random forest model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # I used mse, mae, and r2 score to evaluate my models
from pprint import pprint

# Datasets

Performing same operations on train and test set, would like to figure out how to do it once, then split them

In [781]:
df2 = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/datasets/20:21mergedGW.csv')
df3 = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/datasets/21:22mergedGW.csv')
df4 = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/datasets/22:23mergedGW.csv')
test_df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/datasets/23:24mergedGW.csv')

df2 = df2.sort_values(by=['name', 'GW']).reset_index(drop=True)
df3 = df3.sort_values(by=['name', 'GW']).reset_index(drop=True)
df4 = df4.sort_values(by=['name', 'GW']).reset_index(drop=True)
test_df = test_df.sort_values(by=['name', 'GW']).reset_index(drop=True)

In [782]:
# Combine data from 20/21, 21/22, 22/23 seasons for the training set
dataframes = [ df2, df3, df4 ]
df = pd.concat( dataframes, ignore_index = True )

In [783]:
# Sort by positions for training set
gk_df = df[df['position'] == 'GK']
def_df = df[df['position'] == 'DEF']
mid_df = df[df['position'] == 'MID']
fwd_df = df[df['position'] == 'FWD']

In [784]:
# Sort by positions for test set
gk_test_df = test_df[test_df['position'] == 'GK']
def_test_df = test_df[test_df['position'] == 'DEF']
mid_test_df = test_df[test_df['position'] == 'MID']
fwd_test_df = test_df[test_df['position'] == 'FWD']

In [785]:
# Save the datasets to CSV files
gk_df.to_csv('goalkeepers.csv', index=False)
def_df.to_csv('defenders.csv', index=False)
mid_df.to_csv('midfielders.csv', index=False)
fwd_df.to_csv('forwards.csv', index=False)

In [786]:
# Save the test datasets to CSV files
gk_test_df.to_csv('goalkeepers_test.csv', index=False)
def_test_df.to_csv('defenders_test.csv', index=False)
mid_test_df.to_csv('midfielders_test.csv', index=False)
fwd_test_df.to_csv('forwards_test.csv', index=False)

# Goalkeepers

In [787]:
gk_df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/goalkeepers.csv')
gk_test_df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/goalkeepers_test.csv')

### Training data

In [788]:
gk_df['avg_saves'] = gk_df.groupby('name')['saves'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_df['avg_ict'] = gk_df.groupby('name')['ict_index'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_df['avg_pen_saves'] = gk_df.groupby('name')['penalties_saved'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_df['avg_goals_conceded'] = gk_df.groupby('name')['goals_conceded'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_df['avg_xP'] = gk_df.groupby('name')['xP'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_df['avg_cs'] = gk_df.groupby('name')['clean_sheets'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_df['avg_bps'] = gk_df.groupby('name')['bps'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_df['avg_mins'] = gk_df.groupby('name')['minutes'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_df['avg_x_goals_conceded'] = gk_df.groupby('name')['expected_goals_conceded'].expanding().mean().shift(1).reset_index(level=0, drop=True)

gk_df['total_pen_saves'] = gk_df.groupby('name')['penalties_saved'].cumsum().shift(1).fillna(0)
gk_df['total_cs'] = gk_df.groupby('name')['clean_sheets'].cumsum().shift(1).fillna(0)
gk_df['total_starts'] = gk_df.groupby('name')['starts'].cumsum().shift(1).fillna(0)

In [789]:
# Create a mapping from player names to categorical codes
gk_df['name'] = gk_df['name'].astype('category')
gk_name_mapping = dict(enumerate(gk_df['name'].cat.categories))
gk_name_mapping_reverse = {v: k for k, v in gk_name_mapping.items()}
gk_df['name'] = gk_df['name'].cat.codes

gk_df['team'] = gk_df['team'].astype('category')
gk_df['team'] = gk_df['team'].cat.codes

gk_df['was_home'] = gk_df['was_home'].astype('category')
gk_df['was_home'] = gk_df['was_home'].cat.codes

gk_df = gk_df.fillna(0)

gk_df = gk_df.drop(['position','total_pen_saves','was_home','team','expected_goals_conceded','penalties_saved','bonus','own_goals','minutes','saves','kickoff_time','team_a_score','team_h_score','expected_assists','expected_goal_involvements','expected_goals','transfers_in','transfers_out','transfers_balance','fixture','assists','goals_scored','ict_index','influence','creativity','threat','penalties_missed','selected','value','goals_conceded','xP','clean_sheets','element','round','red_cards','yellow_cards','starts','bps','total_pen_saves','total_cs'],axis=1)


In [790]:
gk_df = pd.DataFrame(gk_df)
y_train = gk_df['total_points']
X_train = gk_df.drop(['total_points'],axis=1)

### Test Data

In [791]:
gk_test_df['avg_saves'] = gk_test_df.groupby('name')['saves'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_test_df['avg_ict'] = gk_test_df.groupby('name')['ict_index'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_test_df['avg_pen_saves'] = gk_test_df.groupby('name')['penalties_saved'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_test_df['avg_goals_conceded'] = gk_test_df.groupby('name')['goals_conceded'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_test_df['avg_xP'] = gk_test_df.groupby('name')['xP'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_test_df['avg_cs'] = gk_test_df.groupby('name')['clean_sheets'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_test_df['avg_bps'] = gk_test_df.groupby('name')['bps'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_test_df['avg_mins'] = gk_test_df.groupby('name')['minutes'].expanding().mean().shift(1).reset_index(level=0, drop=True)
gk_test_df['avg_x_goals_conceded'] = gk_test_df.groupby('name')['expected_goals_conceded'].expanding().mean().shift(1).reset_index(level=0, drop=True)

gk_test_df['total_pen_saves'] = gk_test_df.groupby('name')['penalties_saved'].cumsum().shift(1).fillna(0)
gk_test_df['total_cs'] = gk_test_df.groupby('name')['clean_sheets'].cumsum().shift(1).fillna(0)
gk_test_df['total_starts'] = gk_test_df.groupby('name')['starts'].cumsum().shift(1).fillna(0)

In [792]:
# Create a mapping from player names to categorical codes
gk_test_df['name'] = gk_test_df['name'].astype('category')
gk_test_name_mapping = dict(enumerate(gk_test_df['name'].cat.categories))
gk_test_name_mapping_reverse = {v: k for k, v in gk_test_name_mapping.items()}
gk_test_df['name'] = gk_test_df['name'].cat.codes

gk_test_df['team'] = gk_test_df['team'].astype('category')
gk_test_df['team'] = gk_test_df['team'].cat.codes

gk_test_df['was_home'] = gk_test_df['was_home'].astype('category')
gk_test_df['was_home'] = gk_test_df['was_home'].cat.codes

gk_test_df = gk_test_df.fillna(0)

gk_test_df = gk_test_df.drop(['position','total_pen_saves','was_home','team','expected_goals_conceded','penalties_saved','bonus','own_goals','minutes','saves','kickoff_time','team_a_score','team_h_score','expected_assists','expected_goal_involvements','expected_goals','transfers_in','transfers_out','transfers_balance','fixture','assists','goals_scored','ict_index','influence','creativity','threat','penalties_missed','selected','value','goals_conceded','xP','clean_sheets','element','round','red_cards','yellow_cards','starts','bps','total_pen_saves','total_cs'],axis=1)


In [793]:
gk_test_df = pd.DataFrame(gk_test_df)
gk_y_test = gk_test_df['total_points']
gk_X_test = gk_test_df.drop(['total_points'],axis=1)

### Random Forest

In [794]:
rf = RandomForestRegressor(max_depth=4,max_features= 15,n_estimators= 500,random_state=13) # max depth helps avoid overfitting, max features helps accuracy
rf.fit(X_train, y_train)

In [795]:
train_pred = rf.predict(X_train)

gk_pred = rf.predict(gk_X_test)

In [796]:
print( 'mean squared error: ', mean_squared_error( gk_y_test, gk_pred ) )
print( 'mean absolute error: ', mean_absolute_error( gk_y_test, gk_pred ) )
print( 'r2 score: ', r2_score( gk_y_test, gk_pred ) )

print( '-------------------------------------' )

print( 'train mean_squared_error : ', mean_squared_error( y_train, train_pred ) ) 
print( 'train mean_absolute_error : ', mean_absolute_error( y_train, train_pred ) ) 
print( 'r2 score: ', r2_score( y_train, train_pred ) )

mean squared error:  2.34475070658149
mean absolute error:  0.7366842719843028
r2 score:  0.31369154189193393
-------------------------------------
train mean_squared_error :  2.9695762814882865
train mean_absolute_error :  0.8660860878311397
r2 score:  0.39691026293275833


# Defenders

In [797]:
def_df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/defenders.csv')
def_test_df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/defenders_test.csv')

### Training Data

In [798]:
def_df['avg_ict'] = def_df.groupby('name')['ict_index'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_bps'] = def_df.groupby('name')['bps'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_xP'] = def_df.groupby('name')['xP'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_xA'] = def_df.groupby('name')['expected_assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
# def_df['avg_xG'] = def_df.groupby('name')['expected_goals'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_xGI'] = def_df.groupby('name')['expected_goal_involvements'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_xGC'] = def_df.groupby('name')['expected_goals_conceded'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_GC'] = def_df.groupby('name')['goals_conceded'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_mins'] = def_df.groupby('name')['minutes'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)

In [799]:
def_df['total_assists'] = def_df.groupby('name')['assists'].cumsum().shift(1).fillna(0)
def_df['total_goals'] = def_df.groupby('name')['goals_scored'].cumsum().shift(1).fillna(0)
def_df['total_cs'] = def_df.groupby('name')['clean_sheets'].cumsum().shift(1).fillna(0)
def_df['total_starts'] = def_df.groupby('name')['starts'].cumsum().shift(1).fillna(0)

In [800]:
def_df = def_df.drop(['position','team','starts','minutes','goals_conceded','red_cards','team_a_score','team_h_score','yellow_cards','element','assists','goals_scored','clean_sheets','penalties_missed','penalties_saved','influence','threat','round','saves','selected','threat','kickoff_time','own_goals','fixture','creativity','transfers_balance','transfers_in','transfers_out','value','ict_index','bps','bonus','xP','expected_assists','expected_goals','expected_goal_involvements','expected_goals_conceded'],axis=1)

In [801]:
# Create a mapping from player names to categorical codes
def_df['name'] = def_df['name'].astype('category')
def_name_mapping = dict(enumerate(def_df['name'].cat.categories))
def_name_mapping_reverse = {v: k for k, v in def_name_mapping.items()}
def_df['name'] = def_df['name'].cat.codes


def_df['was_home'] = def_df['was_home'].astype('category')
def_df['was_home'] = def_df['was_home'].cat.codes

In [802]:
y_train = def_df['total_points']
X_train = def_df.drop(['total_points'],axis=1)

### Test Data

In [803]:
def_test_df['avg_ict'] = def_test_df.groupby('name')['ict_index'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_test_df['avg_bps'] = def_test_df.groupby('name')['bps'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_test_df['avg_xP'] = def_test_df.groupby('name')['xP'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_test_df['avg_xA'] = def_test_df.groupby('name')['expected_assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
# def_df['avg_xG'] = def_df.groupby('name')['expected_goals'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_test_df['avg_xGI'] = def_test_df.groupby('name')['expected_goal_involvements'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_test_df['avg_xGC'] = def_test_df.groupby('name')['expected_goals_conceded'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_test_df['avg_GC'] = def_test_df.groupby('name')['goals_conceded'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_test_df['avg_mins'] = def_test_df.groupby('name')['minutes'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)

In [804]:
def_test_df['total_assists'] = def_test_df.groupby('name')['assists'].cumsum().shift(1).fillna(0)
def_test_df['total_goals'] = def_test_df.groupby('name')['goals_scored'].cumsum().shift(1).fillna(0)
def_test_df['total_cs'] = def_test_df.groupby('name')['clean_sheets'].cumsum().shift(1).fillna(0)
def_test_df['total_starts'] = def_test_df.groupby('name')['starts'].cumsum().shift(1).fillna(0)

In [805]:
def_test_df = def_test_df.drop(['position','team','starts','minutes','goals_conceded','red_cards','team_a_score','team_h_score','yellow_cards','element','assists','goals_scored','clean_sheets','penalties_missed','penalties_saved','influence','threat','round','saves','selected','threat','kickoff_time','own_goals','fixture','creativity','transfers_balance','transfers_in','transfers_out','value','ict_index','bps','bonus','xP','expected_assists','expected_goals','expected_goal_involvements','expected_goals_conceded'],axis=1)

In [806]:
# Create a mapping from player names to categorical codes
def_test_df['name'] = def_test_df['name'].astype('category')
def_test_name_mapping = dict(enumerate(def_test_df['name'].cat.categories))
def_test_name_mapping_reverse = {v: k for k, v in def_test_name_mapping.items()}
def_test_df['name'] = def_test_df['name'].cat.codes


def_test_df['was_home'] = def_test_df['was_home'].astype('category')
def_test_df['was_home'] = def_test_df['was_home'].cat.codes

In [807]:
def_y_test = def_test_df['total_points']
def_X_test = def_test_df.drop(['total_points'],axis=1)

### Random Forest

In [808]:
def_rf = RandomForestRegressor(max_depth=4,max_features= 5,n_estimators= 500,random_state=13)
def_rf.fit(X_train,y_train)

In [809]:
train_pred = def_rf.predict( X_train )
def_pred = def_rf.predict( def_X_test )

In [810]:
print( 'mean squared error: ', mean_squared_error( def_y_test, def_pred ) )
print( 'mean absolute error: ', mean_absolute_error( def_y_test, def_pred ) )
print( 'r2 score: ', r2_score( def_y_test, def_pred ) )

print( '-------------------------------------' )

print( 'train mean_squared_error : ', mean_squared_error( y_train, train_pred ) ) 
print( 'train mean_absolute_error : ', mean_absolute_error( y_train, train_pred ) ) 
print( 'r2 score: ', r2_score( y_train, train_pred ) )

mean squared error:  4.1324775021960285
mean absolute error:  1.1564108306614356
r2 score:  0.15202940589179925
-------------------------------------
train mean_squared_error :  4.8251751575192054
train mean_absolute_error :  1.3269828788411524
r2 score:  0.16373000140743765


# Midfielders

In [811]:
mid_df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/midfielders.csv')
mid_test_df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/midfielders_test.csv')

### Training Data

In [812]:
mid_df['avg_ict'] = mid_df.groupby('name')['ict_index'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_assists'] = mid_df.groupby('name')['assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_goals'] = mid_df.groupby('name')['goals_scored'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_bps'] = mid_df.groupby('name')['bps'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_xP'] = mid_df.groupby('name')['xP'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_xA'] = mid_df.groupby('name')['expected_assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_xG'] = mid_df.groupby('name')['expected_goals'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_xGI'] = mid_df.groupby('name')['expected_goal_involvements'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_xGC'] = mid_df.groupby('name')['expected_goals_conceded'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_GC'] = mid_df.groupby('name')['goals_conceded'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_mins'] = mid_df.groupby('name')['minutes'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)

In [813]:
mid_df['total_assists'] = mid_df.groupby('name')['assists'].cumsum().shift(1).fillna(0)
mid_df['total_goals'] = mid_df.groupby('name')['goals_scored'].cumsum().shift(1).fillna(0)
mid_df['total_cs'] = mid_df.groupby('name')['clean_sheets'].cumsum().shift(1).fillna(0)
mid_df['total_starts'] = mid_df.groupby('name')['starts'].cumsum().shift(1).fillna(0)

In [814]:
# Create a mapping from player names to categorical codes
mid_df['name'] = mid_df['name'].astype('category')
mid_name_mapping = dict(enumerate(mid_df['name'].cat.categories))
mmid_name_mapping_reverse = {v: k for k, v in mid_name_mapping.items()}
mid_df['name'] = mid_df['name'].cat.codes


mid_df['was_home'] = mid_df['was_home'].astype('category')
mid_df['was_home'] = mid_df['was_home'].cat.codes

In [815]:
mid_df = mid_df.drop(['position','team','value','transfers_balance','bonus','own_goals','saves','team_a_score','team_h_score','red_cards','yellow_cards','transfers_in','transfers_out','penalties_missed','penalties_saved','clean_sheets','creativity','influence','threat','kickoff_time','fixture','round','element','starts','selected','ict_index','assists','avg_assists','goals_scored','avg_goals','bps','xP','expected_assists','expected_goals','expected_goal_involvements','expected_goals_conceded','goals_conceded','minutes'],axis=1)

In [816]:
y_train = mid_df['total_points']
X_train = mid_df.drop(['total_points'],axis=1)

### Test Data

In [817]:
mid_test_df['avg_ict'] = mid_test_df.groupby('name')['ict_index'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_test_df['avg_assists'] = mid_test_df.groupby('name')['assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_test_df['avg_goals'] = mid_test_df.groupby('name')['goals_scored'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_test_df['avg_bps'] = mid_test_df.groupby('name')['bps'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_test_df['avg_xP'] = mid_test_df.groupby('name')['xP'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_test_df['avg_xA'] = mid_test_df.groupby('name')['expected_assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_test_df['avg_xG'] = mid_test_df.groupby('name')['expected_goals'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_test_df['avg_xGI'] = mid_test_df.groupby('name')['expected_goal_involvements'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_test_df['avg_xGC'] = mid_test_df.groupby('name')['expected_goals_conceded'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_test_df['avg_GC'] = mid_test_df.groupby('name')['goals_conceded'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_test_df['avg_mins'] = mid_test_df.groupby('name')['minutes'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)

In [818]:
mid_test_df['total_assists'] = mid_test_df.groupby('name')['assists'].cumsum().shift(1).fillna(0)
mid_test_df['total_goals'] = mid_test_df.groupby('name')['goals_scored'].cumsum().shift(1).fillna(0)
mid_test_df['total_cs'] = mid_test_df.groupby('name')['clean_sheets'].cumsum().shift(1).fillna(0)
mid_test_df['total_starts'] = mid_test_df.groupby('name')['starts'].cumsum().shift(1).fillna(0)

In [819]:
# Create a mapping from player names to categorical codes
mid_test_df['name'] = mid_test_df['name'].astype('category')
mid_test_name_mapping = dict(enumerate(mid_test_df['name'].cat.categories))
mid_test_name_mapping_reverse = {v: k for k, v in mid_test_name_mapping.items()}
mid_test_df['name'] = mid_test_df['name'].cat.codes


mid_test_df['was_home'] = mid_test_df['was_home'].astype('category')
mid_test_df['was_home'] = mid_test_df['was_home'].cat.codes

In [820]:
mid_test_df = mid_test_df.drop(['position','team','value','transfers_balance','bonus','own_goals','saves','team_a_score','team_h_score','red_cards','yellow_cards','transfers_in','transfers_out','penalties_missed','penalties_saved','clean_sheets','creativity','influence','threat','kickoff_time','fixture','round','element','starts','selected','ict_index','assists','avg_assists','goals_scored','avg_goals','bps','xP','expected_assists','expected_goals','expected_goal_involvements','expected_goals_conceded','goals_conceded','minutes'],axis=1)

In [821]:
mid_y_test = mid_test_df['total_points']
mid_X_test = mid_test_df.drop(['total_points'],axis=1)

### Random Forest

In [822]:
mid_rf = RandomForestRegressor(max_depth=4,max_features= 10,n_estimators= 100,random_state=13)
mid_rf.fit( X_train, y_train )

In [823]:
train_pred = mid_rf.predict( X_train )
mid_pred = mid_rf.predict( mid_X_test )

In [824]:
print( 'mean squared error: ', mean_squared_error( mid_pred, mid_y_test ) )
print( 'mean absolute error: ', mean_absolute_error( mid_pred, mid_y_test ) )
print( 'r2 score: ', r2_score( mid_y_test, mid_pred ) )

print( '-------------------------------------' )

print( 'train mean_squared_error : ', mean_squared_error( y_train, train_pred ) ) 
print( 'train mean_absolute_error : ', mean_absolute_error( y_train, train_pred ) ) 
print( 'r2 score: ', r2_score( y_train, train_pred ) )

mean squared error:  4.101862088424853
mean absolute error:  1.0356376471041664
r2 score:  0.268079842385448
-------------------------------------
train mean_squared_error :  4.440217403033684
train mean_absolute_error :  1.1719371058354284
r2 score:  0.24069940045275862


# Forwards

In [825]:
fwd_df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/forwards.csv')
fwd_test_df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/forwards_test.csv')

### Training Data

In [826]:
fwd_df['avg_ict'] = fwd_df.groupby('name')['ict_index'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_assists'] = fwd_df.groupby('name')['assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_goals'] = fwd_df.groupby('name')['goals_scored'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_bps'] = fwd_df.groupby('name')['bps'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_xP'] = fwd_df.groupby('name')['xP'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_xA'] = fwd_df.groupby('name')['expected_assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_xG'] = fwd_df.groupby('name')['expected_goals'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_xGI'] = fwd_df.groupby('name')['expected_goal_involvements'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_mins'] = fwd_df.groupby('name')['minutes'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)

In [827]:
fwd_df['total_assists'] = fwd_df.groupby('name')['assists'].cumsum().shift(1).fillna(0)
fwd_df['total_goals'] = fwd_df.groupby('name')['goals_scored'].cumsum().shift(1).fillna(0)
fwd_df['total_starts'] = fwd_df.groupby('name')['starts'].cumsum().shift(1).fillna(0)

In [828]:
# Create a mapping from player names to categorical codes
fwd_df['name'] = fwd_df['name'].astype('category')
fwd_name_mapping = dict(enumerate(fwd_df['name'].cat.categories))
fwd_name_mapping_reverse = {v: k for k, v in fwd_name_mapping.items()}
fwd_df['name'] = fwd_df['name'].cat.codes


fwd_df['was_home'] = fwd_df['was_home'].astype('category')
fwd_df['was_home'] = fwd_df['was_home'].cat.codes

In [829]:
fwd_df = fwd_df.drop(['position','team','value','transfers_balance','bonus','own_goals','saves','team_a_score','team_h_score','red_cards','yellow_cards','transfers_in','transfers_out','penalties_missed','penalties_saved','clean_sheets','creativity','influence','threat','kickoff_time','fixture','round','element','starts','selected','ict_index','assists','avg_assists','goals_scored','avg_goals','bps','xP','expected_assists','expected_goals','expected_goal_involvements','expected_goals_conceded','goals_conceded','minutes'],axis=1)

In [830]:
y_train = fwd_df['total_points']
X_train = fwd_df.drop(['total_points'],axis=1)

### Test Data

In [831]:
fwd_test_df['avg_ict'] = fwd_test_df.groupby('name')['ict_index'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_test_df['avg_assists'] = fwd_test_df.groupby('name')['assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_test_df['avg_goals'] = fwd_test_df.groupby('name')['goals_scored'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_test_df['avg_bps'] = fwd_test_df.groupby('name')['bps'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_test_df['avg_xP'] = fwd_test_df.groupby('name')['xP'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_test_df['avg_xA'] = fwd_test_df.groupby('name')['expected_assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_test_df['avg_xG'] = fwd_test_df.groupby('name')['expected_goals'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_test_df['avg_xGI'] = fwd_test_df.groupby('name')['expected_goal_involvements'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_test_df['avg_mins'] = fwd_test_df.groupby('name')['minutes'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)

In [832]:
fwd_test_df['total_assists'] = fwd_test_df.groupby('name')['assists'].cumsum().shift(1).fillna(0)
fwd_test_df['total_goals'] = fwd_test_df.groupby('name')['goals_scored'].cumsum().shift(1).fillna(0)
fwd_test_df['total_starts'] = fwd_test_df.groupby('name')['starts'].cumsum().shift(1).fillna(0)

In [833]:
# Create a mapping from player names to categorical codes
fwd_test_df['name'] = fwd_test_df['name'].astype('category')
fwd_test_name_mapping = dict(enumerate(fwd_test_df['name'].cat.categories))
fwd_test_name_mapping_reverse = {v: k for k, v in fwd_test_name_mapping.items()}
fwd_test_df['name'] = fwd_test_df['name'].cat.codes


fwd_test_df['was_home'] = fwd_test_df['was_home'].astype('category')
fwd_test_df['was_home'] = fwd_test_df['was_home'].cat.codes

In [834]:
fwd_test_df = fwd_test_df.drop(['position','team','value','transfers_balance','bonus','own_goals','saves','team_a_score','team_h_score','red_cards','yellow_cards','transfers_in','transfers_out','penalties_missed','penalties_saved','clean_sheets','creativity','influence','threat','kickoff_time','fixture','round','element','starts','selected','ict_index','assists','avg_assists','goals_scored','avg_goals','bps','xP','expected_assists','expected_goals','expected_goal_involvements','expected_goals_conceded','goals_conceded','minutes'],axis=1)

In [835]:
fwd_y_test = fwd_test_df['total_points']
fwd_X_test = fwd_test_df.drop(['total_points'],axis=1)

### Random Forest

In [836]:
fwd_rf = RandomForestRegressor(max_depth=4,max_features= 10,n_estimators= 100,random_state=13)
fwd_rf.fit( X_train, y_train )

In [837]:
train_pred = fwd_rf.predict(X_train)
fwd_pred = fwd_rf.predict(fwd_X_test)

In [838]:
print( 'mean squared error: ', mean_squared_error( fwd_pred, fwd_y_test ) )
print( 'mean absolute error: ', mean_absolute_error( fwd_pred, fwd_y_test ) )
print( 'r2 score: ', r2_score( fwd_y_test, fwd_pred ) )

print( '-------------------------------------' )

print( 'train mean_squared_error : ', mean_squared_error( y_train, train_pred ) ) 
print( 'train mean_absolute_error : ', mean_absolute_error( y_train, train_pred ) ) 
print( 'r2 score: ', r2_score( y_train, train_pred ) )

mean squared error:  4.758184806516996
mean absolute error:  1.1241664235702808
r2 score:  0.2546661027177164
-------------------------------------
train mean_squared_error :  5.25805922761095
train mean_absolute_error :  1.2845260645206846
r2 score:  0.25712588529842295


# Lookup Table

In [844]:
pos = input("Select a position( 'gk', 'def', 'mid', 'fwd' ): ")
if pos == 'gk':
    pprint( gk_test_name_mapping )
    id = int(input("Select an id for a player: "))
    gw = int(input("Enter a game week 1-38: "))
    index = gk_X_test.loc[(gk_X_test['name'] == id) & (gk_X_test['GW'] == gw)].index
    print( gk_test_name_mapping[id], "predicted points for gameweek ", gw, ": ", gk_pred[index] )
    print( gk_test_name_mapping[id], "actual points for gameweek ", gw, ": ", gk_y_test[index] )
elif pos == 'def':
    pprint( def_test_name_mapping )
    id = int(input("Select an id for a player: "))
    gw = int(input("Enter a game week 1-38: "))
    index = def_X_test.loc[(def_X_test['name'] == id) & (def_X_test['GW'] == gw)].index
    print( def_test_name_mapping[id], "predicted points for gameweek ", gw, ": ", def_pred[index] )
    print( def_test_name_mapping[id], "actual points for gameweek ", gw, ": ", def_y_test[index] )

elif pos == 'mid':
    pprint( mid_test_name_mapping )
    id = int(input("Select an id for a player: "))
    gw = int(input("Enter a game week 1-38: "))
    index = mid_X_test.loc[(mid_X_test['name'] == id) & (mid_X_test['GW'] == gw)].index
    print( mid_test_name_mapping[id], "predicted points for gameweek ", gw, ": ", mid_pred[index] )
    print( mid_test_name_mapping[id], "actual points for gameweek ", gw, ": ", mid_y_test.iloc[index] )

elif pos == 'fwd':
    pprint( fwd_test_name_mapping )
    id = int(input("Select an id for a player: "))
    gw = int(input("Enter a game week 1-38: "))
    index = fwd_X_test.loc[(mid_X_test['name'] == id) & (fwd_X_test['GW'] == gw)].index
    print( fwd_test_name_mapping[id], "predicted points for gameweek ", gw, ": ", fwd_pred[index] )
    print( fwd_test_name_mapping[id], "actual points for gameweek ", gw, ": ", fwd_y_test.iloc[index] )


Select a position( 'gk', 'def', 'mid', 'fwd' ):  fwd


{0: 'Aaron Connolly',
 1: 'Ademola Ola-Adebomi',
 2: 'Admiral Muskwe',
 3: 'Alejo Véliz',
 4: 'Aleksandar Mitrović',
 5: 'Alexander Isak',
 6: 'Andi Zeqiri',
 7: 'Anthony Martial',
 8: 'Antoine Semenyo',
 9: 'Antwoine Hackford',
 10: 'Aribim Pepple',
 11: 'Armando Broja',
 12: 'Ben Parkinson',
 13: 'Billy Blacker',
 14: 'Bénie Traoré',
 15: 'Callum Marshall',
 16: 'Callum Wilson',
 17: 'Cameron Archer',
 18: 'Carlos Vinícius Alves Morais',
 19: 'Carlton Morris',
 20: 'Cauley Woodrow',
 21: 'Chiedozie Ogbene',
 22: 'Chris Wood',
 23: 'Christopher Nkunku',
 24: 'Cody Gakpo',
 25: 'Dane Scarlett',
 26: 'Daniel Jebbison',
 27: 'Danny Ings',
 28: 'Danny Welbeck',
 29: 'Darwin Núñez Ribeiro',
 30: 'David Datro Fofana',
 31: 'Deivid Washington de Souza Eugênio',
 32: 'Deniz Undav',
 33: 'Detlef Esapa Osong',
 34: 'Divin Mubama',
 35: 'Divock Origi',
 36: 'Dominic Calvert-Lewin',
 37: 'Dominic Solanke',
 38: 'Eddie Nketiah',
 39: 'Elijah Adebayo',
 40: 'Ellis Simms',
 41: 'Emmanuel Dennis',
 4

Select an id for a player:  43
Enter a game week 1-38:  10


Erling Haaland predicted points for gameweek  10 :  [4.91842507]
Erling Haaland actual points for gameweek  10 :  1492    16
Name: total_points, dtype: int64
