In [113]:
import pandas as pd # to read and manipulate datasets
import matplotlib.pyplot as plt # to plot graphs for results
from sklearn.model_selection import train_test_split # to split data
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor # for random forest model
from sklearn.neural_network import MLPRegressor # for neural network model
from sklearn.model_selection import GridSearchCV # used grid search to find the best hyperparameters
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # I used mse, mae, and r2 score to evaluate my models

In [114]:
df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/datasets/23:24mergedGW.csv')

In [115]:
gk_df = df[df['position'] == 'GK']
def_df = df[df['position'] == 'DEF']
mid_df = df[df['position'] == 'MID']
fwd_df = df[df['position'] == 'FWD']


# Sort each DataFrame by 'name' and 'gameweek'
gk_df = gk_df.sort_values(by=['name', 'GW']).reset_index(drop=True)
def_df = def_df.sort_values(by=['name', 'GW']).reset_index(drop=True)
mid_df = mid_df.sort_values(by=['name', 'GW']).reset_index(drop=True)
fwd_df = fwd_df.sort_values(by=['name', 'GW']).reset_index(drop=True)

In [116]:
# Save the datasets to CSV files
gk_df.to_csv('goalkeepers.csv', index=False)
def_df.to_csv('defenders.csv', index=False)
mid_df.to_csv('midfielders.csv', index=False)
fwd_df.to_csv('forwards.csv', index=False)

# Goalkeepers

In [117]:
df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/goalkeepers.csv')

In [118]:
df['avg_saves'] = df.groupby('name')['saves'].expanding().mean().shift(1).reset_index(level=0, drop=True)
df['avg_ict'] = df.groupby('name')['ict_index'].expanding().mean().shift(1).reset_index(level=0, drop=True)
df['avg_pen_saves'] = df.groupby('name')['penalties_saved'].expanding().mean().shift(1).reset_index(level=0, drop=True)
df['avg_goals_conceded'] = df.groupby('name')['goals_conceded'].expanding().mean().shift(1).reset_index(level=0, drop=True)
df['avg_xP'] = df.groupby('name')['xP'].expanding().mean().shift(1).reset_index(level=0, drop=True)
df['avg_cs'] = df.groupby('name')['clean_sheets'].expanding().mean().shift(1).reset_index(level=0, drop=True)
df['avg_bps'] = df.groupby('name')['bps'].expanding().mean().shift(1).reset_index(level=0, drop=True)
df['avg_mins'] = df.groupby('name')['minutes'].expanding().mean().shift(1).reset_index(level=0, drop=True)
df['avg_x_goals_conceded'] = df.groupby('name')['expected_goals_conceded'].expanding().mean().shift(1).reset_index(level=0, drop=True)

df['total_pen_saves'] = df.groupby('name')['penalties_saved'].cumsum().shift(1).fillna(0)
df['total_cs'] = df.groupby('name')['clean_sheets'].cumsum().shift(1).fillna(0)
df['total_starts'] = df.groupby('name')['starts'].cumsum().shift(1).fillna(0)

In [119]:
# Create a mapping from player names to categorical codes
df['name'] = df['name'].astype('category')
name_mapping = dict(enumerate(df['name'].cat.categories))
name_mapping_reverse = {v: k for k, v in name_mapping.items()}
df['name'] = df['name'].cat.codes

df['team'] = df['team'].astype('category')
df['team'] = df['team'].cat.codes

df['was_home'] = df['was_home'].astype('category')
df['was_home'] = df['was_home'].cat.codes

df = df.fillna(0)

df = df.drop(['position','total_pen_saves','was_home','team','expected_goals_conceded','penalties_saved','bonus','own_goals','minutes','saves','kickoff_time','team_a_score','team_h_score','expected_assists','expected_goal_involvements','expected_goals','transfers_in','transfers_out','transfers_balance','fixture','assists','goals_scored','ict_index','influence','creativity','threat','penalties_missed','selected','value','goals_conceded','xP','clean_sheets','element','round','red_cards','yellow_cards','starts','bps','total_pen_saves','total_cs'],axis=1)


In [120]:
df = pd.DataFrame(df)
y = df['total_points']
X = df.drop(['total_points'],axis=1)
print(X.columns)

Index(['name', 'opponent_team', 'GW', 'avg_saves', 'avg_ict', 'avg_pen_saves',
       'avg_goals_conceded', 'avg_xP', 'avg_cs', 'avg_bps', 'avg_mins',
       'avg_x_goals_conceded', 'total_starts'],
      dtype='object')


In [121]:
X_train, X_temp, y_train, y_temp = train_test_split( X, y, test_size = 0.2, random_state = 0 ) # 80/10/10 split
X_test, X_val, y_test, y_val = train_test_split( X_temp, y_temp, test_size = 0.5, random_state = 0 )

In [122]:
rf = RandomForestRegressor(max_depth=4,max_features= 15,n_estimators= 500,random_state=13) # max depth helps avoid overfitting, max features helps accuracy
rf.fit(X_train, y_train)

In [123]:
train_pred = rf.predict(X_train)

rf_pred = rf.predict(X_test)

In [124]:
# # Define the hyperparameters and their possible values
# param_grid = {
#     'n_estimators': [50, 100, 500,1000,2000],
#     'max_depth': [None, 5, 10, 20, 30],
#     'max_features': [2, 5, 10, 15],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
# }

# # Set up the GridSearchCV
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
#                            cv=5, n_jobs=-1, verbose=2)

# # Fit the model to the data
# grid_search.fit(X_train, y_train)

# # Print the best hyperparameters
# print(f"Best Hyperparameters: {grid_search.best_params_}")


In [125]:
# print("Best Hyperparameters: ", grid_search.best_params_)

In [126]:
print( 'mean squared error: ', mean_squared_error( rf_pred, y_test ) )
print( 'mean absolute error: ', mean_absolute_error( rf_pred, y_test ) )
print( 'r2 score: ', r2_score( y_test, rf_pred ) )

print( '-------------------------------------' )

print( 'train mean_squared_error : ', mean_squared_error( y_train, train_pred ) ) 
print( 'train mean_absolute_error : ', mean_absolute_error( y_train, train_pred ) ) 
print( 'r2 score: ', r2_score( y_train, train_pred ) )

mean squared error:  2.703535366959296
mean absolute error:  0.7406197867559018
r2 score:  0.32084625748181195
-------------------------------------
train mean_squared_error :  1.8555332769838273
train mean_absolute_error :  0.5946861156524287
r2 score:  0.45930477394137503


In [127]:
# # Create a DataFrame for the test set with predictions
# test_results = X_test.copy()
# test_results['predicted_points'] = rf_pred
# test_results['actual_points'] = y_test

# # Convert categorical codes back to player names
# test_results['name'] = test_results['name'].map(name_mapping)

# # Select relevant columns and rename them
# final_df = test_results[['name', 'GW', 'predicted_points', 'actual_points']]
# final_df.columns = ['Player Name', 'Gameweek', 'Predicted Points', 'Actual Points']

# # Set display options to show all rows and columns
# pd.set_option('display.max_rows', None)  # show all rows
# pd.set_option('display.max_columns', None)  # show all columns

# # Print the DataFrame
# # print(final_df)

# # Reset display options to default (if needed)
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')

# Defenders

In [128]:
def_df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/defenders.csv')

In [129]:
def_df['avg_ict'] = def_df.groupby('name')['ict_index'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_bps'] = def_df.groupby('name')['bps'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_xP'] = def_df.groupby('name')['xP'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_xA'] = def_df.groupby('name')['expected_assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
# def_df['avg_xG'] = def_df.groupby('name')['expected_goals'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_xGI'] = def_df.groupby('name')['expected_goal_involvements'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_xGC'] = def_df.groupby('name')['expected_goals_conceded'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_GC'] = def_df.groupby('name')['goals_conceded'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
def_df['avg_mins'] = def_df.groupby('name')['minutes'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)

In [130]:
def_df['total_assists'] = def_df.groupby('name')['assists'].cumsum().shift(1).fillna(0)
def_df['total_goals'] = def_df.groupby('name')['goals_scored'].cumsum().shift(1).fillna(0)
def_df['total_cs'] = def_df.groupby('name')['clean_sheets'].cumsum().shift(1).fillna(0)
def_df['total_starts'] = def_df.groupby('name')['starts'].cumsum().shift(1).fillna(0)

In [131]:
def_df = def_df.drop(['position','team','starts','minutes','goals_conceded','red_cards','team_a_score','team_h_score','yellow_cards','element','assists','goals_scored','clean_sheets','penalties_missed','penalties_saved','influence','threat','round','saves','selected','threat','kickoff_time','own_goals','fixture','creativity','transfers_balance','transfers_in','transfers_out','value','ict_index','bps','bonus','xP','expected_assists','expected_goals','expected_goal_involvements','expected_goals_conceded'],axis=1)

In [132]:
# Create a mapping from player names to categorical codes
def_df['name'] = def_df['name'].astype('category')
name_mapping = dict(enumerate(def_df['name'].cat.categories))
name_mapping_reverse = {v: k for k, v in name_mapping.items()}
def_df['name'] = def_df['name'].cat.codes


def_df['was_home'] = def_df['was_home'].astype('category')
def_df['was_home'] = def_df['was_home'].cat.codes

In [134]:
y = def_df['total_points']
X = def_df.drop(['total_points'],axis=1)

Index(['name', 'opponent_team', 'was_home', 'GW', 'avg_ict', 'avg_bps',
       'avg_xP', 'avg_xA', 'avg_xGI', 'avg_xGC', 'avg_GC', 'avg_mins',
       'total_assists', 'total_goals', 'total_cs', 'total_starts'],
      dtype='object')


In [135]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, random_state = 0 ) # 80/20

In [136]:
def_rf = RandomForestRegressor(max_depth=4,max_features= 5,n_estimators= 500,random_state=13)

In [137]:
def_rf.fit(X_train,y_train)

In [138]:
train_pred = def_rf.predict( X_train )
test_pred = def_rf.predict( X_test )

In [139]:
print( 'mean squared error: ', mean_squared_error( test_pred, y_test ) )
print( 'mean absolute error: ', mean_absolute_error( test_pred, y_test ) )
print( 'r2 score: ', r2_score( y_test, test_pred ) )

print( '-------------------------------------' )

print( 'train mean_squared_error : ', mean_squared_error( y_train, train_pred ) ) 
print( 'train mean_absolute_error : ', mean_absolute_error( y_train, train_pred ) ) 
print( 'r2 score: ', r2_score( y_train, train_pred ) )

mean squared error:  4.7918830396090595
mean absolute error:  1.1354472946173524
r2 score:  0.1620507169619163
-------------------------------------
train mean_squared_error :  3.7271100026075716
train mean_absolute_error :  1.0516883115482882
r2 score:  0.20032452977495296


In [140]:
# # Get feature importances
# feature_importances = def_rf.feature_importances_

# feature_names = [ 'name','opponent_team','was_home','GW','avg_ict','avg_bps','avg_xP','avg_xA','avg_xGI','avg_xGC','avg_GC','avg_mins','total_goals','total_assists','total_cs','total_starts']


# # Plot feature importances
# plt.figure(figsize=(25, 10))
# plt.bar(range(len(feature_importances)), feature_importances)
# plt.xticks(range(len(feature_importances)), feature_names)  # Set feature names as x-axis ticksplt.xlabel('Feature Index')
# plt.ylabel('Feature Importance')
# plt.xlabel('Feature')
# plt.title('Random Forest Feature Importance Plot')
# plt.show()

# Midfielders

In [141]:
mid_df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/midfielders.csv')

In [142]:
mid_df['avg_ict'] = mid_df.groupby('name')['ict_index'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_assists'] = mid_df.groupby('name')['assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_goals'] = mid_df.groupby('name')['goals_scored'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_bps'] = mid_df.groupby('name')['bps'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_xP'] = mid_df.groupby('name')['xP'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_xA'] = mid_df.groupby('name')['expected_assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_xG'] = mid_df.groupby('name')['expected_goals'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_xGI'] = mid_df.groupby('name')['expected_goal_involvements'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_xGC'] = mid_df.groupby('name')['expected_goals_conceded'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_GC'] = mid_df.groupby('name')['goals_conceded'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
mid_df['avg_mins'] = mid_df.groupby('name')['minutes'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)

In [143]:
mid_df['total_assists'] = mid_df.groupby('name')['assists'].cumsum().shift(1).fillna(0)
mid_df['total_goals'] = mid_df.groupby('name')['goals_scored'].cumsum().shift(1).fillna(0)
mid_df['total_cs'] = mid_df.groupby('name')['clean_sheets'].cumsum().shift(1).fillna(0)
mid_df['total_starts'] = mid_df.groupby('name')['starts'].cumsum().shift(1).fillna(0)

In [144]:
# Create a mapping from player names to categorical codes
mid_df['name'] = mid_df['name'].astype('category')
name_mapping = dict(enumerate(mid_df['name'].cat.categories))
name_mapping_reverse = {v: k for k, v in name_mapping.items()}
mid_df['name'] = mid_df['name'].cat.codes


mid_df['was_home'] = mid_df['was_home'].astype('category')
mid_df['was_home'] = mid_df['was_home'].cat.codes

In [145]:
mid_df = mid_df.drop(['position','team','value','transfers_balance','bonus','own_goals','saves','team_a_score','team_h_score','red_cards','yellow_cards','transfers_in','transfers_out','penalties_missed','penalties_saved','clean_sheets','creativity','influence','threat','kickoff_time','fixture','round','element','starts','selected','ict_index','assists','avg_assists','goals_scored','avg_goals','bps','xP','expected_assists','expected_goals','expected_goal_involvements','expected_goals_conceded','goals_conceded','minutes'],axis=1)

In [147]:
y = mid_df['total_points']
X = mid_df.drop(['total_points'],axis=1)
print(X.columns)

Index(['name', 'opponent_team', 'was_home', 'GW', 'avg_ict', 'avg_bps',
       'avg_xP', 'avg_xA', 'avg_xG', 'avg_xGI', 'avg_xGC', 'avg_GC',
       'avg_mins', 'total_assists', 'total_goals', 'total_cs', 'total_starts'],
      dtype='object')


In [148]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, random_state = 0 ) # 80/20

In [149]:
mid_rf = RandomForestRegressor(max_depth=4,max_features= 10,n_estimators= 100,random_state=13)

In [150]:
mid_rf.fit( X_train, y_train )

In [151]:
train_pred = mid_rf.predict( X_train )
test_pred = mid_rf.predict( X_test )

In [152]:
print( 'mean squared error: ', mean_squared_error( test_pred, y_test ) )
print( 'mean absolute error: ', mean_absolute_error( test_pred, y_test ) )
print( 'r2 score: ', r2_score( y_test, test_pred ) )

print( '-------------------------------------' )

print( 'train mean_squared_error : ', mean_squared_error( y_train, train_pred ) ) 
print( 'train mean_absolute_error : ', mean_absolute_error( y_train, train_pred ) ) 
print( 'r2 score: ', r2_score( y_train, train_pred ) )

mean squared error:  4.268144904763499
mean absolute error:  1.040382480189319
r2 score:  0.26010350533551363
-------------------------------------
train mean_squared_error :  3.8672928718282336
train mean_absolute_error :  1.0196222038833729
r2 score:  0.30483825092288175


In [153]:
# # Get feature importances
# feature_importances = mid_rf.feature_importances_

# feature_names = [ 'name','opponent_team','was_home','GW','avg_ict','avg_bps','avg_xP','avg_xA','avg_XG','avg_xGI','avg_xGC','avg_GC','avg_mins','total_assists','total_goals','total_cs','total_starts']

# # Plot feature importances
# plt.figure(figsize=(25, 10))
# plt.bar(range(len(feature_importances)), feature_importances)
# plt.xticks(range(len(feature_importances)), feature_names)  # Set feature names as x-axis ticksplt.xlabel('Feature Index')
# plt.ylabel('Feature Importance')
# plt.xlabel('Feature')
# plt.title('Random Forest Feature Importance Plot')
# plt.show()

# Forwards

In [154]:
fwd_df = pd.read_csv('/Users/zacharylai/Desktop/FPL_ML/fpl_points_predictor/forwards.csv')

In [155]:
fwd_df['avg_ict'] = fwd_df.groupby('name')['ict_index'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_assists'] = fwd_df.groupby('name')['assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_goals'] = fwd_df.groupby('name')['goals_scored'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_bps'] = fwd_df.groupby('name')['bps'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_xP'] = fwd_df.groupby('name')['xP'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_xA'] = fwd_df.groupby('name')['expected_assists'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_xG'] = fwd_df.groupby('name')['expected_goals'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_xGI'] = fwd_df.groupby('name')['expected_goal_involvements'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)
fwd_df['avg_mins'] = fwd_df.groupby('name')['minutes'].expanding().mean().shift(1).fillna(0).reset_index(level=0, drop=True)

In [156]:
fwd_df['total_assists'] = fwd_df.groupby('name')['assists'].cumsum().shift(1).fillna(0)
fwd_df['total_goals'] = fwd_df.groupby('name')['goals_scored'].cumsum().shift(1).fillna(0)
fwd_df['total_starts'] = fwd_df.groupby('name')['starts'].cumsum().shift(1).fillna(0)

In [157]:
fwd_df

Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,avg_goals,avg_bps,avg_xP,avg_xA,avg_xG,avg_xGI,avg_mins,total_assists,total_goals,total_starts
0,Aaron Connolly,FWD,Brighton,0.0,0,0,0,0,0.0,127,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,Aaron Connolly,FWD,Brighton,0.0,0,0,0,0,0.0,127,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,Aaron Connolly,FWD,Brighton,0.0,0,0,0,0,0.0,127,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,Aaron Connolly,FWD,Brighton,0.0,0,0,0,0,0.0,127,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
4,Aaron Connolly,FWD,Brighton,0.0,0,0,0,0,0.0,127,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3736,Zeki Amdouni,FWD,Burnley,0.0,0,0,0,0,0.0,594,...,0.125000,6.531250,1.975000,0.036875,0.129062,0.165937,57.312500,1.0,4.0,26.0
3737,Zeki Amdouni,FWD,Burnley,0.6,0,0,3,0,1.3,594,...,0.121212,6.333333,1.915152,0.035758,0.125152,0.160909,55.575758,1.0,4.0,26.0
3738,Zeki Amdouni,FWD,Burnley,1.7,0,2,28,0,0.3,594,...,0.117647,6.235294,1.876471,0.034706,0.121471,0.156176,54.088235,1.0,4.0,26.0
3739,Zeki Amdouni,FWD,Burnley,1.7,0,0,2,0,1.3,594,...,0.142857,6.857143,1.871429,0.033714,0.144571,0.178286,53.485714,1.0,5.0,26.0


In [158]:
# Create a mapping from player names to categorical codes
fwd_df['name'] = fwd_df['name'].astype('category')
name_mapping = dict(enumerate(fwd_df['name'].cat.categories))
name_mapping_reverse = {v: k for k, v in name_mapping.items()}
fwd_df['name'] = fwd_df['name'].cat.codes


fwd_df['was_home'] = fwd_df['was_home'].astype('category')
fwd_df['was_home'] = fwd_df['was_home'].cat.codes

In [159]:
fwd_df = fwd_df.drop(['position','team','value','transfers_balance','bonus','own_goals','saves','team_a_score','team_h_score','red_cards','yellow_cards','transfers_in','transfers_out','penalties_missed','penalties_saved','clean_sheets','creativity','influence','threat','kickoff_time','fixture','round','element','starts','selected','ict_index','assists','avg_assists','goals_scored','avg_goals','bps','xP','expected_assists','expected_goals','expected_goal_involvements','expected_goals_conceded','goals_conceded','minutes'],axis=1)

In [160]:
fwd_df

Unnamed: 0,name,opponent_team,total_points,was_home,GW,avg_ict,avg_bps,avg_xP,avg_xA,avg_xG,avg_xGI,avg_mins,total_assists,total_goals,total_starts
0,0,12,0,1,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,0,20,0,0,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,0,19,0,1,3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,0,15,0,1,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
4,0,14,0,0,5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3736,112,5,0,1,33,3.112500,6.531250,1.975000,0.036875,0.129062,0.165937,57.312500,1.0,4.0,26.0
3737,112,17,1,0,34,3.018182,6.333333,1.915152,0.035758,0.125152,0.160909,55.575758,1.0,4.0,26.0
3738,112,14,7,0,35,2.935294,6.235294,1.876471,0.034706,0.121471,0.156176,54.088235,1.0,4.0,26.0
3739,112,15,1,1,36,3.040000,6.857143,1.871429,0.033714,0.144571,0.178286,53.485714,1.0,5.0,26.0


In [161]:
y = fwd_df['total_points']
X = fwd_df.drop(['total_points'],axis=1)
print(X.columns)

Index(['name', 'opponent_team', 'was_home', 'GW', 'avg_ict', 'avg_bps',
       'avg_xP', 'avg_xA', 'avg_xG', 'avg_xGI', 'avg_mins', 'total_assists',
       'total_goals', 'total_starts'],
      dtype='object')


In [162]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, random_state = 0 ) # 80/20fwd_rf = Radn

In [163]:
fwd_rf = RandomForestRegressor(max_depth=4,max_features= 10,n_estimators= 100,random_state=13)

In [164]:
fwd_rf.fit( X_train, y_train )

In [165]:
train_pred = fwd_rf.predict(X_train)
test_pred = fwd_rf.predict(X_test)

In [166]:
X_test

Unnamed: 0,name,opponent_team,was_home,GW,avg_ict,avg_bps,avg_xP,avg_xA,avg_xG,avg_xGI,avg_mins,total_assists,total_goals,total_starts
825,23,16,0,37,0.885714,1.742857,0.377143,0.007429,0.055143,0.062571,9.457143,0.0,2.0,2.0
1982,58,7,0,35,0.011111,0.111111,0.192593,0.000000,0.000000,0.000000,0.111111,0.0,0.0,0.0
2506,75,15,0,35,0.000000,0.000000,-0.058824,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
668,19,8,0,28,4.484615,13.076923,4.246154,0.046923,0.301923,0.348846,68.423077,6.0,8.0,20.0
598,17,7,1,32,2.440000,6.866667,1.446667,0.025667,0.124000,0.149667,48.300000,2.0,4.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2181,65,7,0,31,0.000000,0.000000,0.247059,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
1652,49,19,1,32,0.563333,0.233333,0.276667,0.003000,0.037333,0.040333,8.633333,0.0,0.0,3.0
3399,101,18,1,21,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
2588,78,1,0,17,0.000000,0.000000,0.440000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0


In [167]:
print( 'mean squared error: ', mean_squared_error( test_pred, y_test ) )
print( 'mean absolute error: ', mean_absolute_error( test_pred, y_test ) )
print( 'r2 score: ', r2_score( y_test, test_pred ) )

print( '-------------------------------------' )

print( 'train mean_squared_error : ', mean_squared_error( y_train, train_pred ) ) 
print( 'train mean_absolute_error : ', mean_absolute_error( y_train, train_pred ) ) 
print( 'r2 score: ', r2_score( y_train, train_pred ) )

mean squared error:  5.112895883045341
mean absolute error:  1.1956999997169284
r2 score:  0.25168254871462237
-------------------------------------
train mean_squared_error :  3.9599376576630836
train mean_absolute_error :  1.0281346428746863
r2 score:  0.36823031661592587


In [168]:
# # Get feature importances
# feature_importances = fwd_rf.feature_importances_

# feature_names = [ 'name','opponent_team','was_home','GW','avg_ict','avg_bps','avg_xP','avg_xA','avg_XG','avg_xGI','avg_mins','total_assists','total_goals','total_starts']

# # Plot feature importances
# plt.figure(figsize=(25, 10))
# plt.bar(range(len(feature_importances)), feature_importances)
# plt.xticks(range(len(feature_importances)), feature_names)  # Set feature names as x-axis ticksplt.xlabel('Feature Index')
# plt.ylabel('Feature Importance')
# plt.xlabel('Feature')
# plt.title('Random Forest Feature Importance Plot')
# plt.show()

# Lookup Table

In [169]:
name_mapping
fwd_name = input("Enter a players name: ")
gw = input("Enter a gameweek: ")
# name_mapping
# name_mapping_reverse[fwd_name]


Enter a players name:  Danny Ings
Enter a gameweek:  4
