In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

### import data

In [3]:
player_df = pd.read_csv('epl-players-2010-2023-stats.csv').drop(columns=["Unnamed: 0"], errors="ignore")
player_df.shape

(8112, 269)

### features with >70% null records

In [4]:
missing = player_df.isnull().sum()*100/player_df.shape[0]
player_df.columns[missing>70].shape[0]

222

### removing 222 features + filtering for players from 2011-2020 playing >1000 min

In [5]:
full_df = player_df[player_df.columns[missing<70]]
final_df = full_df[full_df['minutes_played_overall']>1000]
final_df = final_df[~final_df['season'].isin(['2020/2021','2021/2022','2022/2023'])]

final_df.groupby('season').full_name.count()

season
2010/2011    308
2011/2012    312
2012/2013    318
2013/2014    320
2014/2015    304
2015/2016    311
2016/2017    308
2017/2018    319
2018/2019    305
2019/2020    307
Name: full_name, dtype: int64

### correcting age & removing redundant columns

In [6]:
final_df['birthday_GMT']= pd.to_datetime(final_df['birthday_GMT'])
final_df['age'] = pd.to_datetime(final_df['season'].str.slice(0, 4)).dt.year - final_df['birthday_GMT'].dt.year
final_df.drop(['birthday','birthday_GMT','league'],axis=1,inplace=True)

### identifying highly correlated features 

In [7]:
corr_df = final_df[final_df.select_dtypes(include=['number']).columns].corr()
mask=np.triu(np.ones(corr_df.shape), k=1).astype('bool')
tri_df = corr_df.where(mask)

corr_table = tri_df.stack().reset_index()
corr_table.columns = ['feature1', 'feature2', 'correlation']
corr_table = corr_table.sort_values('correlation', ascending=False)
hi_corr_table = corr_table[corr_table['correlation']>0.8]

### dropping highly correlated & more redundant features

In [8]:
filtered_df = final_df.drop(['minutes_played_overall', 'appearances_overall', 'goals_overall',
                 'goals_per_90_home', 'goals_per_90_away', 'assists_overall','rank_in_league_top_defenders', 'rank_in_club_top_scorer',
                 'rank_in_league_top_attackers', 'rank_in_league_top_midfielders','conceded_overall','clean_sheets_overall', 
                 'cards_per_90_overall'], axis=1)

### creating 2011-2020 TOTY winners list

In [10]:
toty_df = pd.DataFrame(data={
    2011: ['Carlos Alberto Tevez','Dimitar Ivanov Berbatov','Gareth Bale','Samir Nasri','Jack Wilshere','Nani','Ashley Cole','Nemanja Vidić','Vincent Kompany','Bacary Sagna','Edwin van der Sar'],
    2012: ['Robin van Persie','Wayne Rooney','Gareth Bale','Yaya Touré','Scott Parker','David Silva','Leighton Baines','Vincent Kompany','Fabricio Coloccini','Kyle Walker','Joe Hart'], 
    2013: ['Robin van Persie','Luis Suárez','Gareth Bale','Juan Mata','Michael Carrick','Eden Hazard','Leighton Baines','Jan Vertonghen','Rio Ferdinand','Pablo Zabaleta','David de Gea'], 
    2014: ['Luis Suárez','Daniel Sturridge','Eden Hazard','Yaya Touré','Steven Gerrard','Adam Lallana','Luke Shaw','Vincent Kompany','Gary Cahill','Séamus Coleman','Peter Čech'], 
    2015: ['Diego Costa','Harry Kane','Alexis Sanchez','Nemanja Matić','Philippe Coutinho','Eden Hazard','Ryan Bertrand','John Terry','Gary Cahill','Branislav Ivanović','David de Gea'], 
    2016: ['Jamie Vardy','Harry Kane','Dimitri Payet',"N'Golo Kanté",'Dele Alli','Riyad Mahrez','Danny Rose','Toby Alderweireld','Wes Morgan','Hector Bellerin','David de Gea'], 
    2017: ['Harry Kane','Romelu Lukaku','Sadio Mané','Dele Alli',"N'Golo Kanté",'Eden Hazard','Danny Rose','David Luiz','Gary Cahill','Kyle Walker','David de Gea'], 
    2018: ['Sergio Aguero','Harry Kane','Mohamed Salah','David Silva','Kevin De Bruyne','Christian Eriksen','Marcus Alonso','Jan Vertonghen','Nicolas Otamendi','Kyle Walker','David de Gea'], 
    2019: ['Sadio Mané','Sergio Aguero','Raheem Sterling','Paul Pogba','Fernandinho','Bernardo Silva','Andrew Robertson','Virgil van Dijk','Aymeric Laporte','Trent Alexander-Arnold','Ederson Moraes'], 
    2020: ['Sadio Mané','Jamie Vardy','Pierre-Emerick Aubameyang','David Silva','Jordan Henderson','Kevin De Bruyne','Andrew Robertson','Virgil van Dijk','Caglar Söyüncü','Trent Alexander-Arnold','Nick Pope']
})