In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

### import data

In [76]:
player_df = pd.read_csv('epl-players-2010-2023-stats.csv', index_col=0)
player_df.shape

(8112, 269)

### name corrections

In [77]:
wrong_names = player_df[player_df['full_name'].str.contains('u00', na=False)]['full_name'].unique()
print(wrong_names)

['Abdoulaye Mu00e9u00eftu00e9' 'Cheik Ismau00ebl Tiotu00e9'
 'Didier Yves Drogba Tu00e9bily' 'Emmanuel Ebouu00e9'
 'Gnu00e9gnu00e9ri Yaya Touru00e9' 'Kolo Habib Touru00e9'
 'Lohoru00e9 Steve Ulrich Gohouri' 'Arouna Konu00e9' 'Lacina Traoru00e9'
 'Lamine Konu00e9' 'Jean Michau00ebl Seri']


In [78]:
name_corrections = {
    'Gnu00e9gnu00e9ri Yaya Touru00e9': 'Yaya Touré',
    'Abdoulaye Mu00e9u00eftu00e9': 'Abdoulaye Méïté',
    'Cheik Ismau00ebl Tiotu00e9': 'Cheik Ismaël Tioté',
    'Emmanuel Ebouu00e9': 'Emmanuel Eboué',
    'Kolo Habib Touru00e9': 'Kolo Habib Touré',
    'Lohoru00e9 Steve Ulrich Gohouri': 'Lohoré Steve Ulrich Gohouri',
    'Arouna Konu00e9': 'Arouna Koné',
    'Lacina Traoru00e9': 'Lacina Traoré',
    'Lamine Konu00e9': 'Lamine Koné',
    'Jean Michau00ebl Seri': 'Jean Michaël Seri',
    'Didier Yves Drogba Tu00e9bily': 'Didier Drogba'
}

player_df['full_name'] = player_df['full_name'].replace(name_corrections)

### correcting age & removing redundant columns

In [79]:
player_df['birthday_GMT']= pd.to_datetime(player_df['birthday_GMT'])
player_df['age'] = pd.to_datetime(player_df['season'].str.slice(0, 4)).dt.year - player_df['birthday_GMT'].dt.year
player_df.drop(['birthday','birthday_GMT','league'],axis=1,inplace=True)
player_df = player_df.rename(columns={'Current Club': 'team'})

### features with >70% null records

In [80]:
missing = player_df.isnull().sum()*100/player_df.shape[0]
player_df.columns[missing>70].shape[0]

222

### removing 222 features + filtering for players playing >1000 min

In [81]:
full_df = player_df[player_df.columns[missing<70]]
filtered_df = full_df[full_df['minutes_played_overall']>1000]

filtered_df.groupby('season').full_name.count()

season
2010/2011    308
2011/2012    312
2012/2013    318
2013/2014    320
2014/2015    304
2015/2016    311
2016/2017    308
2017/2018    319
2018/2019    305
2019/2020    307
2020/2021    317
2021/2022    318
2022/2023    309
Name: full_name, dtype: int64

### identifying highly correlated features 

In [82]:
corr_df = filtered_df[filtered_df.select_dtypes(include=['number']).columns].corr()
mask=np.triu(np.ones(corr_df.shape), k=1).astype('bool')
tri_df = corr_df.where(mask)

corr_table = tri_df.stack().reset_index()
corr_table.columns = ['feature1', 'feature2', 'correlation']
corr_table = corr_table.sort_values('correlation', ascending=False)
hi_corr_table = corr_table[corr_table['correlation']>0.8]

### dropping highly correlated & more redundant features

In [103]:
final_df = filtered_df.drop(['minutes_played_home', 'minutes_played_away', 'appearances_home', 'appearances_away', 'goals_overall',
                 'goals_per_90_home', 'goals_per_90_away', 'assists_overall','rank_in_league_top_defenders', 'rank_in_club_top_scorer',
                 'rank_in_league_top_attackers', 'rank_in_league_top_midfielders','conceded_overall','clean_sheets_overall', 
                 'cards_per_90_overall'], axis=1)

### creating 2011-2023 TOTY winners list

In [104]:
toty = pd.DataFrame(data={
    '2010/2011': ['Carlos Alberto Tevez','Dimitar Ivanov Berbatov','Gareth Bale','Samir Nasri','Jack Wilshere','Nani','Ashley Cole','Nemanja Vidić','Vincent Kompany','Bacary Sagna','Edwin van der Sar'],
    '2011/2012': ['Robin van Persie','Wayne Rooney','Gareth Bale','Yaya Touré','Scott Parker','David Silva','Leighton Baines','Vincent Kompany','Fabricio Coloccini','Kyle Walker','Joe Hart'], 
    '2012/2013': ['Robin van Persie','Luis Suárez','Gareth Bale','Juan Mata','Michael Carrick','Eden Hazard','Leighton Baines','Jan Vertonghen','Rio Ferdinand','Pablo Zabaleta','David de Gea'], 
    '2013/2014': ['Luis Suárez','Daniel Sturridge','Eden Hazard','Yaya Touré','Steven Gerrard','Adam Lallana','Luke Shaw','Vincent Kompany','Gary Cahill','Séamus Coleman','Petr Čech'], 
    '2014/2015': ['Diego Costa','Harry Kane','Alexis Sanchez','Nemanja Matić','Philippe Coutinho','Eden Hazard','Ryan Bertrand','John Terry','Gary Cahill','Branislav Ivanović','David de Gea'], 
    '2015/2016': ['Jamie Vardy','Harry Kane','Dimitri Payet',"N'Golo Kanté",'Dele Alli','Riyad Mahrez','Danny Rose','Toby Alderweireld','Wes Morgan','Hector Bellerin','David de Gea'], 
    '2016/2017': ['Harry Kane','Romelu Lukaku','Sadio Mané','Dele Alli',"N'Golo Kanté",'Eden Hazard','Danny Rose','David Luiz','Gary Cahill','Kyle Walker','David de Gea'], 
    '2017/2018': ['Sergio Aguero','Harry Kane','Mohamed Salah','David Silva','Kevin De Bruyne','Christian Eriksen','Marcos Alonso','Jan Vertonghen','Nicolas Otamendi','Kyle Walker','David de Gea'], 
    '2018/2019': ['Sadio Mané','Sergio Aguero','Raheem Sterling','Paul Pogba','Fernandinho','Bernardo Silva','Andrew Robertson','Virgil van Dijk','Aymeric Laporte','Trent Alexander-Arnold','Ederson'], 
    '2019/2020': ['Sadio Mané','Jamie Vardy','Pierre-Emerick Aubameyang','David Silva','Jordan Henderson','Kevin De Bruyne','Andrew Robertson','Virgil van Dijk','Caglar Söyüncü','Trent Alexander-Arnold','Nick Pope'],
    '2020/2021': ['Heung-Min Son','Harry Kane','Mohamed Salah','Kevin De Bruyne','İlkay Gündoğan','Bruno Fernandes','Luke Shaw','John Stones','Rúben Dias','João Cancelo','Ederson'],
    '2021/2022': ['Sadio Mané','Cristiano Ronaldo','Mohamed Salah','Bernardo Silva','Thiago Alcantara','Kevin De Bruyne','João Cancelo','Virgil van Dijk','Antonio Rüdiger','Trent Alexander-Arnold','Alisson Becker'],
    '2022/2023': ['Harry Kane','Erling Haaland','Bukayo Saka','Martin Ødegaard','Rodri','Kevin De Bruyne','William Saliba','John Stones','Rúben Dias','Kieran Trippier','Aaron Ramsdale']
})

toty_df = toty.melt(var_name='season', value_name='full_name')


### creating target feature

In [105]:
toty_df['toty_player'] = 1
final_df1 = final_df.merge(toty_df[['season', 'full_name', 'toty_player']], on=['season', 'full_name'], how='left')
final_df1['toty_player'] = final_df1['toty_player'].fillna(0).astype(int)

### import team data to get league position

In [106]:
teams_df = pd.read_csv('epl-teams-2010-2023-stats.csv', index_col=0)
teams_df = teams_df.rename(columns={'common_name': 'team'})
league_pos_df = teams_df[['team', 'season', 'league_position']]

In [107]:
final_df2 = final_df1.merge(league_pos_df, on=['team', 'season'], how='left')

In [108]:
final_df2.shape

(4056, 31)

In [109]:
final_df2.groupby('season').toty_player.sum()

season
2010/2011    11
2011/2012    11
2012/2013    11
2013/2014    11
2014/2015    11
2015/2016    11
2016/2017    11
2017/2018    11
2018/2019    11
2019/2020    11
2020/2021    11
2021/2022    11
2022/2023    11
Name: toty_player, dtype: int32

In [112]:
mask = (final_df2['season'] == '2010/2011') & (final_df2['toty_player'] == 1)
final_df2[mask]

Unnamed: 0,full_name,age,season,position,team,minutes_played_overall,nationality,appearances_overall,goals_home,goals_away,...,assists_per_90_overall,goals_per_90_overall,min_per_goal_overall,conceded_per_90_overall,min_per_conceded_overall,min_per_match,min_per_card_overall,min_per_assist_overall,toty_player,league_position
20,Ashley Cole,30,2010/2011,Defender,Chelsea,3386,England,38,0,0,...,0.11,0.0,0,0.8,113,89,847,847,1,2
23,Bacary Sagna,27,2010/2011,Defender,Arsenal,2970,France,33,0,1,...,0.09,0.03,2970,1.12,80,90,594,990,1,4
34,Carlos Alberto Tevez,26,2010/2011,Forward,Manchester City,2529,Argentina,31,13,7,...,0.21,0.71,126,0.6,149,82,422,422,1,3
71,Dimitar Ivanov Berbatov,29,2010/2011,Forward,Manchester United,2210,Bulgaria,32,16,4,...,0.16,0.81,111,1.02,88,69,2210,553,1,1
74,Edwin van der Sar,40,2010/2011,Goalkeeper,Manchester United,2970,Netherlands,33,0,0,...,0.03,0.0,0,0.88,102,90,2970,2970,1,1
88,Gareth Bale,21,2010/2011,Forward,Tottenham Hotspur,2452,Wales,30,4,3,...,0.04,0.26,350,1.17,77,82,2452,2452,1,5
105,Jack Wilshere,18,2010/2011,Midfielder,Arsenal,2653,England,35,0,1,...,0.1,0.03,2653,0.98,91,76,295,884,1,4
205,Nani,24,2010/2011,Midfielder,Manchester United,2668,Portugal,33,7,2,...,0.57,0.3,296,0.91,99,81,2668,157,1,1
207,Nemanja Vidić,29,2010/2011,Defender,Manchester United,3118,Serbia,35,3,2,...,0.03,0.14,624,0.87,104,89,390,3118,1,1
254,Samir Nasri,23,2010/2011,Midfielder,Arsenal,2390,France,30,6,4,...,0.04,0.38,239,0.94,96,80,598,2390,1,4
