In [1]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None
pd.set_option("display.precision", 3)
pd.set_option('future.no_silent_downcasting', True)

def getMinutesPerConceded(row: pd.Series) -> pd.Series:
  if row["goals_conceded"] == 0:
    return np.nan
  return row['minutes'] / row['goals_conceded']

def gks_for_year(year: int) -> pd.DataFrame:
  folderlabel = foldername_from_year(year)
  df = pd.read_csv(f"data/{folderlabel}/cleaned_players.csv", encoding = "iso-8859-1")
  return df[df["element_type"]=="GK"]

def affordable_gks_for_year(year: int) -> pd.DataFrame:
  gks = gks_for_year(year)
  played_gks = gks.loc[gks['minutes'] != 0]

  played_gks[year] = played_gks.apply(getMinutesPerConceded, axis=1)
  played_gks.dropna(subset=[year], inplace=True)
  best_gks = played_gks.sort_values(by=year, ascending=False)
  return best_gks[['first_name', 'second_name', year]]

def foldername_from_year(year: int) -> str:
  next_year = year + 1 - 2000
  return f"{year}-{next_year}"

def year_from_foldername(foldername: str) -> int:
  return int(foldername.split("-")[0])

In [2]:
start_year = 2020
current_year = 2024

affordable_gks = affordable_gks_for_year(start_year)

for x in range(start_year+1, current_year+1):
  second_df = affordable_gks_for_year(x)
  affordable_gks = pd.merge(
      left=affordable_gks,
      right=second_df,
      how='outer',
      left_on=['first_name', 'second_name'],
      right_on=['first_name', 'second_name'],
  )

# Drop players unavailable for this round
current_year_folderlabel = foldername_from_year(current_year)
raw_players = pd.read_csv(f"data/{current_year_folderlabel}/players_raw.csv", encoding = "iso-8859-1")

# Drop NA in any of two columns
# filters out unavailable now
# filter out newcomers with outlying results
last_year = current_year - 1
affordable_gks.dropna(subset=[last_year, current_year], inplace=True)

# Drop players unavailable for this round

def chance_of_playing_this_round(player: pd.Series) -> int:
  found_players = raw_players[
    (raw_players["first_name"]==player["first_name"]) & 
    (raw_players["second_name"]==player["second_name"])
  ]
  if found_players.empty:
    player["chance_to_play"] = 0
    return player
    result
  first_row = found_players.fillna(100).iloc[0]["chance_of_playing_this_round"]
  player["chance_to_play"] = first_row.astype(int)
  return player

affordable_gks = affordable_gks.apply(chance_of_playing_this_round, axis=1)
affordable_gks.dropna(subset=["chance_to_play"], inplace=True)

unavailable_gks = affordable_gks[affordable_gks["chance_to_play"] < 75]
affordable_gks.drop(unavailable_gks.index, inplace=True)

# # Show top 10 GKs w/ precision set to 1
affordable_gks.sort_values(by=last_year, ascending=False, inplace=True)
affordable_gks

Unnamed: 0,first_name,second_name,2020,2021,2022,2023,2024,chance_to_play
22,David,Raya Martin,,80.0,74.348,120.0,108.0,75
0,Aaron,Ramsdale,54.286,78.462,79.535,108.0,36.0,100
25,Ederson,Santana de Moraes,115.714,128.077,98.438,103.148,90.0,100
62,Nick,Pope,77.838,68.936,101.906,84.062,77.143,100
3,Alisson,Ramses Becker,92.812,135.0,77.442,84.0,225.0,75
57,Mark,Travers,,,32.344,72.0,90.0,100
10,Bart,Verbruggen,,,,67.5,51.429,100
45,Jordan,Pickford,70.308,54.31,58.421,67.059,36.0,100
69,Sam,Johnstone,45.0,,81.0,66.519,45.0,100
28,Emiliano,MartÃ­nez Romero,,,82.605,62.812,60.0,100


In [3]:
# Predict stat using previous data
from sklearn.linear_model import LinearRegression

def build_test_data_from_series(player: pd.Series) -> ([], [], [], []):
  X_train = []
  y_train = []
  for x in range(start_year, current_year):
    # Skip nans
    if np.isnan(player[x]):
      continue
    X_train.append([x])
    y_train.append(player[x])

  X_test = X_train + [[current_year]]
  y_test = y_train + [player[current_year]]
  return (X_train, y_train, X_test, y_test)

def predict_this_year_linear(player: pd.Series) -> pd.Series:
  X_train, y_train, X_test, y_test = build_test_data_from_series(player)
  model = LinearRegression()
  model.fit(X_train, y_train)
  player["prediction_linear"] = model.predict([[current_year]])[0]
  player["prediction_linear_score"] = model.score(X_test, y_test)
  return player

predicted_gks = affordable_gks.copy().apply(predict_this_year_linear, axis=1)

# Drop if prediction score is lower than 0.5
# predicted_gks.drop(predicted_gks[predicted_gks["prediction_linear_score"] < 0.5].index, inplace=True)
# Sort by prediction
predicted_gks.sort_values(by="prediction_linear", ascending=False, inplace=True)
predicted_gks.head()

Unnamed: 0,first_name,second_name,2020,2021,2022,2023,2024,chance_to_play,prediction_linear,prediction_linear_score
22,David,Raya Martin,,80.0,74.348,120.0,108.0,75,131.449,0.315
0,Aaron,Ramsdale,54.286,78.462,79.535,108.0,36.0,100,120.625,-1.43
57,Mark,Travers,,,32.344,72.0,90.0,100,111.656,0.731
62,Nick,Pope,77.838,68.936,101.906,84.062,77.143,100,96.097,-0.321
25,Ederson,Santana de Moraes,115.714,128.077,98.438,103.148,90.0,100,94.51,0.636


In [4]:
# Predict stat using previous data
from sklearn.svm import SVR

def predict_this_year_svr(player: pd.Series) -> pd.Series:
  X_train, y_train, X_test, y_test = build_test_data_from_series(player)
  model = SVR(kernel='rbf')
  model.fit(X_train, y_train)
  player["prediction_svr"] = model.predict([[current_year]])[0]
  player["prediction_svr_score"] = model.score(X_test, y_test)
  return player

predicted_gks_svr = predicted_gks.apply(predict_this_year_svr, axis=1)

# Drop if prediction score is lower than 0.5
# predicted_gks.drop(predicted_gks[predicted_gks["prediction_score_svr"] < 0.5].index, inplace=True)
# Sort by prediction
predicted_gks_svr.sort_values(by="prediction_svr", ascending=False, inplace=True)
predicted_gks_svr.head()

Unnamed: 0,first_name,second_name,2020,2021,2022,2023,2024,chance_to_play,prediction_linear,prediction_linear_score,prediction_svr,prediction_svr_score
25,Ederson,Santana de Moraes,115.714,128.077,98.438,103.148,90.0,100,94.51,0.636,108.942,0.086
3,Alisson,Ramses Becker,92.812,135.0,77.442,84.0,225.0,75,76.315,-0.579,87.917,-0.394
62,Nick,Pope,77.838,68.936,101.906,84.062,77.143,100,96.097,-0.321,81.44,0.107
22,David,Raya Martin,,80.0,74.348,120.0,108.0,75,131.449,0.315,80.441,-0.597
0,Aaron,Ramsdale,54.286,78.462,79.535,108.0,36.0,100,120.625,-1.43,79.45,-0.077


In [20]:
# Get current season stats to calculate performance
def get_player_id(first_name: str, second_name: str) -> int:
  found_players = raw_players[
    (raw_players["first_name"]==first_name) & 
    (raw_players["second_name"]==second_name)
  ]
  if found_players.empty:
    return 0
  first_row = found_players.fillna(100).iloc[0]["id"]
  return first_row.astype(int)

def find_player_gw_this_year(first_name: str, second_name: str) -> pd.DataFrame:
  player_id = get_player_id(first_name, second_name)
  path = f"data/{current_year_folderlabel}/players/{first_name}_{second_name}_{player_id}/gw.csv"
  df = pd.read_csv(path, encoding = "iso-8859-1")
  return df

gk_data = predicted_gks_svr.copy()
def expected_vs_actual_goals_conceded(player: pd.Series) -> pd.Series:
  expected_vs_actual_goals_conceded = np.nan
  player_gw = None
  try:
    player_gw = find_player_gw_this_year(player["first_name"], player["second_name"])
  except:
    # TODO: fucking unicode
    return expected_vs_actual_goals_conceded
  for i in range(len(player_gw)):
    match = player_gw.iloc[-i]
    if match['minutes'] != 0:
      expected_vs_actual_goals_conceded = match['expected_goals_conceded'] - match['goals_conceded']
      break
  player["expected_vs_actual_goals_conceded"] = expected_vs_actual_goals_conceded
  return player

gk_data = gk_data.apply(expected_vs_actual_goals_conceded, axis=1)

# Drop if performes much worse
gk_data.drop(gk_data[gk_data["expected_vs_actual_goals_conceded"] < -1].index, inplace=True)
gk_data.dropna(how='all', inplace=True)
gk_data

Unnamed: 0,first_name,second_name,2020,2021,2022,2023,2024,chance_to_play,prediction_linear,prediction_linear_score,prediction_svr,prediction_svr_score,expected_vs_actual_goals_conceded
25,Ederson,Santana de Moraes,115.714,128.077,98.438,103.148,90.0,100.0,94.51,0.636,108.942,0.086,1.01
3,Alisson,Ramses Becker,92.812,135.0,77.442,84.0,225.0,75.0,76.315,-0.579,87.917,-0.394,0.45
62,Nick,Pope,77.838,68.936,101.906,84.062,77.143,100.0,96.097,-0.321,81.44,0.107,1.77
22,David,Raya Martin,,80.0,74.348,120.0,108.0,75.0,131.449,0.315,80.441,-0.597,0.47
10,Bart,Verbruggen,,,,67.5,51.429,100.0,67.5,-1.0,67.5,-1.0,0.16
69,Sam,Johnstone,45.0,,81.0,66.519,45.0,100.0,84.519,-0.999,66.072,-0.122,0.54
12,Bernd,Leno,84.622,40.0,63.529,56.066,108.0,100.0,45.519,-0.697,59.612,-0.194,1.43
4,Alphonse,Areola,67.5,90.0,43.857,50.943,55.0,100.0,39.122,0.197,58.732,0.082,0.03
63,Norberto,Murara Neto,,,61.154,52.364,90.0,100.0,43.573,-1.78,56.74,-0.457,0.3
33,Guglielmo,Vicario,,,,56.066,108.0,100.0,56.066,-1.0,56.066,-1.0,0.02


In [21]:
# mix in player stats
gks = gks_for_year(current_year)
player_info = pd.merge(
      left=gk_data,
      right=gks,
      how='left',
      left_on=['first_name', 'second_name'],
      right_on=['first_name', 'second_name'],
  )
player_custom_info = player_info[['first_name', 'second_name',last_year, current_year, 'prediction_linear', 'prediction_svr', 'expected_vs_actual_goals_conceded', 'now_cost', 'clean_sheets','goals_conceded', 'minutes', 'assists', 'total_points',  'influence', 'creativity', 'bonus', 'bps','selected_by_percent', ]]
player_custom_info.head()

Unnamed: 0,first_name,second_name,2023,2024,prediction_linear,prediction_svr,expected_vs_actual_goals_conceded,now_cost,clean_sheets,goals_conceded,minutes,assists,total_points,influence,creativity,bonus,bps,selected_by_percent
0,Ederson,Santana de Moraes,103.148,90.0,94.51,108.942,1.01,55,1,6,540,1,21,131.8,10.0,2,75,8.4
1,Alisson,Ramses Becker,84.0,225.0,76.315,87.917,0.45,55,3,2,450,0,25,101.2,0.0,2,99,9.5
2,Nick,Pope,84.062,77.143,96.097,81.44,1.77,50,1,7,540,0,25,207.6,0.0,2,92,7.8
3,David,Raya Martin,120.0,108.0,131.449,80.441,0.47,56,3,5,540,0,34,204.6,0.0,6,118,30.9
4,Bart,Verbruggen,67.5,51.429,67.5,67.5,0.16,45,1,7,360,0,10,98.4,0.0,0,47,2.0
