In [47]:
import pandas as pd
import numpy as np

def getMinutesPerConceded(row: pd.Series) -> pd.Series:
  if row["goals_conceded"] == 0:
    return np.nan
  return row['minutes'] / row['goals_conceded']

def affordable_gks_for_year(folderlabel: str) -> pd.DataFrame:
  df = pd.read_csv(f"data/{folderlabel}/cleaned_players.csv", encoding = "iso-8859-1")
  gks = df[df["element_type"]=="GK"]
  played_gks = gks.loc[df['minutes'] != 0]

  year = year_from_foldername(folderlabel)
  played_gks[year] = played_gks.apply(getMinutesPerConceded, axis=1)
  played_gks.dropna(subset=[year], inplace=True)
  best_gks = played_gks.sort_values(by=year, ascending=False)
  return best_gks[['first_name', 'second_name', year]]

def foldername_from_year(year: int) -> str:
  next_year = year + 1 - 2000
  return f"{year}-{next_year}"

def year_from_foldername(foldername: str) -> int:
  return int(foldername.split("-")[0])

In [49]:
start_year = 2020
last_year = 2024

df = affordable_gks_for_year(foldername_from_year(start_year))

for x in range(start_year+1, last_year+1):
  foldername = foldername_from_year(x)
  second_df = affordable_gks_for_year(foldername)
  df = pd.merge(
      left=df,
      right=second_df,
      how='left',
      left_on=['first_name', 'second_name'],
      right_on=['first_name', 'second_name'],
  )

# Drop NA in last column
df.dropna(subset=[last_year], inplace=True)
# Replace other NA with 0
df.fillna(0, inplace=True)
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice

Unnamed: 0,first_name,second_name,2020,2021,2022,2023,2024
2,Ederson,Santana de Moraes,115.714286,128.076923,98.4375,103.148148,90.0
4,Dean,Henderson,93.75,0.0,52.258065,54.0,60.0
5,Alisson,Ramses Becker,92.8125,135.0,77.44186,84.0,225.0
9,Robert,SÃ¡nchez,90.0,79.285714,69.0,57.32,77.142857
10,Bernd,Leno,84.621622,40.0,63.529412,56.065574,108.0
11,Nick,Pope,77.837838,68.93617,101.90625,84.0625,77.142857
14,Kepa,Arrizabalaga,73.125,180.0,77.727273,0.0,45.0
17,Jordan,Pickford,70.307692,54.310345,58.421053,67.058824,36.0
20,Alphonse,Areola,67.5,90.0,43.857143,50.943396,55.0
26,Aaron,Ramsdale,54.285714,78.461538,79.534884,108.0,36.0


In [51]:
import plotly.graph_objects as go
fig = go.Figure()

for _, row in df.iterrows():
    fig.add_scatter(
        x=df.columns[2:],
        y=row[2:],
        name=f"{row['first_name']} {row['second_name']}",
        mode='markers+lines')
fig.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
fig.show()