In [41]:
import pandas as pd
from pandas import DataFrame

In [42]:
def calculate_contract_years_left(pdf: DataFrame) -> DataFrame:
    """
    Calculate the number of years left on a player's contract at the end of each season.

    This function:
    - Converts 'contract_expiration_date' to datetime.
    - Calculates June 30th of the given 'year' as the season end.
    - Computes the years remaining on a player's contract.
    - Fills NaNs with 0 and clips values between 0 and 6.
    - Drops the intermediate 'season_end_date' column.

    Args:
        pdf (pd.DataFrame): A DataFrame with columns 'contract_expiration_date' and 'year'.

    Returns:
        pd.DataFrame: The original DataFrame with 'contract_years_left' column added.
    """
    pdf["contract_expiration_date"] = pd.to_datetime(pdf["contract_expiration_date"])
    pdf["season_end_date"] = pd.to_datetime(pdf["year"].astype(str) + "-06-30")
    pdf["contract_years_left"] = round(
        (pdf["contract_expiration_date"] - pdf["season_end_date"]).dt.days / 365.25, 1
    )
    pdf["contract_years_left"] = pdf["contract_years_left"].fillna(0).clip(lower=0, upper=6)
    pdf.drop(columns="season_end_date", inplace=True)

    return pdf

In [43]:
pdf_valuations = pd.read_csv('../data/input/player_valuations.csv')[["player_id", "date", "market_value_in_eur"]]
pdf_players = pd.read_csv('../data/input/players.csv')[["player_id", "name", "date_of_birth", "position", "sub_position", "contract_expiration_date", "current_club_name", "current_club_domestic_competition_id" ]]

In [44]:

pdf_joined = pdf_players.merge(pdf_valuations, on="player_id").query("current_club_domestic_competition_id == 'GB1'")
pdf_joined["age"] = (pd.to_datetime(pdf_joined["date"]) - pd.to_datetime(pdf_joined["date_of_birth"])).dt.days // 365
pdf_joined["year"] = pd.to_datetime(pdf_joined["date"]).dt.year
pdf_joined["month"] = pd.to_datetime(pdf_joined["date"]).dt.month

In [45]:
pdf_joined = calculate_contract_years_left(pdf_joined)

In [46]:
pdf_joined["market_value_in_million_eur"] = pdf_joined["market_value_in_eur"] / 1000000
pdf_mvp = pdf_joined.groupby(["player_id", "name", "year"])[["date_of_birth", "market_value_in_million_eur", "age", "position", "sub_position", "contract_years_left"]].first().reset_index().sort_values(by=["player_id", "year"])
pdf_mvp ["value_last_year"] = pdf_mvp .groupby("player_id")["market_value_in_million_eur"].shift(1)
pdf_mvp ["age_last_year"] = pdf_mvp.groupby("player_id")["age"].shift(1)
pdf_mvp = pdf_mvp.dropna(subset=["value_last_year", "age_last_year"])

In [47]:

position_dummies = pd.get_dummies(pdf_mvp["position"], prefix="pos")
pdf_mvp = pd.concat([pdf_mvp, position_dummies], axis=1)

# Create subposition dummies
subpos_dummies = pd.get_dummies(pdf_mvp["sub_position"], prefix="subpos")
pdf_mvp = pd.concat([pdf_mvp, subpos_dummies], axis=1)


In [48]:
pdf_mvp["age_from_peak"] = (pdf_mvp["age"] - 25) ** 2

In [49]:
pdf_mvp.to_csv("../data/intermediate/time_series_model_data_prep.csv", index=False)