In [27]:
import pandas as pd
from pandas import DataFrame

In [28]:
def calculate_contract_years_left(pdf: DataFrame) -> DataFrame:
    """
    Calculate the number of years left on a player's contract at the end of each season.

    This function:
    - Converts 'contract_expiration_date' to datetime.
    - Calculates June 30th of the given 'year' as the season end.
    - Computes the years remaining on a player's contract.
    - Fills NaNs with 0 and clips values between 0 and 6.
    - Drops the intermediate 'season_end_date' column.

    Args:
        pdf (pd.DataFrame): A DataFrame with columns 'contract_expiration_date' and 'year'.

    Returns:
        pd.DataFrame: The original DataFrame with 'contract_years_left' column added.
    """
    pdf["contract_expiration_date"] = pd.to_datetime(pdf["contract_expiration_date"])
    pdf["season_end_date"] = pd.to_datetime(pdf["year"].astype(str) + "-06-30")
    pdf["contract_years_left"] = round(
        (pdf["contract_expiration_date"] - pdf["season_end_date"]).dt.days / 365.25, 1
    )
    pdf["contract_years_left"] = pdf["contract_years_left"].fillna(0).clip(lower=0, upper=6)
    pdf.drop(columns="season_end_date", inplace=True)

    return pdf

In [29]:
def generate_team_strength() -> DataFrame:
    """
    Calculates seasonal team strength statistics for Premier League clubs based on historical match data.

    This function reads input CSVs containing information about clubs, games, competitions, and results,
    filters for Premier League matches, and aggregates key team performance metrics such as:
    - Points per game (PPG)
    - Goal difference
    - Goals scored
    - Goals conceded

    Returns:
        pd.DataFrame: A dataframe with the following columns:
            - 'team_name' (str): Club name
            - 'season' (str): Season identifier
            - 'team_ppg' (float): Points per game
            - 'team_goal_difference' (int): Goal difference
            - 'team_goals_scored' (int): Total goals scored
            - 'team_goals_conceded' (int): Total goals conceded
    """
    # Load CSVs
    club_games = pd.read_csv("../data/input/club_games.csv")
    clubs = pd.read_csv("../data/input/clubs.csv")
    games = pd.read_csv("../data/input/games.csv")
    competitions = pd.read_csv("../data/input/competitions.csv")

    # Merge club names and game metadata
    club_games = club_games.merge(clubs, on="club_id", suffixes=("", "_club"))
    club_games = club_games.merge(games, on="game_id", suffixes=("", "_game"))
    club_games = club_games.merge(competitions, on="competition_id", suffixes=("", "_comp"))

    # Filter to Premier League games only
    club_games = club_games[club_games["name_comp"] == "premier-league"]

    # Aggregate team stats by club name and season
    team_stats = (
        club_games.groupby(["name", "season"])
        .agg(
            games_played=("is_win", "count"),
            wins=("is_win", "sum"),
            goals_scored=("own_goals", "sum"),
            goals_conceded=("opponent_goals", "sum"),
        )
        .reset_index()
    )

    # Calculate draws
    team_stats["draws"] = team_stats["games_played"] - team_stats["wins"] - (
        club_games.groupby(["name", "season"])["is_win"]
        .apply(lambda x: (~x & (x.notna())).sum())
        .values
    )

    # Additional metrics
    team_stats["losses"] = team_stats["games_played"] - team_stats["wins"] - team_stats["draws"]
    team_stats["points"] = 3 * team_stats["wins"] + team_stats["draws"]
    team_stats["team_ppg"] = team_stats["points"] / team_stats["games_played"]
    team_stats["team_goal_difference"] = team_stats["goals_scored"] - team_stats["goals_conceded"]

    # Rename columns to have 'team_' prefix except 'season'
    team_stats.rename(columns={
        "goals_scored": "team_goals_scored",
        "goals_conceded": "team_goals_conceded"
    }, inplace=True)

    return team_stats[["name", "season", "team_ppg", "team_goal_difference", "team_goals_scored", "team_goals_conceded"]]


In [30]:
pdf_valuations = pd.read_csv('../data/input/player_valuations.csv')[["player_id", "date", "market_value_in_eur"]]
pdf_players = pd.read_csv('../data/input/players.csv')[["player_id", "name", "date_of_birth", "position", "sub_position", "contract_expiration_date", "current_club_name", "current_club_domestic_competition_id" ]]

In [31]:

pdf_joined = pdf_players.merge(pdf_valuations, on="player_id").query("current_club_domestic_competition_id == 'GB1'")
pdf_joined["age"] = (pd.to_datetime(pdf_joined["date"]) - pd.to_datetime(pdf_joined["date_of_birth"])).dt.days // 365
pdf_joined["year"] = pd.to_datetime(pdf_joined["date"]).dt.year
pdf_joined["month"] = pd.to_datetime(pdf_joined["date"]).dt.month

In [32]:
pdf_joined = calculate_contract_years_left(pdf_joined)

In [33]:
pdf_joined["market_value_in_million_eur"] = pdf_joined["market_value_in_eur"] / 1000000
pdf_mvp = pdf_joined.groupby(["player_id", "name", "year"])[["date_of_birth", "market_value_in_million_eur", "age", "position", "sub_position", "contract_years_left", "current_club_name"]].first().reset_index().sort_values(by=["player_id", "year"])
pdf_mvp ["value_last_year"] = pdf_mvp .groupby("player_id")["market_value_in_million_eur"].shift(1)
pdf_mvp ["age_last_year"] = pdf_mvp.groupby("player_id")["age"].shift(1)
pdf_mvp = pdf_mvp.dropna(subset=["value_last_year", "age_last_year"])

In [34]:

position_dummies = pd.get_dummies(pdf_mvp["position"], prefix="pos")
pdf_mvp = pd.concat([pdf_mvp, position_dummies], axis=1)

# Create subposition dummies
subpos_dummies = pd.get_dummies(pdf_mvp["sub_position"], prefix="subpos")
pdf_mvp = pd.concat([pdf_mvp, subpos_dummies], axis=1)


In [35]:
pdf_mvp["age_from_peak"] = (pdf_mvp["age"] - 25) ** 2

In [36]:
pdf_mvp = pdf_mvp.merge(generate_team_strength(), left_on=["current_club_name", "year"], right_on=["name", "season"], how="left", suffixes=("", "_team"))

In [38]:
pdf_mvp.to_csv("../data/intermediate/time_series_model_data_prep.csv", index=False)