In [150]:
import pandas as pd
from pandas import DataFrame

In [151]:
def calculate_contract_years_left(pdf: DataFrame) -> DataFrame:
    """
    Calculate the number of years left on a player's contract at the end of each season.

    This function:
    - Converts 'contract_expiration_date' to datetime.
    - Calculates June 30th of the given 'year' as the season end.
    - Computes the years remaining on a player's contract.
    - Fills NaNs with 0 and clips values between 0 and 6.
    - Drops the intermediate 'season_end_date' column.

    Args:
        pdf (pd.DataFrame): A DataFrame with columns 'contract_expiration_date' and 'year'.

    Returns:
        pd.DataFrame: The original DataFrame with 'contract_years_left' column added.
    """
    pdf["contract_expiration_date"] = pd.to_datetime(pdf["contract_expiration_date"])
    pdf["season_end_date"] = pd.to_datetime(pdf["year"].astype(str) + "-06-30")
    pdf["contract_years_left"] = round(
        (pdf["contract_expiration_date"] - pdf["season_end_date"]).dt.days / 365.25, 1
    )
    pdf["contract_years_left"] = pdf["contract_years_left"].fillna(0).clip(lower=0, upper=6)
    pdf.drop(columns="season_end_date", inplace=True)

    return pdf

In [152]:
def generate_team_strength() -> DataFrame:
    """
    Calculates seasonal team strength statistics for Premier League clubs based on historical match data.

    This function reads input CSVs containing information about clubs, games, competitions, and results,
    filters for Premier League matches, and aggregates key team performance metrics such as:
    - Points per game (PPG)
    - Goal difference
    - Goals scored
    - Goals conceded

    Returns:
        pd.DataFrame: A dataframe with the following columns:
            - 'team_name' (str): Club name
            - 'season' (str): Season identifier
            - 'team_ppg' (float): Points per game
            - 'team_goal_difference' (int): Goal difference
            - 'team_goals_scored' (int): Total goals scored
            - 'team_goals_conceded' (int): Total goals conceded
    """
    # Load CSVs
    club_games = pd.read_csv("../data/input/club_games.csv")
    clubs = pd.read_csv("../data/input/clubs.csv")
    games = pd.read_csv("../data/input/games.csv")
    competitions = pd.read_csv("../data/input/competitions.csv")

    # Merge club names and game metadata
    club_games = club_games.merge(clubs, on="club_id", suffixes=("", "_club"))
    club_games = club_games.merge(games, on="game_id", suffixes=("", "_game"))
    club_games = club_games.merge(competitions, on="competition_id", suffixes=("", "_comp"))

    # Filter to Premier League games only
    club_games = club_games[club_games["name_comp"] == "premier-league"]

    # Aggregate team stats by club name and season
    team_stats = (
        club_games.groupby(["name", "season"])
        .agg(
            games_played=("is_win", "count"),
            wins=("is_win", "sum"),
            goals_scored=("own_goals", "sum"),
            goals_conceded=("opponent_goals", "sum"),
        )
        .reset_index()
    )

    # Calculate draws
    team_stats["draws"] = team_stats["games_played"] - team_stats["wins"] - (
        club_games.groupby(["name", "season"])["is_win"]
        .apply(lambda x: (~x & (x.notna())).sum())
        .values
    )

    # Additional metrics
    team_stats["losses"] = team_stats["games_played"] - team_stats["wins"] - team_stats["draws"]
    team_stats["points"] = 3 * team_stats["wins"] + team_stats["draws"]
    team_stats["team_ppg"] = team_stats["points"] / team_stats["games_played"]
    team_stats["team_goal_difference"] = team_stats["goals_scored"] - team_stats["goals_conceded"]

    # Rename columns to have 'team_' prefix except 'season'
    team_stats.rename(columns={
        "goals_scored": "team_goals_scored",
        "goals_conceded": "team_goals_conceded"
    }, inplace=True)

    return team_stats[["name", "season", "team_ppg", "team_goal_difference", "team_goals_scored", "team_goals_conceded"]]


In [153]:
def generate_player_stats() -> DataFrame:
    """
    Loads player appearance and game data, filters for Premier League matches,
    and computes per-season player statistics such as:
    - Goals per 90
    - Assists per 90
    - Goal contributions per 90

    Returns:
        pd.DataFrame: A dataframe of player statistics with the following columns:
            - player_id (int)
            - player_name (str)
            - season (str)
            - club_name (str)
            - games_played (int)
            - total_minutes (int)
            - goals (int)
            - assists (int)
            - goal_contributions (int)
            - goals_per_90 (float)
            - assists_per_90 (float)
            - contrib_per_90 (float)
    """
    # Load CSVs
    appearances = pd.read_csv("../data/input/appearances.csv")
    players = pd.read_csv("../data/input/players.csv")
    clubs = pd.read_csv("../data/input/clubs.csv")
    games = pd.read_csv("../data/input/games.csv")
    competitions = pd.read_csv("../data/input/competitions.csv")

    # Merge club name
    clubs = clubs[["club_id", "name"]].rename(columns={"club_id": "player_club_id", "name": "club_name"})
    appearances = appearances.merge(clubs, on="player_club_id", how="left")

    # Merge competition name
    competitions = competitions[["competition_id", "name"]].rename(columns={"name": "competition_name"})
    appearances = appearances.merge(competitions, on="competition_id", how="left")

    # Merge season (from games table)
    games = games[["game_id", "season"]]
    appearances = appearances.merge(games, on="game_id", how="left")

    # Filter for Premier League matches
    df_prem = appearances[appearances["competition_name"] == "premier-league"]

    # Aggregate player stats
    player_stats = (
        df_prem.groupby(["player_id", "player_name", "season"])
        .agg(
            games_played=("appearance_id", "count"),
            total_minutes=("minutes_played", "sum"),
            goals=("goals", "sum"),
            assists=("assists", "sum")
        )
        .reset_index()
    )

    # Remove players with no minutes played
    player_stats = player_stats[player_stats["total_minutes"] > 0]

    # Calculate per-90 metrics
    player_stats["goal_contributions"] = player_stats["goals"] + player_stats["assists"]
    player_stats["goals_per_90"] = player_stats["goals"] / (player_stats["total_minutes"] / 90)
    player_stats["assists_per_90"] = player_stats["assists"] / (player_stats["total_minutes"] / 90)
    player_stats["contrib_per_90"] = player_stats["goal_contributions"] / (player_stats["total_minutes"] / 90)


    return player_stats


In [154]:

def add_synthetic_retirement_data(pdf: pd.DataFrame) -> pd.DataFrame:
    """
    For players who are likely retired (no data after age 32), 
    add synthetic rows for ages 33 to 39 using a constant multiplier model.
    The market value is calculated as:
        market_value_in_million_eur = last_value_million * 0.75
    """
    pdf["synthetic_flag"] = False
    synthetic_rows = []

    # Find players whose last entry is age >= 32 and who have no data after that
    last_entries = pdf.groupby("player_id").last().reset_index()
    retired_players = last_entries[last_entries["age"] >= 32]
    
    for _, row in retired_players.iterrows():
        player_id = row["player_id"]
        name = row["name"]
        last_year = row["year"]
        last_age = row["age"]
        last_value_million = row["market_value_in_million_eur"]

        # Only add synthetic data if no data exists for ages > last_age
        existing_ages = pdf[pdf["player_id"] == player_id]["age"].values
        for age in range(last_age + 1, 40):
            if age in existing_ages:
                continue

            # Synthetic value calculation (in millions) — 25% drop each year
            synthetic_value_million = last_value_million * 0.75

            synthetic_rows.append({
                "player_id": player_id,
                "name": name,
                "year": last_year + (age - last_age),
                "age": age,
                "market_value_in_million_eur": synthetic_value_million,
                "value_last_year": last_value_million,
                "synthetic_flag": True
                # Add other columns as needed, fill with NaN or last known values
            })

            # Update last_value_million for next year
            last_value_million = synthetic_value_million
    
    # Create DataFrame and append to original
    pdf_synth = pd.DataFrame(synthetic_rows)
    pdf_out = pd.concat([pdf, pdf_synth], ignore_index=True)
    return pdf_out


In [155]:
pdf_valuations = pd.read_csv('../data/input/player_valuations.csv')[["player_id", "date", "market_value_in_eur"]]
pdf_players = pd.read_csv('../data/input/players.csv')[["player_id", "name", "date_of_birth", "position", "sub_position", "contract_expiration_date", "current_club_name", "current_club_domestic_competition_id" ]]

In [156]:

pdf_joined = pdf_players.merge(pdf_valuations, on="player_id").query("current_club_domestic_competition_id == 'GB1'")
pdf_joined["age"] = (pd.to_datetime(pdf_joined["date"]) - pd.to_datetime(pdf_joined["date_of_birth"])).dt.days // 365
pdf_joined["year"] = pd.to_datetime(pdf_joined["date"]).dt.year
pdf_joined["month"] = pd.to_datetime(pdf_joined["date"]).dt.month

In [157]:
pdf_joined = calculate_contract_years_left(pdf_joined)

In [158]:
pdf_joined["market_value_in_million_eur"] = pdf_joined["market_value_in_eur"] / 1000000
pdf_mvp = pdf_joined.groupby(["player_id", "name", "year"])[["date_of_birth", "market_value_in_million_eur", "age", "position", "sub_position", "contract_years_left", "current_club_name"]].first().reset_index().sort_values(by=["player_id", "year"])
pdf_mvp ["value_last_year"] = pdf_mvp .groupby("player_id")["market_value_in_million_eur"].shift(1)
pdf_mvp ["age_last_year"] = pdf_mvp.groupby("player_id")["age"].shift(1)
pdf_mvp = pdf_mvp.dropna(subset=["value_last_year", "age_last_year"])

In [159]:

position_dummies = pd.get_dummies(pdf_mvp["position"], prefix="pos")
pdf_mvp = pd.concat([pdf_mvp, position_dummies], axis=1)

# Create subposition dummies
subpos_dummies = pd.get_dummies(pdf_mvp["sub_position"], prefix="subpos")
pdf_mvp = pd.concat([pdf_mvp, subpos_dummies], axis=1)


In [160]:
pdf_mvp["age_from_peak"] = (pdf_mvp["age"] - 25) ** 2

In [161]:
pdf_mvp = pdf_mvp.merge(generate_team_strength(), left_on=["current_club_name", "year"], right_on=["name", "season"], how="left", suffixes=("", "_team"))

In [162]:
pdf_mvp = pdf_mvp.merge(generate_player_stats(), left_on=["player_id", "year"], right_on=["player_id", "season"], how="left", suffixes=("", "_stats"))

In [163]:
stat_cols = [
    "games_played",
    "total_minutes",
    "goals",
    "assists",
    "goal_contributions",
    "goals_per_90",
    "assists_per_90",
    "contrib_per_90",
]

pdf_mvp[stat_cols] = pdf_mvp[stat_cols].fillna(0)

In [164]:
pdf_mvp = add_synthetic_retirement_data(pdf_mvp)

In [165]:
pdf_mvp.to_csv("../data/intermediate/time_series_model_data_prep.csv", index=False)