In [79]:
import pandas as pd
from utils.output_analysis import plot_player_value_trends, save_output_tables

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as root_mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from typing import List
import plotly.express as px

import uuid
import datetime 
import time

In [80]:
start = time.time()

In [81]:
pdf_mvp = pd.read_csv("data/intermediate/time_series_model_data_prep.csv")

In [None]:


def predict_future_values(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    prediction_years: List[int],
    features: List[str],
    max_depth: int = None  # Not used
) -> tuple[List[pd.DataFrame], LinearRegression, pd.Series]:
    """
    Predicts future player values using Linear Regression with standardized features.

    Returns:
        - List of yearly predicted DataFrames
        - Trained LinearRegression model
        - Actual values from base test year (for evaluation)
    """
    target = "market_value_in_million_eur"

    # Scale features
    scaler = StandardScaler()
    X_train_raw = train_df[features]
    X_train_scaled = scaler.fit_transform(X_train_raw)
    y_train = train_df[target]

    # Fit regression model
    regressor = LinearRegression()
    regressor.fit(X_train_scaled, y_train)

    predictions = []
    current_df = test_df.copy()

    for i, year in enumerate(prediction_years):
        # Handle age-based features
        current_df["age"] = current_df["age_last_year"] + 1
        current_df["age_from_peak"] = (current_df["age"] - 25) ** 2

        # Decrease contract years left by 1, not below 0
        if "contract_years_left" in current_df.columns:
            current_df["contract_years_left"] = (current_df["contract_years_left"] - 1).clip(lower=0)

        X_test_raw = current_df[features]
        X_test_scaled = scaler.transform(X_test_raw)

        current_df["predicted_value"] = regressor.predict(X_test_scaled)
        current_df["year"] = year

        if i == 0:
            y_test = current_df[target]
            rmse = root_mean_squared_error(y_test, current_df["predicted_value"], squared=False)
            r2 = r2_score(y_test, current_df["predicted_value"])
            print(f"📅 {year} RMSE: {rmse:.2f}")
            print(f"📈 {year} R²: {r2:.3f}")

            fig = px.scatter(current_df, x="predicted_value", y=target, hover_data=["name", "age"])
            fig.show()

        # Prepare next year’s data
        current_df["age_last_year"] = current_df["age"]
        current_df["value_last_year"] = current_df["predicted_value"]

        # Save results (including contract years if available)
        cols_to_keep = ["player_id", "year", "age", "predicted_value", "age_from_peak"]
        if "contract_years_left" in current_df.columns:
            cols_to_keep.append("contract_years_left")

        predictions.append(current_df[cols_to_keep].copy())

        if i < len(prediction_years) - 1:
            pos_cols = [col for col in current_df.columns if col.startswith("pos_")]
            subpos_cols = [col for col in current_df.columns if col.startswith("subpos_")]
            static_cols = pos_cols + subpos_cols

            carry_cols = ["player_id", "value_last_year", "age", "age_from_peak", *static_cols]
            if "contract_years_left" in current_df.columns:
                carry_cols.append("contract_years_left")

            current_df = current_df[carry_cols].copy()
            current_df.rename(columns={"age": "age_last_year"}, inplace=True)


    return predictions, regressor, y_test


In [83]:
test_start = 2023

version  = "v0.0.1"

features = [
    "value_last_year",
    #"age_last_year",
    "age_from_peak",
    "pos_Attack",
    "pos_Defender",
    "pos_Goalkeeper",
    "pos_Midfield",
    "contract_years_left",
]

subpos_features = [col for col in pdf_mvp.columns if col.startswith("subpos_")]
features += subpos_features


In [84]:
train_df = pdf_mvp[pdf_mvp["year"] < test_start]
test_df = pdf_mvp[pdf_mvp["year"] == test_start]

predicted_dfs, regressor, y_test = predict_future_values(
    train_df,
    test_df,
    prediction_years=[2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030, 2031, 2032, 2033, 2034, 2035, 2036, 2037, 2038],
    features=features
)

forecast_df = pd.concat(predicted_dfs, ignore_index=True)

📅 2023 RMSE: 6.91
📈 2023 R²: 0.811



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.




🔎 Linear Regression Coefficients (Standardized Features):
                  Feature   Coefficient
        subpos_Goalkeeper  2.657357e+13
  subpos_Central Midfield  2.560071e+13
             pos_Midfield -2.253251e+13
subpos_Defensive Midfield  1.930775e+13
           pos_Goalkeeper -1.864093e+13
subpos_Attacking Midfield  1.555294e+13
       subpos_Centre-Back  1.549684e+13
               pos_Attack  1.474708e+13
        subpos_Right-Back  1.123943e+13
         subpos_Left-Back  9.924079e+12
     subpos_Left Midfield  8.787226e+12
             pos_Defender -7.822293e+12
    subpos_Right Midfield  6.183129e+12
    subpos_Centre-Forward -3.771988e+12
      subpos_Right Winger -2.421730e+12
       subpos_Left Winger -2.181998e+12
    subpos_Second Striker -7.425499e+11
          value_last_year  1.005927e+01
      contract_years_left  8.647266e-01
            age_from_peak -3.201499e-01


In [85]:
# Print coefficients
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": regressor.coef_
}).sort_values(by="Coefficient", key=abs, ascending=False)

print("\n🔎 Linear Regression Coefficients:")
print(coef_df.to_string(index=False))


🔎 Linear Regression Coefficients:
                  Feature   Coefficient
        subpos_Goalkeeper  2.657357e+13
  subpos_Central Midfield  2.560071e+13
             pos_Midfield -2.253251e+13
subpos_Defensive Midfield  1.930775e+13
           pos_Goalkeeper -1.864093e+13
subpos_Attacking Midfield  1.555294e+13
       subpos_Centre-Back  1.549684e+13
               pos_Attack  1.474708e+13
        subpos_Right-Back  1.123943e+13
         subpos_Left-Back  9.924079e+12
     subpos_Left Midfield  8.787226e+12
             pos_Defender -7.822293e+12
    subpos_Right Midfield  6.183129e+12
    subpos_Centre-Forward -3.771988e+12
      subpos_Right Winger -2.421730e+12
       subpos_Left Winger -2.181998e+12
    subpos_Second Striker -7.425499e+11
          value_last_year  1.005927e+01
      contract_years_left  8.647266e-01
            age_from_peak -3.201499e-01


In [86]:
# Merge with actual market values for evaluation

merged = forecast_df.merge(test_df[["player_id", "name"]], on=["player_id"], how="left")
end = time.time()
diff = end - start

In [87]:
players = [418560, 357662, 565822, 859951, 148455, 274839, 542586]
plot_player_value_trends(train_df=train_df, merged_df=merged, player_ids=players)

In [88]:
output_df = merged.query("year == 2023")
output_df["actual_value"] = y_test.values




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [89]:
output_df["model_output_id"] = uuid.uuid4()
output_df["model_run_date"] = datetime.datetime.now()
output_df["time_taken_seconds"] = diff
output_df["features_used"] = str(features)
output_df["model_type"] = "Regression"
output_df["split_year"] = test_start
output_df["version"] = version



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [90]:
save_output_tables(output_df)