In [4]:
import pandas as pd
from utils.output_analysis import  plot_player_value_trends
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as root_mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from typing import List
import plotly.express as px

In [5]:
pdf_mvp = pd.read_csv("data/intermediate/time_series_model_data_prep.csv")

In [6]:


def predict_future_values(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    prediction_years: List[int],
    features: List[str],
    max_depth: int = None  # Not used
) -> tuple[List[pd.DataFrame], LinearRegression, pd.Series]:
    """
    Predicts future player values using Linear Regression with standardized features.

    Returns:
        - List of yearly predicted DataFrames
        - Trained LinearRegression model
        - Actual values from base test year (for evaluation)
    """
    target = "market_value_in_million_eur"

    # Scale features
    scaler = StandardScaler()
    X_train_raw = train_df[features]
    X_train_scaled = scaler.fit_transform(X_train_raw)
    y_train = train_df[target]

    # Fit regression model
    regressor = LinearRegression()
    regressor.fit(X_train_scaled, y_train)

    predictions = []
    current_df = test_df.copy()

    for i, year in enumerate(prediction_years):
        # Handle age-based features
        current_df["age"] = current_df["age_last_year"] + 1
        current_df["age_from_peak"] = (current_df["age"] - 25) ** 2

        X_test_raw = current_df[features]
        X_test_scaled = scaler.transform(X_test_raw)

        current_df["predicted_value"] = regressor.predict(X_test_scaled)
        current_df["year"] = year

        if i == 0:
            y_test = current_df[target]
            rmse = root_mean_squared_error(y_test, current_df["predicted_value"], squared=False)
            r2 = r2_score(y_test, current_df["predicted_value"])
            print(f"📅 {year} RMSE: {rmse:.2f}")
            print(f"📈 {year} R²: {r2:.3f}")

            fig = px.scatter(current_df, x="predicted_value", y=target, hover_data=["name", "age"])
            fig.show()

        # Prepare next year’s data
        current_df["age_last_year"] = current_df["age"]
        current_df["value_last_year"] = current_df["predicted_value"]

        predictions.append(current_df[["player_id", "year", "age", "predicted_value", "age_from_peak"]].copy())

        if i < len(prediction_years) - 1:
            pos_cols = [col for col in current_df.columns if col.startswith("pos_")]
            subpos_cols = [col for col in current_df.columns if col.startswith("subpos_")]
            static_cols = pos_cols + subpos_cols
            carry_cols = ["player_id", "value_last_year", "age", "age_from_peak", *static_cols]
            current_df = current_df[carry_cols].copy()
            current_df.rename(columns={"age": "age_last_year"}, inplace=True)

    # Output coefficients
    coef_df = pd.DataFrame({
        "Feature": features,
        "Coefficient": regressor.coef_
    }).sort_values(by="Coefficient", key=abs, ascending=False)

    print("\n🔎 Linear Regression Coefficients (Standardized Features):")
    print(coef_df.to_string(index=False))

    return predictions, regressor, y_test


In [7]:
test_start = 2023

features = [
    "value_last_year",
    #"age_last_year",
    "age_from_peak"
    # "pos_Attack",
    # "pos_Defender",
    # "pos_Goalkeeper",
    # "pos_Midfield"
]


In [8]:
train_df = pdf_mvp[pdf_mvp["year"] < test_start]
test_df = pdf_mvp[pdf_mvp["year"] == test_start]

predicted_dfs, regressor, y_test = predict_future_values(
    train_df,
    test_df,
    prediction_years=[2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030, 2031, 2032, 2033, 2034, 2035, 2036, 2037, 2038],
    features=features
)

forecast_df = pd.concat(predicted_dfs, ignore_index=True)



📅 2023 RMSE: 7.03
📈 2023 R²: 0.804



🔎 Linear Regression Coefficients (Standardized Features):
        Feature  Coefficient
value_last_year    10.158277
  age_from_peak    -0.531060


In [9]:
# Print coefficients
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": regressor.coef_
}).sort_values(by="Coefficient", key=abs, ascending=False)

print("\n🔎 Linear Regression Coefficients:")
print(coef_df.to_string(index=False))


🔎 Linear Regression Coefficients:
        Feature  Coefficient
value_last_year    10.158277
  age_from_peak    -0.531060


In [10]:
# Merge with actual market values for evaluation

merged = forecast_df.merge(test_df[["player_id", "name"]], on=["player_id"], how="left")


In [11]:
players = [418560, 357662, 565822, 859951, 148455, 274839, 542586]
plot_player_value_trends(train_df=train_df, merged_df=merged, player_ids=players)