In [56]:
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import optuna

In [67]:
df = pd.read_csv("../data/cleaned_data_train.csv")
print(df.columns.tolist())
len(df)

['playerId', 'season', 'name', 'position', 'team', 'games_played', 'xGoals', 'pAssists', 'sAssists', 'sog', 'points', 'goals', 'on_ice_chances', 'on_ice_goals', 'icetime_per_game', 'shot_percentage', 'points_per_60', 'ixG-goals', 'ppg', 'apg', 'gpg', 'points_lag_1', 'points_lag_2', 'points_lag_3', 'points_lag_4', 'points_lag_5', 'ppg_lag_1', 'ppg_lag_2', 'ppg_lag_3', 'ppg_lag_4', 'ppg_lag_5', 'pAssists_lag_1', 'pAssists_lag_2', 'pAssists_lag_3', 'pAssists_lag_4', 'pAssists_lag_5', 'sAssists_lag_1', 'sAssists_lag_2', 'sAssists_lag_3', 'sAssists_lag_4', 'sAssists_lag_5', 'goals_lag_1', 'goals_lag_2', 'goals_lag_3', 'goals_lag_4', 'goals_lag_5', 'gpg_lag_1', 'gpg_lag_2', 'gpg_lag_3', 'gpg_lag_4', 'gpg_lag_5', 'apg_lag_1', 'apg_lag_2', 'apg_lag_3', 'apg_lag_4', 'apg_lag_5', 'icetime_per_game_lag_1', 'next_goals_per_game', 'next_assists_per_game', 'age', 'games_played_team', 'xGoalsFor_team', 'goalsFor_team', 'highDangerShotsFor_team', 'highDangerxGoalsFor_team', 'highDangerGoalsFor', 'xGoa

3596

In [96]:
features = ["games_played", "icetime_per_game", "icetime_per_game_lag_1", "games_played_per", "games_played_per_lag_1",
            "games_played_per_lag_2", "games_played_per_lag_3", "games_played_per_lag_4", "games_played_per_lag_5",
            "games_played_lag_1", "points_per_60", "age", "age2", 'pos_C', 'pos_D', 'pos_L', 'pos_R', 'points', 'on_ice_chances']
target = "next_games_played_per"

df_model = df


In [88]:
X = df_model[features]
Y = df_model[target]

X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size = 0.2, random_state = 0)
X_train.shape, X_valid.shape, Y_train.shape, Y_valid.shape

((2876, 20), (720, 20), (2876,), (720,))

In [89]:
def estimate(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 800),
        "max_depth": trial.suggest_int("max_depth", 2, 8),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1.0, 5.0),
        "random_state": 42
    }

    model = XGBRegressor(**params)
    model.fit(X_train, Y_train)

    preds = model.predict(X_valid)
    return r2_score(Y_valid, preds)

In [97]:
study = optuna.create_study(direction = "maximize")
study.optimize(estimate, n_trials = 150)

print(study.best_trial.params)

[I 2025-05-08 19:27:29,376] A new study created in memory with name: no-name-d13efbc1-1927-4656-a34e-e5ec3acab24a
[I 2025-05-08 19:27:29,901] Trial 0 finished with value: 0.410648046721918 and parameters: {'n_estimators': 780, 'max_depth': 2, 'learning_rate': 0.012806536589921447, 'subsample': 0.88312739883319, 'colsample_bytree': 0.6276724837661366, 'reg_alpha': 0.12190565675852805, 'reg_lambda': 3.911740025050046}. Best is trial 0 with value: 0.410648046721918.
[I 2025-05-08 19:27:31,095] Trial 1 finished with value: 0.40851007892130886 and parameters: {'n_estimators': 475, 'max_depth': 6, 'learning_rate': 0.012384990816349611, 'subsample': 0.6685399879465592, 'colsample_bytree': 0.8538234759461272, 'reg_alpha': 0.23158141643660668, 'reg_lambda': 2.304411241215356}. Best is trial 0 with value: 0.410648046721918.
[I 2025-05-08 19:27:32,076] Trial 2 finished with value: 0.3614029841881118 and parameters: {'n_estimators': 498, 'max_depth': 6, 'learning_rate': 0.05030588198467021, 'subsa

{'n_estimators': 207, 'max_depth': 4, 'learning_rate': 0.027146413522537303, 'subsample': 0.6716216754413658, 'colsample_bytree': 0.9325212514878091, 'reg_alpha': 0.8997921125572959, 'reg_lambda': 3.5593512319837}


In [98]:
best_model = XGBRegressor(**study.best_trial.params)
best_model.fit(X_train, Y_train)

In [99]:
y_pred = best_model.predict(X_valid)

In [100]:
mean_absolute_error(Y_valid, y_pred)

0.1842969658764738

In [101]:
mean_squared_error(Y_valid, y_pred)

0.05672256231114045

In [102]:
r2_score(Y_valid, y_pred)

0.41309898372033393