In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [9]:
df = pd.read_csv("../data/cleaned_data_train.csv")
print(df.columns.tolist())
len(df)

['playerId', 'season', 'name', 'position', 'team', 'games_played', 'xGoals', 'pAssists', 'sAssists', 'sog', 'points', 'goals', 'on_ice_chances', 'on_ice_goals', 'icetime_per_game', 'shot_percentage', 'points_per_60', 'ixG-goals', 'ppg', 'apg', 'gpg', 'points_lag_1', 'points_lag_2', 'points_lag_3', 'points_lag_4', 'points_lag_5', 'ppg_lag_1', 'ppg_lag_2', 'ppg_lag_3', 'ppg_lag_4', 'ppg_lag_5', 'pAssists_lag_1', 'pAssists_lag_2', 'pAssists_lag_3', 'pAssists_lag_4', 'pAssists_lag_5', 'sAssists_lag_1', 'sAssists_lag_2', 'sAssists_lag_3', 'sAssists_lag_4', 'sAssists_lag_5', 'goals_lag_1', 'goals_lag_2', 'goals_lag_3', 'goals_lag_4', 'goals_lag_5', 'gpg_lag_1', 'gpg_lag_2', 'gpg_lag_3', 'gpg_lag_4', 'gpg_lag_5', 'apg_lag_1', 'apg_lag_2', 'apg_lag_3', 'apg_lag_4', 'apg_lag_5', 'icetime_per_game_lag_1', 'next_goals_per_game', 'next_assists_per_game', 'age', 'games_played_team', 'xGoalsFor_team', 'goalsFor_team', 'highDangerShotsFor_team', 'highDangerxGoalsFor_team', 'highDangerGoalsFor', 'xGoa

3596

In [11]:
features = ["games_played", "icetime_per_game", "position", "icetime_per_game_lag_1", "games_played_per", "games_played_per_lag_1",
            "games_played_per_lag_2", "games_played_per_lag_3", "games_played_per_lag_4", "games_played_per_lag_5",
            "games_played_lag_1", "points_per_60", "age"]
target = ["next_games_played_per"]

df_model = df


In [12]:
X = df_model[features]
Y = df_model[target]

X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size = 0.2, random_state = 13)
X_train.shape, X_valid.shape, Y_train.shape, Y_valid.shape

((2876, 13), (720, 13), (2876, 1), (720, 1))

In [13]:
xgb = XGBRegressor(random_state = 13, verbosity = 0)

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X_train["position"] = le.fit_transform(X_train["position"])
X_valid["position"] = le.transform(X_valid["position"])

In [23]:
param_grid = {
    "max_depth": [1, 3],
    "learning_rate": [0.0001, 0.01],
    "n_estimators": [300, 500, 700],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "reg_alpha": [0, 0.1],
    "reg_lambda": [1, 2, 5]
}

In [24]:
xgb_grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring="r2",
    cv=5,
    verbose=1,
    n_jobs=-1
)

In [25]:
xgb_grid.fit(X_train, Y_train)

best_model = xgb_grid.best_estimator_

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


In [26]:
print(xgb_grid.best_params_)

{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'reg_alpha': 0, 'reg_lambda': 5, 'subsample': 0.8}


In [27]:
y_pred = best_model.predict(X_valid)

In [28]:
mean_absolute_error(Y_valid, y_pred)

0.18217943608760834

In [29]:
mean_squared_error(Y_valid, y_pred)

0.05692705139517784

In [30]:
r2_score(Y_valid, y_pred)

0.33405518531799316