In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [5]:
df_model = pd.read_csv("../data/cleaned_data_train.csv")
len(df)
print(df.columns.tolist())

['playerId', 'season', 'name', 'position', 'team', 'games_played', 'xGoals', 'pAssists', 'sAssists', 'sog', 'points', 'goals', 'on_ice_chances', 'on_ice_goals', 'icetime_per_game', 'shot_percentage', 'points_per_60', 'ixG-goals', 'ppg', 'apg', 'gpg', 'points_lag_1', 'points_lag_2', 'points_lag_3', 'points_lag_4', 'points_lag_5', 'ppg_lag_1', 'ppg_lag_2', 'ppg_lag_3', 'ppg_lag_4', 'ppg_lag_5', 'pAssists_lag_1', 'pAssists_lag_2', 'pAssists_lag_3', 'pAssists_lag_4', 'pAssists_lag_5', 'sAssists_lag_1', 'sAssists_lag_2', 'sAssists_lag_3', 'sAssists_lag_4', 'sAssists_lag_5', 'goals_lag_1', 'goals_lag_2', 'goals_lag_3', 'goals_lag_4', 'goals_lag_5', 'gpg_lag_1', 'gpg_lag_2', 'gpg_lag_3', 'gpg_lag_4', 'gpg_lag_5', 'apg_lag_1', 'apg_lag_2', 'apg_lag_3', 'apg_lag_4', 'apg_lag_5', 'icetime_per_game_lag_1', 'next_goals_per_game', 'next_assists_per_game', 'age', 'games_played_team', 'xGoalsFor_team', 'goalsFor_team', 'highDangerShotsFor_team', 'highDangerxGoalsFor_team', 'highDangerGoalsFor', 'xGoa

In [17]:
features = ["age", "age2", 'pos_C', 'pos_D', 'pos_L', 'pos_R', 'games_played', 'xGoals',
            'pAssists', 'sAssists', 'sog', 'points', 'goals', 'on_ice_chances', 'on_ice_goals', 
            'icetime_per_game', 'shot_percentage', 'points_per_60', 'ixG-goals', 'ppg', 'apg', 'gpg',
            'points_lag_1', 'points_lag_2', 'points_lag_3', 'points_lag_4', 'points_lag_5', 
            'ppg_lag_1', 'ppg_lag_2', 'ppg_lag_3', 'ppg_lag_4', 'ppg_lag_5', 'pAssists_lag_1', 'sAssists_lag_1',
            'goals_lag_1', 'goals_lag_2', 'goals_lag_3', 'goals_lag_4', 'goals_lag_5', 
            'gpg_lag_1', 'gpg_lag_2', 'gpg_lag_3', 'gpg_lag_4', 'gpg_lag_5', 'apg_lag_1',
            'xGoalsFor_team', 'goalsFor_team', 'highDangerShotsFor_team', 'highDangerxGoalsFor_team',
            'highDangerGoalsFor', 'xGoalsFor - goalsFor', 'games_played_per']
target = ["next_goals_per_game"]

In [18]:
X = df_model[features]
Y = df_model[target]

X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size = 0.2, random_state = 0)
X_train.shape, X_valid.shape, Y_train.shape, Y_valid.shape

((2876, 57), (720, 57), (2876, 1), (720, 1))

In [19]:
xgb = XGBRegressor(random_state = 0, verbosity = 0)

In [20]:
param_grid = {
    "max_depth": [1, 3, 5],
    "learning_rate": [0.01, 0.1],
    "n_estimators": [300, 500, 700],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "reg_alpha": [0, 0.1],
    "reg_lambda": [1, 2, 5]
}

In [21]:
xgb_grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring="r2",
    cv=5,
    verbose=1,
    n_jobs=-1
)

In [22]:
xgb_grid.fit(X_train, Y_train)

best_model = xgb_grid.best_estimator_

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


In [23]:
print(xgb_grid.best_params_)

{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 700, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.8}


In [24]:
y_pred = best_model.predict(X_valid)

In [25]:
mean_absolute_error(Y_valid, y_pred)

0.05500803887844086

In [26]:
mean_squared_error(Y_valid, y_pred)

0.00562277901917696

In [27]:
r2_score(Y_valid, y_pred)

0.6860904693603516