In [3]:
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [4]:
df = pd.read_csv("../data/cleaned_data_train.csv")
print(df.columns.tolist())
len(df)

['team', 'season', 'games_played_team', 'xGoalsFor_team', 'goalsFor_team', 'highDangerShotsFor_team', 'highDangerxGoalsFor_team', 'highDangerGoalsFor', 'xGoalsFor - goalsFor', 'playerId', 'name', 'position', 'games_played', 'xGoals', 'pAssists', 'sAssists', 'sog', 'points', 'goals', 'on_ice_chances', 'on_ice_goals', 'icetime_per_game', 'shot_percentage', 'points_per_60', 'ixG-goals', 'ppg', 'apg', 'gpg', 'points_lag_1', 'points_lag_1_missing', 'points_lag_2', 'points_lag_2_missing', 'points_lag_3', 'points_lag_3_missing', 'points_lag_4', 'points_lag_4_missing', 'points_lag_5', 'points_lag_5_missing', 'ppg_lag_1', 'ppg_lag_1_missing', 'ppg_lag_2', 'ppg_lag_2_missing', 'ppg_lag_3', 'ppg_lag_3_missing', 'ppg_lag_4', 'ppg_lag_4_missing', 'ppg_lag_5', 'ppg_lag_5_missing', 'pAssists_lag_1', 'pAssists_lag_1_missing', 'pAssists_lag_2', 'pAssists_lag_2_missing', 'pAssists_lag_3', 'pAssists_lag_3_missing', 'pAssists_lag_4', 'pAssists_lag_4_missing', 'pAssists_lag_5', 'pAssists_lag_5_missing', 's

2071

In [5]:
features = ["games_played", "icetime_per_game", "position", "icetime_per_game_lag_1", "icetime_per_game_lag_1_missing",
            "games_played_per", "games_played_per_lag_1", "games_played_per_lag_1_missing", 
            "games_played_per_lag_2", "games_played_per_lag_3", "games_played_per_lag_4", "games_played_per_lag_5",
            "games_played_per_lag_2_missing", "games_played_per_lag_3_missing", "games_played_per_lag_4_missing", "games_played_per_lag_5_missing",
            "games_played_lag_1", "games_played_lag_1_missing", "points_per_60", "age"]
target = ["next_games_played_per"]

df_model = df.dropna(subset = features + target)
len(df_model)

2071

In [6]:
X = df_model[features]
Y = df_model[target]

X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size = 0.2, random_state = 13)
X_train.shape, X_valid.shape, Y_train.shape, Y_valid.shape

((1656, 20), (415, 20), (1656, 1), (415, 1))

In [7]:
xgb = XGBRegressor(objective = "reg:squarederror", random_state = 13, verbosity = 0)

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X_train["position"] = le.fit_transform(X_train["position"])
X_valid["position"] = le.transform(X_valid["position"])

In [20]:
param_grid = {
    "max_depth": [3, 5],
    "learning_rate": [0.000001, 0.0001, 0.01],
    "n_estimators": [300, 500, 700, 900],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "reg_alpha": [0, 0.1],
    "reg_lambda": [1, 2, 5]
}

In [21]:
xgb_grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring="r2",
    cv=5,
    verbose=1,
    n_jobs=-1
)

In [22]:
xgb_grid.fit(X_train, Y_train)

best_model = xgb_grid.best_estimator_

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


In [24]:
print(xgb_grid.best_params_)

{'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'reg_alpha': 0.1, 'reg_lambda': 5, 'subsample': 0.8}


In [25]:
y_pred = best_model.predict(X_valid)

In [26]:
mean_absolute_error(Y_valid, y_pred)

0.18887943029403687

In [27]:
mean_squared_error(Y_valid, y_pred)

0.05667976289987564

In [28]:
r2_score(Y_valid, y_pred)

0.32079434394836426