In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('../data/lstm_dataset_final.csv')

df = df.sort_values(by=['playerId', 'season'])

features = [
    'xGoals', 'pAssists', 'sAssists', 'sog', 'points', 'goals',
    'on_ice_chances', 'on_ice_goals', 'hits', 'penality_mins', 'pen_drawn',
    'corsi', 'xGoalsForAfterShifts', 'corsiForAfterShifts',
    'icetime_per_game', 'shot_percentage', 'points_per_60',
    'ixG-goals', 'ppg', 'apg', 'gpg',
    'xGoalsFor-goalsFor_team', 'age', 'age2', 'games_played_per',
    'pos_C', 'pos_D', 'pos_L', 'pos_R'
]
targets = ['next_games_played_per', 'next_goals_per_game', 'next_assists_per_game']

df = df.dropna(subset=features + targets)

for col in ['pos_C', 'pos_D', 'pos_L', 'pos_R']:
    df[col] = df[col].astype(int)
    
df.head()


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,playerId,season,name,position,team,games_played,xGoals,pAssists,...,xGoalsFor-goalsFor_team,age2,games_played_per,pos_C,pos_D,pos_L,pos_R,next_games_played_per,next_goals_per_game,next_assists_per_game
27,27,725,8448208,2015,Jaromir Jagr,R,FLA,79,19.76,21.0,...,-8.23,1849,0.963415,0,0,0,1,0.987805,0.197531,0.37037
1516,1516,1652,8448208,2016,Jaromir Jagr,R,FLA,81,23.04,20.0,...,20.95,1936,0.987805,0,0,0,1,0.268293,0.045455,0.272727
840,840,137,8462038,2015,Shane Doan,R,ARI,72,21.16,12.0,...,-6.89,1521,0.878049,0,0,0,1,0.890244,0.068493,0.287671
129,129,484,8462042,2015,Jarome Iginla,R,COL,82,18.66,16.0,...,-10.79,1444,1.0,0,0,0,1,0.963415,0.177215,0.164557
437,437,173,8464989,2015,Matt Cullen,C,PIT,82,11.16,11.0,...,-0.07,1521,1.0,1,0,0,0,0.878049,0.180556,0.25


In [5]:
from collections import defaultdict

In [6]:
seq = 3

X = []
y = []

player_groups = df.groupby('playerId')

for player_id, group in player_groups:
    group = group.sort_values('season')
    
    for i in range(1, len(group)):
        start = max(0, i - seq)
        sequence = group.iloc[start:i][features].values
        target = group.iloc[i][targets].values
        
        padding_flag = np.zeros((seq, 1)) 
        
        if sequence.shape[0] == 1:
            sequence = np.vstack([sequence[0], sequence[0], sequence[0]])
            padding_flag[:2] = 1
        elif sequence.shape[0] == 2:
            sequence = np.vstack([sequence[0], sequence[0], sequence[1]])
            padding_flag[:1] = 1
        elif sequence.shape[0] > 3:
            sequence = sequence[-3:]
            
        sequence = np.concatenate([sequence, padding_flag], axis=1)
        
        X.append(sequence)
        y.append(target)

X = np.array(X)
y = np.array(y)

In [7]:
from sklearn.preprocessing import StandardScaler

In [9]:
X_scaler = StandardScaler()
y_scaler = StandardScaler()

original_shape = X.shape

X_flat = X.reshape(-1, X.shape[2])
X_flat_scaled = X_scaler.fit_transform(X_flat)
X_seq_scaled = X_flat_scaled.reshape(original_shape)

y_seq_scaled = y_scaler.fit_transform(y)

In [10]:
from torch.utils.data import Dataset

In [11]:
class NHLDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [12]:
import torch.nn as nn

In [13]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(GRU, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers,
                            batch_first=True, dropout=dropout if num_layers > 1 else 0.0)
        self.fc = nn.Linear(hidden_size, 3)

    def forward(self, x):
        _, hn = self.gru(x)
        out = self.fc(hn[-1])
        return out

In [14]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_seq_scaled, y_seq_scaled, test_size=0.2, random_state=42)

train_loader = DataLoader(NHLDataset(X_train, y_train), batch_size=64, shuffle=True)
val_loader = DataLoader(NHLDataset(X_val, y_val), batch_size=64)

In [16]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
def objective(trial):
    hidden_size = trial.suggest_int("hidden_size", 64, 256)
    num_layers = trial.suggest_int("num_layers", 1, 3)
    dropout = trial.suggest_float("dropout", 0.0, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    batch_size = 64

    model = GRU(input_size=X_train.shape[2], hidden_size=hidden_size,
                      num_layers=num_layers, dropout=dropout)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    for epoch in range(10):
        model.train()
        for xb, yb in train_loader:
            pred = model(xb)
            loss = loss_fn(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            out = model(xb)
            preds.append(out.numpy())
            targets.append(yb.numpy())

    preds = np.concatenate(preds, axis=0)
    targets = np.concatenate(targets, axis=0)

    preds_orig = y_scaler.inverse_transform(preds)
    targets_orig = y_scaler.inverse_transform(targets)

    return r2_score(targets_orig, preds_orig)

In [18]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2025-06-24 00:27:44,444] A new study created in memory with name: no-name-fa19ff81-9c7f-49e4-9cdb-566c99a2fd69
[I 2025-06-24 00:27:55,462] Trial 0 finished with value: 0.4929819107055664 and parameters: {'hidden_size': 213, 'num_layers': 2, 'dropout': 0.12654290048571187, 'lr': 0.0006950325555687561}. Best is trial 0 with value: 0.4929819107055664.
[I 2025-06-24 00:27:57,964] Trial 1 finished with value: 0.48078015446662903 and parameters: {'hidden_size': 118, 'num_layers': 1, 'dropout': 0.37473928667842654, 'lr': 0.00011310841341702959}. Best is trial 0 with value: 0.4929819107055664.
[I 2025-06-24 00:28:04,568] Trial 2 finished with value: 0.437284380197525 and parameters: {'hidden_size': 128, 'num_layers': 3, 'dropout': 0.13387864372716612, 'lr': 0.009352066396765522}. Best is trial 0 with value: 0.4929819107055664.
[I 2025-06-24 00:28:15,193] Trial 3 finished with value: 0.4961094856262207 and parameters: {'hidden_size': 236, 'num_layers': 2, 'dropout': 0.36416389989879505, 'lr'

In [19]:
best_params = study.best_trial.params
print(best_params)

{'hidden_size': 220, 'num_layers': 2, 'dropout': 0.10551929610710889, 'lr': 0.0005994359774606638}


In [20]:
best_model = GRU(
    input_size=X_train.shape[2],
    hidden_size=best_params["hidden_size"],
    num_layers=best_params["num_layers"],
    dropout=best_params["dropout"] if best_params["num_layers"] > 1 else 0.0
)

optimizer = torch.optim.Adam(best_model.parameters(), lr=best_params["lr"])
loss_fn = nn.MSELoss()

for epoch in range(31):
    best_model.train()
    for xb, yb in train_loader:
        pred = best_model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % 5 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 0, Loss: 0.4448
Epoch 5, Loss: 0.5620
Epoch 10, Loss: 0.4582
Epoch 15, Loss: 0.3648
Epoch 20, Loss: 0.4203
Epoch 25, Loss: 0.3403
Epoch 30, Loss: 0.3413


In [21]:
best_model.eval()
preds, targets = [], []

with torch.no_grad():
    for xb, yb in val_loader:
        out = best_model(xb)
        preds.append(out.numpy())
        targets.append(yb.numpy())

preds = np.concatenate(preds, axis=0)
targets = np.concatenate(targets, axis=0)

preds_orig = y_scaler.inverse_transform(preds)
targets_orig = y_scaler.inverse_transform(targets)

In [22]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

print("R² Score:", r2_score(targets_orig, preds_orig))
print("MAE:", mean_absolute_error(targets_orig, preds_orig))
print("MSE:", mean_squared_error(targets_orig, preds_orig))
print("RMSE:", np.sqrt(mean_squared_error(targets_orig, preds_orig)))

R² Score: 0.4694139063358307
MAE: 0.11816316843032837
MSE: 0.031387802213430405
RMSE: 0.17716603007752474


In [23]:
target_names = ['next_games_played_per', 'next_goals_per_game', 'next_assists_per_game']
for i, name in enumerate(target_names):
    print(f"{name}")
    print("R²:", r2_score(targets_orig[:, i], preds_orig[:, i]))
    print("MAE:", mean_absolute_error(targets_orig[:, i], preds_orig[:, i]))
    print("MSE:", mean_squared_error(targets_orig[:, i], preds_orig[:, i]))
    print("RMSE:", np.sqrt(mean_squared_error(targets_orig[:, i], preds_orig[:, i])))

next_games_played_per
R²: 0.24836891889572144
MAE: 0.19956953823566437
MSE: 0.07025319337844849
RMSE: 0.2650531897156653
next_goals_per_game
R²: 0.5640271902084351
MAE: 0.06312374770641327
MSE: 0.008871781639754772
RMSE: 0.09419013557562582
next_assists_per_game
R²: 0.5958454608917236
MAE: 0.09179605543613434
MSE: 0.015038395300507545
RMSE: 0.12263113511872727


In [24]:
import joblib

In [25]:
torch.save(best_model.state_dict(), "../models/gru_nhl_model.pt")
joblib.dump(X_scaler, "../models/X_scaler_gru.pkl")
joblib.dump(y_scaler, "../models/y_scaler_gru.pkl")

['../models/y_scaler_gru.pkl']