In [154]:
import pandas as pd

In [155]:
df = pd.read_csv('../data/lstm_dataset_final.csv')

df = df.sort_values(by=['playerId', 'season'])

features = [
    'xGoals', 'pAssists', 'sAssists', 'sog', 'points', 'goals',
    'on_ice_chances', 'on_ice_goals', 'hits', 'penality_mins', 'pen_drawn',
    'corsi', 'xGoalsForAfterShifts', 'corsiForAfterShifts',
    'icetime_per_game', 'shot_percentage', 'points_per_60',
    'ixG-goals', 'ppg', 'apg', 'gpg',
    'xGoalsFor-goalsFor_team', 'age', 'age2', 'games_played_per',
    'pos_C', 'pos_D', 'pos_L', 'pos_R'
]
targets = ['next_games_played_per', 'next_goals_per_game', 'next_assists_per_game']

df = df.dropna(subset=features + targets)

for col in ['pos_C', 'pos_D', 'pos_L', 'pos_R']:
    df[col] = df[col].astype(int)
    
df.head()


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,playerId,season,name,position,team,games_played,xGoals,pAssists,...,xGoalsFor-goalsFor_team,age2,games_played_per,pos_C,pos_D,pos_L,pos_R,next_games_played_per,next_goals_per_game,next_assists_per_game
27,27,725,8448208,2015,Jaromir Jagr,R,FLA,79,19.76,21.0,...,-8.23,1849,0.963415,0,0,0,1,0.987805,0.197531,0.37037
1516,1516,1652,8448208,2016,Jaromir Jagr,R,FLA,81,23.04,20.0,...,20.95,1936,0.987805,0,0,0,1,0.268293,0.045455,0.272727
840,840,137,8462038,2015,Shane Doan,R,ARI,72,21.16,12.0,...,-6.89,1521,0.878049,0,0,0,1,0.890244,0.068493,0.287671
129,129,484,8462042,2015,Jarome Iginla,R,COL,82,18.66,16.0,...,-10.79,1444,1.0,0,0,0,1,0.963415,0.177215,0.164557
437,437,173,8464989,2015,Matt Cullen,C,PIT,82,11.16,11.0,...,-0.07,1521,1.0,1,0,0,0,0.878049,0.180556,0.25


In [156]:
from collections import defaultdict
import numpy as np

In [157]:
seq = 3

X = []
y = []

player_groups = df.groupby('playerId')

for player_id, group in player_groups:
    group = group.sort_values('season')
    
    for i in range(1, len(group)):
        start = max(0, i - seq)
        sequence = group.iloc[start:i][features].values
        target = group.iloc[i][targets].values
        
        padding_flag = np.zeros((seq, 1)) 
        
        if sequence.shape[0] == 1:
            sequence = np.vstack([sequence[0], sequence[0], sequence[0]])
            padding_flag[:2] = 1
        elif sequence.shape[0] == 2:
            sequence = np.vstack([sequence[0], sequence[0], sequence[1]])
            padding_flag[:1] = 1
        elif sequence.shape[0] > 3:
            sequence = sequence[-3:]
            
        sequence = np.concatenate([sequence, padding_flag], axis=1)
        
        X.append(sequence)
        y.append(target)

X = np.array(X)
y = np.array(y)

In [158]:
from sklearn.preprocessing import StandardScaler

In [159]:
X_scaler = StandardScaler()
y_scaler = StandardScaler()

original_shape = X.shape

X_flat = X.reshape(-1, X.shape[2])
X_flat_scaled = X_scaler.fit_transform(X_flat)
X_seq_scaled = X_flat_scaled.reshape(original_shape)

y_seq_scaled = y_scaler.fit_transform(y)

In [161]:
from torch.utils.data import Dataset

In [162]:
class NHLDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [163]:
import torch.nn as nn

In [164]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True, dropout=dropout if num_layers > 1 else 0.0)
        self.fc = nn.Linear(hidden_size, 3)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])
        return out

In [165]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch

In [166]:
X_train, X_val, y_train, y_val = train_test_split(X_seq_scaled, y_seq_scaled, test_size=0.2, random_state=42)

train_loader = DataLoader(NHLDataset(X_train, y_train), batch_size=64, shuffle=True)
val_loader = DataLoader(NHLDataset(X_val, y_val), batch_size=64)

In [167]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import optuna

In [168]:
def objective(trial):
    hidden_size = trial.suggest_int("hidden_size", 64, 256)
    num_layers = trial.suggest_int("num_layers", 1, 3)
    dropout = trial.suggest_float("dropout", 0.0, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    batch_size = 64

    model = LSTM(input_size=X_train.shape[2], hidden_size=hidden_size,
                      num_layers=num_layers, dropout=dropout)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    for epoch in range(10):
        model.train()
        for xb, yb in train_loader:
            pred = model(xb)
            loss = loss_fn(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            out = model(xb)
            preds.append(out.numpy())
            targets.append(yb.numpy())

    preds = np.concatenate(preds, axis=0)
    targets = np.concatenate(targets, axis=0)

    preds_orig = y_scaler.inverse_transform(preds)
    targets_orig = y_scaler.inverse_transform(targets)

    return r2_score(targets_orig, preds_orig)

In [169]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2025-06-23 22:24:09,489] A new study created in memory with name: no-name-36aa34eb-b3b6-4d79-8ebb-aa857cc31eaa
[I 2025-06-23 22:24:14,910] Trial 0 finished with value: 0.4511529505252838 and parameters: {'hidden_size': 137, 'num_layers': 2, 'dropout': 0.08107218019487555, 'lr': 0.004233968561589297}. Best is trial 0 with value: 0.4511529505252838.
[I 2025-06-23 22:24:19,959] Trial 1 finished with value: 0.49558091163635254 and parameters: {'hidden_size': 135, 'num_layers': 2, 'dropout': 0.39858891382688816, 'lr': 0.0007002531794920246}. Best is trial 1 with value: 0.49558091163635254.
[I 2025-06-23 22:24:23,753] Trial 2 finished with value: 0.49230265617370605 and parameters: {'hidden_size': 103, 'num_layers': 2, 'dropout': 0.13412897619271247, 'lr': 0.00028966749887620973}. Best is trial 1 with value: 0.49558091163635254.
[I 2025-06-23 22:24:32,317] Trial 3 finished with value: 0.44730380177497864 and parameters: {'hidden_size': 166, 'num_layers': 3, 'dropout': 0.35774347436763904,

In [170]:
best_params = study.best_trial.params
print(best_params)

{'hidden_size': 176, 'num_layers': 2, 'dropout': 0.33538865521448935, 'lr': 0.001118455827194482}


In [179]:
best_model = LSTM(
    input_size=30,
    hidden_size=best_params["hidden_size"],
    num_layers=best_params["num_layers"],
    dropout=best_params["dropout"] if best_params["num_layers"] > 1 else 0.0
)

optimizer = torch.optim.Adam(best_model.parameters(), lr=best_params["lr"])
loss_fn = nn.MSELoss()

for epoch in range(31):
    best_model.train()
    for xb, yb in train_loader:
        pred = best_model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % 5 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 0, Loss: 0.5620
Epoch 5, Loss: 0.3845
Epoch 10, Loss: 0.4680
Epoch 15, Loss: 0.4007
Epoch 20, Loss: 0.5174
Epoch 25, Loss: 0.4665
Epoch 30, Loss: 0.3513


In [180]:
best_model.eval()
preds, targets = [], []

with torch.no_grad():
    for xb, yb in val_loader:
        out = best_model(xb)
        preds.append(out.numpy())
        targets.append(yb.numpy())

preds = np.concatenate(preds, axis=0)
targets = np.concatenate(targets, axis=0)

preds_orig = y_scaler.inverse_transform(preds)
targets_orig = y_scaler.inverse_transform(targets)

In [181]:
print("R² Score:", r2_score(targets_orig, preds_orig))
print("MAE:", mean_absolute_error(targets_orig, preds_orig))
print("MSE:", mean_squared_error(targets_orig, preds_orig))
print("RMSE:", np.sqrt(mean_squared_error(targets_orig, preds_orig)))

R² Score: 0.45383429527282715
MAE: 0.11962541937828064
MSE: 0.03243095800280571
RMSE: 0.18008597392025208


In [182]:
target_names = ['next_games_played_per', 'next_goals_per_game', 'next_assists_per_game']
for i, name in enumerate(target_names):
    print(f"{name}")
    print("R²:", r2_score(targets_orig[:, i], preds_orig[:, i]))
    print("MAE:", mean_absolute_error(targets_orig[:, i], preds_orig[:, i]))
    print("MSE:", mean_squared_error(targets_orig[:, i], preds_orig[:, i]))
    print("RMSE:", np.sqrt(mean_squared_error(targets_orig[:, i], preds_orig[:, i])))

next_games_played_per
R²: 0.22282147407531738
MAE: 0.20151984691619873
MSE: 0.0726410523056984
RMSE: 0.26952004063835105
next_goals_per_game
R²: 0.5612466335296631
MAE: 0.063809335231781
MSE: 0.008928364142775536
RMSE: 0.09449002139260809
next_assists_per_game
R²: 0.5774340629577637
MAE: 0.09354712069034576
MSE: 0.015723474323749542
RMSE: 0.1253932786226979


In [183]:
import joblib

In [184]:
torch.save(best_model.state_dict(), "../models/lstm_nhl_model.pt")
joblib.dump(X_scaler, "../models/X_scaler.pkl")
joblib.dump(y_scaler, "../models/y_scaler.pkl")

['../models/y_scaler.pkl']