In [24]:
import sys
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
%matplotlib inline

In [25]:
%load_ext autoreload
%autoreload 2
from data import ApiFetcher, DataPreparation
from model.gru_team_embbedings import EmbeddingsTrain
from model.gru_team_embbedings import TeamEmbeddings

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
api = ApiFetcher(starting_year=2019, ending_year=2025)
print(api.get_dataframe('leaguegamelog').head())

      game_id  home_fga  away_fga  home_fg_pct  away_fg_pct  home_fg3a  \
0  0021900001       103       102        0.408        0.422         40   
1  0021900002        81        85        0.519        0.435         31   
2  0021900005        86        88        0.430        0.375         30   
3  0021900009        76        93        0.461        0.398         36   
4  0021900003        88       105        0.511        0.467         44   

   away_fg3a  home_fg3_pct  away_fg3_pct  home_oreb  ...  away_blk  home_tov  \
0         45         0.350         0.422         16  ...         9        17   
1         33         0.355         0.394         11  ...         7        14   
2         34         0.300         0.265          7  ...         2        13   
3         41         0.306         0.268          5  ...         3        20   
4         30         0.523         0.300          9  ...         4        19   

   away_tov  home_pf  away_pf  home_pts  away_pts  home_team_id  away_team

In [None]:
dataPrepeparation = DataPreparation('leaguegamelog', starting_year=2000, ending_year=2025)
train_ds, val_ds, test_ds = dataPrepeparation.data

In [28]:
x, home_ids, away_ids, y = train_ds[0]
print(x.shape, y.shape)

torch.Size([10, 17]) torch.Size([2])


In [29]:
n_teams = int(max(np.max(train_ds.home_ids), np.max(train_ds.away_ids)) + 1)
print(n_teams)

30


In [30]:
model = TeamEmbeddings(
    input_num_features=x.shape[1],
    n_teams=n_teams,
    emb_dim=32,
    hidden_size=128,
    num_layers=3,
    output_size=2,
)
trainer = EmbeddingsTrain(
    model,
    learning_rate=1e-3,
    weight_decay=1e-4,
)

In [31]:
import torch
from torch.utils.data import DataLoader

# inicjalizacja trenera
trainer = EmbeddingsTrain(model, learning_rate=1e-3)

# online learning - batch_size = 1
train_loader = DataLoader(train_ds, batch_size=1, shuffle=False)  # zachowujemy chronologię
val_loader = DataLoader(val_ds, batch_size=1, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=1, shuffle=False)

# ------------------------
# TRAINING (online)
# ------------------------
train_losses = []
for batch in train_loader:
    loss = trainer.step(batch)
    train_losses.append(loss)

print(f"Średni loss na train_ds: {sum(train_losses)/len(train_losses):.3f}")

# ------------------------
# VALIDATION / TEST
# ------------------------
def evaluate(loader):
    preds_list = []
    targets_list = []
    for batch in loader:
        preds = trainer.predict(batch)
        if len(batch) == 2:
            _, yb = batch
        else:
            _, _, _, yb = batch
        targets_list.append(yb)
        preds_list.append(preds)
    return torch.cat(preds_list), torch.cat(targets_list)

val_preds, val_targets = evaluate(val_loader)
test_preds, test_targets = evaluate(test_loader)

# przykładowe metryki
mae = torch.mean(torch.abs(val_preds - val_targets.float()))
rmse = torch.sqrt(torch.mean((val_preds - val_targets.float())**2))
print(f"VAL MAE: {mae:.3f}, VAL RMSE: {rmse:.3f}")


Średni loss na train_ds: 17.021
VAL MAE: 10.356, VAL RMSE: 12.878


In [38]:
import torch
from torch.utils.data import DataLoader

criterion = torch.nn.HuberLoss(reduction="sum")
l1 = torch.nn.L1Loss(reduction="sum")
mse = torch.nn.MSELoss(reduction="sum")

device = next(trainer.model.parameters()).device
test_loader = DataLoader(test_ds, batch_size=1, shuffle=False, drop_last=False)

trainer.model.eval()
device = next(trainer.model.parameters()).device

ss_res = torch.zeros(2, device=device)
ss_tot = torch.zeros(2, device=device)
y_tot_sum = torch.zeros(2, device=device)
y_count = 0

# najpierw sumujemy wartości, żeby policzyć mean
for batch in test_loader:
    if len(batch) == 2:
        _, yb = batch
    else:
        _, _, _, yb = batch
    yb = yb.to(device)
    if yb.ndim == 3:
        yb = yb.squeeze(1)
    y_tot_sum += torch.sum(yb, dim=0)
    y_count += yb.size(0)

y_mean = y_tot_sum / y_count

# teraz jeden przebieg: ss_res i ss_tot
sum_loss = sum_mae = sum_mse = n = 0
for batch in test_loader:
    if len(batch) == 2:
        xb, yb = batch
        xb, yb = xb.to(device), yb.to(device)
    else:
        xb, home_ids, away_ids, yb = batch
        xb, yb = xb.to(device), yb.to(device)
        home_ids, away_ids = home_ids.to(device), away_ids.to(device)

    if yb.ndim == 3:
        yb = yb.squeeze(1)

    preds = trainer.model(xb) if len(batch) == 2 else trainer.model(xb, home_ids, away_ids)
    
    bs = yb.size(0)
    sum_loss += criterion(preds, yb).item() * bs
    sum_mae  += l1(preds, yb).item()
    sum_mse  += mse(preds, yb).item()
    
    ss_res += torch.sum((yb - preds)**2, dim=0)
    ss_tot += torch.sum((yb - y_mean)**2, dim=0)
    n += bs * yb.shape[1]

r2 = 1 - ss_res / ss_tot
test_loss = sum_loss / n
test_mae  = sum_mae / n
test_rmse = (sum_mse / n) ** 0.5

print("Test loss:", test_loss)
print("Test MAE:", test_mae)
print("Test RMSE:", test_rmse)
print("R² per target:", r2.tolist())


Test loss: 9.803990972457406
Test MAE: 10.288990082572596
Test RMSE: 12.874862479097734
R² per target: [-0.025451064109802246, -0.004592180252075195]
