In [19]:
import sys
import os
import pandas as pd
import numpy as np
import torch
import copy
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [20]:
%load_ext autoreload
%autoreload 2
from data.api_fetcher import ApiFetcher

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
%reload_ext autoreload
api = ApiFetcher(starting_year=2015, ending_year=2025)
df = api.get_dataframe(numeric=False, ids=True, date=True, time_coeff=False, season_id=True)
print(df.columns)

Index(['home_fga', 'away_fga', 'home_fg_pct', 'away_fg_pct', 'home_fg3a',
       'away_fg3a', 'home_fg3_pct', 'away_fg3_pct', 'home_oreb', 'away_oreb',
       'home_dreb', 'away_dreb', 'home_ast', 'away_ast', 'home_stl',
       'away_stl', 'home_blk', 'away_blk', 'home_tov', 'away_tov', 'home_pf',
       'away_pf', 'home_pts', 'away_pts', 'home_team', 'away_team', 'date',
       'home_team_id', 'away_team_id', 'home_team_season_id',
       'away_team_season_id'],
      dtype='object')


In [22]:
def prepare_embedding_dataset(df, 
                               home_id_col='home_team_season_id', 
                               away_id_col='away_team_season_id',
                               target_cols=('home_pts', 'away_pts'),
                               date_col='date'):
    
    #prep target
    y = df[target_cols[0]] + df[target_cols[1]] 
    y = y.values.astype(np.float32)

    #extract ids, leave only numeric data
    exclude_cols = [home_id_col, away_id_col] + list(target_cols) + [date_col]
    numeric_cols = []
    for col in df.columns:
            if col not in exclude_cols:
                # Check if column is numeric and can be converted to float
                if pd.api.types.is_numeric_dtype(df[col]):
                    numeric_cols.append(col)
    
    print(f"Using numeric columns: {numeric_cols}")

    #extract and normalise num features (must be  numpy tensors)

    X_numeric = df[numeric_cols].values.astype(np.float32)
    scaler = StandardScaler()
    X_numeric = scaler.fit_transform(X_numeric)

    # Extract team IDs
    X_home_id = df[home_id_col].values.astype(np.int32)
    X_away_id = df[away_id_col].values.astype(np.int32)
    
    return X_numeric, X_home_id, X_away_id, y, scaler




In [23]:
#X_numeric, X_home_id, X_away_id, y, scaler = prepare_embedding_dataset(df)

#print("X_numeric shape:", X_numeric.shape)
#print("Home IDs shape:", X_home_id.shape)
#print("Away IDs shape:", X_away_id.shape)
#print("y shape:", y.shape)


In [24]:
class NBAEmbeddingModel(nn.Module):
    def __init__(self, num_numeric_features, num_teams, embedding_dim=8):
        super(NBAEmbeddingModel, self).__init__()
        
        # Embedding layers for team IDs
        self.home_embedding = nn.Embedding(num_teams, embedding_dim)
        self.away_embedding = nn.Embedding(num_teams, embedding_dim)
        
        # First hidden layer: (numeric features + 2 embeddings) → 64 neurons
        self.fc1 = nn.Linear(num_numeric_features + embedding_dim * 2, 64)
        
        # Second hidden layer: 64 → 32 neurons
        self.fc2 = nn.Linear(64, 32)
        
        # Output layer: 32 → 1 (predict total points)
        self.output = nn.Linear(32, 1)
        
        # Activation
        self.relu = nn.ReLU()

    def forward(self, numeric_features, home_ids, away_ids):
        # Get embeddings
        home_emb = self.home_embedding(home_ids)   # shape: (batch_size, embedding_dim)
        away_emb = self.away_embedding(away_ids)   # shape: (batch_size, embedding_dim)
        
        # Concatenate numeric + embeddings
        x = torch.cat([numeric_features, home_emb, away_emb], dim=1)
        
        # Pass through hidden layers
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        
        # Output (linear, no activation for regression)
        return self.output(x)


In [25]:
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# 1. Prepare dataset
X_numeric, X_home_id, X_away_id, y, scaler = prepare_embedding_dataset(df)

# Convert date column to datetime
dates = pd.to_datetime(df['date'])

# Sort by date
sorted_idx = dates.argsort()
X_numeric = X_numeric[sorted_idx]
X_home_id = X_home_id[sorted_idx]
X_away_id = X_away_id[sorted_idx]
y = y[sorted_idx]

# Split by date: last 20% as test
split_idx = int(len(df) * 0.8)

X_numeric_train, X_numeric_test = X_numeric[:split_idx], X_numeric[split_idx:]
X_home_train, X_home_test = X_home_id[:split_idx], X_home_id[split_idx:]
X_away_train, X_away_test = X_away_id[:split_idx], X_away_id[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# Convert to PyTorch tensors
X_numeric_train = torch.tensor(X_numeric_train, dtype=torch.float32)
X_numeric_test = torch.tensor(X_numeric_test, dtype=torch.float32)
X_home_train = torch.tensor(X_home_train, dtype=torch.long)
X_home_test = torch.tensor(X_home_test, dtype=torch.long)
X_away_train = torch.tensor(X_away_train, dtype=torch.long)
X_away_test = torch.tensor(X_away_test, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# Create DataLoader
train_dataset = TensorDataset(X_numeric_train, X_home_train, X_away_train, y_train)
test_dataset = TensorDataset(X_numeric_test, X_home_test, X_away_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Using numeric columns: ['home_fga', 'away_fga', 'home_fg_pct', 'away_fg_pct', 'home_fg3a', 'away_fg3a', 'home_fg3_pct', 'away_fg3_pct', 'home_oreb', 'away_oreb', 'home_dreb', 'away_dreb', 'home_ast', 'away_ast', 'home_stl', 'away_stl', 'home_blk', 'away_blk', 'home_tov', 'away_tov', 'home_pf', 'away_pf', 'home_team_id', 'away_team_id']


In [26]:
num_numeric_features = X_numeric_train.shape[1]
num_teams = max(X_home_id.max(), X_away_id.max()) + 1  # IDs start at 0
model = NBAEmbeddingModel(num_numeric_features, num_teams, embedding_dim=8)

# Loss and optimizer
criterion = nn.MSELoss()

In [27]:
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-7, weight_decay=1e-4)

# Walidacja
val_split_idx = int(len(X_numeric_train) * 0.8)
X_num_val, X_home_val, X_away_val, y_val = (
    X_numeric_train[val_split_idx:],
    X_home_train[val_split_idx:],
    X_away_train[val_split_idx:],
    y_train[val_split_idx:]
)
X_num_train_only, X_home_train_only, X_away_train_only, y_train_only = (
    X_numeric_train[:val_split_idx],
    X_home_train[:val_split_idx],
    X_away_train[:val_split_idx],
    y_train[:val_split_idx]
)

train_dataset = TensorDataset(X_num_train_only, X_home_train_only, X_away_train_only, y_train_only)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# OneCycleLR
num_epochs = 1000
total_steps = len(train_loader) * num_epochs
max_lr = 1.0
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=max_lr,
    total_steps=total_steps,
    pct_start=0.2,
    anneal_strategy='cos',
    div_factor=25.0,
    final_div_factor=1e4
)

# Early stopping
best_val_loss = float('inf')
patience = 100
patience_counter = 0
best_model_state = None

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0.0
    for X_num_batch, X_home_batch, X_away_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_num_batch, X_home_batch, X_away_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item() * X_num_batch.size(0)
    train_loss /= len(train_dataset)
    
    # Walidacja
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_num_val, X_home_val, X_away_val)
        val_loss = criterion(val_outputs, y_val).item()
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        best_model_state = copy.deepcopy(model.state_dict())
    else:
        patience_counter += 1
    
    if epoch % 50 == 0:
        current_lr = optimizer.param_groups[0]['lr']
        print(f'Epoch {epoch}: Train: {train_loss:.6f}, Val: {val_loss:.6f}, LR: {current_lr:.6f}')
    
    if patience_counter >= patience:
        print(f'Early stopping at epoch {epoch}. Best val loss: {best_val_loss:.6f}')
        break

# Załaduj najlepszy model
if best_model_state:
    model.load_state_dict(best_model_state)

Epoch 0: Train: 4721.411394, Val: 1013.225891, LR: 0.040059
Epoch 50: Train: 124.790477, Val: 100.078415, LR: 0.185973
Epoch 100: Train: 443.257559, Val: 536.980835, LR: 0.527571
Epoch 150: Train: 444.624412, Val: 522.890930, LR: 0.864734
Early stopping at epoch 160. Best val loss: 57.476025


In [None]:
model.eval()
criterion = nn.MSELoss()

mse = 0
with torch.no_grad():
    for X_num, X_home, X_away, y in test_loader:
        outputs = model(X_num, X_home, X_away)
        mse += criterion(outputs, y).item() * X_num.size(0)

mse /= len(test_dataset)
print(f"MSE: {mse:.6f}")
