In [13]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [10]:
%load_ext autoreload
%autoreload 2
from data.api_fetcher import ApiFetcher

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
%reload_ext autoreload
api = ApiFetcher(starting_year=2015, ending_year=2025)
df = api.create_numeric_with_team_ids()

In [47]:
def prepare_embedding_dataset(df, 
                               home_id_col='home_team_season_id', 
                               away_id_col='away_team_season_id',
                               target_cols=('home_pts', 'away_pts'),
                               date_col='game_date'):
    
    #prep target
    y = df[target_cols[0]] + df[target_cols[1]] 
    y = y.values.astype(np.float32)

    #extract ids, leave only numeric data
    exclude_cols = [home_id_col, away_id_col] + list(target_cols) + [date_col]
    numeric_cols = df.drop(columns=exclude_cols).select_dtypes(include=[np.number]).columns.tolist()

    #extract and normalise num features (must be  numpy tensors)

    X_numeric = df[numeric_cols].values.astype(np.float32)
    scaler = StandardScaler()
    X_numeric = scaler.fit_transform(X_numeric)

    # Extract team IDs
    X_home_id = df[home_id_col].values.astype(np.int32)
    X_away_id = df[away_id_col].values.astype(np.int32)
    
    return X_numeric, X_home_id, X_away_id, y, scaler




In [None]:
#X_numeric, X_home_id, X_away_id, y, scaler = prepare_embedding_dataset(df)

#print("X_numeric shape:", X_numeric.shape)
#print("Home IDs shape:", X_home_id.shape)
#print("Away IDs shape:", X_away_id.shape)
#print("y shape:", y.shape)


X_numeric shape: (11973, 22)
Home IDs shape: (11973,)
Away IDs shape: (11973,)
y shape: (11973,)


In [42]:
import torch
import torch.nn as nn

class NBAEmbeddingModel(nn.Module):
    def __init__(self, num_numeric_features, num_teams, embedding_dim=8):
        super(NBAEmbeddingModel, self).__init__()
        
        # Embedding layers for team IDs
        self.home_embedding = nn.Embedding(num_teams, embedding_dim)
        self.away_embedding = nn.Embedding(num_teams, embedding_dim)
        
        # First hidden layer: (numeric features + 2 embeddings) → 64 neurons
        self.fc1 = nn.Linear(num_numeric_features + embedding_dim * 2, 64)
        
        # Second hidden layer: 64 → 32 neurons
        self.fc2 = nn.Linear(64, 32)
        
        # Output layer: 32 → 1 (predict total points)
        self.output = nn.Linear(32, 1)
        
        # Activation
        self.relu = nn.ReLU()

    def forward(self, numeric_features, home_ids, away_ids):
        # Get embeddings
        home_emb = self.home_embedding(home_ids)   # shape: (batch_size, embedding_dim)
        away_emb = self.away_embedding(away_ids)   # shape: (batch_size, embedding_dim)
        
        # Concatenate numeric + embeddings
        x = torch.cat([numeric_features, home_emb, away_emb], dim=1)
        
        # Pass through hidden layers
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        
        # Output (linear, no activation for regression)
        return self.output(x)


In [49]:
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# 1. Prepare dataset
X_numeric, X_home_id, X_away_id, y, scaler = prepare_embedding_dataset(df)

# Convert date column to datetime
dates = pd.to_datetime(df['game_date'])

# Sort by date
sorted_idx = dates.argsort()
X_numeric = X_numeric[sorted_idx]
X_home_id = X_home_id[sorted_idx]
X_away_id = X_away_id[sorted_idx]
y = y[sorted_idx]

# Split by date: last 20% as test
split_idx = int(len(df) * 0.8)

X_numeric_train, X_numeric_test = X_numeric[:split_idx], X_numeric[split_idx:]
X_home_train, X_home_test = X_home_id[:split_idx], X_home_id[split_idx:]
X_away_train, X_away_test = X_away_id[:split_idx], X_away_id[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# Convert to PyTorch tensors
X_numeric_train = torch.tensor(X_numeric_train, dtype=torch.float32)
X_numeric_test = torch.tensor(X_numeric_test, dtype=torch.float32)
X_home_train = torch.tensor(X_home_train, dtype=torch.long)
X_home_test = torch.tensor(X_home_test, dtype=torch.long)
X_away_train = torch.tensor(X_away_train, dtype=torch.long)
X_away_test = torch.tensor(X_away_test, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# Create DataLoader
train_dataset = TensorDataset(X_numeric_train, X_home_train, X_away_train, y_train)
test_dataset = TensorDataset(X_numeric_test, X_home_test, X_away_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

KeyError: "['game_date'] not found in axis"