In [1]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [None]:
%load_ext autoreload
%autoreload 2
from data.api_fetcher import ApiFetcher
from model.team_embeddings import EmbeddingFetcher
from model.team_embeddings import TeamEmbeddingsModel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
api = ApiFetcher(starting_year=2019, ending_year=2025)
df = api.get_dataframe(numeric=False, date=True, time_coeff=False, ids=True)

In [43]:
trainer = TeamEmbeddingsModel(df)
test_mse, trained_model = trainer.train()
print("Test MSE:", test_mse)

  return bound(*args, **kwds)


Test MSE: 19.605910043409956


In [44]:
home_embeddings = trained_model.home_embedding.weight.detach().cpu().numpy()
away_embeddings = trained_model.away_embedding.weight.detach().cpu().numpy()
embeddings_fetcher = EmbeddingFetcher(home_embeddings, away_embeddings)


In [45]:
def prep_df(df1, target_cols=['home_pts', 'away_pts'], scaler=None):
    team_id_cols = ['home_team_id', 'away_team_id']
    exclude_cols = target_cols + team_id_cols + ['date', 'home_team', 'away_team']
    numeric_cols = [col for col in df1.columns if col not in exclude_cols]


    X_numeric_raw = df1[numeric_cols].values
    X_team_ids = df1[team_id_cols].astype(int).values
    y = df1[target_cols].sum(axis=1).values
    if scaler is None:
        scaler = StandardScaler()
        X_numeric = scaler.fit_transform(X_numeric_raw)
    else:
        X_numeric = scaler.transform(X_numeric_raw)

    return X_numeric, X_team_ids, y, scaler, numeric_cols

In [46]:
class NBAEmbeddingDataset(Dataset):
    def __init__(self, X_numeric, X_team_ids, y, fetcher):
        self.X_numeric = torch.tensor(X_numeric, dtype=torch.float32)
        self.X_team_ids = torch.tensor(X_team_ids, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.fetcher = fetcher
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        numeric_features = self.X_numeric[idx]
        home_id, away_id = self.X_team_ids[idx]
        # Fetch embeddings
        home_emb = torch.tensor(self.fetcher.get_home_embedding(home_id), dtype=torch.float32)
        away_emb = torch.tensor(self.fetcher.get_away_embedding(away_id), dtype=torch.float32)
        return numeric_features, home_emb, away_emb, self.y[idx]

In [57]:
class MiniNN(nn.Module):
    def __init__(self, num_numeric_features, embedding_dim):
        super(MiniNN, self).__init__()
        input_size = num_numeric_features + embedding_dim*2
        
        # Two hidden layers: 64 and 32 neurons
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.out = nn.Linear(32, 1)  
        
    def forward(self, numeric_features, home_emb, away_emb):
        # Concatenate numeric features and embeddings
        x = torch.cat([numeric_features, home_emb, away_emb], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.out(x).squeeze(1)  


In [63]:
def train_model(df, fetcher, embedding_dim, num_epochs, lr, batch_size=64,):
    # Sort and split
    df_sorted = df.sort_values("date")
    train_df, val_df, test_df = np.split(
        df_sorted, 
        [int(0.7*len(df_sorted)), int(0.85*len(df_sorted))]
    )
    
    X_train_num, X_train_ids, y_train, scaler, numeric_cols = prep_df(train_df)
    X_val_num, X_val_ids, y_val, _, _ = prep_df(val_df, scaler=scaler)
    X_test_num, X_test_ids, y_test, _, _ = prep_df(test_df, scaler=scaler)
    
    # Datasets
    train_dataset = NBAEmbeddingDataset(X_train_num, X_train_ids, y_train, fetcher)
    val_dataset = NBAEmbeddingDataset(X_val_num, X_val_ids, y_val, fetcher)
    test_dataset = NBAEmbeddingDataset(X_test_num, X_test_ids, y_test, fetcher)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Model
    num_numeric_features = X_train_num.shape[1]
    model = MiniNN(num_numeric_features, embedding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for X_num, home_emb, away_emb, y in train_loader:
            optimizer.zero_grad()
            pred = model(X_num, home_emb, away_emb)
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_num.size(0)
        train_loss /= len(train_loader.dataset)
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_num, home_emb, away_emb, y in val_loader:
                pred = model(X_num, home_emb, away_emb)
                val_loss += criterion(pred, y).item() * X_num.size(0)
        val_loss /= len(val_loader.dataset)
        
        if (epoch+1) % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs} | Train MSE: {train_loss:.4f} | Val MSE: {val_loss:.4f}")
    
    # Test MSE
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_num, home_emb, away_emb, y in test_loader:
            pred = model(X_num, home_emb, away_emb)
            test_loss += criterion(pred, y).item() * X_num.size(0)
    test_loss /= len(test_loader.dataset)
    print(f"Test MSE: {test_loss:.4f}")
    
    return model, scaler, numeric_cols
    

In [68]:
embedding_dim = home_embeddings.shape[1]
model, scaler, numeric_cols = train_model(df, embeddings_fetcher, embedding_dim, num_epochs=100, lr=0.01)

  return bound(*args, **kwds)


Epoch 10/100 | Train MSE: 47.3264 | Val MSE: 51.9393
Epoch 20/100 | Train MSE: 39.2278 | Val MSE: 56.6664
Epoch 30/100 | Train MSE: 37.8138 | Val MSE: 47.1676
Epoch 40/100 | Train MSE: 33.8047 | Val MSE: 47.3127
Epoch 50/100 | Train MSE: 34.7763 | Val MSE: 49.1492
Epoch 60/100 | Train MSE: 34.0830 | Val MSE: 45.6843
Epoch 70/100 | Train MSE: 31.5679 | Val MSE: 47.8563
Epoch 80/100 | Train MSE: 34.1696 | Val MSE: 48.3356
Epoch 90/100 | Train MSE: 33.3236 | Val MSE: 67.5184
Epoch 100/100 | Train MSE: 28.8428 | Val MSE: 51.8205
Test MSE: 59.0572
