In [2]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [3]:
%load_ext autoreload
%autoreload 2
from data.api_fetcher import ApiFetcher
from model.team_embeddings import EmbeddingFetcher
from model.team_embeddings import TeamEmbeddingsModel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
api = ApiFetcher(starting_year=2019, ending_year=2025)
df = api.get_dataframe(numeric=False, date=True, time_coeff=False, ids=True)

In [5]:
trainer = TeamEmbeddingsModel(df)
test_mse, trained_model = trainer.train()
print("Test MSE:", test_mse)

  return bound(*args, **kwds)


Test MSE: 18.45200906043242


In [6]:
home_embeddings = trained_model.home_embedding.weight.detach().cpu().numpy()
away_embeddings = trained_model.away_embedding.weight.detach().cpu().numpy()
embeddings_fetcher = EmbeddingFetcher(home_embeddings, away_embeddings)


In [7]:
def prep_df(df1, target_cols=['home_pts', 'away_pts'], scaler=None):
    team_id_cols = ['home_team_id', 'away_team_id']
    exclude_cols = target_cols + team_id_cols + ['date', 'home_team', 'away_team']
    numeric_cols = [col for col in df1.columns if col not in exclude_cols]


    X_numeric_raw = df1[numeric_cols].values
    X_team_ids = df1[team_id_cols].astype(int).values
    y = df1[target_cols].sum(axis=1).values
    if scaler is None:
        scaler = StandardScaler()
        X_numeric = scaler.fit_transform(X_numeric_raw)
    else:
        X_numeric = scaler.transform(X_numeric_raw)

    return X_numeric, X_team_ids, y, scaler, numeric_cols

In [8]:
class NBAEmbeddingDataset(Dataset):
    def __init__(self, X_numeric, X_team_ids, y, fetcher):
        self.X_numeric = torch.tensor(X_numeric, dtype=torch.float32)
        self.X_team_ids = torch.tensor(X_team_ids, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.fetcher = fetcher
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        numeric_features = self.X_numeric[idx]
        home_id, away_id = self.X_team_ids[idx]
        # Fetch embeddings
        home_emb = torch.tensor(self.fetcher.get_home_embedding(home_id), dtype=torch.float32)
        away_emb = torch.tensor(self.fetcher.get_away_embedding(away_id), dtype=torch.float32)
        return numeric_features, home_emb, away_emb, self.y[idx]

In [9]:
class MiniNN(nn.Module):
    def __init__(self, num_numeric_features, embedding_dim):
        super(MiniNN, self).__init__()
        input_size = num_numeric_features + embedding_dim*2
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.out = nn.Linear(32, 1)
        
    def forward(self, numeric_features, home_emb, away_emb):
        x = torch.cat([numeric_features, home_emb, away_emb], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.out(x).squeeze(1)

In [10]:
def train_model(df, fetcher, embedding_dim, num_epochs, lr, batch_size=64,):
    # Sort and split
    df_sorted = df.sort_values("date")
    train_df, val_df, test_df = np.split(
        df_sorted, 
        [int(0.7*len(df_sorted)), int(0.85*len(df_sorted))]
    )
    
    X_train_num, X_train_ids, y_train, scaler, numeric_cols = prep_df(train_df)
    X_val_num, X_val_ids, y_val, _, _ = prep_df(val_df, scaler=scaler)
    X_test_num, X_test_ids, y_test, _, _ = prep_df(test_df, scaler=scaler)
    
    # Datasets
    train_dataset = NBAEmbeddingDataset(X_train_num, X_train_ids, y_train, fetcher)
    val_dataset = NBAEmbeddingDataset(X_val_num, X_val_ids, y_val, fetcher)
    test_dataset = NBAEmbeddingDataset(X_test_num, X_test_ids, y_test, fetcher)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Model
    num_numeric_features = X_train_num.shape[1]
    model = MiniNN(num_numeric_features, embedding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for X_num, home_emb, away_emb, y in train_loader:
            optimizer.zero_grad()
            pred = model(X_num, home_emb, away_emb)
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_num.size(0)
        train_loss /= len(train_loader.dataset)
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_num, home_emb, away_emb, y in val_loader:
                pred = model(X_num, home_emb, away_emb)
                val_loss += criterion(pred, y).item() * X_num.size(0)
        val_loss /= len(val_loader.dataset)
        
        if (epoch+1) % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs} | Train MSE: {train_loss:.4f} | Val MSE: {val_loss:.4f}")
    
    # Test MSE
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_num, home_emb, away_emb, y in test_loader:
            pred = model(X_num, home_emb, away_emb)
            test_loss += criterion(pred, y).item() * X_num.size(0)
    test_loss /= len(test_loader.dataset)
    print(f"Test MSE: {test_loss:.4f}")
    
    return model, scaler, numeric_cols
    

In [27]:
embedding_dim = home_embeddings.shape[1]
model, scaler, numeric_cols = train_model(df, embeddings_fetcher, embedding_dim, num_epochs=270, lr=0.0008991)

  return bound(*args, **kwds)


Epoch 10/270 | Train MSE: 115.4103 | Val MSE: 124.0954
Epoch 20/270 | Train MSE: 52.8090 | Val MSE: 59.0087
Epoch 30/270 | Train MSE: 38.8426 | Val MSE: 46.7050
Epoch 40/270 | Train MSE: 35.5091 | Val MSE: 42.1989
Epoch 50/270 | Train MSE: 32.7131 | Val MSE: 43.6055
Epoch 60/270 | Train MSE: 30.1163 | Val MSE: 42.5996
Epoch 70/270 | Train MSE: 28.2149 | Val MSE: 41.6736
Epoch 80/270 | Train MSE: 27.9489 | Val MSE: 48.2381
Epoch 90/270 | Train MSE: 26.3216 | Val MSE: 43.0820
Epoch 100/270 | Train MSE: 25.6309 | Val MSE: 42.2009
Epoch 110/270 | Train MSE: 24.9162 | Val MSE: 45.8874
Epoch 120/270 | Train MSE: 25.3220 | Val MSE: 42.7017
Epoch 130/270 | Train MSE: 23.9628 | Val MSE: 45.2780
Epoch 140/270 | Train MSE: 23.6932 | Val MSE: 43.7720
Epoch 150/270 | Train MSE: 23.3272 | Val MSE: 48.8769
Epoch 160/270 | Train MSE: 22.7800 | Val MSE: 44.9829
Epoch 170/270 | Train MSE: 21.9843 | Val MSE: 43.7482
Epoch 180/270 | Train MSE: 23.2504 | Val MSE: 43.8190
Epoch 190/270 | Train MSE: 21.6494 

In [9]:
    import sys
    import os
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    import torch
    from torch.utils.data import Dataset, DataLoader
    import torch.nn.functional as F
    import torch.optim as optim
    import torch.nn as nn
    import matplotlib.pyplot as plt

    sys.path.append(os.path.abspath(os.path.join('..', 'src')))
    %load_ext autoreload
    %autoreload 2

    from data.api_fetcher import ApiFetcher
    from model.team_embeddings import EmbeddingFetcher
    from model.team_embeddings import TeamEmbeddingsModel

    api = ApiFetcher(starting_year=2019, ending_year=2025)
    df = api.get_dataframe(numeric=False, date=True, time_coeff=False, ids=True)

    home_embeddings = trained_model.home_embedding.weight.detach().cpu().numpy()
    away_embeddings = trained_model.away_embedding.weight.detach().cpu().numpy()
    embeddings_fetcher = EmbeddingFetcher(home_embeddings, away_embeddings)

    def prep_df(df1, target_cols=['home_pts', 'away_pts'], scaler=None):
        team_id_cols = ['home_team_id', 'away_team_id']
        exclude_cols = target_cols + team_id_cols + ['date', 'home_team', 'away_team']
        numeric_cols = [col for col in df1.columns if col not in exclude_cols]
        
        X_numeric_raw = df1[numeric_cols].values
        X_team_ids = df1[team_id_cols].astype(int).values
        # Changed: Keep separate home_pts and away_pts as targets
        y = df1[target_cols].values  # Shape: (n_samples, 2)
        
        if scaler is None:
            scaler = StandardScaler()
            X_numeric = scaler.fit_transform(X_numeric_raw)
        else:
            X_numeric = scaler.transform(X_numeric_raw)
        
        return X_numeric, X_team_ids, y, scaler, numeric_cols

    class NBAEmbeddingDataset(Dataset):
        def __init__(self, X_numeric, X_team_ids, y, fetcher):
            self.X_numeric = torch.tensor(X_numeric, dtype=torch.float32)
            self.X_team_ids = torch.tensor(X_team_ids, dtype=torch.long)
            self.y = torch.tensor(y, dtype=torch.float32)  # Shape: (n_samples, 2)
            self.fetcher = fetcher
        
        def __len__(self):
            return len(self.y)
    
        def __getitem__(self, idx):
            numeric_features = self.X_numeric[idx]
            home_id, away_id = self.X_team_ids[idx]
            # Fetch embeddings
            home_emb = torch.tensor(self.fetcher.get_home_embedding(home_id), dtype=torch.float32)
            away_emb = torch.tensor(self.fetcher.get_away_embedding(away_id), dtype=torch.float32)
            return numeric_features, home_emb, away_emb, self.y[idx]  # y[idx] has shape (2,)

    class MiniNN(nn.Module):
        def __init__(self, num_numeric_features, embedding_dim):
            super(MiniNN, self).__init__()
            input_size = num_numeric_features + embedding_dim*2
            self.fc1 = nn.Linear(input_size, 128)
            self.fc2 = nn.Linear(128, 64)
            self.fc3 = nn.Linear(64, 32)
            # Changed: Output 2 values (home_pts, away_pts)
            self.out = nn.Linear(32, 2)
        
        def forward(self, numeric_features, home_emb, away_emb):
            x = torch.cat([numeric_features, home_emb, away_emb], dim=1)
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = F.relu(self.fc3(x))
            return self.out(x)  # Shape: (batch_size, 2)

    def train_model(df, fetcher, embedding_dim, num_epochs, lr, batch_size=64):
        # Sort and split
        df_sorted = df.sort_values("date")
        train_df, val_df, test_df = np.split(
            df_sorted,
            [int(0.7*len(df_sorted)), int(0.85*len(df_sorted))]
        )
    
        X_train_num, X_train_ids, y_train, scaler, numeric_cols = prep_df(train_df)
        X_val_num, X_val_ids, y_val, _, _ = prep_df(val_df, scaler=scaler)
        X_test_num, X_test_ids, y_test, _, _ = prep_df(test_df, scaler=scaler)
    
        # Datasets
        train_dataset = NBAEmbeddingDataset(X_train_num, X_train_ids, y_train, fetcher)
        val_dataset = NBAEmbeddingDataset(X_val_num, X_val_ids, y_val, fetcher)
        test_dataset = NBAEmbeddingDataset(X_test_num, X_test_ids, y_test, fetcher)
    
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
        # Model
        num_numeric_features = X_train_num.shape[1]
        model = MiniNN(num_numeric_features, embedding_dim)
    
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)
    
        # Training loop
        for epoch in range(num_epochs):
            model.train()
            train_loss = 0
            for X_num, home_emb, away_emb, y in train_loader:
                optimizer.zero_grad()
                pred = model(X_num, home_emb, away_emb)  # Shape: (batch_size, 2)
                loss = criterion(pred, y)  # y has shape (batch_size, 2)
                loss.backward()
                optimizer.step()
                train_loss += loss.item() * X_num.size(0)
            train_loss /= len(train_loader.dataset)
        
            # Validation
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for X_num, home_emb, away_emb, y in val_loader:
                    pred = model(X_num, home_emb, away_emb)
                    val_loss += criterion(pred, y).item() * X_num.size(0)
            val_loss /= len(val_loader.dataset)
        
            if (epoch+1) % 10 == 0:
                print(f"Epoch {epoch+1}/{num_epochs} | Train MSE: {train_loss:.4f} | Val MSE: {val_loss:.4f}")
    
        # Test evaluation with detailed metrics
        model.eval()
        test_loss = 0
        home_loss = 0
        away_loss = 0
        total_loss = 0  # For sum of home + away
        
        with torch.no_grad():
            for X_num, home_emb, away_emb, y in test_loader:
                pred = model(X_num, home_emb, away_emb)  # Shape: (batch_size, 2)
                
                # Overall MSE (average of home and away)
                test_loss += criterion(pred, y).item() * X_num.size(0)
                
                # Individual losses
                home_pred, away_pred = pred[:, 0], pred[:, 1]
                home_true, away_true = y[:, 0], y[:, 1]
                
                home_loss += F.mse_loss(home_pred, home_true).item() * X_num.size(0)
                away_loss += F.mse_loss(away_pred, away_true).item() * X_num.size(0)
                
                # Total points loss (sum of predictions vs sum of actuals)
                total_pred = home_pred + away_pred
                total_true = home_true + away_true
                total_loss += F.mse_loss(total_pred, total_true).item() * X_num.size(0)
        
        test_size = len(test_loader.dataset)
        test_loss /= test_size
        home_loss /= test_size
        away_loss /= test_size
        total_loss /= test_size
        
        print(f"Test Results:")
        print(f"  Overall MSE (avg of home/away): {test_loss:.4f}")
        print(f"  Home Points MSE: {home_loss:.4f}")
        print(f"  Away Points MSE: {away_loss:.4f}")
        print(f"  Total Points MSE (sum): {total_loss:.4f}")
    
        return model, scaler, numeric_cols, {
            'overall_mse': test_loss,
            'home_mse': home_loss,
            'away_mse': away_loss,
            'total_mse': total_loss
        }

    # Run the training
    embedding_dim = home_embeddings.shape[1]
    model, scaler, numeric_cols, results = train_model(
        df, embeddings_fetcher, embedding_dim, 
        num_epochs=270, lr=0.0008991
    )

    # Function to get predictions for meta-model
    def get_predictions(model, df, fetcher, scaler, numeric_cols):
        """Get home, away, and total predictions from the trained model"""
        X_numeric, X_team_ids, _, _, _ = prep_df(df, scaler=scaler)
        dataset = NBAEmbeddingDataset(X_numeric, X_team_ids, np.zeros((len(df), 2)), fetcher)
        
        model.eval()
        predictions = []
        
        with torch.no_grad():
            for X_num, home_emb, away_emb, _ in DataLoader(dataset, batch_size=64, shuffle=False):
                pred = model(X_num, home_emb, away_emb)
                predictions.append(pred.numpy())
        
        predictions = np.vstack(predictions)
        home_preds = predictions[:, 0]
        away_preds = predictions[:, 1]
        total_preds = home_preds + away_preds
        
        return home_preds, away_preds, total_preds

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  return bound(*args, **kwds)


Epoch 10/270 | Train MSE: 34.6675 | Val MSE: 36.3457
Epoch 20/270 | Train MSE: 16.2516 | Val MSE: 18.7962
Epoch 30/270 | Train MSE: 13.0286 | Val MSE: 16.6666
Epoch 40/270 | Train MSE: 11.7982 | Val MSE: 15.0682
Epoch 50/270 | Train MSE: 10.9088 | Val MSE: 15.5365
Epoch 60/270 | Train MSE: 10.6084 | Val MSE: 15.0979
Epoch 70/270 | Train MSE: 10.1883 | Val MSE: 15.5976
Epoch 80/270 | Train MSE: 10.0737 | Val MSE: 15.2523
Epoch 90/270 | Train MSE: 9.4923 | Val MSE: 14.9745
Epoch 100/270 | Train MSE: 9.0354 | Val MSE: 15.1892
Epoch 110/270 | Train MSE: 8.7994 | Val MSE: 14.4293
Epoch 120/270 | Train MSE: 8.9456 | Val MSE: 18.7509
Epoch 130/270 | Train MSE: 8.6737 | Val MSE: 16.7638
Epoch 140/270 | Train MSE: 8.1362 | Val MSE: 14.8187
Epoch 150/270 | Train MSE: 8.0911 | Val MSE: 15.0890
Epoch 160/270 | Train MSE: 7.9714 | Val MSE: 14.7745
Epoch 170/270 | Train MSE: 7.7384 | Val MSE: 14.5448
Epoch 180/270 | Train MSE: 7.5792 | Val MSE: 15.3816
Epoch 190/270 | Train MSE: 7.4428 | Val MSE: 15