In [3]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [4]:
%load_ext autoreload
%autoreload 2
from data.api_fetcher import ApiFetcher

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:

api = ApiFetcher(starting_year=2014, ending_year=2025)
df = api.get_dataframe(numeric=False, date=False, time_coeff=False, ids=True)


Index(['home_fga', 'away_fga', 'home_fg_pct', 'away_fg_pct', 'home_fg3a',
       'away_fg3a', 'home_fg3_pct', 'away_fg3_pct', 'home_oreb', 'away_oreb',
       'home_dreb', 'away_dreb', 'home_ast', 'away_ast', 'home_stl',
       'away_stl', 'home_blk', 'away_blk', 'home_tov', 'away_tov', 'home_pf',
       'away_pf', 'home_pts', 'away_pts', 'home_team', 'away_team',
       'home_team_id', 'away_team_id'],
      dtype='object')


In [42]:
df1 = df.copy()


In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

def prep_df(df, test_size=0.2, val_size=0.2, random_state=42):
    """
    Prepare basketball game data for neural network training.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame with basketball game statistics
    test_size : float, default=0.2
        Proportion of data to use for testing
    val_size : float, default=0.2
        Proportion of remaining data to use for validation (after test split)
    random_state : int, default=42
        Random state for reproducible splits
    
    Returns:
    --------
    dict : Dictionary containing:
        - 'X_train': Training features (scaled)
        - 'X_val': Validation features (scaled)
        - 'X_test': Test features (scaled)
        - 'y_train': Training labels
        - 'y_val': Validation labels
        - 'y_test': Test labels
        - 'feature_names': List of feature names
        - 'scaler': Fitted StandardScaler object
        - 'label_encoder': Fitted LabelEncoder object (if applicable)
        - 'n_features': Number of input features
    """
    
    # Create a copy to avoid modifying original data
    data = df.copy()
    
    # Feature Engineering
    # Calculate differential stats (home - away)
    feature_pairs = [
        ('fga', 'Field Goal Attempts'),
        ('fg_pct', 'Field Goal Percentage'),
        ('fg3a', '3-Point Attempts'),
        ('fg3_pct', '3-Point Percentage'),
        ('oreb', 'Offensive Rebounds'),
        ('dreb', 'Defensive Rebounds'),
        ('ast', 'Assists'),
        ('stl', 'Steals'),
        ('blk', 'Blocks'),
        ('tov', 'Turnovers'),
        ('pf', 'Personal Fouls'),
        ('pts', 'Points')
    ]
    
    # Create differential features
    for stat, desc in feature_pairs:
        home_col = f'home_{stat}'
        away_col = f'away_{stat}'
        diff_col = f'{stat}_diff'
        data[diff_col] = data[home_col] - data[away_col]
    
    # Create efficiency metrics
    data['home_total_reb'] = data['home_oreb'] + data['home_dreb']
    data['away_total_reb'] = data['away_oreb'] + data['away_dreb']
    data['reb_diff'] = data['home_total_reb'] - data['away_total_reb']
    
    # Create possession estimates (simplified)
    data['home_poss_est'] = data['home_fga'] + 0.44 * data['home_fga'] - data['home_oreb'] + data['home_tov']
    data['away_poss_est'] = data['away_fga'] + 0.44 * data['away_fga'] - data['away_oreb'] + data['away_tov']
    data['poss_diff'] = data['home_poss_est'] - data['away_poss_est']
    
    # Create efficiency ratings
    data['home_off_eff'] = (data['home_pts'] / data['home_poss_est']) * 100
    data['away_off_eff'] = (data['away_pts'] / data['away_poss_est']) * 100
    data['off_eff_diff'] = data['home_off_eff'] - data['away_off_eff']
    
    # Create target variable (1 if home team wins, 0 if away team wins)
    data['home_win'] = (data['home_pts'] > data['away_pts']).astype(int)
    
    # Define features to use
    base_features = [
        'home_fga', 'away_fga', 'home_fg_pct', 'away_fg_pct',
        'home_fg3a', 'away_fg3a', 'home_fg3_pct', 'away_fg3_pct',
        'home_oreb', 'away_oreb', 'home_dreb', 'away_dreb',
        'home_ast', 'away_ast', 'home_stl', 'away_stl',
        'home_blk', 'away_blk', 'home_tov', 'away_tov',
        'home_pf', 'away_pf'
    ]
    
    differential_features = [f'{stat}_diff' for stat, _ in feature_pairs]
    
    engineered_features = [
        'reb_diff', 'poss_diff', 'off_eff_diff',
        'home_total_reb', 'away_total_reb'
    ]
    
    # Combine all features
    feature_columns = base_features + differential_features + engineered_features
    
    # Handle any infinite or NaN values
    data = data.replace([np.inf, -np.inf], np.nan)
    
    # For any remaining NaN values, fill with median
    for col in feature_columns:
        if col in data.columns:
            median_val = data[col].median()
            data[col] = data[col].fillna(median_val)
    
    # Prepare features and target
    X = data[feature_columns].values
    y = data['home_win'].values
    
    # First split: separate test set
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    # Second split: separate train and validation from remaining data
    val_size_adjusted = val_size / (1 - test_size)  # Adjust val_size for remaining data
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_size_adjusted, random_state=random_state, stratify=y_temp
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Prepare return dictionary
    result = {
        'X_train': X_train_scaled,
        'X_val': X_val_scaled,
        'X_test': X_test_scaled,
        'y_train': y_train,
        'y_val': y_val,
        'y_test': y_test,
        'feature_names': feature_columns,
        'scaler': scaler,
        'n_features': len(feature_columns),
        'train_size': len(X_train_scaled),
        'val_size': len(X_val_scaled),
        'test_size': len(X_test_scaled)
    }
    
    # Print summary
    print(f"Data preparation complete!")
    print(f"Total samples: {len(data)}")
    print(f"Features: {len(feature_columns)}")
    print(f"Training samples: {len(X_train_scaled)}")
    print(f"Validation samples: {len(X_val_scaled)}")
    print(f"Test samples: {len(X_test_scaled)}")
    print(f"Home team win rate: {y.mean():.3f}")
    
    return result

# Example usage:
#prepared_data = prep_df(df1)
#X_train, y_train = prepared_data['X_train'], prepared_data['y_train']
#X_val, y_val = prepared_data['X_val'], prepared_data['y_val']
#X_test, y_test = prepared_data['X_test'], prepared_data['y_test']

In [44]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

class BasketballEmbeddingNet(nn.Module):
    """
    Neural Network that learns 8-dimensional embeddings for basketball game data
    and uses them for binary classification (home team win/loss prediction).
    """
    
    def __init__(self, input_dim, embedding_dim=8, hidden_dim1=64, hidden_dim2=32, dropout_rate=0.3):
        super(BasketballEmbeddingNet, self).__init__()
        
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim
        
        # Embedding layers
        self.embedding_layer = nn.Sequential(
            nn.Linear(input_dim, hidden_dim1),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim1, hidden_dim2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim2, embedding_dim)  # 8-dimensional embedding
        )
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(embedding_dim, 16),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        # Get embeddings
        embeddings = self.embedding_layer(x)
        
        # Classification
        output = self.classifier(embeddings)
        
        return output, embeddings
    
    def get_embeddings(self, x):
        """Extract embeddings without classification"""
        with torch.no_grad():
            embeddings = self.embedding_layer(x)
        return embeddings

def train_model(model, train_loader, val_loader, epochs=100, learning_rate=0.001, device='cpu'):
    """
    Train the basketball embedding model
    """
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=10, factor=0.5)
    
    train_losses = []
    val_losses = []
    val_accuracies = []
    
    best_val_loss = float('inf')
    patience_counter = 0
    early_stopping_patience = 20
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0.0
        
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            outputs, _ = model(batch_x)
            loss = criterion(outputs.squeeze(), batch_y.float())
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0.0
        val_predictions = []
        val_targets = []
        
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                
                outputs, _ = model(batch_x)
                loss = criterion(outputs.squeeze(), batch_y.float())
                val_loss += loss.item()
                
                predictions = (outputs.squeeze() > 0.5).float()
                val_predictions.extend(predictions.cpu().numpy())
                val_targets.extend(batch_y.cpu().numpy())
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(val_targets, val_predictions)
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)
        
        scheduler.step(val_loss)
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            # Save best model
            torch.save(model.state_dict(), 'best_basketball_model.pth')
        else:
            patience_counter += 1
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch:3d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}')
        
        if patience_counter >= early_stopping_patience:
            print(f'Early stopping at epoch {epoch}')
            break
    
    # Load best model
    model.load_state_dict(torch.load('best_basketball_model.pth'))
    
    return {
        'train_losses': train_losses,
        'val_losses': val_losses,
        'val_accuracies': val_accuracies
    }

def evaluate_model(model, test_loader, device='cpu'):
    """
    Evaluate the model on test data
    """
    model.eval()
    test_predictions = []
    test_targets = []
    test_embeddings = []
    
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            outputs, embeddings = model(batch_x)
            predictions = (outputs.squeeze() > 0.5).float()
            
            test_predictions.extend(predictions.cpu().numpy())
            test_targets.extend(batch_y.cpu().numpy())
            test_embeddings.extend(embeddings.cpu().numpy())
    
    test_accuracy = accuracy_score(test_targets, test_predictions)
    
    print(f"\nTest Results:")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(test_targets, test_predictions, 
                              target_names=['Away Win', 'Home Win']))
    
    return {
        'predictions': test_predictions,
        'targets': test_targets,
        'embeddings': np.array(test_embeddings),
        'accuracy': test_accuracy
    }

def create_model_and_train(prepared_data, batch_size=64, epochs=100, learning_rate=0.001):
    """
    Complete pipeline to create and train the model
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Get data
    X_train = torch.FloatTensor(prepared_data['X_train'])
    y_train = torch.LongTensor(prepared_data['y_train'])
    X_val = torch.FloatTensor(prepared_data['X_val'])
    y_val = torch.LongTensor(prepared_data['y_val'])
    X_test = torch.FloatTensor(prepared_data['X_test'])
    y_test = torch.LongTensor(prepared_data['y_test'])
    
    # Create data loaders
    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)
    test_dataset = TensorDataset(X_test, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Create model
    input_dim = prepared_data['n_features']
    model = BasketballEmbeddingNet(input_dim=input_dim)
    
    print(f"Model created with {input_dim} input features")
    print(f"Embedding dimension: 8")
    print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")
    
    # Train model
    history = train_model(model, train_loader, val_loader, epochs=epochs, 
                         learning_rate=learning_rate, device=device)
    
    # Evaluate model
    test_results = evaluate_model(model, test_loader, device=device)
    
    return {
        'model': model,
        'history': history,
        'test_results': test_results,
        'device': device
    }

# Example usage:
prepared_data = prep_df(df1)  # From previous function
results = create_model_and_train(prepared_data)
model = results['model']
 
# Extract embeddings for new data
# new_embeddings = model.get_embeddings(torch.FloatTensor(new_data))

Data preparation complete!
Total samples: 13203
Features: 39
Training samples: 7921
Validation samples: 2641
Test samples: 2641
Home team win rate: 0.566
Using device: cuda
Model created with 39 input features
Embedding dimension: 8
Model parameters: 5065
Epoch   0 | Train Loss: 0.4086 | Val Loss: 0.1343 | Val Acc: 0.9512
Epoch  10 | Train Loss: 0.0158 | Val Loss: 0.0158 | Val Acc: 0.9939
Epoch  20 | Train Loss: 0.0052 | Val Loss: 0.0076 | Val Acc: 0.9973
Epoch  30 | Train Loss: 0.0035 | Val Loss: 0.0136 | Val Acc: 0.9955
Epoch  40 | Train Loss: 0.0022 | Val Loss: 0.0004 | Val Acc: 1.0000
Epoch  50 | Train Loss: 0.0057 | Val Loss: 0.0145 | Val Acc: 0.9962
Epoch  60 | Train Loss: 0.0003 | Val Loss: 0.0004 | Val Acc: 1.0000
Epoch  70 | Train Loss: 0.0009 | Val Loss: 0.0001 | Val Acc: 1.0000
Epoch  80 | Train Loss: 0.0009 | Val Loss: 0.0011 | Val Acc: 0.9996
Epoch  90 | Train Loss: 0.0005 | Val Loss: 0.0000 | Val Acc: 1.0000

Test Results:
Test Accuracy: 1.0000

Classification Report:
   