In [1]:
# In this code I enhanced the age difference between mentor and mentee to predict the match length
# The result truned out to be much worse than the previous model
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from torch.utils.data import Dataset, DataLoader

# ----------------------
# 1. Data Validation & Processing
# ----------------------
def load_and_validate_data():
    # Load the dataset from a CSV file
    df = pd.read_csv('/Users/gautam/Downloads/Training.csv', low_memory=False)
    target = 'Match Length'
    
    # Remove rows with missing values in critical columns
    # This ensures we have complete data for the target and date fields
    df = df.dropna(subset=[target, 'Match Activation Date', 'Big Birthdate', 'Little Birthdate'])
    
    # Filter to keep only valid matches (positive match length)
    df = df[df[target] > 0].copy()
    
    # Calculate ages based on birthdate
    # Using a fixed reference date (February 27, 2025) for all calculations
    # This ensures consistency but will need updating over time
    df['Big Age'] = (pd.to_datetime('2025-02-27') - pd.to_datetime(df['Big Birthdate'])).dt.days // 365
    df['Little Age'] = (pd.to_datetime('2025-02-27') - pd.to_datetime(df['Little Birthdate'])).dt.days // 365
    
    # Create derived feature: age difference between mentor and mentee
    df['Age Difference'] = df['Big Age'] - df['Little Age']
    
    # Apply domain-specific validation rules:
    # - Mentors ("Bigs") must be adults (18+)
    # - Mentees ("Littles") must be children/youth (18 or younger)
    df = df[(df['Big Age'] >= 18) & (df['Little Age'] <= 18)]
    
    # Select only the features we'll use for prediction
    features = df[[
        'Big Age', 'Little Age', 'Age Difference',  # Numerical features
        'Big County', 'Big Occupation', 'Program Type'  # Categorical features
    ]]
    
    return df, features, target

# ----------------------
# 2. Simplified Preprocessing
# ----------------------
def create_preprocessor():
    # Create a preprocessing pipeline using scikit-learn's ColumnTransformer
    # This allows different preprocessing for numerical vs categorical features
    return ColumnTransformer([
        # Pipeline for numerical features:
        ('num', Pipeline([
            # Replace missing values with the median of each column
            ('imputer', SimpleImputer(strategy='median')),
            # Standardize data: (x - mean) / std
            # This puts all numerical features on similar scale
            ('scaler', StandardScaler())
        ], verbose=True), ['Big Age', 'Little Age', 'Age Difference']),
        
        # Pipeline for categorical features:
        ('cat', Pipeline([
            # Replace missing categories with the most common category
            ('imputer', SimpleImputer(strategy='most_frequent')),
            # Convert categories to one-hot encoded binary vectors
            # handle_unknown='ignore' prevents errors for new categories at test time
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ], verbose=True), ['Big County', 'Big Occupation', 'Program Type'])
    ], verbose=True, sparse_threshold=0)  # sparse_threshold=0 ensures dense output for PyTorch

# ----------------------
# 3. Simplified Model Architecture
# ----------------------
class MatchPredictor(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        # Define a sequential neural network with:
        # - 2 hidden layers (64 and 32 neurons)
        # - ReLU activation between layers
        # - Dropout layers to prevent overfitting
        self.net = nn.Sequential(
            # Input layer to first hidden layer
            nn.Linear(input_size, 64),  # Linear transformation: y = Wx + b
            nn.ReLU(),  # Activation function: f(x) = max(0, x)
            nn.Dropout(0.2),  # Randomly zero 20% of neurons during training
            
            # First hidden layer to second hidden layer
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Second hidden layer to output (single value for regression)
            nn.Linear(32, 1)
        )
        
    def forward(self, x):
        # Pass input through the network and remove extra dimensions
        # squeeze() removes singleton dimensions (e.g., [batch_size, 1] -> [batch_size])
        return self.net(x).squeeze()

# ----------------------
# 4. Enhanced Training Loop
# ----------------------
def train_model(model, train_loader, test_loader):
    # Set up training components:
    
    # HuberLoss: Less sensitive to outliers than MSE
    # Works like MSE for small errors, like MAE for large errors
    criterion = nn.HuberLoss()
    
    # Adam optimizer: Adaptive learning rates for each parameter
    # lr=0.01 is the initial learning rate
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    
    # Learning rate scheduler: Reduces learning rate when validation loss plateaus
    # 'patience=5' means wait 5 epochs of no improvement before reducing rate
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)
    
    # Variables for early stopping
    best_loss = float('inf')  # Track best validation loss so far
    patience_counter = 0      # Track epochs without improvement
    
    # Main training loop (maximum 100 epochs)
    for epoch in range(100):
        # --- TRAINING PHASE ---
        model.train()  # Set model to training mode (enables dropout)
        total_loss = 0
        
        # Process mini-batches from training data
        for X_batch, y_batch in train_loader:
            # Zero the parameter gradients (required each iteration)
            optimizer.zero_grad()
            
            # Forward pass: compute predictions
            outputs = model(X_batch)
            
            # Calculate loss between predictions and true values
            loss = criterion(outputs, y_batch)
            
            # Backward pass: compute gradients
            loss.backward()
            
            # Clip gradients to prevent exploding gradients
            # Maximum gradient norm of 1.0
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            # Update weights based on gradients
            optimizer.step()
            
            # Accumulate total loss for this epoch
            total_loss += loss.item()
        
        # --- VALIDATION PHASE ---
        model.eval()  # Set model to evaluation mode (disables dropout)
        
        # Compute validation loss without updating gradients
        with torch.no_grad():
            # Get predictions on entire test set at once
            # (More efficient than batch-by-batch for small datasets)
            test_preds = model(test_loader.dataset.X)
            test_loss = criterion(test_preds, test_loader.dataset.y)
        
        # Adjust learning rate based on validation performance
        scheduler.step(test_loss)
        
        # --- EARLY STOPPING LOGIC ---
        if test_loss < best_loss:
            # If we found a better model, save it
            best_loss = test_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            # If no improvement, increment patience counter
            patience_counter += 1
            
        # Print progress update
        print(f'Epoch {epoch+1:03d} | Train Loss: {total_loss/len(train_loader):.4f} | Test Loss: {test_loss:.4f}')
        
        # Stop training if no improvement for 10 consecutive epochs
        if patience_counter >= 10:
            print("Early stopping triggered")
            break
            
    # Load the best model from disk
    model.load_state_dict(torch.load('best_model.pth'))
    return model

# ----------------------
# 5. Execution Flow
# ----------------------
if __name__ == "__main__":
    # --- DATA PREPARATION ---
    # Load and clean the data
    df, features, target = load_and_validate_data()
    
    # Split data into training (80%) and test (20%) sets
    # Using random sampling with a fixed seed for reproducibility
    train_df = df.sample(frac=0.8, random_state=42)
    test_df = df.drop(train_df.index)  # Use remaining samples for testing
    
    # --- FEATURE PREPROCESSING ---
    # Create and apply the preprocessing pipeline
    preprocessor = create_preprocessor()
    
    # Fit preprocessor on training data, then transform both sets
    X_train = preprocessor.fit_transform(train_df[features.columns])
    X_test = preprocessor.transform(test_df[features.columns])  # Only transform test data
    
    # --- DATASET CREATION ---
    # Custom dataset class to handle PyTorch data loading
    class MatchDataset(Dataset):
        def __init__(self, X, y):
            # Convert numpy arrays to PyTorch tensors
            self.X = torch.tensor(X, dtype=torch.float32)
            self.y = torch.tensor(y, dtype=torch.float32)
            
        def __len__(self):
            # Return the number of samples
            return len(self.X)
        
        def __getitem__(self, idx):
            # Return a specific sample and its label
            return self.X[idx], self.y[idx]
    
    # Create dataset objects for training and testing
    train_dataset = MatchDataset(X_train, train_df[target].values)
    test_dataset = MatchDataset(X_test, test_df[target].values)
    
    # Create data loaders that will handle batching
    # Batch size of 32 is a common choice
    # Shuffling training data helps prevent overfitting
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)
    
    # --- MODEL TRAINING ---
    # Initialize the model with the correct input size
    # (input_size will depend on the number of features after preprocessing)
    model = MatchPredictor(X_train.shape[1])
    
    # Train the model and get back the best version
    model = train_model(model, train_loader, test_loader)
    
    # --- FINAL EVALUATION ---
    # Set model to evaluation mode for final testing
    model.eval()
    
    # Evaluate on the test set without updating gradients
    with torch.no_grad():
        # Get predictions on the test set
        test_preds = model(test_dataset.X)
        
        # Calculate final RMSE (Root Mean Squared Error)
        # Using MSE loss and taking square root
        # This gives error in the same units as the target (months)
        rmse = torch.sqrt(nn.MSELoss()(test_preds, test_dataset.y))
        print(f'\nFinal RMSE: {rmse.item():.4f} months')
        
        # Note: RMSE will be higher than the HuberLoss reported during training
        # because HuberLoss reduces the impact of outliers, while RMSE does not

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer] ........... (1 of 2) Processing num, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing cat, total=   0.0s
Epoch 001 | Train Loss: 16.6439 | Test Loss: 15.2298
Epoch 002 | Train Loss: 15.5236 | Test Loss: 14.8614
Epoch 003 | Train Loss: 15.1532 | Test Loss: 14.3653
Epoch 004 | Train Loss: 14.7662 | Test Loss: 13.7937
Epoch 005 | Train Loss: 14.4474 | Test Loss: 13.4641
Epoch 006 | Train Loss: 14.0511 | Test Loss: 13.2218
Epoch 007 | Train Loss: 13.7014 | Test Loss: 12.9365
Epoch 008 | Train Loss: 13.3682 | Test Loss: 12.5982
Epoch 009 | Train Loss: 13.1385 | Test Loss: 12.0920
Epoch 010 | Train Loss: 12.7686 | Test Loss: 11.5753
Epoch 011 | Train Loss: 12.5859 | Test Loss: