In [14]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from torch.utils.data import Dataset, DataLoader

# ----------------------
# 1. Enhanced Interest Similarity Calculation
# ----------------------
def calculate_interest_similarity(df):
    """Improved similarity calculation with NaN handling"""
    # This dictionary maps each interest category to a pair of column names
    # representing interests for 'Big Contact' (mentor) and 'Little Contact' (mentee)
    interest_pairs = {
        'Sports': ('Big Contact: Interest Finder - Sports', 
                  'Little Contact: Interest Finder - Sports'),
        'Hobbies': ('Big Contact: Interest Finder - Hobbies',
                   'Little Contact: Interest Finder - Hobbies'),
        'Entertainment': ('Big Contact: Interest Finder - Entertainment',
                         'Little Contact: Interest Finder - Entertainment'),
        'Places': ('Big Contact: Interest Finder - Places To Go',
                  'Little Contact: Interest Finder - Places To Go')
    }
    
    for category, (big_col, little_col) in interest_pairs.items():
        # Handle missing values and empty strings to avoid issues in similarity calculation
        df[big_col] = df[big_col].fillna('').astype(str)
        df[little_col] = df[little_col].fillna('').astype(str)
        
        # Calculate Jaccard similarity: ratio of intersection to union of interests
        # This measures how similar the interests are between mentors and mentees
        # Value ranges from 0 (no overlap) to 1 (identical interests)
        df[f'{category}_Similarity'] = df.apply(
            lambda row: (
                len(set(row[big_col].split(';')) & set(row[little_col].split(';'))) 
                / max(len(set(row[big_col].split(';')) | set(row[little_col].split(';'))), 1)
            ), 
            axis=1
        )
    return df

# ----------------------
# 2. Data Processing with Validation
# ----------------------
def load_and_preprocess():
    # Load the dataset from CSV
    df = pd.read_csv('/Users/gautam/Downloads/Training.csv', low_memory=False)
    target = 'Match Length'
    
    # Filter out invalid target values - ensure all target values are present and positive
    # This is important as we can't predict missing or invalid lengths
    df = df[(df[target].notna()) & (df[target] > 0)].copy()
    
    # Calculate interest similarities between mentors and mentees
    df = calculate_interest_similarity(df)
    
    # Select the features we'll use for prediction
    # These include similarity metrics and categorical variables about participants
    features = df[[
        'Sports_Similarity', 'Hobbies_Similarity',
        'Entertainment_Similarity', 'Places_Similarity',
        'Program Type', 'Big County', 'Big Occupation',
        'Big Languages', 'Little Contact: Language(s) Spoken'
    ]]
    
    return df, features, target

# ----------------------
# 3. Advanced Preprocessing
# ----------------------
def create_preprocessor():
    # Create a preprocessing pipeline with different strategies for different feature types
    return ColumnTransformer([
        # For similarity features: fill missing values with 0 and apply robust scaling
        # Robust scaling is less affected by outliers than standard scaling
        ('similarities', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
            ('scaler', RobustScaler())
        ]), ['Sports_Similarity', 'Hobbies_Similarity',
             'Entertainment_Similarity', 'Places_Similarity']),
        
        # For categorical features: fill missing values with most frequent value and apply one-hot encoding
        # Limited to 20 most frequent categories to avoid dimensionality explosion
        ('categories', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False,
                                     max_categories=20))
        ]), ['Program Type', 'Big County', 'Big Occupation']),
        
        # For language features: fill missing values with empty string and apply one-hot encoding
        ('languages', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), ['Big Languages', 'Little Contact: Language(s) Spoken'])
    ], sparse_threshold=0)  # Ensure dense output, not sparse matrices

# ----------------------
# 4. Enhanced Neural Architecture
# ----------------------
class InterestMatchPredictor(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        # Feature attention layer - learns which features are most important
        # This allows the model to focus on the most relevant predictors
        self.attention = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, input_size),
            nn.Sigmoid()  # Outputs values between 0-1 as attention weights
        )
        
        # Main network architecture: a deep neural network with decreasing layer sizes
        # Includes multiple regularization techniques to prevent overfitting
        self.net = nn.Sequential(
            # First hidden layer
            nn.Linear(input_size, 256),
            nn.BatchNorm1d(256),  # Stabilizes learning by normalizing layer inputs
            nn.ReLU(),            # Non-linear activation
            nn.Dropout(0.4),      # Randomly drops 40% of neurons during training to prevent overfitting
            
            # Second hidden layer
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),      # Slightly less dropout in deeper layers
            
            # Third hidden layer
            nn.Linear(128, 64),
            nn.LayerNorm(64),     # Layer normalization as an alternative to batch norm
            nn.ReLU(),
            
            # Output layer - single neuron for regression output
            nn.Linear(64, 1)
        )
        
    def forward(self, x):
        # Apply attention mechanism: multiply each feature by its learned importance weight
        attn_weights = self.attention(x)
        x = x * attn_weights  # Element-wise multiplication for feature-wise attention
        
        # Pass weighted features through the main network
        return self.net(x).squeeze()  # Squeeze removes singleton dimension from output

# ----------------------
# 5. Optimized Training Loop
# ----------------------
def train_model(model, train_loader, test_loader):
    # Huber loss is more tolerant to outliers than MSE - good for duration predictions
    # which may have some extreme values
    criterion = nn.HuberLoss(delta=2.0)  
    
    # AdamW optimizer: Adam with decoupled weight decay for better regularization
    optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-5)
    
    # Learning rate scheduler: reduces learning rate according to cosine schedule with restarts
    # Helps escape local minima and converge to better solutions
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10)
    
    # Early stopping parameters
    best_loss = float('inf')
    patience = 0
    max_patience = 20  # Stop if no improvement for 20 epochs
    
    # Main training loop
    for epoch in range(100):  # Maximum of 100 epochs
        # Training phase
        model.train()
        total_loss = 0
        
        for X_batch, y_batch in train_loader:
            # Standard training step
            optimizer.zero_grad()      # Clear previous gradients
            outputs = model(X_batch)   # Forward pass
            loss = criterion(outputs, y_batch)  # Calculate loss
            loss.backward()            # Backward pass
            
            # Gradient clipping prevents exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            
            optimizer.step()           # Update weights
            total_loss += loss.item()  # Accumulate loss for reporting
        
        # Validation phase
        model.eval()  # Set model to evaluation mode (disables dropout, etc.)
        with torch.no_grad():  # No need to track gradients during validation
            test_preds = model(test_loader.dataset.X)
            test_loss = criterion(test_preds, test_loader.dataset.y)
        
        # Update learning rate based on validation loss
        scheduler.step(test_loss)
        
        # Calculate and report average training loss
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1:03d} | Train Loss: {avg_loss:.4f} | Test Loss: {test_loss:.4f}')
        
        # Early stopping logic
        if test_loss < best_loss:
            best_loss = test_loss
            patience = 0
            # Save the best model so far
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience += 1
            
        if patience >= max_patience:
            print(f"Early stopping triggered after {max_patience} epochs without improvement")
            break
            
    # Load the best model state before returning
    model.load_state_dict(torch.load('best_model.pth'))
    return model

# ----------------------
# 6. Execution Flow with Monitoring
# ----------------------
if __name__ == "__main__":
    # Load and process data
    df, features, target = load_and_preprocess()
    
    # Temporal split with validation - sort by date for more realistic evaluation
    # This is better than random splitting for time-dependent data
    df = df.sort_values('Match Activation Date')
    
    # Split into train (70%), validation (15%), and test (15%) sets
    train_size = int(len(df) * 0.7)
    val_size = int(len(df) * 0.15)
    train_df = df.iloc[:train_size]
    val_df = df.iloc[train_size:train_size+val_size]
    test_df = df.iloc[train_size+val_size:]
    
    # Apply preprocessing pipeline to each dataset split
    preprocessor = create_preprocessor()
    X_train = preprocessor.fit_transform(train_df[features.columns])  # Fit on training set only
    X_val = preprocessor.transform(val_df[features.columns])          # Apply to validation set
    X_test = preprocessor.transform(test_df[features.columns])        # Apply to test set
    
    # Create PyTorch datasets for efficient batching and iteration
    class MatchDataset(Dataset):
        def __init__(self, X, y):
            # Convert numpy arrays to PyTorch tensors
            self.X = torch.tensor(X, dtype=torch.float32)
            self.y = torch.tensor(y, dtype=torch.float32)
            
        def __len__(self):
            return len(self.X)
        
        def __getitem__(self, idx):
            return self.X[idx], self.y[idx]
    
    # Create datasets for each split
    train_dataset = MatchDataset(X_train, train_df[target].values)
    val_dataset = MatchDataset(X_val, val_df[target].values)
    test_dataset = MatchDataset(X_test, test_df[target].values)
    
    # Create data loaders for batched processing
    batch_size = 128  # Process 128 samples at a time
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  # Shuffle training data
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Initialize model with appropriate input size
    model = InterestMatchPredictor(X_train.shape[1])
    
    # Train the model
    model = train_model(model, train_loader, val_loader)
    
    # Final evaluation on test set
    model.eval()
    with torch.no_grad():
        test_preds = model(test_dataset.X)
        # Calculate RMSE: root mean squared error in months
        rmse = torch.sqrt(nn.MSELoss()(test_preds, test_dataset.y))
        print(f'\nFinal Test RMSE: {rmse.item():.4f} months')

Epoch 001 | Train Loss: 68.7488 | Test Loss: 50.1660
Epoch 002 | Train Loss: 58.7098 | Test Loss: 39.5681
Epoch 003 | Train Loss: 53.7264 | Test Loss: 39.5211
Epoch 004 | Train Loss: 53.7060 | Test Loss: 39.4651
Epoch 005 | Train Loss: 53.6207 | Test Loss: 39.3960
Epoch 006 | Train Loss: 53.5934 | Test Loss: 39.3087
Epoch 007 | Train Loss: 53.5164 | Test Loss: 39.1951
Epoch 008 | Train Loss: 53.3234 | Test Loss: 39.0420
Epoch 009 | Train Loss: 53.2184 | Test Loss: 38.8272
Epoch 010 | Train Loss: 52.9417 | Test Loss: 38.5077
Epoch 011 | Train Loss: 52.5855 | Test Loss: 37.9989
Epoch 012 | Train Loss: 52.0614 | Test Loss: 37.1099
Epoch 013 | Train Loss: 50.6841 | Test Loss: 35.3503
Epoch 014 | Train Loss: 48.1798 | Test Loss: 31.5579
Epoch 015 | Train Loss: 43.3979 | Test Loss: 25.0727
Epoch 016 | Train Loss: 39.2301 | Test Loss: 22.5335
Epoch 017 | Train Loss: 36.5017 | Test Loss: 21.5331
Epoch 018 | Train Loss: 34.2303 | Test Loss: 23.6422
Epoch 019 | Train Loss: 33.1441 | Test Loss: 2