In [15]:
# I have trained this data only using the file called training.xlx 
# I am not sure what how to use the truncated data or novice data aloing with this data to get a combined model
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from torch.utils.data import Dataset, DataLoader

# ----------------------
# 1. Data Loading & Prep
# ----------------------
def load_and_preprocess():
    """
    This function handles loading the data file and initial preprocessing.
    It removes invalid data points and selects relevant features.
    """
    # Load data from CSV file with low_memory=False to prevent mixed-type inference
    # Here you should insert your own path to the CSV file.
    df = pd.read_csv('/Users/gautam/Downloads/Training.csv', low_memory=False)
    
    # Define the target variable (what we want to predict - match duration)
    # Here I was only trying to predict the match length. I don't remeber if the questiosn in the required us to predict anything else 
    target = 'Match Length'
    
    # Data cleaning: Remove rows where the target is missing or negative
    # This is critical since we can't train on invalid target values
    # I don't think there are any negative values, but jsut to be sure 
    df = df[df[target].notna() & (df[target] > 0)]
    
    # Feature selection: Drop columns that wouldn't be useful for prediction:
    # - Unique identifiers (Match ID)
    # - Date fields that might leak future information (Completion Date)
    # - Text fields that are hard to process (Notes, Closure Details)
    # - Redundant or irrelevant features
    features = df.drop(columns=[
        target, 'Match ID 18Char', 'Completion Date', 
        'Match Support Contact Notes', 'Closure Details',
        'Rationale for Match', 'Big Contact: Preferred Communication Type',
        'Big Contact: Former Big/Little', 'Big Contact: Interest Finder - Sports',
        'Big Contact: Interest Finder - Places To Go', 'Big Contact: Interest Finder - Hobbies',
        'Big Contact: Interest Finder - Entertainment', 'Big Contact: Created Date'
    ])
    
    return df, features, target

# Execute the data loading function
df, features, target = load_and_preprocess()

# ----------------------------
# 2. Feature Preprocessing
# ----------------------------
def get_preprocessor(features):
    """
    Creates a preprocessing pipeline that handles both numerical and categorical features.
    This ensures consistent transformation of features for both training and inference.
    """
    # Automatically identify categorical and numerical columns based on their data types
    cat_cols = features.select_dtypes(include=['object', 'category']).columns
    num_cols = features.select_dtypes(include=['int64', 'float64']).columns
    
    # Remove constant numerical features that don't provide any information
    # Features with zero variance (same value in all rows) are useless for prediction
    constant_filter = VarianceThreshold(threshold=0)
    constant_filter.fit(features[num_cols])
    non_constant_num = num_cols[constant_filter.get_support()]
    
    # Create a composite preprocessing pipeline using ColumnTransformer
    # This applies different transformations to different column types
    preprocessor = ColumnTransformer(
        transformers=[
            # Numerical features pipeline:
            ('num', Pipeline([
                # Replace missing values with median (robust to outliers)
                ('imputer', SimpleImputer(strategy='median')),
                # Standardize features to zero mean and unit variance
                # This helps neural networks converge faster
                ('scaler', StandardScaler())
            ]), non_constant_num),
            
            # Categorical features pipeline:
            ('cat', Pipeline([
                # Replace missing values with a constant string 'missing'
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                # One-hot encode categorical variables
                # handle_unknown='ignore' prevents errors with new categories at inference time
                # max_categories=20 limits one-hot encoding to top 20 categories to prevent dimension explosion
                ('encoder', OneHotEncoder(handle_unknown='ignore', max_categories=20))
            ]), cat_cols)
        ],
        # Drop any remaining columns not explicitly handled
        remainder='drop'
    )
    
    return preprocessor

# Create the preprocessor
preprocessor = get_preprocessor(features)

# ----------------------------
# 3. Data Preparation
# ----------------------------
# Apply the preprocessing transformations to our features
X = preprocessor.fit_transform(features)

# Convert target to float32 (PyTorch's preferred float type) numpy array
y = df[target].values.astype(np.float32)

# Validation checks to ensure no NaN values remain
# NaNs would cause training errors
assert not np.isnan(X.data).any(), "X contains NaN values after preprocessing"
assert not np.isnan(y).any(), "y contains NaN values"

# Split data into training (80%) and testing (20%) sets
# random_state ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Helper function to convert data to PyTorch tensors
# Handles both dense numpy arrays and sparse matrices from one-hot encoding
def convert_to_tensor(sparse_matrix):
    if isinstance(sparse_matrix, np.ndarray):
        return torch.tensor(sparse_matrix, dtype=torch.float32)
    # If sparse, convert to dense before creating tensor
    return torch.tensor(sparse_matrix.todense(), dtype=torch.float32)

# Convert our data to PyTorch tensors
X_train_tensor = convert_to_tensor(X_train)
X_test_tensor = convert_to_tensor(X_test)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# ----------------------------
# 4. Dataset & DataLoader
# ----------------------------
class MatchDataset(Dataset):
    """
    Custom PyTorch Dataset that wraps our feature and target tensors.
    This allows efficient batching, shuffling, and parallel data loading.
    """
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        """Return the total number of samples in the dataset"""
        return len(self.X)
    
    def __getitem__(self, idx):
        """Return a specific sample by index"""
        return self.X[idx], self.y[idx]

# Create dataset objects
train_dataset = MatchDataset(X_train_tensor, y_train_tensor)
test_dataset = MatchDataset(X_test_tensor, y_test_tensor)

# Define batch size (number of samples processed together)
# Larger batch sizes are more efficient but use more memory
batch_size = 64

# Create DataLoader objects that handle batching, shuffling, and more
# Shuffling training data helps prevent the model from learning the order of samples
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# ----------------------------
# 5. Neural Network
# ----------------------------
class MatchPredictor(nn.Module):
    """
    Neural network architecture for predicting match length.
    A feed-forward network with two hidden layers.
    """
    def __init__(self, input_size):
        """Initialize the network with given input dimension"""
        super().__init__()
        self.layers = nn.Sequential(
            # First layer: input_size -> 128 neurons
            nn.Linear(input_size, 128),
            # ReLU activation introduces non-linearity
            nn.ReLU(),
            # Dropout randomly zeros 30% of neurons during training to prevent overfitting
            nn.Dropout(0.3),
            
            # Second layer: 128 -> 64 neurons
            nn.Linear(128, 64),
            nn.ReLU(),
            # Less dropout in deeper layers
            nn.Dropout(0.2),
            
            # Output layer: 64 -> 1 (single regression output)
            nn.Linear(64, 1)
        )
        # Initialize weights using optimal strategy
        self._init_weights()
        
    def _init_weights(self):
        """
        Initialize weights using Kaiming initialization.
        This helps with training deep networks by preventing vanishing/exploding gradients.
        """
        for m in self.modules():
            if isinstance(m, nn.Linear):
                # Kaiming/He initialization - ideal for ReLU activations
                nn.init.kaiming_normal_(m.weight)
                # Initialize biases to zero
                nn.init.constant_(m.bias, 0)
        
    def forward(self, x):
        """Forward pass through the network"""
        # squeeze() removes the unnecessary singleton dimension from output
        return self.layers(x).squeeze()

# Initialize the model with the correct input dimension
input_size = X_train_tensor.shape[1]
model = MatchPredictor(input_size)

# ----------------------------
# 6. Training Configuration
# ----------------------------
# HuberLoss is less sensitive to outliers than MSE
# Better choice for real-world data that might have outliers
criterion = nn.HuberLoss()

# AdamW optimizer: Adam with weight decay (L2 regularization)
# lr: learning rate - how quickly parameters are updated
# weight_decay: penalizes large weights to prevent overfitting
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

# Learning rate scheduler: reduces learning rate when validation loss plateaus
# This helps fine-tune the model when it's close to convergence
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)

# ----------------------------
# 7. Training Loop
# ----------------------------
def train_model(model, epochs=100):
    """
    Train the model for a specified number of epochs.
    Implements early stopping to prevent overfitting.
    """
    best_loss = float('inf')  # Initialize best loss to infinity
    
    for epoch in range(epochs):
        # Set model to training mode (enables dropout)
        model.train()
        epoch_loss = 0
        
        # Loop through batches of training data
        for batch_X, batch_y in train_loader:
            # Zero the gradients before each batch
            # This prevents gradient accumulation from previous batches
            optimizer.zero_grad()
            
            # Forward pass: compute predictions
            outputs = model(batch_X)
            
            # Compute loss between predictions and true values
            loss = criterion(outputs, batch_y)
            
            # Backward pass: compute gradients
            loss.backward()
            
            # Gradient clipping prevents exploding gradients
            # Ensures stable training by limiting gradient magnitude
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            # Update model parameters based on gradients
            optimizer.step()
            
            # Accumulate batch loss
            epoch_loss += loss.item()
        
        # Validation phase - no gradient updates
        model.eval()  # Set model to evaluation mode (disables dropout)
        with torch.no_grad():  # No gradient computation for efficiency
            # Compute predictions on test set
            test_preds = model(X_test_tensor)
            # Calculate validation loss
            test_loss = criterion(test_preds, y_test_tensor)
            # Adjust learning rate based on validation loss
            scheduler.step(test_loss)
        
        # Calculate average training loss for reporting
        avg_train_loss = epoch_loss / len(train_loader)
        print(f'Epoch {epoch+1:03d} | Train Loss: {avg_train_loss:.4f} | Test Loss: {test_loss:.4f}')
        
        # Early stopping logic
        if test_loss < best_loss:
            # If we found a better model, save it
            best_loss = test_loss
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            # If validation loss hasn't improved and we're past warm-up
            if epoch > 10:
                print("Early stopping triggered")
                break  # Stop training

    # Load the best model found during training
    model.load_state_dict(torch.load('best_model.pth'))
    return model

# ----------------------------
# 8. Training & Evaluation
# ----------------------------
# Train the model with early stopping
model = train_model(model, epochs=100)

# Final evaluation on test set
model.eval()  # Set to evaluation mode
with torch.no_grad():  # No gradients needed for inference
    # Generate predictions
    test_preds = model(X_test_tensor)
    # Calculate final loss
    final_loss = criterion(test_preds, y_test_tensor)
    # Convert loss to RMSE (Root Mean Squared Error) for interpretability
    # RMSE is in the same units as the target (months)
    rmse = torch.sqrt(final_loss)
    print(f'\nFinal RMSE: {rmse.item():.4f} months')

Epoch 001 | Train Loss: 12.0233 | Test Loss: 7.6606
Epoch 002 | Train Loss: 8.1615 | Test Loss: 6.7657
Epoch 003 | Train Loss: 7.4450 | Test Loss: 6.2259
Epoch 004 | Train Loss: 6.8427 | Test Loss: 5.4380
Epoch 005 | Train Loss: 6.3984 | Test Loss: 5.1703
Epoch 006 | Train Loss: 5.9572 | Test Loss: 4.6681
Epoch 007 | Train Loss: 5.6964 | Test Loss: 5.0643
Epoch 008 | Train Loss: 5.4826 | Test Loss: 3.8405
Epoch 009 | Train Loss: 5.2652 | Test Loss: 3.5757
Epoch 010 | Train Loss: 5.0809 | Test Loss: 3.3433
Epoch 011 | Train Loss: 4.9621 | Test Loss: 3.3695
Epoch 012 | Train Loss: 4.7723 | Test Loss: 3.2634
Epoch 013 | Train Loss: 4.6646 | Test Loss: 2.8319
Epoch 014 | Train Loss: 4.5641 | Test Loss: 2.9317
Early stopping triggered

Final RMSE: 1.6828 months


In [13]:
def calculate_average_match_length():
    
        # Load the data
        df = pd.read_csv('/Users/gautam/Downloads/Training.csv', low_memory=False)
            
        # Clean the data and calculate average
        avg_length = df['Match Length'].dropna().mean()
        
        # Print formatted result
        print(f"\nAverage Match Length: {avg_length:.2f} months")
        print(f"Based on {len(df['Match Length'].dropna())} valid records")
        

# Run the calculation
calculate_average_match_length()


Average Match Length: 35.20 months
Based on 39345 valid records
