# Neural Probabilistic Language Model (Bengio et al., 2003) - Replica

This notebook implements a replica of the Neural Probabilistic Language Model with MLflow integration for experiment tracking.

## MLflow Persistence Setup
We store MLflow results on Google Drive to ensure persistence across Colab sessions.

**Advantages of this approach:**
- Simple to implement (just mount Drive and set tracking URI)
- No external server required
- Data persists across Colab sessions
- Easy to share results with team members via Drive sharing

In [None]:
# Cell 1: Mount Google Drive and setup paths
import sys
import os

try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except ImportError:
    IN_COLAB = False
    print('Not running in Google Colab. Using local paths.')

# Project paths
if IN_COLAB:
    project_path = '/content/drive/MyDrive/Deep_learning_papers/Neural_probabilistic_laguage_model'
    mlflow_tracking_path = '/content/drive/MyDrive/Deep_learning_papers/Neural_probabilistic_laguage_model/mlruns'
else:
    project_path = '.'
    mlflow_tracking_path = './mlruns'

# Create mlruns directory if it doesn't exist
os.makedirs(mlflow_tracking_path, exist_ok=True)

if project_path not in sys.path:
    sys.path.insert(0, project_path)

from utils.data_preparator import *

In [None]:
# Cell 2: Install and setup MLflow
try:
    import mlflow
except ImportError:
    !pip install mlflow -q
    import mlflow

import mlflow.pytorch

# Set the tracking URI to Google Drive path for persistence
mlflow.set_tracking_uri(f'file://{mlflow_tracking_path}')

# Create or get experiment
experiment_name = 'bengio_language_model'
mlflow.set_experiment(experiment_name)

print(f'MLflow tracking URI: {mlflow.get_tracking_uri()}')
print(f'MLflow experiment: {experiment_name}')

In [None]:
# Cell 3: Import dependencies
import nltk
nltk.download('brown', quiet=True)
from nltk.corpus import brown
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import random
import math

In [None]:
# Cell 4: Device setup (automatic CUDA/CPU detection)
torch.cuda.empty_cache()
if torch.cuda.is_available():
    torch.cuda.synchronize()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')
if device == 'cuda':
    print(f'GPU: {torch.cuda.get_device_name(0)}')

In [None]:
# Cell 5: Model layers (Andrej Karpathy style)
class Linear:
    def __init__(self, fan_in, fan_out, bias=True, device=None):
        self.weight = (torch.randn((fan_in, fan_out)) / fan_in**0.5).to(device)
        self.bias = torch.zeros(fan_out).to(device) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])


class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out

    def parameters(self):
        return []


class Embeddings:
    def __init__(self, num_embeddings, embedding_dim, device=None):
        self.weight = torch.randn((num_embeddings, embedding_dim)).to(device)

    def __call__(self, IX):
        self.out = self.weight[IX]
        # Flatten: [Batch, context_window, embedding_dim] -> [Batch, context_window * embedding_dim]
        self.out = self.out.view(self.out.shape[0], -1)
        return self.out

    def parameters(self):
        return [self.weight]


class Sequential:
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [None]:
# Cell 6: Model factory function
def create_model(vocab_size, n_emb, n_hidden, context_window, device):
    """Create a Bengio-style neural language model.
    
    Args:
        vocab_size: Size of the vocabulary
        n_emb: Embedding dimension
        n_hidden: Hidden layer dimension
        context_window: Number of context words (block_size)
        device: 'cuda' or 'cpu'
    
    Returns:
        model: Sequential model
        n_params: Number of trainable parameters
    """
    model = Sequential([
        Embeddings(vocab_size, n_emb, device=device),
        Linear(n_emb * context_window, n_hidden, bias=True, device=device),
        Tanh(),
        Linear(n_hidden, vocab_size, bias=True, device=device),
    ])
    
    parameters = model.parameters()
    n_params = sum(p.nelement() for p in parameters)
    for p in parameters:
        p.requires_grad = True
    
    return model, n_params

In [None]:
# Cell 7: Evaluation function
@torch.no_grad()
def evaluate_loss(model, X, Y, batch_size=256):
    """Evaluate loss on a dataset.
    
    Args:
        model: The language model
        X: Input tensor
        Y: Target tensor
        batch_size: Batch size for evaluation
    
    Returns:
        loss: Mean cross-entropy loss
    """
    losses = []
    for i in range(0, X.shape[0], batch_size):
        Xb = X[i:i + batch_size]
        Yb = Y[i:i + batch_size]
        logits = model(Xb)
        loss = F.cross_entropy(logits, Yb)
        losses.append(loss.item())
    return sum(losses) / len(losses)

In [None]:
# Cell 8: Main training function with MLflow logging
def train_experiment(params):
    """Run a training experiment with the given parameters.
    
    Args:
        params: Dictionary containing:
            - embedding_dim: Dimension of word embeddings
            - hidden_dim: Dimension of hidden layer
            - learning_rate: Learning rate
            - batch_size: Training batch size
            - context_window: Number of context words
            - max_steps: Maximum training steps
            - eval_interval: Evaluate every N steps
            - lr_decay_step: Step at which to decay learning rate (optional)
            - seed: Random seed (optional)
    
    Returns:
        dict: Results including final train_loss, val_loss, and perplexity
    """
    # Extract parameters with defaults
    n_emb = params.get('embedding_dim', 60)
    n_hidden = params.get('hidden_dim', 100)
    learning_rate = params.get('learning_rate', 0.1)
    batch_size = params.get('batch_size', 64)
    context_window = params.get('context_window', 3)
    max_steps = params.get('max_steps', 10000)
    eval_interval = params.get('eval_interval', 1000)
    lr_decay_step = params.get('lr_decay_step', 150000)
    seed = params.get('seed', 42)
    
    # Set random seed for reproducibility
    torch.manual_seed(seed)
    
    # Build dataset
    Xtr, Xval, Xte, Ytr, Yval, Yte, vocab_size, stoi, itos = build_dataset(
        brown.words(), context_window, device=device
    )
    
    # Create model
    model, n_params = create_model(vocab_size, n_emb, n_hidden, context_window, device)
    parameters = model.parameters()
    
    print(f'\n{"="*60}')
    print(f'Starting experiment: n_emb={n_emb}, n_hidden={n_hidden}')
    print(f'Model parameters: {n_params:,}')
    print(f'Vocab size: {vocab_size}')
    print(f'{"="*60}\n')
    
    # Start MLflow run
    with mlflow.start_run():
        # Log parameters
        mlflow.log_params({
            'embedding_dim': n_emb,
            'hidden_dim': n_hidden,
            'learning_rate': learning_rate,
            'batch_size': batch_size,
            'context_window': context_window,
            'max_steps': max_steps,
            'vocab_size': vocab_size,
            'n_params': n_params,
            'seed': seed,
            'device': device
        })
        
        # Training loop
        train_losses = []
        val_losses = []
        
        for step in range(max_steps):
            # Sample batch
            ix = torch.randint(0, Xtr.shape[0], (batch_size,))
            Xb, Yb = Xtr[ix], Ytr[ix]
            
            # Forward pass
            logits = model(Xb)
            loss = F.cross_entropy(logits, Yb)
            
            # Backward pass
            for p in parameters:
                p.grad = None
            loss.backward()
            
            # Learning rate schedule
            lr = learning_rate if step < lr_decay_step else learning_rate * 0.1
            
            # Update parameters
            for p in parameters:
                p.data += -lr * p.grad
            
            # Log and evaluate
            if step % eval_interval == 0 or step == max_steps - 1:
                train_loss = evaluate_loss(model, Xtr, Ytr)
                val_loss = evaluate_loss(model, Xval, Yval)
                train_perplexity = math.exp(train_loss)
                val_perplexity = math.exp(val_loss)
                
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                
                # Log metrics to MLflow
                mlflow.log_metrics({
                    'train_loss': train_loss,
                    'val_loss': val_loss,
                    'train_perplexity': train_perplexity,
                    'val_perplexity': val_perplexity
                }, step=step)
                
                print(f'Step {step:6d}/{max_steps}: '
                      f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, '
                      f'train_ppl={train_perplexity:.2f}, val_ppl={val_perplexity:.2f}')
        
        # Final evaluation
        final_train_loss = train_losses[-1]
        final_val_loss = val_losses[-1]
        final_train_ppl = math.exp(final_train_loss)
        final_val_ppl = math.exp(final_val_loss)
        test_loss = evaluate_loss(model, Xte, Yte)
        test_perplexity = math.exp(test_loss)
        
        # Log final metrics
        mlflow.log_metrics({
            'final_train_loss': final_train_loss,
            'final_val_loss': final_val_loss,
            'final_test_loss': test_loss,
            'final_train_perplexity': final_train_ppl,
            'final_val_perplexity': final_val_ppl,
            'final_test_perplexity': test_perplexity
        })
        
        print(f'\nFinal Results:')
        print(f'  Train Loss: {final_train_loss:.4f}, Perplexity: {final_train_ppl:.2f}')
        print(f'  Val Loss:   {final_val_loss:.4f}, Perplexity: {final_val_ppl:.2f}')
        print(f'  Test Loss:  {test_loss:.4f}, Perplexity: {test_perplexity:.2f}')
        
        # Log number of parameters as metric for easy comparison
        mlflow.log_metric('n_params', n_params)
        
        run_id = mlflow.active_run().info.run_id
    
    return {
        'run_id': run_id,
        'train_loss': final_train_loss,
        'val_loss': final_val_loss,
        'test_loss': test_loss,
        'train_perplexity': final_train_ppl,
        'val_perplexity': final_val_ppl,
        'test_perplexity': test_perplexity,
        'n_params': n_params,
        'model': model,
        'itos': itos,
        'stoi': stoi
    }

In [None]:
# Cell 9: Define experiment configurations
# Based on Bengio et al. (2003), we test smaller configurations to reduce overfitting
# The original paper used embedding dimensions around 30-100 and hidden layer sizes of 50-200

experiments = [
    # Small configuration (from Bengio paper)
    {
        'embedding_dim': 30,
        'hidden_dim': 50,
        'learning_rate': 0.1,
        'batch_size': 64,
        'context_window': 3,
        'max_steps': 10000,
        'eval_interval': 1000,
    },
    # Medium configuration (from Bengio paper)
    {
        'embedding_dim': 60,
        'hidden_dim': 100,
        'learning_rate': 0.1,
        'batch_size': 64,
        'context_window': 3,
        'max_steps': 10000,
        'eval_interval': 1000,
    },
    # Large configuration (overfitting baseline)
    {
        'embedding_dim': 150,
        'hidden_dim': 400,
        'learning_rate': 0.1,
        'batch_size': 64,
        'context_window': 3,
        'max_steps': 10000,
        'eval_interval': 1000,
    },
]

print(f'Configured {len(experiments)} experiments:')
for i, exp in enumerate(experiments, 1):
    print(f'  {i}. n_emb={exp["embedding_dim"]}, n_hidden={exp["hidden_dim"]}')

In [None]:
# Cell 10: Run all experiments
results = []

for i, params in enumerate(experiments, 1):
    print(f'\n{"#"*60}')
    print(f'Running Experiment {i}/{len(experiments)}')
    print(f'{"#"*60}')
    
    result = train_experiment(params)
    result['config'] = params
    results.append(result)
    
    # Clear CUDA cache between experiments
    if device == 'cuda':
        torch.cuda.empty_cache()

print(f'\n{"="*60}')
print('All experiments completed!')
print(f'{"="*60}')

In [None]:
# Cell 11: Summary of results
print('\nExperiment Summary:')
print('-' * 80)
print(f'{"Config":^25} | {"Params":>10} | {"Train PPL":>10} | {"Val PPL":>10} | {"Test PPL":>10}')
print('-' * 80)

for r in results:
    config = r['config']
    config_str = f"n_emb={config['embedding_dim']}, n_h={config['hidden_dim']}"
    print(f'{config_str:^25} | {r["n_params"]:>10,} | {r["train_perplexity"]:>10.2f} | {r["val_perplexity"]:>10.2f} | {r["test_perplexity"]:>10.2f}')

print('-' * 80)

# Find best model by validation perplexity
best_result = min(results, key=lambda x: x['val_perplexity'])
best_config = best_result['config']
print(f'\nBest model by validation perplexity:')
print(f'  Config: n_emb={best_config["embedding_dim"]}, n_hidden={best_config["hidden_dim"]}')
print(f'  Val Perplexity: {best_result["val_perplexity"]:.2f}')
print(f'  Run ID: {best_result["run_id"]}')

In [None]:
# Cell 12: Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Get data for plotting
configs = [f"n_emb={r['config']['embedding_dim']}\nn_h={r['config']['hidden_dim']}" for r in results]
val_ppls = [r['val_perplexity'] for r in results]
train_ppls = [r['train_perplexity'] for r in results]
n_params = [r['n_params'] / 1e6 for r in results]  # in millions

# Plot 1: Perplexity comparison
x = range(len(configs))
width = 0.35
axes[0].bar([i - width/2 for i in x], train_ppls, width, label='Train', alpha=0.8)
axes[0].bar([i + width/2 for i in x], val_ppls, width, label='Validation', alpha=0.8)
axes[0].set_xticks(x)
axes[0].set_xticklabels(configs)
axes[0].set_ylabel('Perplexity')
axes[0].set_title('Train vs Validation Perplexity by Configuration')
axes[0].legend()

# Plot 2: Parameters vs Perplexity
axes[1].scatter(n_params, val_ppls, s=100, c='blue', alpha=0.7, label='Validation')
axes[1].scatter(n_params, train_ppls, s=100, c='orange', alpha=0.7, label='Train')
for i, txt in enumerate(configs):
    axes[1].annotate(txt.replace('\n', ', '), (n_params[i], val_ppls[i]), 
                     textcoords="offset points", xytext=(5, 5), fontsize=8)
axes[1].set_xlabel('Parameters (millions)')
axes[1].set_ylabel('Perplexity')
axes[1].set_title('Model Size vs Perplexity')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Cell 13: Text generation with best model
def generate_sentence(model, itos, context_size=3, max_len=20, device='cuda'):
    """Generate a sentence using the trained model."""
    context = [0] * context_size  # Start with padding/block
    out = []
    
    for _ in range(max_len):
        x = torch.tensor([context], device=device)
        logits = model(x)
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1).item()
        
        if ix == 0:  # End of sentence / Block
            break
        
        context = context[1:] + [ix]
        out.append(ix)
    
    return ' '.join(itos[i] for i in out)

# Generate sentences with best model
print('Generated sentences from best model:')
print('-' * 50)
for i in range(5):
    sentence = generate_sentence(
        best_result['model'], 
        best_result['itos'],
        context_size=best_config['context_window'],
        device=device
    )
    print(f'{i+1}. {sentence}')

In [None]:
# Cell 14: View MLflow experiment info
print(f'\nMLflow UI: Run `mlflow ui` in terminal at {mlflow_tracking_path}')
print(f'Or in Colab, access via ngrok tunneling.')
print(f'\nExperiment runs saved to: {mlflow_tracking_path}')

# List runs in the experiment
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment:
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    print(f'\nNumber of runs in experiment: {len(runs)}')
    if len(runs) > 0:
        print('\nRecent runs:')
        print(runs[['run_id', 'params.embedding_dim', 'params.hidden_dim', 'metrics.final_val_perplexity']].to_string())