# Project : Optimization for Machine Learning

Comparison script for the impact of learning rate on the convergence of different optimizers.

Tested optimizers: Adam, RMSprop, AdaGrad, AdamW, AmsGrad, SGD, NAdam, RAdam
Objective: Study the robustness of models to different learning rates
Analyzed metrics: loss, accuracy, convergence speed

### Initial Setup

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np
import random
from collections import defaultdict
from tqdm.notebook import tqdm
import pickle

In [2]:
def set_seed(seed=1):
    """
    Set the random seed for reproducibility across torch, numpy, and random.
    Args: seed (int): The seed value to use.
    This function ensures deterministic behavior across different runs by:
    - Setting seeds for torch, numpy, and Python's random module
    - Configuring PyTorch's cuDNN backend for reproducibility
    - Setting the seed for all CUDA devices if available
    """
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def seed_worker(worker_id):
    """
    Set the seed for data loader worker processes to ensure deterministic behavior.
    Args: worker_id (int): Unique ID of the worker process.
    This function is used when initializing PyTorch DataLoader workers with `worker_init_fn`.
    """
    worker_seed = worker_id
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [3]:
# Number of samples processed in each training batch
BATCH_SIZE = 256

# Total number of training epochs
EPOCHS = 20

# Automatically select GPU if available, otherwise fallback to CPU
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Dataset Preprocessing and CNN benchmark model

In [4]:
# Define a transformation pipeline:
    # - Convert PIL images to tensors
    # - Normalize images
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


# Load the CIFAR-10 training dataset with transformations applied
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
# Load the CIFAR-10 test dataset with the same transformations
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# Set manual seed for the data loader generator to ensure reproducibility of shuffling
g = torch.Generator()
g.manual_seed(42)

# Create the data loader for the training dataset 
# - Use seeded generator for reproducible shuffling
# - Initialize each worker deterministically
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, generator=g, worker_init_fn=seed_worker)
# Create the data loader for the test dataset (no shuffling needed)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [5]:
class CNN_model(nn.Module):
    def __init__(self):
        super(CNN_model, self).__init__()
        
        # Feature extractor: two convolutional blocks with ReLU and max-pooling
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),  # First conv layer: input channels = 3 (RGB), output = 64
            nn.ReLU(),                                  # Non-linearity
            nn.MaxPool2d(2),                            # Downsample by factor of 2
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1), # Second conv layer: output = 128
            nn.ReLU(),
            nn.MaxPool2d(2)                             # Downsample again
        )
        
        # Classifier: flatten the features and pass through two fully connected layers
        self.classifier = nn.Sequential(
            nn.Flatten(),                               # Flatten the 128×8×8 feature maps
            nn.Linear(128 * 8 * 8, 256),                # Fully connected layer with 256 hidden units
            nn.ReLU(),
            nn.Linear(256, 10)                          # Output layer for 10 CIFAR-10 classes
        )

    def forward(self, x):
        # Forward pass through the network
        x = self.features(x)
        x = self.classifier(x)
        return x


### Training and testing function

In [6]:
def test(model, test_loader):
    model.eval()  # Set the model to evaluation mode (disables dropout, batchnorm, etc.)
    
    correct = 0   # Counter for correct predictions
    total = 0     # Counter for total samples
    
    with torch.no_grad():  # Disable gradient computation for efficiency
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)  # Move data to the correct device (CPU or GPU)
            outputs = model(inputs)  # Forward pass
            _, predicted = torch.max(outputs, 1)  # Get class with highest probability for each sample
            total += targets.size(0)  # Update total number of samples
            correct += (predicted == targets).sum().item()  # Count how many predictions were correct

    accuracy = correct / total  # Compute final accuracy
    return accuracy

In [7]:
def train(model, optimizer, criterion, epochs, train_loader):
    """
    Train the model over a number of epochs while tracking training loss 
    and test accuracy at each epoch.

    Args:
        model (nn.Module): The neural network model to train.
        optimizer (torch.optim.Optimizer): Optimizer to update model parameters.
        criterion (nn.Module): Loss function.
        epochs (int): Number of training epochs.
        train_loader (DataLoader): DataLoader for training data.

    Returns:
        losses (list of float): Average training loss per epoch (including initial).
        acc_history (list of float): Test set accuracy after each epoch (including initial).
    """
    
    model.train()  # Set model to training mode (enables dropout, batchnorm, etc.)

    losses = []       # To record average loss per epoch
    acc_history = []  # To record accuracy on test set after each epoch

    # Compute and store the initial loss before training starts
    initial_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)  # Move data to device
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, targets)  # Calculate loss
        initial_loss += loss.item()  # Accumulate loss over batches
    avg_initial_loss = initial_loss / len(train_loader)  # Average loss over all batches
    losses.append(avg_initial_loss)  # Append initial loss to losses list

    # Evaluate and store the initial accuracy on the test set before training
    initial_accuracy = test(model, test_loader)
    acc_history.append(initial_accuracy)

    # Training loop over epochs
    for epoch in range(epochs):
        total_loss = 0  # Accumulate total loss for this epoch

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)  # Move data to device
            optimizer.zero_grad()  # Clear gradients before backward pass
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, targets)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update model parameters
            total_loss += loss.item()  # Accumulate loss for this batch

        avg_loss = total_loss / len(train_loader)  # Compute average loss for the epoch
        losses.append(avg_loss)  # Store average loss

        # Evaluate and record accuracy on the test set after this epoch
        acc_history.append(test(model, test_loader))

        # Uncomment to print loss progress per epoch
        # print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

    return losses, acc_history  # Return lists of losses and accuracies

### Comparison of optimizers

In [8]:
# List of learning rates to explore during experiments
learning_rates = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

# List of random seeds to ensure reproducibility and enable averaging over runs
seeds = [0, 1, 2]

# Dictionary mapping optimizer names to their initialization functions
optimizers_dict = {
    'Adam': lambda model, lr: optim.Adam(model.parameters(), lr=lr),
    'RMSprop': lambda model, lr: optim.RMSprop(model.parameters(), lr=lr), #0.01
    'AdaGrad': lambda model, lr: optim.Adagrad(model.parameters(), lr=lr), #0.01
    'AdamW': lambda model, lr: optim.AdamW(model.parameters(), lr=lr), #0.001
    'AmsGrad': lambda model, lr: optim.Adam(model.parameters(), lr=lr, amsgrad=True), #0.001
    'NAdam' : lambda model, lr : optim.NAdam(model.parameters(), lr=lr), #0.002
    'RAdam' : lambda model, lr : optim.RAdam(model.parameters(), lr=lr), #0.001
    'SGD' : lambda model, lr : optim.SGD(model.parameters(), lr=lr), #0.001
}

In [9]:
# Initialize a nested dictionary to store results for each optimizer and learning rate
results = defaultdict(dict)

# Loop over each optimizer and its corresponding constructor function
for opt_name, opt_fn in tqdm(optimizers_dict.items()):
    print(f"\n>>> Testing optimizer: {opt_name}")
    
    # Loop over the defined learning rates
    for lr in tqdm(learning_rates):
        print(f"  - LR = {lr}")

        # Initialize the storage for accuracies and losses for all seeds at this optimizer/lr combo
        results[opt_name][lr] = {
            'accuracies': [],
            'losses': []
        }

        # Run training for each seed to average out randomness
        for seed in seeds:
            set_seed(seed)  # Set random seeds for reproducibility
            model = CNN_model().to(DEVICE)  # Instantiate and move model to device
            optimizer = opt_fn(model, lr)   # Create optimizer with given lr
            criterion = nn.CrossEntropyLoss()  # Define loss function (cross-entropy for classification)

            # Train the model and get loss and accuracy history
            losses, acc_history = train(model, optimizer, criterion, EPOCHS, train_loader)
    
            # Append the accuracy and loss history for this seed
            results[opt_name][lr]['accuracies'].append(acc_history)
            results[opt_name][lr]['losses'].append(losses)

    # Save intermediate results for each optimizer to a separate file to avoid data loss on long runs
    with open(f'simple_studies/8model_20epoch_saved_loss/{opt_name}', 'wb') as f:
        pickle.dump(results, f)


# After all training runs, compute the mean and standard deviation across seeds for accuracy and loss
for opt_name in results:
    for lr in results[opt_name]:
        accs_all_seeds = np.array(results[opt_name][lr]['accuracies'])
        results[opt_name][lr]['mean_acc'] = accs_all_seeds.mean(axis=0)
        results[opt_name][lr]['std_acc'] = accs_all_seeds.std(axis=0)

        losses_all_seeds = np.array(results[opt_name][lr]['losses'])
        mean_losses = losses_all_seeds.mean(axis=0)
        std_losses = losses_all_seeds.std(axis=0)
        results[opt_name][lr]['mean_losses'] = mean_losses
        results[opt_name][lr]['std_losses'] = std_losses

# Save the final aggregated results with means and stds for all optimizers and learning rates
with open(f'simple_studies/8model_20epoch_saved_loss/final_losses', 'wb') as f:
    pickle.dump(results, f)

  0%|          | 0/8 [00:00<?, ?it/s]


>>> Testing optimizer: Adam


  0%|          | 0/5 [00:00<?, ?it/s]

  - LR = 1e-05
  - LR = 0.0001
  - LR = 0.001
  - LR = 0.01
  - LR = 0.1

>>> Testing optimizer: RMSprop


  0%|          | 0/5 [00:00<?, ?it/s]

  - LR = 1e-05
  - LR = 0.0001
  - LR = 0.001
  - LR = 0.01
  - LR = 0.1

>>> Testing optimizer: AdaGrad


  0%|          | 0/5 [00:00<?, ?it/s]

  - LR = 1e-05
  - LR = 0.0001
  - LR = 0.001
  - LR = 0.01
  - LR = 0.1

>>> Testing optimizer: AdamW


  0%|          | 0/5 [00:00<?, ?it/s]

  - LR = 1e-05
  - LR = 0.0001
  - LR = 0.001
  - LR = 0.01
  - LR = 0.1

>>> Testing optimizer: AmsGrad


  0%|          | 0/5 [00:00<?, ?it/s]

  - LR = 1e-05
  - LR = 0.0001
  - LR = 0.001
  - LR = 0.01
  - LR = 0.1

>>> Testing optimizer: NAdam


  0%|          | 0/5 [00:00<?, ?it/s]

  - LR = 1e-05
  - LR = 0.0001
  - LR = 0.001
  - LR = 0.01
  - LR = 0.1

>>> Testing optimizer: RAdam


  0%|          | 0/5 [00:00<?, ?it/s]

  - LR = 1e-05
  - LR = 0.0001
  - LR = 0.001
  - LR = 0.01
  - LR = 0.1

>>> Testing optimizer: SGD


  0%|          | 0/5 [00:00<?, ?it/s]

  - LR = 1e-05
  - LR = 0.0001
  - LR = 0.001
  - LR = 0.01
  - LR = 0.1
