1. Since time did not allow, I just run for 10 epochs for fashion_mnist, and 50 epochs for cifar, but it still shows which setting is better. 

2. Don't look at the printings becuase I had to check for something and I printed out something else. But if you look at the files, the logging is for both datasets.

In [1]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import csv
from matplotlib import pyplot as plt
import numpy as np
import os

In [2]:
# Function to set the random seed for reproducibility
def set_seed(seed_value):
    np.random.seed(seed_value)
set_seed(42)

## 1 and 2

In [3]:
# Step 1: Dataset Preparation (FashionMNIST and CIFAR10)
transform_fashion = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
transform_cifar = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# Load FashionMNIST
fashion_mnist_train = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform_fashion)
fashion_mnist_test = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform_fashion)

# Load CIFAR10
cifar_train = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_cifar)
cifar_test = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_cifar)

# Split datasets into training and validation sets
train_size = int(0.8 * len(fashion_mnist_train))
val_size = len(fashion_mnist_train) - train_size
fashion_mnist_train, fashion_mnist_val = random_split(fashion_mnist_train, [train_size, val_size])


train_size_cifar = int(0.8 * len(cifar_train))
val_size_cifar = len(cifar_train) - train_size_cifar
cifar_train, cifar_val = random_split(cifar_train, [train_size_cifar, val_size_cifar])



# Create DataLoaders
batch_size = 64
fashion_mnist_train_loader = DataLoader(fashion_mnist_train, batch_size=batch_size, shuffle=True)
fashion_mnist_val_loader = DataLoader(fashion_mnist_val, batch_size=batch_size)
fashion_mnist_test_loader = DataLoader(fashion_mnist_test, batch_size=batch_size)


cifar_train_loader = DataLoader(cifar_train, batch_size=batch_size, shuffle=True)
cifar_val_loader = DataLoader(cifar_val, batch_size=batch_size)
cifar_test_loader = DataLoader(cifar_test, batch_size=batch_size)


Files already downloaded and verified
Files already downloaded and verified


## 3

In [4]:
class SimpleCNN(nn.Module):
    def __init__(self, in_channels=1, num_classes=10, input_size=28):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # We initialize fc1 with None here, and set it once we know the size in forward
        self.fc1 = None
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)  # Flatten

        # Initialize the fc1 layer dynamically based on flattened size if not already set
        if self.fc1 is None:
            self.fc1 = nn.Linear(x.size(1), 128).to(x.device)
        
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x



fashion_mnist_model = SimpleCNN(in_channels=1, num_classes=10, input_size=28)
cifar_model = SimpleCNN(in_channels=3, num_classes=10, input_size=32)


## 4

In [5]:
def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, csv_filename):
    model.train()
    losses = []  # Store training losses per epoch
    best_val_accuracy = 0  # Track the best validation accuracy during training

    with open(csv_filename, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['epoch', 'train_loss', 'grad_norm', 'val_accuracy'])
        
        for epoch in range(num_epochs):
            running_loss = 0.0
            grad_norms = []
            for images, labels in train_loader:
                optimizer.zero_grad()
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()
                grad_norm = sum((p.grad.norm() ** 2).item() for p in model.parameters() if p.grad is not None)
                grad_norms.append(grad_norm)

            # Calculate average training loss and gradient norm for the epoch
            train_loss = running_loss / len(train_loader)
            avg_grad_norm = sum(grad_norms) / len(grad_norms)
            losses.append(train_loss)
            
            # Calculate validation accuracy for this epoch
            val_accuracy = calculate_accuracy(model, val_loader)
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy  # Update best validation accuracy

            # Write data to CSV
            writer.writerow([epoch, train_loss, avg_grad_norm, val_accuracy])
            print(f"Epoch [{epoch+1}/{num_epochs}] Loss: {train_loss:.4f}, Grad Norm: {avg_grad_norm:.4f}, Val Accuracy: {val_accuracy:.4f}")
    
    return losses, best_val_accuracy 

# Accuracy calculation function
def calculate_accuracy(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in data_loader:
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total


The tuning and going through all the optimization models and datasets are being coded in section 5

## 5

In [6]:
# Step 5: Custom Implementations of Adagrad and Adam
class CustomAdagrad:
    def __init__(self, params, lr=0.01, epsilon=1e-8):
        self.params = list(params)
        self.lr = lr
        self.epsilon = epsilon
        self.squared_gradients = [torch.zeros_like(p) for p in self.params]

    def step(self):
        for i, param in enumerate(self.params):
            if param.grad is None:
                continue
            grad = param.grad.data
            self.squared_gradients[i] += grad ** 2
            param.data -= self.lr * grad / (torch.sqrt(self.squared_gradients[i]) + self.epsilon)

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()

class CustomAdam:
    def __init__(self, params, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.params = list(params)
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.t = 0
        self.m = [torch.zeros_like(p) for p in self.params]
        self.v = [torch.zeros_like(p) for p in self.params]

    def step(self):
        self.t += 1
        for i, param in enumerate(self.params):
            if param.grad is None:
                continue
            grad = param.grad.data
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * grad ** 2
            m_hat = self.m[i] / (1 - self.beta1 ** self.t)
            v_hat = self.v[i] / (1 - self.beta2 ** self.t)
            param.data -= self.lr * m_hat / (torch.sqrt(v_hat) + self.epsilon)

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()


## 8

In [7]:
# Define criterion and hyperparameters
criterion = nn.CrossEntropyLoss()
learning_rates = [0.01, 0.001]
momentums = [0.9, 0.99]
num_epochs = 10
batch_size = 64

# Define datasets
datasets = {
    "FashionMNIST": (fashion_mnist_train_loader, fashion_mnist_val_loader),
    "CIFAR10": (cifar_train_loader, cifar_val_loader)  # Uncomment if CIFAR10 data loaders are defined
}

# Define optimizers, including custom implementations
optimizers = {
    "SGD": lambda params, lr: optim.SGD(params, lr=lr),
    "SGD with Momentum": lambda params, lr, momentum: optim.SGD(params, lr=lr, momentum=momentum),
    "Adagrad": lambda params, lr: optim.Adagrad(params, lr=lr),
    "Adam": lambda params, lr: optim.Adam(params, lr=lr),
    "Custom Adagrad": lambda params, lr: CustomAdagrad(params, lr=lr),
    "Custom Adam": lambda params, lr: CustomAdam(params, lr=lr)
}

# Directory for storing results
os.makedirs('results', exist_ok=True)

# Initialize variables to store the best hyperparameters and validation accuracy
best_overall_val_accuracy = 0
best_overall_hyperparams = None
best_overall_optimizer = None
best_overall_dataset = None
best_model = None

# Loop through each dataset, optimizer, and hyperparameter combination
for dataset_name, (train_loader, val_loader) in datasets.items():
    for opt_name, opt_fn in optimizers.items():
        for lr in learning_rates:
            for momentum in (momentums if "Momentum" in opt_name else [None]):
                model = SimpleCNN(in_channels=1 if dataset_name == "FashionMNIST" else 3, num_classes=10)
                
                # Initialize optimizer based on the presence of momentum
                if opt_name == "SGD with Momentum":
                    optimizer = opt_fn(model.parameters(), lr=lr, momentum=momentum)
                else:
                    optimizer = opt_fn(model.parameters(), lr=lr)
                
                # Generate the CSV filename with the dataset name, even if momentum is None
                csv_filename = f'results/{opt_name}_lr_{lr}_momentum_{momentum}_{dataset_name}.csv'
                
                # Train the model and save results
                print(f"Training {opt_name} on {dataset_name} with lr={lr}, momentum={momentum}")
                losses, val_accuracy = train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, csv_filename)
                
                # Update best hyperparameters if the current validation accuracy is higher
                if val_accuracy > best_overall_val_accuracy:
                    best_overall_val_accuracy = val_accuracy
                    best_overall_hyperparams = {'learning_rate': lr, 'momentum': momentum}
                    best_overall_optimizer = opt_name
                    best_overall_dataset = dataset_name
                    best_model = model

# Print the best hyperparameters and corresponding validation accuracy
print("\nBest Hyperparameters and Optimizer:")
print(f"Dataset: {best_overall_dataset}")
print(f"Optimizer: {best_overall_optimizer}")
print(f"Learning Rate: {best_overall_hyperparams['learning_rate']}")
print(f"Momentum: {best_overall_hyperparams['momentum']}")
print(f"Best Validation Accuracy: {best_overall_val_accuracy}")


Training SGD on FashionMNIST with lr=0.01, momentum=None
Epoch [1/10] Loss: 2.1677, Grad Norm: 91851.1619, Val Accuracy: 0.5457
Epoch [2/10] Loss: 1.1511, Grad Norm: 4031981.6935, Val Accuracy: 0.7085
Epoch [3/10] Loss: 0.7266, Grad Norm: 12239228.2205, Val Accuracy: 0.7491
Epoch [4/10] Loss: 0.6394, Grad Norm: 18692607.1182, Val Accuracy: 0.7744
Epoch [5/10] Loss: 0.5934, Grad Norm: 24157574.8056, Val Accuracy: 0.7896
Epoch [6/10] Loss: 0.5612, Grad Norm: 29794282.6937, Val Accuracy: 0.7997
Epoch [7/10] Loss: 0.5348, Grad Norm: 35424131.3419, Val Accuracy: 0.8154
Epoch [8/10] Loss: 0.5141, Grad Norm: 41060839.7834, Val Accuracy: 0.8214
Epoch [9/10] Loss: 0.4961, Grad Norm: 46536602.0518, Val Accuracy: 0.8242
Epoch [10/10] Loss: 0.4806, Grad Norm: 52160637.3433, Val Accuracy: 0.8332
Training SGD on FashionMNIST with lr=0.001, momentum=None
Epoch [1/10] Loss: 2.2930, Grad Norm: 38142.9929, Val Accuracy: 0.1235
Epoch [2/10] Loss: 2.2760, Grad Norm: 288143.8323, Val Accuracy: 0.1693
Epoch