<a href="https://colab.research.google.com/github/yashc73080/CS462-Deep-Learning/blob/main/HW1/architecture_and_hyperparam_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
import random
import matplotlib.pyplot as plt

Generating and Preparing Data

In [2]:
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

100%|██████████| 9.91M/9.91M [00:00<00:00, 18.0MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 481kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 4.43MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 13.4MB/s]


In [3]:
def preprocess_data(dataset, device):
  data = (dataset.data / 255.0) - 0.5
  flattened_data = data.view(data.size(0), -1)
  targets = dataset.targets
  return flattened_data.to(device), targets.to(device)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train, y_train = preprocess_data(trainset, device)
X_test, y_test = preprocess_data(testset, device)

In [None]:
X_train.size()

torch.Size([60000, 784])

# 1. Basic Model

$F(\vec{x}) = \text{softmax}(A\vec{v}+\vec{b})$

In [28]:
class BasicModel():
  def __init__(self, input=784, output=10, device='cpu'):
    # Set device to CPU or GPU
    self.device = torch.device('cuda' if torch.cuda.is_available() else device)

    # Model parameters
    self.input = input
    self.output = output
    self.A = nn.Parameter(torch.randn(output, input, device=self.device), requires_grad=True) # shape (10, 784)
    self.b = nn.Parameter(torch.randn(output, device=self.device), requires_grad=True)  # shape (10,)

  def forward(self, x):
    logits = x @ self.A.t() + self.b # CrossEntropyLoss applies softmax internally
    return logits

  def train(self, X_train, y_train, epochs=100, lr=0.7, batch_size=64):
    self.loss_function = nn.CrossEntropyLoss()
    self.optimizer = optim.SGD([self.A, self.b], lr=lr)

    n = X_train.size(0)

    for epoch in range(epochs):
        total_loss = 0.0

        # Process by batch for more efficiency
        for i in range(0, n, batch_size):
            x_batch = X_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]

            logits = self.forward(x_batch)
            loss = self.loss_function(logits, y_batch)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            total_loss += loss.item()

        if epoch % 5 == 0:
          print(f"Epoch {epoch+1}/{epochs}, Training Loss: {total_loss / (n // batch_size):.4f}")

    with torch.no_grad():
      logits = self.forward(X_train)
      preds = torch.argmax(logits, dim=1)
      acc = (preds == y_train).float().mean().item()
    print(f"Final Train Accuracy: {acc:.4f}")

  def test(self, X_test, y_test):
    with torch.no_grad():
      logits = self.forward(X_test)
      preds = torch.argmax(logits, dim=1)
      acc = (preds == y_test).float().mean().item()
    print(f"Test Accuracy: {acc:.4f}")
    return acc

In [29]:
basic_model = BasicModel(input=784, output=10)
basic_model.train(X_train, y_train, epochs=150, lr=0.7, batch_size=64)
basic_test_acc = basic_model.test(X_test, y_test)

Epoch 1/150, Training Loss: 1.3803
Epoch 6/150, Training Loss: 0.5289
Epoch 11/150, Training Loss: 0.4805
Epoch 16/150, Training Loss: 0.4582
Epoch 21/150, Training Loss: 0.4446
Epoch 26/150, Training Loss: 0.4340
Epoch 31/150, Training Loss: 0.4256
Epoch 36/150, Training Loss: 0.4194
Epoch 41/150, Training Loss: 0.4144
Epoch 46/150, Training Loss: 0.4103
Epoch 51/150, Training Loss: 0.4067
Epoch 56/150, Training Loss: 0.4033
Epoch 61/150, Training Loss: 0.4006
Epoch 66/150, Training Loss: 0.3983
Epoch 71/150, Training Loss: 0.3960
Epoch 76/150, Training Loss: 0.3938
Epoch 81/150, Training Loss: 0.3918
Epoch 86/150, Training Loss: 0.3899
Epoch 91/150, Training Loss: 0.3883
Epoch 96/150, Training Loss: 0.3867
Epoch 101/150, Training Loss: 0.3853
Epoch 106/150, Training Loss: 0.3840
Epoch 111/150, Training Loss: 0.3827
Epoch 116/150, Training Loss: 0.3816
Epoch 121/150, Training Loss: 0.3805
Epoch 126/150, Training Loss: 0.3795
Epoch 131/150, Training Loss: 0.3785
Epoch 136/150, Training

# 2. Fixed Size Layer Model

In [31]:
class FixedSizeNetwork(nn.Module):
  def __init__(self, input_size=784, output_size=10, num_layers=2, hidden_width=200, device='cpu'):
    '''
    Args:
      input_size (int): MNIST input size
      output_size (int): MNIST output size
      num_layers (int): Number of hidden layers (k)
      hidden_width (int): Width of hidden layers (m)
    '''
    super().__init__()

    # Set device to CPU or GPU
    self.device = torch.device('cuda' if torch.cuda.is_available() else device)

    # All layers of network
    layers = []

    # First layer (Input -> H1)
    layers.append(nn.Linear(input_size, hidden_width))
    layers.append(nn.Tanh())

    # Rest of the layers
    for _ in range(num_layers - 1):
      layers.append(nn.Linear(hidden_width, hidden_width))
      layers.append(nn.Tanh())

    # Wrap hidden layers
    self.hidden_layers = nn.Sequential(*layers)

    # Add output layer (Hk -> Output)
    self.output_layer = nn.Linear(hidden_width, output_size)

    # Move whole network to device
    self.to(self.device)

  def forward(self, x):
    # Flatten input
    x = x.view(x.size(0), -1)

    # Pass input through all hidden layers
    x = self.hidden_layers(x)

    # Pass through output layer
    logits = self.output_layer(x)

    return logits

  def train(self, X_train, y_train, epochs=100, lr=0.7, batch_size=64):
    self.loss_function = nn.CrossEntropyLoss()
    self.optimizer = optim.SGD(self.parameters(), lr=lr)

    n = X_train.size(0)

    for epoch in range(epochs):
      total_loss = 0.0

      for i in range(0, n, batch_size):
        x_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]

        logits = self.forward(x_batch)
        loss = self.loss_function(logits, y_batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        total_loss += loss.item()

      if epoch % 5 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {total_loss / (n // batch_size):.4f}")

    with torch.no_grad():
      logits = self.forward(X_train)
      preds = torch.argmax(logits, dim=1)
      acc = (preds == y_train).float().mean().item()
    print(f"Final Train Accuracy: {acc:.4f}")

  def test(self, X_test, y_test):
    with torch.no_grad():
      logits = self.forward(X_test)
      preds = torch.argmax(logits, dim=1)
      acc = (preds == y_test).float().mean().item()
    print(f"Test Accuracy: {acc:.4f}")
    return acc

In [32]:
fixed_network = FixedSizeNetwork(input_size=784, output_size=10, num_layers=2, hidden_width=200)
fixed_network.train(X_train, y_train, epochs=100, lr=0.7, batch_size=64)
fixed_test_acc = fixed_network.test(X_test, y_test)

Epoch 1/100, Training Loss: 58.5420
Epoch 6/100, Training Loss: 60.8285
Epoch 11/100, Training Loss: 59.5987
Epoch 16/100, Training Loss: 59.9111
Epoch 21/100, Training Loss: 60.0360
Epoch 26/100, Training Loss: 60.6065
Epoch 31/100, Training Loss: 59.6161
Epoch 36/100, Training Loss: 60.1842
Epoch 41/100, Training Loss: 60.4737
Epoch 46/100, Training Loss: 60.0701
Epoch 51/100, Training Loss: 59.2184
Epoch 56/100, Training Loss: 60.3127
Epoch 61/100, Training Loss: 60.0912
Epoch 66/100, Training Loss: 59.9352
Epoch 71/100, Training Loss: 59.8623
Epoch 76/100, Training Loss: 59.5196
Epoch 81/100, Training Loss: 59.5686
Epoch 86/100, Training Loss: 59.3367
Epoch 91/100, Training Loss: 59.3611
Epoch 96/100, Training Loss: 58.9617
Final Train Accuracy: 0.1124
Test Accuracy: 0.1135
