<a href="https://colab.research.google.com/github/yashc73080/CS462-Deep-Learning/blob/main/HW1/architecture_and_hyperparam_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
import random
import matplotlib.pyplot as plt

Generating and Preparing Data

In [2]:
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

100%|██████████| 9.91M/9.91M [00:00<00:00, 18.0MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 481kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 4.43MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 13.4MB/s]


In [3]:
def preprocess_data(dataset, device):
  data = (dataset.data / 255.0) - 0.5
  flattened_data = data.view(data.size(0), -1)
  targets = dataset.targets
  return flattened_data.to(device), targets.to(device)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train, y_train = preprocess_data(trainset, device)
X_test, y_test = preprocess_data(testset, device)

In [None]:
X_train.size()

torch.Size([60000, 784])

# Basic Model

$F(\vec{x}) = \text{softmax}(A\vec{v}+\vec{b})$

In [14]:
class BasicModel():
  def __init__(self, input=784, output=10, device='cpu'):
    # Set device to CPU or GPU
    self.device = torch.device('cuda' if torch.cuda.is_available() else device)

    # Model parameters
    self.input = input
    self.output = output
    self.A = nn.Parameter(torch.randn(output, input, device=self.device), requires_grad=True) # shape (10, 784)
    self.b = nn.Parameter(torch.randn(output, device=self.device), requires_grad=True)  # shape (10,)

  def forward(self, x):
    logits = x @ self.A.t() + self.b # CrossEntropyLoss applies softmax internally
    return logits

  def train(self, X_train, y_train, epochs=100, lr=0.7, batch_size=64):
    self.loss_function = nn.CrossEntropyLoss()
    self.optimizer = optim.SGD([self.A, self.b], lr=lr)

    n = X_train.size(0)

    for epoch in range(epochs):
        total_loss = 0.0

        # Process by batch for more efficiency
        for i in range(0, n, batch_size):
            x_batch = X_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]

            logits = self.forward(x_batch)
            loss = self.loss_function(logits, y_batch)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            total_loss += loss.item()

        if epoch % 5 == 0:
          print(f"Epoch {epoch+1}/{epochs}, Training Loss: {total_loss / (n // batch_size):.4f}")

    with torch.no_grad():
      logits = self.forward(X_train)
      preds = torch.argmax(logits, dim=1)
      acc = (preds == y_train).float().mean().item()
    print(f"Final Train Accuracy: {acc:.4f}")

  def test(self, X_test, y_test):
    with torch.no_grad():
      logits = self.forward(X_test)
      preds = torch.argmax(logits, dim=1)
      acc = (preds == y_test).float().mean().item()
    print(f"Test Accuracy: {acc:.4f}")
    return acc

In [15]:
basic_model = BasicModel(input=784, output=10)
basic_model.train(X_train, y_train, epochs=150, lr=0.7, batch_size=64)
basic_test_acc = basic_model.test(X_test, y_test)

Epoch 1/150, Training Loss: 1.3129
Epoch 6/150, Training Loss: 0.5426
Epoch 11/150, Training Loss: 0.4896
Epoch 16/150, Training Loss: 0.4661
Epoch 21/150, Training Loss: 0.4504
Epoch 26/150, Training Loss: 0.4388
Epoch 31/150, Training Loss: 0.4301
Epoch 36/150, Training Loss: 0.4222
Epoch 41/150, Training Loss: 0.4163
Epoch 46/150, Training Loss: 0.4115
Epoch 51/150, Training Loss: 0.4077
Epoch 56/150, Training Loss: 0.4044
Epoch 61/150, Training Loss: 0.4014
Epoch 66/150, Training Loss: 0.3987
Epoch 71/150, Training Loss: 0.3963
Epoch 76/150, Training Loss: 0.3941
Epoch 81/150, Training Loss: 0.3921
Epoch 86/150, Training Loss: 0.3903
Epoch 91/150, Training Loss: 0.3885
Epoch 96/150, Training Loss: 0.3869
Epoch 101/150, Training Loss: 0.3854
Epoch 106/150, Training Loss: 0.3840
Epoch 111/150, Training Loss: 0.3827
Epoch 116/150, Training Loss: 0.3815
Epoch 121/150, Training Loss: 0.3803
Epoch 126/150, Training Loss: 0.3793
Epoch 131/150, Training Loss: 0.3782
Epoch 136/150, Training

# Fixed Size Layer Model

In [None]:
class FixedSizeNetwork(nn.Module):
  def __init__(self, input_size=784, output_size=10, num_layers=2, hidden_width=200, device='cpu'):
    super().__init__()

    # Set device to CPU or GPU
    self.device = torch.device('cuda' if torch.cuda.is_available() else device)

    # All layers of network
    layers = []

    # First layer (Input -> H1)
    layers.append(nn.Linear(input_size, hidden_width))
    layers.append(nn.Tanh())

    # Rest of the layers
    for _ in range(num_layers - 1):
      layers.append(nn.Linear(hidden_width, hidden_width))
      layers.append(nn.Tanh())

    # Wrap hidden layers
    self.hidden_layers = nn.Sequential(*layers)

    # Add output layer (Hk -> Output)
    self.output_layer = nn.Linear(hidden_width, output_size)

    # Move whole network to device
    self.to(self.device)

  def forward(self, x):
    # Flatten input
    x = x.view(x.size(0), -1)

    # Pass input through all hidden layers
    x = self.hidden_layers(x)

    # Pass through output layer
    logits = self.output_layer(x)

    return logits

  def train(self, X_train, y_train, epochs=100, lr=0.7, batch_size=64):
    pass

  def test(self, X_test, y_test):
    pass

