<a href="https://colab.research.google.com/github/yashc73080/CS462-Deep-Learning/blob/main/HW1/architecture_and_hyperparam_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
import random
import matplotlib.pyplot as plt

Set the random seed

In [19]:
seed = 0
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

Generating and Preparing Data

In [20]:
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

In [21]:
def preprocess_data(dataset, device):
  data = (dataset.data / 255.0) - 0.5
  flattened_data = data.view(data.size(0), -1)
  targets = dataset.targets
  return flattened_data.to(device), targets.to(device)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train, y_train = preprocess_data(trainset, device)
X_test, y_test = preprocess_data(testset, device)

In [38]:
X_train.size()

torch.Size([60000, 784])

# 1. Basic Model

$F(\vec{x}) = \text{softmax}(A\vec{v}+\vec{b})$

In [17]:
class BasicModel():
  def __init__(self, input=784, output=10, device='cpu'):
    # Set device to CPU or GPU
    self.device = torch.device('cuda' if torch.cuda.is_available() else device)

    # Model parameters
    self.input = input
    self.output = output
    self.A = nn.Parameter(torch.randn(output, input, device=self.device), requires_grad=True) # shape (10, 784)
    self.b = nn.Parameter(torch.randn(output, device=self.device), requires_grad=True)  # shape (10,)

  def forward(self, x):
    logits = x @ self.A.t() + self.b # CrossEntropyLoss applies softmax internally
    return logits

  def train(self, X_train, y_train, epochs=100, lr=0.7, batch_size=64):
    self.loss_function = nn.CrossEntropyLoss()
    self.optimizer = optim.SGD([self.A, self.b], lr=lr)

    n = X_train.size(0)
    final_train_loss = 0.0

    for epoch in range(epochs):
        total_loss = 0.0

        # Process by batch for more efficiency
        for i in range(0, n, batch_size):
            x_batch = X_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]

            logits = self.forward(x_batch)
            loss = self.loss_function(logits, y_batch)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            total_loss += loss.item()

        final_train_loss = total_loss / (n // batch_size)
        if epoch % 5 == 0:
          print(f"Epoch {epoch+1}/{epochs}, Training Loss: {final_train_loss:.4f}")

    with torch.no_grad():
      logits = self.forward(X_train)
      preds = torch.argmax(logits, dim=1)
      acc = (preds == y_train).float().mean().item()
    print(f"Final Train Loss: {final_train_loss:.4f}, Final Train Accuracy: {acc:.4f}")

  def test(self, X_test, y_test):
    with torch.no_grad():
      logits = self.forward(X_test)
      loss = self.loss_function(logits, y_test)
      preds = torch.argmax(logits, dim=1)
      acc = (preds == y_test).float().mean().item()
    print(f"Test Loss: {loss.item():.4f}, Test Accuracy: {acc:.4f}")
    return loss.item(), acc

In [22]:
basic_model = BasicModel(input=784, output=10)
basic_model.train(X_train, y_train, epochs=350, lr=0.05, batch_size=64)
basic_test_loss, basic_test_acc = basic_model.test(X_test, y_test)

Epoch 1/350, Training Loss: 3.0408
Epoch 6/350, Training Loss: 0.6910
Epoch 11/350, Training Loss: 0.5564
Epoch 16/350, Training Loss: 0.4908
Epoch 21/350, Training Loss: 0.4497
Epoch 26/350, Training Loss: 0.4210
Epoch 31/350, Training Loss: 0.3995
Epoch 36/350, Training Loss: 0.3826
Epoch 41/350, Training Loss: 0.3688
Epoch 46/350, Training Loss: 0.3573
Epoch 51/350, Training Loss: 0.3475
Epoch 56/350, Training Loss: 0.3391
Epoch 61/350, Training Loss: 0.3318
Epoch 66/350, Training Loss: 0.3253
Epoch 71/350, Training Loss: 0.3195
Epoch 76/350, Training Loss: 0.3143
Epoch 81/350, Training Loss: 0.3097
Epoch 86/350, Training Loss: 0.3054
Epoch 91/350, Training Loss: 0.3016
Epoch 96/350, Training Loss: 0.2980
Epoch 101/350, Training Loss: 0.2948
Epoch 106/350, Training Loss: 0.2918
Epoch 111/350, Training Loss: 0.2891
Epoch 116/350, Training Loss: 0.2865
Epoch 121/350, Training Loss: 0.2841
Epoch 126/350, Training Loss: 0.2819
Epoch 131/350, Training Loss: 0.2798
Epoch 136/350, Training

# 2. Fixed Size Layer Model

$\text{Parameters}(k,m)$ in terms of input size $n_{in}$, output size $n_{out}$, number of hidden layers $k$, and number of nodes per hidden layer $m$.

The connection from the Input Layer to Hidden Layer 1 has the following number of trainable parameters. There are $m$ nodes in the first (and every) hidden layer multiplied by $n_{in}$ input nodes. Then, there are $m$ bias terms from the hidden layer.
$$(n_{in} \cdot m) + m$$

The connection from each hidden layer to the next hidden layer follow a similar pattern. Each layer has $m$ nodes connected to $m$ nodes, and each next layer has $m$ bias terms. So, the following expression represents the number of parameters in the hidden layers.
$$(k-1) \cdot (m \cdot m + m)$$

Finally, the connection from the last hidden layer to the output layer also follows a similar pattern. $m$ nodes are connected to $n_{out}$ nodes, with $n_{out}$ bias terms. So, the following expression represents the parameters in this last part.
$$(m \cdot n_{out}) + n_{out}$$

All together, the following formula is the number of trainable parameters in the network.
$$\text{Parameters}(k,m) = P = [(n_{in} \cdot m) + m] + [(k-1) \cdot (m \cdot m + m)] + [(m \cdot n_{out}) + n_{out}]$$

In [41]:
class FixedSizeNetwork(nn.Module):
  def __init__(self, input_size=784, output_size=10, num_layers=2, hidden_width=200, device='cpu'):
    '''
    Args:
      input_size (int): MNIST input size
      output_size (int): MNIST output size
      num_layers (int): Number of hidden layers (k)
      hidden_width (int): Width of hidden layers (m)
    '''
    super().__init__()

    # Set device to CPU or GPU
    self.device = torch.device('cuda' if torch.cuda.is_available() else device)

    # All layers of network
    layers = []

    # First layer (Input -> H1)
    layers.append(nn.Linear(input_size, hidden_width))
    layers.append(nn.Tanh())

    # Rest of the layers
    for _ in range(num_layers - 1):
      layers.append(nn.Linear(hidden_width, hidden_width))
      layers.append(nn.Tanh())

    # Wrap hidden layers
    self.hidden_layers = nn.Sequential(*layers)

    # Add output layer (Hk -> Output)
    self.output_layer = nn.Linear(hidden_width, output_size)

    # Move whole network to device
    self.to(self.device)

  def forward(self, x):
    # Flatten input
    x = x.view(x.size(0), -1)

    # Pass input through all hidden layers
    x = self.hidden_layers(x)

    # Pass through output layer
    logits = self.output_layer(x)

    return logits

  def train(self, X_train, y_train, epochs=100, lr=0.7, batch_size=64):
    self.loss_function = nn.CrossEntropyLoss()
    self.optimizer = optim.SGD(self.parameters(), lr=lr)

    n = X_train.size(0)

    for epoch in range(epochs):
      total_loss = 0.0

      for i in range(0, n, batch_size):
        x_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]

        logits = self.forward(x_batch)
        loss = self.loss_function(logits, y_batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        total_loss += loss.item()

      if epoch % 5 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {total_loss / (n // batch_size):.4f}")

    with torch.no_grad():
      logits = self.forward(X_train)
      preds = torch.argmax(logits, dim=1)
      acc = (preds == y_train).float().mean().item()
    print(f"Final Train Accuracy: {acc:.4f}")

  def test(self, X_test, y_test):
    with torch.no_grad():
      logits = self.forward(X_test)
      preds = torch.argmax(logits, dim=1)
      acc = (preds == y_test).float().mean().item()
    print(f"Test Accuracy: {acc:.4f}")
    return acc

  def num_parameters(self, input_size=784, output_size=10, k=2, m=200):
    '''
    k = number of hidden layers
    m = width of hidden layers (number of nodes per hidden layer)
    '''
    input_H1 = (input_size * m) + m
    all_H = (k - 1) * (m * m + m)
    lastH_output = (m * output_size) + output_size
    return input_H1 + all_H + lastH_output

In [42]:
fixed_network = FixedSizeNetwork(input_size=784, output_size=10, num_layers=2, hidden_width=200)
P = fixed_network.num_parameters()
print(f"Number of trainable parameters: {P}\n")

fixed_network.train(X_train, y_train, epochs=100, lr=0.7, batch_size=64)
fixed_test_acc = fixed_network.test(X_test, y_test)

Number of trainable parameters: 199210

Epoch 1/100, Training Loss: 59.0971
Epoch 6/100, Training Loss: 60.6884
Epoch 11/100, Training Loss: 59.5781
Epoch 16/100, Training Loss: 60.1648
Epoch 21/100, Training Loss: 60.4451
Epoch 26/100, Training Loss: 59.9619
Epoch 31/100, Training Loss: 59.7945
Epoch 36/100, Training Loss: 59.7677
Epoch 41/100, Training Loss: 59.6390
Epoch 46/100, Training Loss: 60.6941
Epoch 51/100, Training Loss: 59.7445
Epoch 56/100, Training Loss: 60.1853
Epoch 61/100, Training Loss: 60.0227
Epoch 66/100, Training Loss: 59.7482
Epoch 71/100, Training Loss: 59.7897
Epoch 76/100, Training Loss: 59.2132
Epoch 81/100, Training Loss: 60.0394
Epoch 86/100, Training Loss: 60.8052
Epoch 91/100, Training Loss: 59.8381
Epoch 96/100, Training Loss: 59.9904
Final Train Accuracy: 0.0993
Test Accuracy: 0.1032
