In [1]:
# Optimizing Model Parameters
# Putting this all together, and using the optimizer to actually estimate the model

# preliminary
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose

from torchvision import datasets
from torchvision.transforms import ToTensor

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else 'cpu'
print(f"Using {device} device")

# load in the data (FashionMNIST)
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

# set up DataLoader
train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)

# can check size of the data like so
# training_data.data.shape

# Define the model class
class NeuralNetwork(nn.Module):
    # initialisation layer
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            # this is input layer, and follows because the data is 28x28
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
            nn.ReLU()
        )
    # forward pass method
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

# create the model, and move it to the optimal device
model = NeuralNetwork().to(device)
print(model)

Using cuda device
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): ReLU()
  )
)


In [2]:
# Model Training

# Set up the hyperparameters
learning_rate = 1e-3
batch_size = 64
epochs = 5

# Initialise the loss function
loss_fn = nn.CrossEntropyLoss()

# Optimizer
# very vanilla stochastic gradient descent
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [3]:
# Training Loop main ingredients
# call optimizer.zero_grad() to reset the gradients first
# calculate the gradients via backprop using loss.backward()
# call optimizer step to "nudge" the parameters in the right direction using those gradients
# repeat

# PyTorch wants you to define your own train loop function, then then manually loop through it
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # first set the model in training mode, this is important for batch normalization
    # and dropout layers. 
    # Actually unnecessary in this specific model (which doesn't have either), but
    # added for best practice
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Move tensors to appropraite device first
        X, y = X.to(device), y.to(device)
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # if there are no more batches, then print the loss
        if batch % 100 == 0:
            loss, current = loss.item(), batch*batch_size + len(X)
            # print the loss to 5 decimal places
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test_loop(dataloader, model, loss_fn):
    # set the model to evaluation mode, important for batch normalization and dropout
    # layers.
    # Technically unnecessary in this specific model, but added for best practice
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    # initialize empty variables to store the loss and the number of correct predictions
    test_loss, correct = 0, 0

    # Evaluate the model with torch.no_grad() to prevent gradient tracking 
    # (i.e. gradients accumulating when computing the forward pass)
    # how does this with statement work?
    with torch.no_grad():
        for X, y in dataloader:
            # Move tensors to appropraite device first
            X, y = X.to(device), y.to(device)
            
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")



In [5]:
# Begin training!
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
epochs = 10

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")





Epoch 1
-------------------------------
loss: 1.381158  [   64/60000]
loss: 1.330277  [ 6464/60000]
loss: 1.098648  [12864/60000]
loss: 1.112260  [19264/60000]
loss: 1.165929  [25664/60000]
loss: 1.223262  [32064/60000]
loss: 1.240569  [38464/60000]
loss: 1.333045  [44864/60000]
loss: 1.255942  [51264/60000]
loss: 1.145032  [57664/60000]
Test Error: 
 Accuracy: 60.5%, Avg loss: 1.210869 

Epoch 2
-------------------------------
loss: 1.348202  [   64/60000]
loss: 1.307628  [ 6464/60000]
loss: 1.066619  [12864/60000]
loss: 1.083502  [19264/60000]
loss: 1.145869  [25664/60000]
loss: 1.196983  [32064/60000]
loss: 1.220352  [38464/60000]
loss: 1.314123  [44864/60000]
loss: 1.229030  [51264/60000]
loss: 1.127525  [57664/60000]
Test Error: 
 Accuracy: 62.2%, Avg loss: 1.189526 

Epoch 3
-------------------------------
loss: 1.318947  [   64/60000]
loss: 1.288324  [ 6464/60000]
loss: 1.039439  [12864/60000]
loss: 1.058799  [19264/60000]
loss: 1.129309  [25664/60000]
loss: 1.174480  [32064/600