In [None]:
# start with consolidating the training routine
%%time

%matplotlib inline

import torch
import torchvision
from torch import nn
from torch.optim import lr_scheduler

import numpy
import matplotlib.pyplot as plt
import time

class SquareRootScheduler:
    def __init__(self, lr=0.1):
        self.lr = lr

    def __call__(self, num_update):
        return self.lr * pow(num_update + 1.0, -0.5)

def evaluate(dataloader, model, loss_fn):
    # Set the model to evaluation mode - some NN pieces behave differently during training
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    loss, correct = 0, 0

    # We can save computation and memory by not calculating gradients here - we aren't optimizing
    with torch.no_grad():
        # loop over all of the batches
        for X, y in dataloader:
            pred = model(X)
            loss += loss_fn(pred, y).item()
            # how many are correct in this batch? Tracking for accuracy
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    loss /= num_batches
    correct /= size

    accuracy = 100*correct
    return accuracy, loss

def train_one_epoch(dataloader, model, loss_fn, optimizer, scheduler=None, epoch=1):
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # forward pass
        pred = model(X)
        loss = loss_fn(pred, y)

        # backward pass calculates gradients
        loss.backward()

        # take one step with these gradients
        optimizer.step()

        # resets the gradients
        optimizer.zero_grad()

    if scheduler:
            if scheduler.__module__ == lr_scheduler.__name__:
                # Using PyTorch In-Built scheduler
                scheduler.step()
            else:
                # Using custom defined scheduler
                for param_group in optimizer.param_groups:
                    param_group['lr'] = scheduler(epoch)

class NonlinearClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layers_stack = nn.Sequential(
            nn.Linear(28*28, 50),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(50, 50),
            nn.ReLU(),
           # nn.Dropout(0.2),
            nn.Linear(50, 50),
            nn.ReLU(),
           # nn.Dropout(0.2),
            nn.Linear(50, 10) )

    def forward(self, x):
        x = self.flatten(x)
        x = self.layers_stack(x)

        return x

# read in MNIST data set
training_data = torchvision.datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=torchvision.transforms.ToTensor())

test_data = torchvision.datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=torchvision.transforms.ToTensor())

In [None]:
train_size = int(0.8 * len(training_data))  # 80% for training
val_size = len(training_data) - train_size  # Remaining 20% for validation
training_data, validation_data = torch.utils.data.random_split(training_data, [train_size, val_size], generator=torch.Generator().manual_seed(55))

scheduler = SquareRootScheduler(lr=0.05)

nonlinear_model = NonlinearClassifier()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(nonlinear_model.parameters(), lr=0.05)

In [None]:
batch_sizes = [32, 64, 128, 256, 512]
pltsize=1
plt.figure(figsize=(10*pltsize, 10 * pltsize))

for batch_size in batch_sizes:
  # The dataloader makes our dataset iterable
  train_dataloader = torch.utils.data.DataLoader(training_data, batch_size=batch_size)
  val_dataloader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size)

  epochs = 5
  train_acc_all = []
  val_acc_all = []

  for j in range(epochs):
      train_one_epoch(train_dataloader, nonlinear_model, loss_fn, optimizer, scheduler, epoch=j)

      # checking on the training loss and accuracy once per epoch
      acc, loss = evaluate(train_dataloader, nonlinear_model, loss_fn)
      train_acc_all.append(acc)
      #print(f"Epoch {j}: training loss: {loss}, accuracy: {acc}")

      # checking on the validation loss and accuracy once per epoch
      val_acc, val_loss = evaluate(val_dataloader, nonlinear_model, loss_fn)
      val_acc_all.append(val_acc)
      #print(f"Epoch {j}: val. loss: {val_loss}, val. accuracy: {val_acc}")
  plt.plot(range(epochs), train_acc_all,label = f'Training Acc. ({batch_size})' )
  plt.plot(range(epochs), val_acc_all, label = f'Validation Acc. ({batch_size})' )

for batch_size in batch_sizes:
  # The dataloader makes our dataset iterable
  train_dataloader = torch.utils.data.DataLoader(training_data, batch_size=batch_size)
  val_dataloader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size)

  epochs = 5
  train_acc_all = []
  val_acc_all = []

  for j in range(epochs):
      train_one_epoch(train_dataloader, nonlinear_model, loss_fn, optimizer, scheduler=None, epoch=j)

      # checking on the training loss and accuracy once per epoch
      acc, loss = evaluate(train_dataloader, nonlinear_model, loss_fn)
      train_acc_all.append(acc)
      #print(f"Epoch {j}: training loss: {loss}, accuracy: {acc}")

      # checking on the validation loss and accuracy once per epoch
      val_acc, val_loss = evaluate(val_dataloader, nonlinear_model, loss_fn)
      val_acc_all.append(val_acc)
      #print(f"Epoch {j}: val. loss: {val_loss}, val. accuracy: {val_acc}")
  plt.plot(range(epochs), train_acc_all,label = f'Training Acc. ({batch_size})' , linestyle='-.')
  plt.plot(range(epochs), val_acc_all, label = f'Validation Acc. ({batch_size})' , linestyle='-.')

plt.xlabel('Epoch #')
plt.ylabel('Loss')
plt.title('Training')
plt.legend()

The above plot tells us a few things about the impact of increasing batch size and introducing a learning rate scheduler. 



The most straightforward way of demonstrating the effect of different batch sizes is to train the model on different sized batches and compare. I used batch sizes from 32 to 512. In the session, we used 256. Larger batch sizes tend to take longer to train, thus we only include one example that is larger and a few that are much smaller. We see from the plot above that as batch size increases, the accuracy increases. Not only that, but the variance between the accuracy between the first and last epochs significantly reduces for larger batch sizes. 



Here, I added a square root learning rate scheduler. This means that the original learning rate ($\eta _0$) is reduced via $\eta = \eta_0 (t + 1)^{1/2}$ where $t$ is the epoch. Overall, the models trained using a learning rate scheduler had higher accuracy than those without.


