Practice training a deep neural network on the CIFAR10 image dataset:
Load CIFAR10 just like you loaded the FashionMNIST dataset in Chapter 10, but using torchvision.datasets.CIFAR10 instead of FashionMNIST. The dataset is composed of 60,000 32 × 32–pixel color images (50,000 for training, 10,000 for testing) with 10 classes.
Build a DNN with 20 hidden layers of 100 neurons each (that’s too many, but it’s the point of this exercise). Use He initialization and the Swish activation function (using nn.SiLU). Since this is a classification task, you will need an output layer with one neuron per class.
Using NAdam optimization and early stopping, train the network on the CIFAR10 dataset. Remember to search for the right learning rate each time you change the model’s architecture or hyperparameters.
Now try adding batch-norm and compare the learning curves: is it converging faster than before? Does it produce a better model? How does it affect training speed?
Try replacing batch-norm with SELU, and make the necessary adjustments to ensure the network self-normalizes (i.e., standardize the input features, use LeCun normal initialization, make sure the DNN contains only a sequence of dense layers, etc.).
Try regularizing the model with alpha dropout. Then, without retraining your model, see if you can achieve better accuracy using MC dropout.
Retrain your model using 1cycle scheduling and see if it improves training speed and model accuracy.

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torchvision
from torchvision.datasets import CIFAR10
from torchvision import transforms

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import random_split, DataLoader

In [28]:
trainset = CIFAR10(root='./data', train=True, download=True, 
                   transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]))

train_size = int(0.9 * len(trainset))
val_size = len(trainset) - train_size
train_subset, val_subset = random_split(trainset, [train_size, val_size])

train_loader = DataLoader(train_subset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=64, shuffle=False)

# images, labels = next(iter(train_loader))

# plt.imshow(torchvision.utils.make_grid(images).permute(1, 2, 0) / 2 + 0.5)
# plt.title(' '.join(trainset.classes[label] for label in labels))
# plt.show()

In [29]:
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

In [23]:
class CifarDNN(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_outputs):
        super().__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Flatten())
        self.layers.append(nn.Linear(n_inputs, n_hidden, device=device))
        self.layers.append(nn.SiLU())
        for i in range(20):
            self.layers.append(nn.Linear(n_hidden, n_hidden, device=device))
            self.layers.append(nn.SiLU())
        self.layers.append(nn.Linear(n_hidden, n_outputs))

    def forward(self, x):
        for layers in self.layers:
            x = layers(x)
        return x

In [31]:
class EarlyStopping:
    def __init__(self, patience=5):
        self.patience = patience
        self.counter = 0
        self.best_acc = 0
    
    def step(self, val_acc):
        if val_acc > self.best_acc:
            self.best_acc = val_acc
            self.counter = 0
            return False  # do not stop
        else:
            self.counter += 1
            return self.counter >= self.patience  # stop if patience exhausted


In [30]:
def init_weights_he(m):
    if isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
        if m.bias is not None:
            nn.init.zeros_(m.bias)

model = CifarDNN(3*32*32, 50, 10).to(device)
model.apply(init_weights_he)


CifarDNN(
  (layers): ModuleList(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=3072, out_features=50, bias=True)
    (2): SiLU()
    (3): Linear(in_features=50, out_features=50, bias=True)
    (4): SiLU()
    (5): Linear(in_features=50, out_features=50, bias=True)
    (6): SiLU()
    (7): Linear(in_features=50, out_features=50, bias=True)
    (8): SiLU()
    (9): Linear(in_features=50, out_features=50, bias=True)
    (10): SiLU()
    (11): Linear(in_features=50, out_features=50, bias=True)
    (12): SiLU()
    (13): Linear(in_features=50, out_features=50, bias=True)
    (14): SiLU()
    (15): Linear(in_features=50, out_features=50, bias=True)
    (16): SiLU()
    (17): Linear(in_features=50, out_features=50, bias=True)
    (18): SiLU()
    (19): Linear(in_features=50, out_features=50, bias=True)
    (20): SiLU()
    (21): Linear(in_features=50, out_features=50, bias=True)
    (22): SiLU()
    (23): Linear(in_features=50, out_features=50, bias=True)
    (24): Si

In [34]:
def accuracy(pred, target):
    preds = pred.argmax(dim=1)
    return (preds == target).float().mean().item()

In [32]:
optimizer = optim.NAdam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
early_stopper = EarlyStopping(patience=4)

In [36]:
for epoch in range(20):

    model.train()
    train_loss = 0
    train_acc = 0
    
    for X_batch, y_batch in train_loader:

        X_batch, y_batch = X_batch.to(device), y_batch.to(device) 
        pred = model(X_batch)
        loss = criterion(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()
        train_acc += accuracy(pred, y_batch)

    train_loss /= len(train_loader)
    train_acc /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    val_acc = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            pred = model(X_batch)
            loss = criterion(pred, y_batch)
            val_loss += loss.item()
            val_acc += accuracy(pred, y_batch)

    val_loss /= len(val_loader)
    val_acc /= len(val_loader)

    print(f"Epoch {epoch+1:02d} | "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if early_stopper.step(val_acc):
        print("Early stopping triggered.")
        break

Epoch 01 | Train Loss: 1.9253 | Train Acc: 0.2702 | Val Loss: 1.8156 | Val Acc: 0.3212
Epoch 02 | Train Loss: 1.7104 | Train Acc: 0.3745 | Val Loss: 1.7320 | Val Acc: 0.3742
Epoch 03 | Train Loss: 1.6166 | Train Acc: 0.4120 | Val Loss: 1.6491 | Val Acc: 0.3902
Epoch 04 | Train Loss: 1.5462 | Train Acc: 0.4378 | Val Loss: 1.6348 | Val Acc: 0.4108
Epoch 05 | Train Loss: 1.4966 | Train Acc: 0.4645 | Val Loss: 1.5998 | Val Acc: 0.4126
Epoch 06 | Train Loss: 1.4458 | Train Acc: 0.4818 | Val Loss: 1.5933 | Val Acc: 0.4351
Epoch 07 | Train Loss: 1.4029 | Train Acc: 0.5002 | Val Loss: 1.5322 | Val Acc: 0.4563
Epoch 08 | Train Loss: 1.3694 | Train Acc: 0.5125 | Val Loss: 1.5619 | Val Acc: 0.4508
Epoch 09 | Train Loss: 1.3425 | Train Acc: 0.5248 | Val Loss: 1.5143 | Val Acc: 0.4602
Epoch 10 | Train Loss: 1.3169 | Train Acc: 0.5344 | Val Loss: 1.5033 | Val Acc: 0.4763
Epoch 11 | Train Loss: 1.2914 | Train Acc: 0.5428 | Val Loss: 1.5209 | Val Acc: 0.4650
Epoch 12 | Train Loss: 1.2712 | Train Acc: 