# Chapter 9

## Exercise 1:

Implement a deep convolutional neural network from scratch using a popular deep learning framework (e.g., TensorFlow or PyTorch). Train and evaluate the network on a standard image classification dataset, such as CIFAR-10 or MNIST. First, experiment blindly with various hyperparameters and architectures and observe the model’s performance. Second, apply the measurements proposed in this book to reduce the hyperparameter search space and observe the model’s performance.

In [1]:
# Prepare dataset

from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader

train_data = datasets.MNIST(
    root = 'data',
    train = True,                         
    transform = ToTensor(), 
    download = True,            
)
test_data = datasets.MNIST(
    root = 'data', 
    train = False, 
    transform = ToTensor()
)

train_dataloader = DataLoader(train_data, batch_size=100, shuffle=True, num_workers=1)
test_dataloader = DataLoader(test_data, batch_size=100, shuffle=True, num_workers=1)

  from .autonotebook import tqdm as notebook_tqdm


60000


In [2]:
# Define the model

import torch.nn as nn

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 16, 5, 1, 2),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, 5, 1, 2),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.out = nn.Linear(32 * 7 * 7, 10)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        
        x = x.view(x.size(0), -1)
        output = self.out(x)
        
        return output

In [4]:
from torch import optim
from torch.autograd import Variable

cnn = NeuralNetwork()
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn.parameters(), lr=0.01)
num_epochs = 10

def train():
    cnn.train()
    
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_dataloader):
            b_x = Variable(images)
            b_y = Variable(labels)
            
            outputs = cnn(b_x)
            loss = loss_func(outputs, b_y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (i+1) % 100 == 0:
                print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

train()

Epoch 1, Loss: 0.061979740858078
Epoch 1, Loss: 0.07952898740768433
Epoch 1, Loss: 0.03716222196817398
Epoch 1, Loss: 0.022422507405281067
Epoch 1, Loss: 0.05648098140954971
Epoch 1, Loss: 0.07930899411439896
Epoch 2, Loss: 0.017399856820702553
Epoch 2, Loss: 0.017844809219241142
Epoch 2, Loss: 0.02048613131046295
Epoch 2, Loss: 0.09929203242063522
Epoch 2, Loss: 0.007487455382943153
Epoch 2, Loss: 0.05051618069410324
Epoch 3, Loss: 0.0382247120141983
Epoch 3, Loss: 0.05837143585085869
Epoch 3, Loss: 0.012061964720487595
Epoch 3, Loss: 0.028368398547172546
Epoch 3, Loss: 0.02847195230424404
Epoch 3, Loss: 0.020262017846107483
Epoch 4, Loss: 0.03359116241335869
Epoch 4, Loss: 0.09503994137048721
Epoch 4, Loss: 0.06591596454381943
Epoch 4, Loss: 0.011651252396404743
Epoch 4, Loss: 0.10653070360422134
Epoch 4, Loss: 0.009702167473733425
Epoch 5, Loss: 0.015484996140003204
Epoch 5, Loss: 0.057421039789915085
Epoch 5, Loss: 0.028067696839571
Epoch 5, Loss: 0.08785694092512131
Epoch 5, Loss:

In [8]:
import torch

def test():
    cnn.eval()
    
    with torch.no_grad():
        for images, labels in test_dataloader:
            output = cnn(images)
            pred = torch.max(output, 1)[1].data.squeeze()
            accuracy = (pred == labels).sum().item() / float(labels.size(0))
    
    print(f'Accuracy: {accuracy}')

test()

Accuracy: 0.97


The MEC of this CNN is mostly based on the last linear layer (1568 -> 10), which is roughly 2.5 million bits. However, there are only 60000 instances of 10 classes, which takes only 240000 bits. This shows that the MEC is more than 10 times larger than the input information. Hence, we can shrink the network.

In [8]:
# Define the model

import torch.nn as nn

class NewNeuralNetwork(nn.Module):
    def __init__(self):
        super(NewNeuralNetwork, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 4, 5, 1, 2),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(4, 16, 5, 1, 2),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.out = nn.Linear(16 * 7 * 7, 10)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        
        x = x.view(x.size(0), -1)
        output = self.out(x)
        
        return output

In [9]:
from torch import optim
from torch.autograd import Variable

cnn = NewNeuralNetwork()
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn.parameters(), lr=0.01)
num_epochs = 10

def train():
    cnn.train()
    
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_dataloader):
            b_x = Variable(images)
            b_y = Variable(labels)
            
            outputs = cnn(b_x)
            loss = loss_func(outputs, b_y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (i+1) % 100 == 0:
                print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

train()

Epoch 1, Loss: 0.08102487772703171
Epoch 1, Loss: 0.11710499972105026
Epoch 1, Loss: 0.09750881791114807
Epoch 1, Loss: 0.06769582629203796
Epoch 1, Loss: 0.21913009881973267
Epoch 1, Loss: 0.12206210196018219
Epoch 2, Loss: 0.07750270515680313
Epoch 2, Loss: 0.04412339627742767
Epoch 2, Loss: 0.07159687578678131
Epoch 2, Loss: 0.035802341997623444
Epoch 2, Loss: 0.11703293025493622
Epoch 2, Loss: 0.08621673285961151
Epoch 3, Loss: 0.11933111399412155
Epoch 3, Loss: 0.078800268471241
Epoch 3, Loss: 0.048164866864681244
Epoch 3, Loss: 0.03147512674331665
Epoch 3, Loss: 0.02031896263360977
Epoch 3, Loss: 0.03376016020774841
Epoch 4, Loss: 0.016535067930817604
Epoch 4, Loss: 0.06465445458889008
Epoch 4, Loss: 0.026407934725284576
Epoch 4, Loss: 0.08631669729948044
Epoch 4, Loss: 0.08895490318536758
Epoch 4, Loss: 0.03865287825465202
Epoch 5, Loss: 0.027885565534234047
Epoch 5, Loss: 0.0008036562940105796
Epoch 5, Loss: 0.03352247178554535
Epoch 5, Loss: 0.04877978935837746
Epoch 5, Loss: 

In [11]:
import torch

def test():
    cnn.eval()
    
    with torch.no_grad():
        for images, labels in test_dataloader:
            output = cnn(images)
            pred = torch.max(output, 1)[1].data.squeeze()
            accuracy = (pred == labels).sum().item() / float(labels.size(0))
    
    print(f'Accuracy: {accuracy}')

test()

Accuracy: 0.98


I shrank the last linear layer to 784 -> 10, which only has a MEC of 610000 bits, which should still be sufficient for the dataset. As a result, the testing accuracy increased, showing that the prior model is probablly overfitting, and reducing the model size saved both time and gave better performance.