## 0. Setup

In [1]:
# Import dependencies
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [2]:
# Set up your device 
cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if cuda else "cpu")

In [3]:
# Set up random seed to 1008. Do not change the random seed.
seed = 1008
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed_all(seed)

## 1. Data: MNIST
#### Load the MNIST training and test dataset using $\texttt{torch.utils.data.DataLoader}$ and $\texttt{torchvision.datasets}$. 

Hint: You might find Alf's notebook useful: https://github.com/Atcold/pytorch-Deep-Learning-Minicourse/blob/master/06-convnet.ipynb

### 1.1. Load Training Set [4 pts]

In [4]:
# Load the MNIST training set with batch size 128, apply data shuffling and normalization
train_loader = DataLoader(
    datasets.MNIST('.', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=128, shuffle=True)

### 1.1. Load Test Set [4 pts]

In [5]:
# Load the MNIST test set with batch size 128, apply data shuffling and normalization
test_loader = DataLoader(
    datasets.MNIST('.', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=128, shuffle=True)

## 2. Models
#### You are going to define two convolutional neural networks which are trained to classify MNIST digits

### 2.1. CNN without Batch Norm [15 pts]

In [6]:
# Fill in the values below that make this network valid for MNIST data
conv1_in_ch = 1
conv2_in_ch = 20
fc1_in_features = 4*4*50
fc2_in_features = 500
n_classes = 10

In [7]:
# Define the CNN with architecture explained in Part 2.1
class NetWithoutBatchNorm(nn.Module):
    def __init__(self):
        super(NetWithoutBatchNorm, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=conv1_in_ch, out_channels=20, kernel_size=5, stride=1)
        self.conv2 = nn.Conv2d(in_channels=20, out_channels=50, kernel_size=5, stride=1)
        self.fc1 = nn.Linear(in_features=fc1_in_features, out_features=500)
        self.fc2 = nn.Linear(in_features=fc2_in_features, out_features=n_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        x = x.view(-1, fc1_in_features)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

### 2.2. CNN with Batch Norm [15 pts]

In [14]:
# Fill in the values below that make this network valid for MNIST data
conv1_bn_size = 20
conv2_bn_size = 50
fc1_bn_size = 500

In [15]:
# Define the CNN with architecture explained in Part 2.2
class NetWithBatchNorm(nn.Module):
    def __init__(self):
        super(NetWithBatchNorm, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=conv1_in_ch, out_channels=20, kernel_size=5, stride=1)
        self.conv1_bn = nn.BatchNorm2d(conv1_bn_size)
        self.conv2 = nn.Conv2d(in_channels=20, out_channels=50, kernel_size=5, stride=1)
        self.conv2_bn = nn.BatchNorm2d(conv2_bn_size)
        self.fc1 = nn.Linear(in_features=fc1_in_features, out_features=500)
        self.fc1_bn = nn.BatchNorm1d(fc1_bn_size)
        self.fc2 = nn.Linear(in_features=fc2_in_features, out_features=n_classes)
        
    def forward(self, x):
        x = F.relu(self.conv1_bn(self.conv1(x)))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2_bn(self.conv2(x)))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, fc1_in_features)
        x = F.relu(self.fc1_bn(self.fc1(x)))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

## 3. Training & Evaluation

### 3.1. Define training method [10 pts]

In [10]:
def train(model, device, train_loader, optimizer, epoch, log_interval = 100):
    # Set model to training mode
    model.train()
    # Loop through data points
    for batch_idx, (data, target) in enumerate(train_loader):
        # Send data to device
        data, target = data.to(device), target.to(device)
        # Zero out the ortimizer
        optimizer.zero_grad()
        # Pass data through model
        output = model(data)
        # Compute the negative log likelihood loss with reduction='mean'
        loss = F.nll_loss(output, target, reduction='mean')
        # Backpropagate loss
        loss.backward()
        # Make a step with the optimizer
        optimizer.step()
        # Print loss (uncomment lines below once implemented)
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

### 3.2. Define test method [10 pts]

In [11]:
# Define test method
def test(model, device, test_loader):
    # Set model to evaluation mode
    model.eval()
    # Variable for the total loss 
    test_loss = 0
    # Counter for the correct predictions
    num_correct = 0
    
    with torch.no_grad():
        # Loop through data points
        for data, target in test_loader:
            # Send data to device
            data, target = data.to(device), target.to(device)
            # Pass data through model
            output = model(data)
            # Compute the negative log likelihood loss and add to total test_loss
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            # Get predictions from the model 
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            # Add number of correct predictions to total num_correct 
            num_correct += pred.eq(target.view_as(pred)).sum().item()

    # Compute the average test_loss
    avg_test_loss = test_loss/len(test_loader.dataset)
    
    # Print loss (uncomment lines below once implemented)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        avg_test_loss, num_correct, len(test_loader.dataset),
        100. * num_correct / len(test_loader.dataset)))

### 3.3. Train NetWithoutBatchNorm() [5 pts]

In [13]:
# Deifne model and sent to device
model = NetWithoutBatchNorm().to(device)

# Optimizer: SGD with learning rate of 1e-2 and momentum of 0.5
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.5)

# Training loop with 10 epochs
for epoch in range(1, 10 + 1):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)


Test set: Average loss: 0.1536, Accuracy: 9553/10000 (96%)


Test set: Average loss: 0.0913, Accuracy: 9716/10000 (97%)


Test set: Average loss: 0.0750, Accuracy: 9748/10000 (97%)


Test set: Average loss: 0.0598, Accuracy: 9826/10000 (98%)


Test set: Average loss: 0.0561, Accuracy: 9814/10000 (98%)


Test set: Average loss: 0.0572, Accuracy: 9823/10000 (98%)


Test set: Average loss: 0.0393, Accuracy: 9880/10000 (99%)


Test set: Average loss: 0.0412, Accuracy: 9873/10000 (99%)


Test set: Average loss: 0.0437, Accuracy: 9851/10000 (99%)


Test set: Average loss: 0.0348, Accuracy: 9882/10000 (99%)



### 3.4. Train NetWithBatchNorm() [5 pts]

In [16]:
# Deifne model and sent to device
model = NetWithBatchNorm().to(device)

# Optimizer: SGD with learning rate of 1e-2 and momentum of 0.5
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.5)

# Training loop with 10 epochs
for epoch in range(1, 10 + 1):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)


Test set: Average loss: 0.1109, Accuracy: 9774/10000 (98%)


Test set: Average loss: 0.0676, Accuracy: 9828/10000 (98%)


Test set: Average loss: 0.0533, Accuracy: 9860/10000 (99%)


Test set: Average loss: 0.0446, Accuracy: 9876/10000 (99%)


Test set: Average loss: 0.0391, Accuracy: 9894/10000 (99%)


Test set: Average loss: 0.0368, Accuracy: 9898/10000 (99%)


Test set: Average loss: 0.0321, Accuracy: 9917/10000 (99%)


Test set: Average loss: 0.0333, Accuracy: 9906/10000 (99%)


Test set: Average loss: 0.0299, Accuracy: 9914/10000 (99%)


Test set: Average loss: 0.0275, Accuracy: 9923/10000 (99%)



## 4. Empirically, which of the models achieves higher accuracy faster? [2 pts]

Answer: NetWithBatchNorm()