In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms

In [None]:
train_dataset = datasets.MNIST('./data', train=True, download=True,  # Downloads into a directory ../data
                               transform=transforms.ToTensor())
test_dataset = datasets.MNIST('./data', train=False, download=False,  # No need to download again
                              transform=transforms.ToTensor())

# **3-B**

In [None]:
model = nn.Sequential(
    # In problem 2, we don't use the 2D structure of an image at all. Our network
    # takes in a flat vector of the pixel values as input.
    nn.Flatten(),  
    nn.Linear(784, 100),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(100, 10)
)
print(model)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=100, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.5, inplace=False)
  (4): Linear(in_features=100, out_features=10, bias=True)
)


In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True) 

In [None]:
# Some layers, such as Dropout, behave differently during training
model.train()

for epoch in range(10):
    for batch_idx, (data, target) in enumerate(train_loader):
        # Erase accumulated gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(data)

        # Calculate loss
        loss = loss_fn(output, target)

        # Backward pass
        loss.backward()
        
        # Weight update
        optimizer.step()

    # Track loss each epoch
    print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))

Train Epoch: 1  Loss: 0.1254
Train Epoch: 2  Loss: 0.2105
Train Epoch: 3  Loss: 0.1366
Train Epoch: 4  Loss: 0.0387
Train Epoch: 5  Loss: 0.1046
Train Epoch: 6  Loss: 0.0460
Train Epoch: 7  Loss: 0.0662
Train Epoch: 8  Loss: 0.0623
Train Epoch: 9  Loss: 0.0286
Train Epoch: 10  Loss: 0.0502


In [None]:
# Putting layers like Dropout into evaluation mode
model.eval()

test_loss = 0
correct = 0

# Turning off automatic differentiation
with torch.no_grad():
    for data, target in test_loader:
        output = model(data)
        test_loss += loss_fn(output, target).item()  # Sum up batch loss
        pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max class score
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)

print('Test set: Average loss: %.4f, Accuracy: %d/%d (%.4f)' %
      (test_loss, correct, len(test_loader.dataset),
       100. * correct / len(test_loader.dataset)))

Test set: Average loss: 0.0028, Accuracy: 9757/10000 (97.5700)


# **3-C**

In [None]:
model2 = nn.Sequential(
    nn.Flatten(),  
    nn.Linear(784, 148),
    nn.ReLU(),
    nn.Dropout(0.23),
    nn.Linear(148, 52),
    nn.ReLU(),
    nn.Dropout(0.23),
    nn.Linear(52, 10)
)
print(model2)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=148, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.23, inplace=False)
  (4): Linear(in_features=148, out_features=52, bias=True)
  (5): ReLU()
  (6): Dropout(p=0.23, inplace=False)
  (7): Linear(in_features=52, out_features=10, bias=True)
)


In [None]:
optimizer = torch.optim.Adam(model2.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True) 

In [None]:
# Some layers, such as Dropout, behave differently during training
model2.train()

for epoch in range(10):
    for batch_idx, (data, target) in enumerate(train_loader):
        # Erase accumulated gradients
        optimizer.zero_grad()

        # Forward pass
        output = model2(data)

        # Calculate loss
        loss = loss_fn(output, target)

        # Backward pass
        loss.backward()
        
        # Weight update
        optimizer.step()

    # Track loss each epoch
    print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))

Train Epoch: 1  Loss: 0.0176
Train Epoch: 2  Loss: 0.0041
Train Epoch: 3  Loss: 0.0026
Train Epoch: 4  Loss: 0.1498
Train Epoch: 5  Loss: 0.0247
Train Epoch: 6  Loss: 0.0029
Train Epoch: 7  Loss: 0.0269
Train Epoch: 8  Loss: 0.0231
Train Epoch: 9  Loss: 0.0023
Train Epoch: 10  Loss: 0.0207


In [None]:
# Putting layers like Dropout into evaluation mode
model2.eval()

test_loss = 0
correct = 0

# Turning off automatic differentiation
with torch.no_grad():
    for data, target in test_loader:
        output = model2(data)
        test_loss += loss_fn(output, target).item()  # Sum up batch loss
        pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max class score
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)

print('Test set: Average loss: %.4f, Accuracy: %d/%d (%.4f)' %
      (test_loss, correct, len(test_loader.dataset),
       100. * correct / len(test_loader.dataset)))

Test set: Average loss: 0.0026, Accuracy: 9807/10000 (98.0700)


# **3-D**

In [None]:
model3 = nn.Sequential(
    # In problem 2, we don't use the 2D structure of an image at all. Our network
    # takes in a flat vector of the pixel values as input.
    nn.Flatten(),  
    nn.Linear(784, 500),
    nn.ReLU(),
    nn.Dropout(0.25),
    nn.Linear(500, 350),
    nn.ReLU(),
    nn.Dropout(0.25),
    nn.Linear(350, 150),
    nn.ReLU(),
    nn.Dropout(0.25),
    nn.Linear(150, 10)
)
print(model)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=100, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.5, inplace=False)
  (4): Linear(in_features=100, out_features=10, bias=True)
)


In [None]:
optimizer = torch.optim.Adam(model3.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True) 

In [None]:
# Some layers, such as Dropout, behave differently during training
model3.train()

for epoch in range(10):
    for batch_idx, (data, target) in enumerate(train_loader):
        # Erase accumulated gradients
        optimizer.zero_grad()

        # Forward pass
        output = model3(data)

        # Calculate loss
        loss = loss_fn(output, target)

        # Backward pass
        loss.backward()
        
        # Weight update
        optimizer.step()

    # Track loss each epoch
    print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))

Train Epoch: 1  Loss: 0.0042
Train Epoch: 2  Loss: 0.0001
Train Epoch: 3  Loss: 0.0046
Train Epoch: 4  Loss: 0.0184
Train Epoch: 5  Loss: 0.1198
Train Epoch: 6  Loss: 0.0114
Train Epoch: 7  Loss: 0.0023
Train Epoch: 8  Loss: 0.0013
Train Epoch: 9  Loss: 0.0001
Train Epoch: 10  Loss: 0.0003


In [None]:
# Putting layers like Dropout into evaluation mode
model3.eval()

test_loss = 0
correct = 0

# Turning off automatic differentiation
with torch.no_grad():
    for data, target in test_loader:
        output = model3(data)
        test_loss += loss_fn(output, target).item()  # Sum up batch loss
        pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max class score
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)

print('Test set: Average loss: %.4f, Accuracy: %d/%d (%.4f)' %
      (test_loss, correct, len(test_loader.dataset),
       100. * correct / len(test_loader.dataset)))

Test set: Average loss: 0.0029, Accuracy: 9853/10000 (98.5300)
