In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

In [62]:
# Device config
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
n_epochs = 10
batch_size = 4
learning_rate = 0.001

In [63]:
# load dataset
# dataset has PILImage images of range [0, 1]
# here we transform them to tensors of normalized range [-1, 1]
transform = transforms.Compose(
        [transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                             transform=transform)

test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                            transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
                                          shuffle=True)

test_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
                                          shuffle=False)

classes = ('plane', 'car', 'bird', 'cat', 'deer',
          'dog', 'frog', 'horse', 'ship', 'truck')

In [64]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        # input_layer: 3 input channels, 6 output channels, 5 kernel size
        self.conv1 = nn.Conv2d(3, 6, 5) # 3*32*32
        self.pool = nn.MaxPool2d(2, stride=2) # 6*28*28
        self.conv2 = nn.Conv2d(6, 16, 5) # 6*14*14
        self.fc1 = nn.Linear(16*5*5, 120)
        # 16*5*5 is the size after two consecutive Conv2d and MaxPool2d
        self.fc2 = nn.Linear(120, 72)
        self.fc3 = nn.Linear(72, 10)
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16*5*5) # Flatten
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x) 
        # Note that no activation function and softmax should be included in the last layer
        # since they are both included in the nn.CrossEntropyLoss()
        return(x)

model = ConvNet().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [65]:
# Training
n_total_steps = len(train_loader)
for epoch in range(n_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # 4, 3, 32, 32 --> 4, 3, 1024
        # input_layer: 3 input channels, 6 output channels, 5 kernel size
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
            
    print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}, Time: {str(datetime.now())[:19]}")

print("Finished Training")

Epoch [1/10], Loss: 2.2973, Time: 2020-06-04 00:10:45
Epoch [2/10], Loss: 1.6392, Time: 2020-06-04 00:11:31
Epoch [3/10], Loss: 2.0236, Time: 2020-06-04 00:12:19
Epoch [4/10], Loss: 2.0376, Time: 2020-06-04 00:13:10
Epoch [5/10], Loss: 1.2965, Time: 2020-06-04 00:14:03
Epoch [6/10], Loss: 0.6566, Time: 2020-06-04 00:14:52
Epoch [7/10], Loss: 0.3501, Time: 2020-06-04 00:15:40
Epoch [8/10], Loss: 0.9825, Time: 2020-06-04 00:16:27
Epoch [9/10], Loss: 0.9566, Time: 2020-06-04 00:17:12
Epoch [10/10], Loss: 1.6371, Time: 2020-06-04 00:17:57
Finished Training


In [66]:
# Testing
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for i in range(10)]
    n_class_samples = [0 for i in range(10)]
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        
        _, preds = torch.max(outputs, dim = 1)
        n_samples += labels.size(0)
        n_correct += (preds == labels).sum().item()
        
        for i in range(batch_size):
            label = labels[i]
            pred = preds[i]
            if (label == pred):
                n_class_correct[label] += 1
            n_class_samples[label] += 1
    
    acc = 100.0 * n_correct / n_samples
    print(f"Accuracy of CNN: {acc}%")
    
    for i in range(10):
        acc = 100.0 * n_class_correct[i] / n_class_samples[i]
        print(f"Accuracy of {classes[i]}: {acc}%")

Accuracy of CNN: 60.062%
Accuracy of plane: 59.32%
Accuracy of car: 76.62%
Accuracy of bird: 44.3%
Accuracy of cat: 44.58%
Accuracy of deer: 40.12%
Accuracy of dog: 52.7%
Accuracy of frog: 73.06%
Accuracy of horse: 66.08%
Accuracy of ship: 74.56%
Accuracy of truck: 69.28%


## Note: Formula for the output size

$$(Input-F+2P)/S + 1$$

Example: 5x5 input, 3x3 filter, padding = 0, stride = 1

$$ (5-3+0)/1 + 1 = 3$$

Thus, the output will be of size 3x3.