# Baseline CNN with stochastic gradient descent

In [8]:
import torch
from torch import nn, no_grad
from torch.nn import Conv2d, CrossEntropyLoss, LeakyReLU, Linear, MaxPool2d
from torch.optim import SGD
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import CIFAR10

In [9]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

cifar_10_training_data = CIFAR10('datasets/', download=True, transform=transform)
cifar_10_test_data = CIFAR10('datasets/', train=False, download=True, transform=transform)

Files already downloaded and verified
Files already downloaded and verified


In [10]:
train_loader = DataLoader(cifar_10_training_data, batch_size=4, num_workers=2)

test_loader = DataLoader(cifar_10_test_data, batch_size=4, num_workers=2)

In [24]:
num_input_channels = 3
num_output_classes = 10

num_conv1_channels = 6
conv_kernel_size = 5
pool_kernel_size = 2
num_conv2_channels = 16

fc1_output_size = 120
fc2_output_size = 84


class Net(nn.Module):
    def __init__(self, activation=LeakyReLU, **kwargs):
        super().__init__()

        self.conv1 = Conv2d(
            num_input_channels, num_conv1_channels, conv_kernel_size)
        self.pool1 = MaxPool2d(pool_kernel_size, pool_kernel_size)
        self.conv2 = Conv2d(
            num_conv1_channels, num_conv2_channels, conv_kernel_size)
        self.pool2 = MaxPool2d(pool_kernel_size, pool_kernel_size)
        self.convolution_output_size = num_conv2_channels * conv_kernel_size**2
        # Fully connected layers
        self.fc1 = Linear(
            num_conv2_channels * conv_kernel_size * conv_kernel_size, fc1_output_size)
        self.fc2 = Linear(fc1_output_size, fc2_output_size)
        self.fc3 = Linear(fc2_output_size, num_output_classes)
        self.relu = activation(**kwargs)

    def forward(self, x):
        # Convolutional layers
        x = self.pool1(self.relu(self.conv1(x)))
        x = self.pool2(self.relu(self.conv2(x)))
        # Flatten the output of the convolutional layers
        x = x.view(-1, self.convolution_output_size)
        # Fully connected layers
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Training the classifier

In [15]:
def train_model(data_loader, network, optimizer, loss_function):
    for epoch in range(NUMBER_OF_EPOCHS):
        running_loss = 0.0
        for i, data in enumerate(data_loader):
            inputs, labels = data
            optimizer.zero_grad()

            outputs = network(inputs)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % BATCH_TO_PRINT == BATCH_TO_PRINT - 1:
                print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
                running_loss = 0.0

    return network

In [25]:
net = Net(negative_slope=0.1)

In [12]:
NUMBER_OF_EPOCHS = 10
BATCH_TO_PRINT = 2000
LEARNING_RATE = 1e-4

criterion = CrossEntropyLoss()
optimizer = SGD(net.parameters(), lr=LEARNING_RATE)

# Train the network
trained_net = train_model(train_loader, net, optimizer, criterion)

[1,  2000] loss: 2.305
[1,  4000] loss: 2.305
[1,  6000] loss: 2.305
[1,  8000] loss: 2.305
[1, 10000] loss: 2.305
[1, 12000] loss: 2.304
[2,  2000] loss: 2.303
[2,  4000] loss: 2.303
[2,  6000] loss: 2.303
[2,  8000] loss: 2.303
[2, 10000] loss: 2.303
[2, 12000] loss: 2.302
[3,  2000] loss: 2.301
[3,  4000] loss: 2.302
[3,  6000] loss: 2.301
[3,  8000] loss: 2.301
[3, 10000] loss: 2.301
[3, 12000] loss: 2.300
[4,  2000] loss: 2.300
[4,  4000] loss: 2.300
[4,  6000] loss: 2.299
[4,  8000] loss: 2.299
[4, 10000] loss: 2.299
[4, 12000] loss: 2.298
[5,  2000] loss: 2.298
[5,  4000] loss: 2.298
[5,  6000] loss: 2.297
[5,  8000] loss: 2.297
[5, 10000] loss: 2.297
[5, 12000] loss: 2.296
[6,  2000] loss: 2.295
[6,  4000] loss: 2.296
[6,  6000] loss: 2.294
[6,  8000] loss: 2.294
[6, 10000] loss: 2.294
[6, 12000] loss: 2.293
[7,  2000] loss: 2.292
[7,  4000] loss: 2.292
[7,  6000] loss: 2.290
[7,  8000] loss: 2.289
[7, 10000] loss: 2.289
[7, 12000] loss: 2.287
[8,  2000] loss: 2.286
[8,  4000] 

Calculate test accuracy

In [19]:
def calculate_test_accuracy(test_loader, network):
    correct = 0
    total = 0
    with no_grad():
        for data in test_loader:
            images, labels = data
            outputs = network(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [13]:
accuracy = calculate_test_accuracy(test_loader, net)

print(f'Accuracy of the network on the test images: {100 * accuracy} %')

Accuracy of the network on the test images: 18.36 %


The test accuracy is very low, most likely the learning rate is too low since the cross-entropy loss is decreasing very slowly over the epochs. It decreased the same amount during the 10th epoch as the previous nine combined.

# Swapping the optimizer for ADAM

In [22]:
from torch.optim import Adam


new_network = Net()
adam_optimizer = Adam(new_network.parameters())

trained_model_with_adam = train_model(train_loader, new_network, adam_optimizer, criterion)

[1,  2000] loss: 1.806
[1,  4000] loss: 1.562
[1,  6000] loss: 1.432
[1,  8000] loss: 1.373
[1, 10000] loss: 1.371
[1, 12000] loss: 1.319
[2,  2000] loss: 1.271
[2,  4000] loss: 1.247
[2,  6000] loss: 1.193
[2,  8000] loss: 1.172
[2, 10000] loss: 1.196
[2, 12000] loss: 1.158
[3,  2000] loss: 1.132
[3,  4000] loss: 1.120
[3,  6000] loss: 1.080
[3,  8000] loss: 1.061
[3, 10000] loss: 1.094
[3, 12000] loss: 1.056
[4,  2000] loss: 1.044
[4,  4000] loss: 1.023
[4,  6000] loss: 0.994
[4,  8000] loss: 0.978
[4, 10000] loss: 1.015
[4, 12000] loss: 0.989
[5,  2000] loss: 0.970
[5,  4000] loss: 0.954
[5,  6000] loss: 0.947
[5,  8000] loss: 0.906
[5, 10000] loss: 0.959
[5, 12000] loss: 0.931
[6,  2000] loss: 0.915
[6,  4000] loss: 0.909
[6,  6000] loss: 0.891
[6,  8000] loss: 0.855
[6, 10000] loss: 0.916
[6, 12000] loss: 0.884
[7,  2000] loss: 0.879
[7,  4000] loss: 0.870
[7,  6000] loss: 0.846
[7,  8000] loss: 0.810
[7, 10000] loss: 0.872
[7, 12000] loss: 0.842
[8,  2000] loss: 0.841
[8,  4000] 

In [23]:
accuracy = calculate_test_accuracy(test_loader, new_network)

print(f'Accuracy of the network on the test images: {100 * accuracy}%')

Accuracy of the network on the test images: 60.07%


The accuracy is still not very high, but significantly better by only changing the optimizer method from stochastic gradient descent to ADAM.

# Swapping the activation function for tanh

In [27]:
from torch.nn import Tanh


network_with_tanh = Net(Tanh)

In [28]:
adam_optimizer = Adam(network_with_tanh.parameters())
_ = train_model(train_loader, network_with_tanh, adam_optimizer, criterion)

[1,  2000] loss: 1.816
[1,  4000] loss: 1.591
[1,  6000] loss: 1.477
[1,  8000] loss: 1.401
[1, 10000] loss: 1.416
[1, 12000] loss: 1.350
[2,  2000] loss: 1.316
[2,  4000] loss: 1.310
[2,  6000] loss: 1.254
[2,  8000] loss: 1.231
[2, 10000] loss: 1.259
[2, 12000] loss: 1.220
[3,  2000] loss: 1.210
[3,  4000] loss: 1.218
[3,  6000] loss: 1.160
[3,  8000] loss: 1.155
[3, 10000] loss: 1.194
[3, 12000] loss: 1.156
[4,  2000] loss: 1.151
[4,  4000] loss: 1.168
[4,  6000] loss: 1.126
[4,  8000] loss: 1.115
[4, 10000] loss: 1.145
[4, 12000] loss: 1.116
[5,  2000] loss: 1.119
[5,  4000] loss: 1.150
[5,  6000] loss: 1.093
[5,  8000] loss: 1.094
[5, 10000] loss: 1.117
[5, 12000] loss: 1.088
[6,  2000] loss: 1.106
[6,  4000] loss: 1.127
[6,  6000] loss: 1.058
[6,  8000] loss: 1.061
[6, 10000] loss: 1.108
[6, 12000] loss: 1.075
[7,  2000] loss: 1.079
[7,  4000] loss: 1.084
[7,  6000] loss: 1.048
[7,  8000] loss: 1.045
[7, 10000] loss: 1.093
[7, 12000] loss: 1.061
[8,  2000] loss: 1.070
[8,  4000] 

Net(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
  (relu): Tanh()
)

In [32]:
accuracy = calculate_test_accuracy(test_loader, network_with_tanh)

print(f'Accuracy of the network on the test images: {100 * accuracy:.3f}%')

Accuracy of the network on the test images: 57.510%


The accuracy is lower using the hyperbolical tangent function as activation function in the network, compared to using the leaky ReLU function as activation function.