# Baseline CNN with stochastic gradient descent

In [1]:
import torch
from torch import nn, no_grad
from torch.nn import Conv2d, CrossEntropyLoss, LeakyReLU, Linear, MaxPool2d
from torch.optim import SGD
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
from torchvision.datasets import CIFAR10

In [2]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

cifar_10_training_data = CIFAR10('datasets/', download=True, transform=transform)
cifar_10_test_data = CIFAR10('datasets/', train=False, download=True, transform=transform)

Files already downloaded and verified
Files already downloaded and verified


In [3]:
train_loader = DataLoader(cifar_10_training_data, batch_size=4, num_workers=2)

test_loader = DataLoader(cifar_10_test_data, batch_size=4, num_workers=2)

In [4]:
writer = SummaryWriter()

In [5]:
num_input_channels = 3
num_output_classes = 10

num_conv1_channels = 6
conv_kernel_size = 5
pool_kernel_size = 2
num_conv2_channels = 16

fc1_output_size = 120
fc2_output_size = 84


class Net(nn.Module):
    def __init__(self, activation=LeakyReLU, **kwargs):
        super().__init__()

        self.conv1 = Conv2d(
            num_input_channels, num_conv1_channels, conv_kernel_size)
        self.pool1 = MaxPool2d(pool_kernel_size, pool_kernel_size)
        self.conv2 = Conv2d(
            num_conv1_channels, num_conv2_channels, conv_kernel_size)
        self.pool2 = MaxPool2d(pool_kernel_size, pool_kernel_size)
        self.convolution_output_size = num_conv2_channels * conv_kernel_size**2
        # Fully connected layers
        self.fc1 = Linear(
            num_conv2_channels * conv_kernel_size * conv_kernel_size, fc1_output_size)
        self.fc2 = Linear(fc1_output_size, fc2_output_size)
        self.fc3 = Linear(fc2_output_size, num_output_classes)

        self.name = kwargs.pop('name', '')
        self.relu = activation(**kwargs)

    def forward(self, x):
        # Convolutional layers
        x = self.pool1(self.relu(self.conv1(x)))
        x = self.pool2(self.relu(self.conv2(x)))
        # Flatten the output of the convolutional layers
        x = x.view(-1, self.convolution_output_size)
        # Fully connected layers
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Training the classifier

In [6]:
def train_model(data_loader, network, optimizer, loss_function):
    for epoch in range(NUMBER_OF_EPOCHS):
        running_loss = epoch_loss = 0.
        for i, data in enumerate(data_loader):
            inputs, labels = data
            optimizer.zero_grad()

            outputs = network(inputs)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            epoch_loss += loss.item()
            if i % BATCH_TO_PRINT == BATCH_TO_PRINT - 1:
                print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / BATCH_TO_PRINT:.3f}")
                running_loss = 0.0

        writer.add_scalar(f'Loss/train: {network.name}', epoch_loss / len(data_loader), epoch)
        print(f"[{epoch + 1}] loss: {epoch_loss / len(data_loader):.3f}")

    writer.flush()
    return network

In [7]:
net = Net(negative_slope=0.1, name='Baseline model')

In [8]:
NUMBER_OF_EPOCHS = 10
BATCH_TO_PRINT = 2000
LEARNING_RATE = 1e-4

criterion = CrossEntropyLoss()
optimizer = SGD(net.parameters(), lr=LEARNING_RATE)

# Train the network
trained_net = train_model(train_loader, net, optimizer, criterion)

[1,  2000] loss: 2.306
[1,  4000] loss: 2.305
[1,  6000] loss: 2.303
[1,  8000] loss: 2.302
[1, 10000] loss: 2.304
[1, 12000] loss: 2.303
[1] loss: 2.304
[2,  2000] loss: 2.304
[2,  4000] loss: 2.303
[2,  6000] loss: 2.302
[2,  8000] loss: 2.301
[2, 10000] loss: 2.302
[2, 12000] loss: 2.301
[2] loss: 2.302
[3,  2000] loss: 2.302
[3,  4000] loss: 2.301
[3,  6000] loss: 2.299
[3,  8000] loss: 2.299
[3, 10000] loss: 2.300
[3, 12000] loss: 2.298
[3] loss: 2.300
[4,  2000] loss: 2.299
[4,  4000] loss: 2.298
[4,  6000] loss: 2.296
[4,  8000] loss: 2.296
[4, 10000] loss: 2.297
[4, 12000] loss: 2.295
[4] loss: 2.297
[5,  2000] loss: 2.294
[5,  4000] loss: 2.293
[5,  6000] loss: 2.291
[5,  8000] loss: 2.290
[5, 10000] loss: 2.291
[5, 12000] loss: 2.288
[5] loss: 2.291
[6,  2000] loss: 2.286
[6,  4000] loss: 2.284
[6,  6000] loss: 2.281
[6,  8000] loss: 2.279
[6, 10000] loss: 2.279
[6, 12000] loss: 2.274
[6] loss: 2.280
[7,  2000] loss: 2.270
[7,  4000] loss: 2.268
[7,  6000] loss: 2.262
[7,  80

Calculate test accuracy

In [9]:
def calculate_test_accuracy(test_loader, network):
    correct = 0
    total = 0
    with no_grad():
        for data in test_loader:
            images, labels = data
            outputs = network(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [10]:
accuracy = calculate_test_accuracy(test_loader, net)

print(f'Accuracy of the network on the test images: {100 * accuracy} %')

Accuracy of the network on the test images: 24.37 %


The test accuracy is very low, most likely the learning rate is too low since the cross-entropy loss is decreasing very slowly over the epochs. It decreased the same amount during the 10th epoch as the previous nine combined.

# Swapping the optimizer for ADAM

In [11]:
from torch.optim import Adam


new_network = Net(name='Model used with ADAM')
adam_optimizer = Adam(new_network.parameters())

trained_model_with_adam = train_model(train_loader, new_network, adam_optimizer, criterion)

[1,  2000] loss: 1.852
[1,  4000] loss: 1.604
[1,  6000] loss: 1.491
[1,  8000] loss: 1.425
[1, 10000] loss: 1.419
[1, 12000] loss: 1.369
[1] loss: 1.520
[2,  2000] loss: 1.311
[2,  4000] loss: 1.309
[2,  6000] loss: 1.263
[2,  8000] loss: 1.233
[2, 10000] loss: 1.235
[2, 12000] loss: 1.202
[2] loss: 1.256
[3,  2000] loss: 1.165
[3,  4000] loss: 1.175
[3,  6000] loss: 1.138
[3,  8000] loss: 1.121
[3, 10000] loss: 1.122
[3, 12000] loss: 1.095
[3] loss: 1.133
[4,  2000] loss: 1.070
[4,  4000] loss: 1.083
[4,  6000] loss: 1.055
[4,  8000] loss: 1.039
[4, 10000] loss: 1.051
[4, 12000] loss: 1.021
[4] loss: 1.051
[5,  2000] loss: 1.013
[5,  4000] loss: 1.005
[5,  6000] loss: 0.991
[5,  8000] loss: 0.976
[5, 10000] loss: 1.000
[5, 12000] loss: 0.965
[5] loss: 0.991
[6,  2000] loss: 0.957
[6,  4000] loss: 0.963
[6,  6000] loss: 0.941
[6,  8000] loss: 0.930
[6, 10000] loss: 0.962
[6, 12000] loss: 0.924
[6] loss: 0.944
[7,  2000] loss: 0.917
[7,  4000] loss: 0.912
[7,  6000] loss: 0.893
[7,  80

In [12]:
accuracy = calculate_test_accuracy(test_loader, new_network)

print(f'Accuracy of the network on the test images: {100 * accuracy}%')

Accuracy of the network on the test images: 60.699999999999996%


The accuracy is still not very high, but significantly better by only changing the optimizer method from stochastic gradient descent to ADAM.

# Swapping the activation function for tanh

In [13]:
from torch.nn import Tanh


network_with_tanh = Net(Tanh, name='Model with tanh/ADAM')

In [14]:
adam_optimizer = Adam(network_with_tanh.parameters())
_ = train_model(train_loader, network_with_tanh, adam_optimizer, criterion)

[1,  2000] loss: 1.801
[1,  4000] loss: 1.609
[1,  6000] loss: 1.478
[1,  8000] loss: 1.407
[1, 10000] loss: 1.403
[1, 12000] loss: 1.361
[1] loss: 1.502
[2,  2000] loss: 1.315
[2,  4000] loss: 1.312
[2,  6000] loss: 1.259
[2,  8000] loss: 1.239
[2, 10000] loss: 1.279
[2, 12000] loss: 1.242
[2] loss: 1.272
[3,  2000] loss: 1.211
[3,  4000] loss: 1.224
[3,  6000] loss: 1.184
[3,  8000] loss: 1.181
[3, 10000] loss: 1.217
[3, 12000] loss: 1.178
[3] loss: 1.198
[4,  2000] loss: 1.154
[4,  4000] loss: 1.182
[4,  6000] loss: 1.140
[4,  8000] loss: 1.133
[4, 10000] loss: 1.171
[4, 12000] loss: 1.143
[4] loss: 1.153
[5,  2000] loss: 1.135
[5,  4000] loss: 1.139
[5,  6000] loss: 1.102
[5,  8000] loss: 1.104
[5, 10000] loss: 1.148
[5, 12000] loss: 1.113
[5] loss: 1.123
[6,  2000] loss: 1.110
[6,  4000] loss: 1.130
[6,  6000] loss: 1.091
[6,  8000] loss: 1.088
[6, 10000] loss: 1.113
[6, 12000] loss: 1.085
[6] loss: 1.102
[7,  2000] loss: 1.079
[7,  4000] loss: 1.104
[7,  6000] loss: 1.073
[7,  80

In [15]:
accuracy = calculate_test_accuracy(test_loader, network_with_tanh)

print(f'Accuracy of the network on the test images: {100 * accuracy:.3f}%')

Accuracy of the network on the test images: 55.810%


The accuracy is lower using the hyperbolical tangent function as activation function in the network, compared to using the leaky ReLU function as activation function.