<a href="https://colab.research.google.com/github/vineelkondapalli/Image-Classification-for-CIFAR-10/blob/main/firstcnn_cifar10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**We are building a CNN for CIFAR-10.**

Defining Transform

*   We turn the image into a tensor and normalize the values to allow faster training of the model.

*  It essentially turns the loss functiono space into a more even surface, making gradient descent faster.

Setting up training data


*   set batch size kinda high bc we have colab pro gpus
*   then we use dataset and dataloader to load in the data


*   Setting num_workers to 8 because we have access to more compute. It basically allows more proccesses to run in parallel

In [None]:
import torch
print(torch.__version__)

import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# 1. Define the transformations for the data
# We will convert the images to Tensors and normalize them
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),#data augmentation
    transforms.RandomHorizontalFlip(),#makes the data more well rounded my introducing rotations
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), #makes the loss function surface more even for training
    transforms.RandomErasing()# more data aug. introduces erasing parts of the
])

# Define transformations for the test set (no augmentation)
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# 2. Set the batch size
batch_size = 256

# Reload the datasets with the new transformations
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=256, # Using a larger batch size
                                          shuffle=True, num_workers=8)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=256,
                                         shuffle=False, num_workers=8)

# 5. Define the classes
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


2.8.0+cu128


Adding Residual Blocks to replace conv layers, in order to avoid vanishing gradients(NEW MODEL)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        # Main path
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        # Shortcut path to handle dimension changes
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x) # Add the shortcut connection
        out = F.relu(out)
        return out


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.in_channels = 32

        # Initial "stem" layer
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)

        # Stacking the residual blocks
        self.layer1 = ResidualBlock(32, 64, stride=2)
        self.layer2 = ResidualBlock(64, 128, stride=2)
        self.layer3 = ResidualBlock(128, 256, stride=2)

        # Classifier head
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, 10)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

**Defining the model and forward pass(OLD MODEL)**


1.   using the init function to define the params for each of our layers, input and output size
2.   then using the defined layers in forward to define what a forward pass looks like on a singular batch of data.



In [None]:
# import torch.nn as nn
# import torch.nn.functional as F

# class Net(nn.Module):
#     def __init__(self):
#         super().__init__()
#         # Layer Block 1
#         self.conv1 = nn.Conv2d(3, 32, kernel_size=5, padding=2) # Added padding
#         self.bn1 = nn.BatchNorm2d(32) # Added Batch Norm
#         self.pool = nn.MaxPool2d(2, 2)

#         # Layer Block 2
#         self.conv2 = nn.Conv2d(32, 64, kernel_size=5, padding=2) # Added padding
#         self.bn2 = nn.BatchNorm2d(64) # Added Batch Norm

#         # Layer Block 3 (New)
#         self.conv3 = nn.Conv2d(64, 128, kernel_size=5, padding=2) # New conv layer
#         self.bn3 = nn.BatchNorm2d(128) # New Batch Norm

#         # Fully connected layers
#         # Input size is now 128 * 4 * 4 = 2048
#         self.fc1 = nn.Linear(128 * 4 * 4, 256)
#         self.fc2 = nn.Linear(256, 128)
#         self.fc3 = nn.Linear(128, 10)

#         #defining the dropout layer
#         self.dropout = nn.Dropout(p=0.5)

#     def forward(self, x):
#         # Block 1
#         x = self.pool(F.relu(self.bn1(self.conv1(x))))
#         # Block 2
#         x = self.pool(F.relu(self.bn2(self.conv2(x))))
#         # Block 3
#         x = self.pool(F.relu(self.bn3(self.conv3(x))))

#         # Flatten and classify
#         x = torch.flatten(x, 1)
#         x = self.dropout(F.relu(self.fc1(x)))
#         x = self.dropout(F.relu(self.fc2(x)))
#         x = self.fc3(x)
#         return x

Now we define the Loss function and optimizer

We choose Cross Entropy loss to keep a smooth convex loss surface to make training faster. CLE runs a softmax over the logits and then computes the -log of the correct probabiility.

In [None]:
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

net = Net()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net.to(device)

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Define the optimizer
# optimizer = optim.Adam(net.parameters(), lr=0.001) #old optimizer and scheduler
# scheduler = StepLR(optimizer, step_size=7, gamma=0.1)

optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

print("Loss function and optimizer are defined.")

Loss function and optimizer are defined.


Time to train!

In [None]:
# Loop for 25 epochs
for epoch in range(50):

    # --- Training Phase ---
    net.train() # Use 'model' if you're using the ResNet, or 'net' for your custom one
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 200 == 199:
            print(f'[{epoch + 1}, {i + 1:5d}] training loss: {running_loss / 200:.3f}')
            running_loss = 0.0

    # --- Evaluation Phase ---
    validation_loss = 0.0 # <-- ADDED: Initialize validation_loss
    correct = 0
    total = 0
    net.eval()
    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs =  net(images)

            # --- START OF ADDED/MODIFIED CODE ---
            loss = criterion(outputs, labels) # <-- ADDED: Calculate loss on test data
            validation_loss += loss.item() # <-- ADDED: Accumulate the validation loss
            # --- END OF ADDED/MODIFIED CODE ---

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    # --- Print Statistics and Step Scheduler ---
    accuracy = 100 * correct // total
    avg_val_loss = validation_loss / len(testloader) # <-- ADDED: Calculate average validation loss

    print(f'Accuracy on the test set after epoch {epoch + 1}: {accuracy} %')
    print(f'Validation loss after epoch {epoch + 1}: {avg_val_loss:.3f}') # <-- ADDED: Print validation loss

    scheduler.step(avg_val_loss) # <-- MODIFIED: Pass validation loss to the scheduler
    print(f"Current Learning Rate: {optimizer.param_groups[0]['lr']}")

print('Finished Training')

Accuracy on the test set after epoch 1: 43 %
Validation loss after epoch 1: 1.660
Current Learning Rate: 0.1
Accuracy on the test set after epoch 2: 56 %
Validation loss after epoch 2: 1.198
Current Learning Rate: 0.1
Accuracy on the test set after epoch 3: 65 %
Validation loss after epoch 3: 1.009
Current Learning Rate: 0.1
Accuracy on the test set after epoch 4: 70 %
Validation loss after epoch 4: 0.834
Current Learning Rate: 0.1
Accuracy on the test set after epoch 5: 72 %
Validation loss after epoch 5: 0.792
Current Learning Rate: 0.1
Accuracy on the test set after epoch 6: 69 %
Validation loss after epoch 6: 0.893
Current Learning Rate: 0.1
Accuracy on the test set after epoch 7: 76 %
Validation loss after epoch 7: 0.676
Current Learning Rate: 0.1
Accuracy on the test set after epoch 8: 78 %
Validation loss after epoch 8: 0.643
Current Learning Rate: 0.1
Accuracy on the test set after epoch 9: 76 %
Validation loss after epoch 9: 0.724
Current Learning Rate: 0.1
Accuracy on the tes

**Conclusion**



1. We started with a simple 3 layer cnn using batch normalization layers, relu, and maxpooling layers
2.   Then to prevent overfitting we introduced data augmentation using random rotations and crops and a learning rate scheduler to drop the learning every couple epochs to reach the mininmum faster
3. Then we increased the width of the conv layers and added a dropout layer to decrease overfitting.
4. Then we implemented residual blocks to add more layers without having to deal with vanishing gradients and replaced the max pooling with global average pooling to maintain more information.
5. Finally we swapped out the optimizer from ADAM to SGD with momentum and switched the scheduler to a reduceLROnPlateau to adapt the learning rate more adaptively

