# Convolutional Neural Networks (CNN)

## Setup

In [621]:
import torch
from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

from src.data import DataLoaderScratch
from src.trainer import TrainerScratch
from src.optimizers import SGDScratch

## Load Data

In [622]:
transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]
)
mnist_trainset = datasets.MNIST(
    root="./data", train=True, download=True, transform=transform
)
mnist_testset = datasets.MNIST(
    root="./data", train=False, download=True, transform=transform
)

In [623]:
# Transform the training data
X_train = mnist_trainset.data.float() / 255.0
X_train = X_train.reshape(X_train.shape[0], -1)
y_train = mnist_trainset.targets

# Transform the test data
X_val = mnist_testset.data.float() / 255.0
X_val = X_val.reshape(X_val.shape[0], -1)
y_val = mnist_testset.targets

train_dataloader = DataLoaderScratch(X_train, y_train, batch_size=256, shuffle=True)
val_dataloader = DataLoaderScratch(X_val, y_val, batch_size=256, shuffle=False)

## Convolution Layer

### Defining a Simple Example

Let's start by creating a very simple simple 4x4 greyscale image with an edge down the middle.

In [624]:
input = torch.tensor([
    [10, 10, 10, -10, -10, -10],
    [10, 10, 10, -10, -10, -10],
    [10, 10, 10, -10, -10, -10],
    [10, 10, 10, -10, -10, -10],
    [10, 10, 10, -10, -10, -10],
    [10, 10, 10, -10, -10, -10],
], dtype=torch.float)

print(input)
print(input.shape)

tensor([[ 10.,  10.,  10., -10., -10., -10.],
        [ 10.,  10.,  10., -10., -10., -10.],
        [ 10.,  10.,  10., -10., -10., -10.],
        [ 10.,  10.,  10., -10., -10., -10.],
        [ 10.,  10.,  10., -10., -10., -10.],
        [ 10.,  10.,  10., -10., -10., -10.]])
torch.Size([6, 6])


Now let's create an edge detection filter.

In [625]:
filter = torch.tensor([
    [1, 0, -1],
    [1, 0, -1],
    [1, 0, -1]
], dtype=torch.float)

print(filter)
print(filter.shape)

tensor([[ 1.,  0., -1.],
        [ 1.,  0., -1.],
        [ 1.,  0., -1.]])
torch.Size([3, 3])


### Implementation using For Loops

In [626]:
padding = 0
stride = 1

# Add zero padding to the input tensor
input_padded = torch.nn.functional.pad(input, (padding, padding, padding, padding), "constant", 0)

input_height, input_width = input_padded.shape
filter_height, filter_width = filter.shape

# Calculate the dimensions of the output tensor
output_height = ((input_height - filter_height) // stride) + 1
output_width = ((input_width - filter_width) // stride) + 1
output = torch.zeros((output_height, output_width))

for i in range(0, output_height):
    for j in range(0, output_width):
        # Apply the filter
        output[i, j] = torch.sum(
            input_padded[i*stride:i*stride+filter_height, j*stride:j*stride+filter_width] * filter)
    
print(output)
print(output.shape)

tensor([[ 0., 60., 60.,  0.],
        [ 0., 60., 60.,  0.],
        [ 0., 60., 60.,  0.],
        [ 0., 60., 60.,  0.]])
torch.Size([4, 4])


### Vectorized Implementation

We reshape the input and filter to simulate an example with added dimensions for the images, channels and filters.
For the inputs, we want our new dimensions to be $(batch \_ size, channels, input \_ height, input\_width)$.
For the filters, we want our new dimensions to be $(out \_ channels, in \_ channels, input \_ height, input\_width)$.

In [627]:
input = input.unsqueeze(0).unsqueeze(0)
filter = filter.unsqueeze(0).unsqueeze(0)

print("inputs shape:", input.shape)
print("filters shape:", filter.shape)

inputs shape: torch.Size([1, 1, 6, 6])
filters shape: torch.Size([1, 1, 3, 3])


The resulting shapes indicate that we have 
- 1 image with 1 color channel of size 6x6
- 1 filter with 1 color channel of size 3x3.

We can use **unfold** to take each filter position in the image, flatten it and stack it horizontally in a tensor. This way, each row contains a vector of the filter position.

In [628]:
padding = 0
stride = 1

batch_size, in_channels, input_height, input_width = input.shape
out_channels, _, filter_height, filter_width = filter.shape

# Add padding to the input
input_padded = torch.nn.functional.pad(input, (padding, padding, padding, padding))

# Calculate the output dimensions
output_height = (input_height + 2 * padding - filter_height) // stride + 1
output_width = (input_width + 2 * padding - filter_width) // stride + 1

# Unfold the input tensor to get all the sliding windows
input_unfolded = input_padded.unfold(2, filter_height, stride).unfold(3, filter_width, stride)
input_unfolded = input_unfolded.contiguous().view(batch_size, output_height * output_width, in_channels * filter_height * filter_width)

print("original input: ")
print(input)
print(input.shape)
print("unfolded input:")
print(input_unfolded)
print(input_unfolded.shape)

original input: 
tensor([[[[ 10.,  10.,  10., -10., -10., -10.],
          [ 10.,  10.,  10., -10., -10., -10.],
          [ 10.,  10.,  10., -10., -10., -10.],
          [ 10.,  10.,  10., -10., -10., -10.],
          [ 10.,  10.,  10., -10., -10., -10.],
          [ 10.,  10.,  10., -10., -10., -10.]]]])
torch.Size([1, 1, 6, 6])
unfolded input:
tensor([[[ 10.,  10.,  10.,  10.,  10.,  10.,  10.,  10.,  10.],
         [ 10.,  10., -10.,  10.,  10., -10.,  10.,  10., -10.],
         [ 10., -10., -10.,  10., -10., -10.,  10., -10., -10.],
         [-10., -10., -10., -10., -10., -10., -10., -10., -10.],
         [ 10.,  10.,  10.,  10.,  10.,  10.,  10.,  10.,  10.],
         [ 10.,  10., -10.,  10.,  10., -10.,  10.,  10., -10.],
         [ 10., -10., -10.,  10., -10., -10.,  10., -10., -10.],
         [-10., -10., -10., -10., -10., -10., -10., -10., -10.],
         [ 10.,  10.,  10.,  10.,  10.,  10.,  10.,  10.,  10.],
         [ 10.,  10., -10.,  10.,  10., -10.,  10.,  10., -10.],
 

Now we want to multiply each row by the filter and we should have an efficient implementation of convolution. 

For each image we have a matrix for the input and for the filters. So, to do the matrix multiplication we will use torch.bmm which performs a batch matrix-matrix product of matrices. 

For each individual image, our input shape is $(filter \_ positions, filter \_ height * filter \_ width)$. We want to matrix multiply each input image with the filter that is $(filter \_ height, filter \_ width)$. Therefore, we want to reshape the filter to make it shape $(filter \_ height * filter \_ width, 1)$. In this way, we get a valid matrix multiplication.

In [633]:
# Reshape the kernel for matrix multiplication
filter_reshaped = filter.view(out_channels, -1).unsqueeze(1)
filter_reshaped.shape

torch.Size([1, 1, 9])

In [682]:
# Reshape the kernel for matrix multiplication
filter_reshaped = filter.view(out_channels, -1, 1)

# Perform batch matrix multiplication
# Result shape: (batch_size, out_channels, output_height * output_width)
output = torch.bmm(input_unfolded, filter_reshaped)

# Reshape to (batch_size, out_channels, output_height, output_width)
output = output.view(batch_size, out_channels, output_height, output_width)

print(output)
print(output.shape)

tensor([[[[ 0., 60., 60.,  0.],
          [ 0., 60., 60.,  0.],
          [ 0., 60., 60.,  0.],
          [ 0., 60., 60.,  0.]]]])
torch.Size([1, 1, 4, 4])


The result is a correct convolution of the the image. Let's now put all of this into a function.

In [683]:
def conv2d(input, filter, padding=0, stride=1):
    # Assuming input shape is (batch_size, in_channels, height, width)
    # and filter shape is (out_channels, in_channels, kernel_height, kernel_width)
    
    batch_size, in_channels, input_height, input_width = input.shape
    out_channels, _, filter_height, filter_width = filter.shape
    
    # Add padding to the input
    input_padded = torch.nn.functional.pad(input, (padding, padding, padding, padding))

    # Calculate the output dimensions
    output_height = (input_height + 2 * padding - filter_height) // stride + 1
    output_width = (input_width + 2 * padding - filter_width) // stride + 1
    
    # Unfold the input tensor to get all the sliding windows
    input_unfolded = input_padded.unfold(2, filter_height, stride).unfold(3, filter_width, stride)
    input_unfolded = input_unfolded.contiguous().view(batch_size, output_height * output_width, in_channels * filter_height * filter_width)
    
    # Reshape the kernel for matrix multiplication
    filter_reshaped = filter.view(out_channels, -1, 1)
    
    # Perform batch matrix multiplication
    # Result shape: (batch_size, out_channels, output_height * output_width)
    output = torch.bmm(input_unfolded, filter_reshaped)

    # Reshape to (batch_size, out_channels, output_height, output_width)
    output = output.view(batch_size, out_channels, output_height, output_width)
    
    return output

In [684]:
conv2d(input, filter)

tensor([[[[ 0., 60., 60.,  0.],
          [ 0., 60., 60.,  0.],
          [ 0., 60., 60.,  0.],
          [ 0., 60., 60.,  0.]]]])