### W281 Final Project Supplemental Notebook
### Basic CNN Approach to MAVOC Vehicle Classification

This notebook tries to fit a simple convolutional neural network in order to compare the performance of a non-linear classifier that generates its own features through convolution versus the hand-engineered features in our main report notebook. The basic CNN performs outperforms our by-hand features and linear classifiers with an accuracy of 87% on the test dataset.

In [1]:
import os
import re
import copy
import time
from collections import defaultdict
import math
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, Subset

from torchvision import datasets, models, transforms
import torchvision.transforms.functional as TVF
from torchvision.io import read_image

In [2]:
# Ensure results are replicable
seed = 281

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [3]:
class MavocDataset(Dataset):
    """MAVOC dataset containing pairs of EO + SAR images"""

    def __init__(self, root_dir, eo_transform=None, sar_transform=None, composite_transform=None):
        """
        Constructs a dataframe reference to each sample pair and label in the MAVOC "training" dataset
        :param root_dir (string): path to the folder containing the class folders
        :param transform (callable, optional): transforms to be applied on paired EO + SAR samples
        """

        self.root_dir = root_dir
        self.img_pairs = defaultdict(lambda: dict)

        self.eo_transform = eo_transform
        self.sar_transform = sar_transform
        self.composite_transform = composite_transform

        eo_prefix = "EO" # case sensitive!
        sar_prefix = "SAR"
        class_folders = os.listdir(self.root_dir)

        # populate a dictionary with image_id number, the eo and sar file path, and class label. ignore hidden files.
        for class_dir in class_folders:
            if not class_dir.startswith('.'):
                for file in os.listdir(os.path.join(self.root_dir, class_dir)):
                    if not file.startswith('.'):
                        id = int(re.findall("\d+", file)[0]) # grab the integer (image_id) in filename and use as key
                        label = int(class_dir)
                        img_path = os.path.join(self.root_dir,class_dir, file)

                        if id in self.img_pairs.keys():
                            if file.startswith(eo_prefix):
                                self.img_pairs[id].update({"eo_img": img_path})
                            if file.startswith(sar_prefix):
                                self.img_pairs[id].update({"sar_img": img_path})
                        else:
                            if file.startswith(eo_prefix):
                                self.img_pairs[id] = {"eo_img": img_path, "sar_img":None, "label":label}
                            if file.startswith(sar_prefix):
                                self.img_pairs[id] = {"eo_img": None,"sar_img": img_path, "label":label}

        # convert the dict to a dataframe so that we can properly index into the dataset with __getitem__
        self.img_labels_df = pd.DataFrame.from_dict(self.img_pairs, orient='index')
        self.img_labels_df.reset_index(inplace=True)
        self.img_labels_df = self.img_labels_df.rename(columns = {'index':'image_id'})

    def __getitem__(self, idx):
        df = self.img_labels_df
        eo_img_path = df.loc[df.index[idx], "eo_img"]

        eo_image = read_image(eo_img_path) # reads jpeg or png into a 3d RGB or grayscale tensor (uint8 in [0,255])
        eo_image = TVF.resize(eo_image, (32, 32))

        label = df.loc[df.index[idx], "label"]

        return eo_image, label

    def __len__(self):
        return len(self.img_labels_df.index)

In [4]:
dataset = MavocDataset(root_dir="train3", eo_transform=None, sar_transform=None)

Create a train-val-test (80-10-10) split with balanced class distribution (10% of each class in each of train/val/test splits). We decide to downsample to the minority class (class 7: flatbed truck).

In [5]:
partition_df = dataset.img_labels_df.copy(deep=True)
partition_df["idx"] = partition_df.index
downsample_amount = partition_df["label"].value_counts().min() # 624

classes_to_downsample = list(range(0,10))
classes_to_downsample.pop(7)
minority_class_df = partition_df[partition_df["label"]==7]

appended_data = [minority_class_df]

for label in classes_to_downsample:
    down_df = partition_df[partition_df["label"]==label]
    down_df = resample(down_df, replace=False, n_samples=downsample_amount, random_state=seed)
    appended_data.append(down_df)

samples_bal_df = pd.concat(appended_data)

print("Downsampled train split class counts")
samples_bal_df["label"].value_counts()

Downsampled train split class counts


7    624
0    624
1    624
2    624
3    624
4    624
5    624
6    624
8    624
9    624
Name: label, dtype: int64

Split into train/val/test, 80-10-10, and preserve the class distribution. Ensure class counts are as expected. Create pytorch dataset classes for each partition.

In [6]:
X = samples_bal_df
y = samples_bal_df["label"].tolist()
# 80% training
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=seed)

# 10%/10% dev/test
X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, stratify=y_test, test_size=0.5, random_state=seed)
X_train["partition"], X_dev["partition"], X_test["partition"]= "train", "dev", "test"

partition_scheme_df = pd.concat([X_train, X_dev, X_test], axis=0)
# partition_scheme_df.to_csv("mavoc_partition_scheme.csv",index=False) # write this out for teammates

print(f"Train samples: {len(X_train.index)}")
print(f"Validation samples: {len(X_dev.index)}")
print(f"Test samples: {len(X_test.index)}")

train_indices = X_train["idx"].tolist()
val_indices = X_dev["idx"].tolist()
test_indices = X_test["idx"].tolist()

# Create pytorch subsets of the full dataset by the new stratified-partitioned indices
train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)
test_dataset = Subset(dataset, test_indices)

print(f"Train set class counts \n{X_train.label.value_counts()}")
print(f"Dev set class counts \n{X_dev.label.value_counts()}")
print(f"Test set class counts \n{X_test.label.value_counts()}")

Train samples: 4992
Validation samples: 624
Test samples: 624
Train set class counts 
2    500
3    500
7    499
5    499
9    499
0    499
4    499
1    499
8    499
6    499
Name: label, dtype: int64
Dev set class counts 
1    63
4    63
5    63
7    63
8    62
6    62
9    62
3    62
0    62
2    62
Name: label, dtype: int64
Test set class counts 
0    63
8    63
6    63
9    63
7    62
5    62
1    62
3    62
2    62
4    62
Name: label, dtype: int64


In [7]:
BATCH_SIZE = 4

class_names = ('sedan', 'suv', 'pickup truck', 'van','box truck', 'motorcycle', 'flatbed truck','bus' , 'pickup truck with trailer',
'flatbed truck with trailer')

# Load individual partition
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=True)

dataloaders = dict()
dataloaders["train"] = train_loader
dataloaders["val"] = val_loader
dataloaders["test"] = test_loader

Checking the proper dimension sizes so we can properly specify the layer input and output dimensions.

In [8]:
dataiter = iter(train_loader)
images, labels = dataiter.next()

images=images.type(torch.float32)

conv1 = nn.Conv2d(1,6,5)
pool = nn.MaxPool2d(2,2)
conv2 = nn.Conv2d(6, 16, 5)
print(images.shape)
x = conv1(images)
print(x.shape)
x = pool(x)
print(x.shape)
x=conv2(x)
print(x.shape)
x = pool(x)
print(x.shape)

torch.Size([4, 1, 32, 32])
torch.Size([4, 6, 28, 28])
torch.Size([4, 6, 14, 14])
torch.Size([4, 16, 10, 10])
torch.Size([4, 16, 5, 5])


We created a simple convolutional neural network to see how this would perform relative to other linear classifiers. Our baseline CNN consisted of two convolutional layers with relu activation and the final feed-forward network consisted of two hidden layers plus final 10-node output layer.
We trained for only 5 epochs using a learning rate of 0.001. We did not introduce dropout or any other enhancements to the network (including no augmentations up front to see how effective the convolutional layers were in generating local features on their own).

In [9]:
# Hyper-parameters 
num_epochs = 5
learning_rate = 0.001

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5) # Color channels, output channel size, kernel size
        self.pool = nn.MaxPool2d(2, 2) # kernel size 2x2, stride
        self.conv2 = nn.Conv2d(6, 16, 5) # input channel size==last output chnl size, output chnl, kernel sie
        self.fc1 = nn.Linear(16 * 5 * 5, 120) # 
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, len(class_names))

    def forward(self, x):
        # -> n, 3, 32, 32
        x = self.pool(F.relu(self.conv1(x)))  # -> n, 6, 14, 14
        x = self.pool(F.relu(self.conv2(x)))  # -> n, 16, 4, 4
        x = x.view(-1, 16 * 5 * 5)            # -> n,  #flatten 3d tensor to 1d tensor
        x = F.relu(self.fc1(x))               # -> n, 120
        x = F.relu(self.fc2(x))               # -> n, 84
        x = self.fc3(x)                       # -> n, 10
        return x

# Put everything on GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ConvNet().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # origin shape: [4, 1, 31, 31] = 4, 3, 1024
        # input_layer: 3 input channels, 6 output channels, 5 kernel size
        images = images.type(torch.float32)
        labels = labels.type(torch.long)

        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 312 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

print('Finished Training')
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)

with torch.no_grad():
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for i in range(10)]
    n_class_samples = [0 for i in range(10)]
    for images, labels in test_loader:
        images = images.type(torch.float32)
        labels = labels.type(torch.long)

        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()
        
        for i in range(BATCH_SIZE):
            label = labels[i]
            pred = predicted[i]
            if (label == pred):
                n_class_correct[label] += 1
            n_class_samples[label] += 1

    acc = 100.0 * n_correct / n_samples
    print(f'Test Accuracy of the network: {acc} %')

    for i in range(10):
        acc = 100.0 * n_class_correct[i] / n_class_samples[i]
        print(f'Accuracy of {class_names[i]}: {acc} %')

Epoch [1/5], Step [312/1248], Loss: 0.4337
Epoch [1/5], Step [624/1248], Loss: 1.1835
Epoch [1/5], Step [936/1248], Loss: 0.7446
Epoch [1/5], Step [1248/1248], Loss: 0.2660
Epoch [2/5], Step [312/1248], Loss: 0.8823
Epoch [2/5], Step [624/1248], Loss: 0.1992
Epoch [2/5], Step [936/1248], Loss: 0.1264
Epoch [2/5], Step [1248/1248], Loss: 0.6384
Epoch [3/5], Step [312/1248], Loss: 0.4437
Epoch [3/5], Step [624/1248], Loss: 0.3215
Epoch [3/5], Step [936/1248], Loss: 0.1676
Epoch [3/5], Step [1248/1248], Loss: 1.1826
Epoch [4/5], Step [312/1248], Loss: 0.4726
Epoch [4/5], Step [624/1248], Loss: 0.6038
Epoch [4/5], Step [936/1248], Loss: 0.3948
Epoch [4/5], Step [1248/1248], Loss: 0.0874
Epoch [5/5], Step [312/1248], Loss: 0.4270
Epoch [5/5], Step [624/1248], Loss: 0.6105
Epoch [5/5], Step [936/1248], Loss: 0.1099
Epoch [5/5], Step [1248/1248], Loss: 0.0863
Finished Training
Test Accuracy of the network: 87.82051282051282 %
Accuracy of sedan: 30.158730158730158 %
Accuracy of suv: 82.2580645

The CNN struggles with predicting sedans in our test set, but performs highly on all other vehicle classes.