# SSD (Single Shot Multibox Detector)
https://arxiv.org/pdf/1512.02325

In [1]:
from pathlib import Path
from google.colab import drive
drive.mount('/content/drive')
DRIVE_ROOT = Path('/content/drive/MyDrive')

DRIVE_DATA_ROOT = DRIVE_ROOT / 'datasets'
DATA_ROOT= Path('./data/')
MODEL_ROOT = DRIVE_ROOT / 'weights'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%pip install -q matplotlib
%pip install -q torchviz torchinfo
%pip install -q tqdm

In [3]:
from IPython.display import display

import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from tqdm.notebook import tqdm

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, Subset
from torchinfo import summary
from torchvision import datasets, models, transforms

%matplotlib inline
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = (3, 3)
plt.rcParams['axes.grid'] = True
np.set_printoptions(suppress=True, precision=4)

# Enable GPU device if available.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
# Copy cached data to work directory
def copy_cache(cache_file_path, dst_dir, chunk_size=100*(1024**2)):
    src = Path(cache_file_path)
    dst = Path(dst_dir) / src.name
    if not src.exists() or dst.exists():
        return

    print(f'Cache Hit. Copying {src} to {dst}.')
    dst.parent.mkdir(parents=True, exist_ok=True)
    with src.open('rb') as src_file, dst.open('wb') as dst_file:
        total_size = src.stat().st_size
        with tqdm(total=total_size, unit='MB', unit_scale=True) as pbar:
            while chunk := src_file.read(chunk_size):
                dst_file.write(chunk)
                pbar.update(len(chunk))

# Copy dataset.
copy_cache(DRIVE_DATA_ROOT/'VOCtrainval_11-May-2012.tar', DATA_ROOT)

# Download backbone network's weight file.
if not (MODEL_ROOT/'vgg16_reducedfc.pth').exists():
    print('Downloading backbone network\'s weight file')
    !wget -q https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth -P {MODEL_ROOT}

## Dataset, Dataloader, Transformer

In [5]:
! cp -r {DRIVE_DATA_ROOT}/utils ./

cp: cannot stat '/content/drive/MyDrive/datasets/utils': No such file or directory


In [6]:
import PIL
from utils.ssd_augmentations import (
    Compose, ConvertFromInts, ToAbsoluteCoords, PhotometricDistort, Expand,
    RandomSampleCrop, RandomMirror, ToPercentCoords, Resize, SubtractMeans
)

class ImageTransform:
    def __init__(self, input_size, color_mean):
        self.transform = {
            'train': Compose([
                ConvertFromInts(),
                ToAbsoluteCoords(),
                PhotometricDistort(),
                Expand(color_mean),
                RandomSampleCrop(),
                RandomMirror(),
                ToPercentCoords(),
                Resize(input_size),
                SubtractMeans(color_mean),
            ]),
            'val': Compose([
                ConvertFromInts(),
                Resize(input_size),
                SubtractMeans(color_mean),
            ]),
        }

    def __call__(self, image: PIL.Image, phase, boxes, labels):
        bgr_image = np.array(image)[:, :, (2, 1, 0)]
        bgr_image, boxes, labels = self.transform[phase](bgr_image, boxes, labels)
        rgb_image = bgr_image[:, :, (2, 1, 0)]  # BGR → RGB
        return rgb_image, boxes, labels


class AnnotationTransform:
    def __call__(self, annotation):
        target = []
        for region in annotation['annotation']['object']:
            name = region["name"]
            bbox = [int(region['bndbox'][key]) - 1 for key in ['xmin', 'ymin', 'xmax', 'ymax']]
            target.append((name, bbox))
        return target

In [7]:
class VOCDataset(Dataset):
    CLASSES = [
        "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
        "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
        "pottedplant", "sheep", "sofa", "train", "tvmonitor",
    ]
    CLASS_TO_INDEX = {
        class_name: index for index, class_name in enumerate(CLASSES)
    }
    CMAP = plt.cm.hsv(np.linspace(0, 1, len(CLASSES)))

    def __init__(self, root, phase, download=True, transform=None, target_transform=None):
        self.dataset = datasets.VOCDetection(root, year='2012', image_set=phase, download=download,
                                             target_transform=target_transform)
        self.phase = phase
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.pull_item(idx)

    def pull_item(self, idx):
        # Get inputs.
        image, annotation = self.dataset[idx]
        width, height = image.size

        # Create normalized boxes and labels.
        boxes = []
        labels = []
        for name, (xmin, ymin, xmax, ymax) in annotation:
            boxes.append((xmin/width, ymin/height, xmax/width, ymax/height))
            labels.append(self.CLASS_TO_INDEX[name])
        boxes, labels = np.array(boxes), np.array(labels)

        # Transform image, boxes, and labels.
        if self.transform:
            image, boxes, labels = self.transform(image, self.phase, boxes, labels)
        target = np.hstack((boxes, labels.reshape(-1, 1)))

        # Return as TorchTensor
        image = transforms.ToTensor()(image)
        target = transforms.ToTensor()(target)
        return image, target

def collate_fn(batch):
    images, targets = list(zip(*batch))
    images = torch.stack(images, dim=0)
    return images, targets

In [9]:
INPUT_SIZE = 300
COLOR_MEAN = (104, 117, 123)
BATCH_SIZE = 48

train_dataset = VOCDataset(root=DATA_ROOT, phase='train', download=True,
                           transform=ImageTransform(INPUT_SIZE, COLOR_MEAN),
                           target_transform=AnnotationTransform())
val_dataset = VOCDataset(root=DATA_ROOT, phase='val', download=True,
                         transform=ImageTransform(INPUT_SIZE, COLOR_MEAN),
                         target_transform=AnnotationTransform())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                          collate_fn=collate_fn, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False,
                        collate_fn=collate_fn, num_workers=2)

data_loader_dict = {'train': train_loader, 'val': val_loader}

Using downloaded and verified file: data/VOCtrainval_11-May-2012.tar
Extracting data/VOCtrainval_11-May-2012.tar to data
Using downloaded and verified file: data/VOCtrainval_11-May-2012.tar
Extracting data/VOCtrainval_11-May-2012.tar to data


## Network Models

In [10]:
def init_weights(model):
    '''Initialize conv-layer weights by He-Initialization'''
    if isinstance(model, nn.Conv2d):
        nn.init.kaiming_normal_(model.weight.data)
        if model.bias is not None:
            nn.init.constant_(model.bias.data, 0.0)

In [11]:
def make_vgg(weights=None):
    layers = []

    layers += models.vgg16().features[:-1]
    _, _, pool3_idx, pool4_idx = (i for i, layer in enumerate(layers) if isinstance(layer, nn.MaxPool2d))
    layers[pool3_idx].ceil_mode = True

    layers += [
        nn.MaxPool2d(kernel_size=3, stride=1, padding=1),            # pool5
        nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6),  # fc6
        nn.ReLU(inplace=True),
        nn.Conv2d(1024, 1024, kernel_size=1),                        # fc7
        nn.ReLU(inplace=True),
    ]

    vgg = nn.Sequential(*layers)
    if weights:
        vgg.load_state_dict(weights)
    else:
        vgg.apply(init_weights)
    return nn.ModuleList([vgg[:pool4_idx], vgg[pool4_idx:]])


make_vgg()

ModuleList(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [12]:
def make_extras():
    return nn.ModuleList([
        nn.Sequential(
            nn.Conv2d(1024, 256, kernel_size=1),                      # conv8_1
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1),  # conv8_2
            nn.ReLU(inplace=True),
        ),
        nn.Sequential(
            nn.Conv2d(512, 128, kernel_size=1),                       # conv9_1
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1),  # conv9_2
            nn.ReLU(inplace=True),
        ),
        nn.Sequential(
            nn.Conv2d(256, 128, kernel_size=1),                       # conv10_1
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1),  # conv10_2
            nn.ReLU(inplace=True),
        ),
        nn.Sequential(
            nn.Conv2d(256, 128, kernel_size=1),                       # conv11_1
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 256, kernel_size=3),  # conv11_2
            nn.ReLU(inplace=True),
        ),
    ])


make_extras()

ModuleList(
  (0): Sequential(
    (0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (3): ReLU(inplace=True)
  )
  (1): Sequential(
    (0): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (3): ReLU(inplace=True)
  )
  (2): Sequential(
    (0): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (3): ReLU(inplace=True)
  )
  (3): Sequential(
    (0): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
    (3): ReLU(inplace=True)
  )
)

In [13]:
def make_reggressors_classifiers(num_classes, bbox_aspect_ratios):
    reg_layers = []
    cls_layers = []

    in_channels = [512, 1024, 512, 256, 256, 256]
    for in_channel, bbox_aspect_ratio in zip(in_channels, bbox_aspect_ratios):
        # Conv-layer for sourceX.
        reg_layers += [nn.Conv2d(in_channel, bbox_aspect_ratio * 4, kernel_size=3, padding=1)]
        cls_layers += [nn.Conv2d(in_channel, bbox_aspect_ratio * num_classes, kernel_size=3, padding=1)]
    return nn.ModuleList(reg_layers), nn.ModuleList(cls_layers)

make_reggressors_classifiers(21, [4, 6, 6, 6, 4, 4])

(ModuleList(
   (0): Conv2d(512, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (1): Conv2d(1024, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (2): Conv2d(512, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (3): Conv2d(256, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (4-5): 2 x Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
 ),
 ModuleList(
   (0): Conv2d(512, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (1): Conv2d(1024, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (2): Conv2d(512, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (3): Conv2d(256, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (4-5): 2 x Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
 ))

In [14]:
class L2Norm(nn.Module):
    def __init__(self, in_channels=512, scale=20):
        super(L2Norm, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(in_channels))
        nn.init.constant_(self.weight, scale)
        self.eps = 1e-10

    def forward(self, x):
        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps  # Size([num_batch, 1, 38, 38])
        x = x / norm

        # Size([512]) -> Size([num_batch, 512, 38, 38]).
        weghts = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3),expand_as(x)
        return x * weghts

In [15]:
class SSD(nn.Module):
    def __init__(self, config, vgg_weights=None):
        super(SSD, self).__init__()
        self.num_classes = config['num_classes']
        self.num_bbox_aspect = config['num_bbox_aspect']

        # SSD Network components.
        self.vgg = make_vgg(vgg_weights)
        self.extras = make_extras()
        self.L2Norm = L2Norm()
        self.regressors, self.classifiers = make_reggressors_classifiers(
            self.num_classes, self.num_bbox_aspect)

    def forward(self, x: torch.Tensor):
        sources = []

        x = self.vgg[0](x)
        source1 = self.L2Norm(x)
        sources.append(source1)

        x = self.vgg[1](x)
        sources.append(x)

        # source3 ~ source6.
        for layer in self.extras:
            x = layer(x)
            sources.append(x)

        loc_list = []
        conf_list = []
        for source, regressor, classifier in zip(sources, self.regressors, self.classifiers):
            l = regressor(source)
            c = classifier(source)

            # Permute torch.Tensor from (N, KA, H, W) to (N, H*W*KA).
            num_batch = loc.size(0)
            l = l.permute(0, 2, 3, 1).contiguous().view(num_batch, -1)
            c = c.permute(0, 2, 3, 1).contiguous().view(num_batch, -1)

            loc_list.append(l)
            conf_list.append(c)

        # Concat all results.
        loc = torch.cat(loc_list, dim=1)
        conf = torch.cat(conf_list, dim=1)
        return loc, conf

In [16]:
SSD_CONFIG = {
    'num_classes': 21,  # background + 20 classes.
    'image_size': 300,
    'num_bbox_aspect': [4, 6, 6, 6, 4, 4],
    'feature_map_size': [38, 19, 10, 5, 3, 1],  # Image size of each source.
    'lr_steps': [8, 16, 32, 64, 100, 300],
    'dbox_min_sizes': [30, 60, 111, 162, 213, 264],
    'dbox_max_sizes': [60, 111, 162, 213, 264, 315],
    'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
}

vgg_weights = torch.load(MODEL_ROOT/'vgg16_reducedfc.pth', weights_only=False)
model = SSD(SSD_CONFIG, vgg_weights)