In [1]:
from torchvision.datasets import VOCDetection
from utils.utils import *
from models.fpnssd300 import SSD300
from data.dataset import VOCDataset
from detect import detect

from pprint import PrettyPrinter
from tqdm import tqdm
from pprint import pprint

import numpy as np
from numpy import random
from torchinfo import summary

import pandas as pd
import cv2
import os
import os.path as osp
import time
import random
import matplotlib.pyplot as plt
%matplotlib inline
import torchvision.transforms.functional as FT

from math import sqrt
from sklearn import preprocessing 
from torch.utils.data import DataLoader, Dataset

import torch
from torchvision import transforms
import types
import torchvision
import torch.utils.data as data
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from PIL import Image, ImageDraw

In [2]:
dataset = VOCDetection(root='data/',year ='2012', image_set='train', download=True)

test_dataset = VOCDetection(root='data/', year = '2012',image_set='val', download=True)

Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar to data/VOCtrainval_11-May-2012.tar


  0%|          | 0/1999639040 [00:00<?, ?it/s]

Extracting data/VOCtrainval_11-May-2012.tar to data/
Using downloaded and verified file: data/VOCtrainval_11-May-2012.tar
Extracting data/VOCtrainval_11-May-2012.tar to data/


In [None]:
rev_label_map= {0: 'background',
                1:'aeroplane',
              2:'bicycle',
              3:'bird',
              4:'boat',
              5:'bottle',
              6:'bus',
              7:'car',
              8:'cat',
              9:'chair',
              10:'cow',
              11:'diningtable',
              12:'dog',
              13:'horse',
              14:'motorbike',
              15:'person',
              16:'pottedplant',
              17:'sheep',
              18:'sofa',
              19:'train',
              20:'tvmonitor'}

label_map = {v: k for k, v in rev_label_map.items() if k!= 0}

In [7]:
print(len(dataset), len(test_dataset))

5717 5823


In [3]:
def image_dict(split):
    image_dict = {}
    if split == "train":
        for image_info in dataset:
            image = image_info[0]
            image_id = image_info[1]['annotation']['filename'].split(".")[0]
            image_dict[image_id] = image
    if split == "test":
        for image_info in test_dataset:
            image = image_info[0]
            image_id = image_info[1]['annotation']['filename'].split(".")[0]
            image_dict[image_id] = image
    return image_dict

In [4]:
train_image_dict = image_dict("train")
test_image_dict = image_dict("test")

In [14]:
df_train = pd.read_csv("data/train_voc_pascal12.csv/working/train_voc_pascal12.csv")
df_test = pd.read_csv("data/test_voc_pascal12.csv/working/test_voc_pascal12.csv")

## Create the dataset

In [18]:
keep_difficult = True
train_dataset = VOCDataset(df_train, train_image_dict, "TRAIN",keep_difficult)
test_dataset = VOCDataset(df_test, test_image_dict, "TEST",keep_difficult)

# del train_image_dict, test_image_dict
# del df_train, df_test

## Dataloader 

In [19]:
# split the dataset in train and test set
train_data_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=2,
    collate_fn=train_dataset.collate_fn
)

test_data_loader = DataLoader(
    test_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=2,
    collate_fn=test_dataset.collate_fn
)

In [20]:
print("Number of training iterations: ", len(train_data_loader))
print("Number of testing iterations: ", len(test_data_loader))

Number of training iterations:  358
Number of testing iterations:  364


# MODEL

In [22]:
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
print(f"Training on device {device}")

Training on device cuda


In [30]:
model = SSD300(n_classes = 21)
summary(model, (16, 3, 300, 300))

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]



Layer (type:depth-idx)                             Output Shape              Param #
SSD300                                             [16, 8532, 4]             256
├─FPNBase: 1-1                                     [16, 256, 38, 38]         --
│    └─ResNet: 2-1                                 --                        2,049,000
│    │    └─Conv2d: 3-1                            [16, 64, 150, 150]        9,408
│    │    └─BatchNorm2d: 3-2                       [16, 64, 150, 150]        128
│    │    └─Sequential: 3-3                        [16, 256, 75, 75]         215,808
│    │    └─Sequential: 3-4                        [16, 512, 38, 38]         1,219,584
│    │    └─Sequential: 3-5                        [16, 1024, 19, 19]        7,098,368
│    │    └─Sequential: 3-6                        [16, 2048, 10, 10]        14,964,736
│    └─Conv2d: 2-2                                 [16, 256, 5, 5]           4,718,848
│    └─BatchNorm2d: 2-3                            [16, 256, 5, 5]   

## Training

In [None]:
# Data parameters
data_folder = './'  # folder with data files

# Model parameters
# Not too many here since the SSD300 has a very specific structure
n_classes = len(label_map)  # number of different types of objects
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Learning parameters
checkpoint = "..\checkpoint\checkpoint_ssd300_280.pth.tar"  # path to model checkpoint, None if none
batch_size = 16  # batch size
iterations = 100000  # number of iterations to train
# workers = 4  # number of workers for loading data in the DataLoader
print_freq = 50  # print training status every __ batches
lr = 5e-4  # learning rate
decay_lr_at = [60000, 80000]  # decay learning rate after these many iterations
decay_lr_to = 0.1  # decay learning rate to this fraction of the existing learning rate
momentum = 0.9  # momentum
weight_decay = 5e-4  # weight decay
grad_clip = 1  # clip if gradients are exploding, which may happen at larger batch sizes (sometimes at 32) - you will recognize it by a sorting error in the MuliBox loss calculation


def main():
    """
    Training.
    """
    global start_epoch, label_map, epoch, checkpoint, decay_lr_at

    # Initialize model or load checkpoint
    if checkpoint is None:
        start_epoch = 0
        model = SSD300(n_classes = 21)
        for index, child in enumerate(model.base.resnet.children()):
            if index != 7:
                for param in child.parameters():
                    param.requires_grad = False
        # Initialize the optimizer, with twice the default learning rate for biases, as in the original Caffe repo
        biases = list()
        not_biases = list()
        for param_name, param in model.named_parameters():
            if param.requires_grad:
                if param_name.endswith('.bias'):
                    biases.append(param)
                else:
                    not_biases.append(param)
        optimizer = torch.optim.SGD(params=[{'params': biases, 'lr': 2 * lr}, {'params': not_biases}],
                                    lr=lr, momentum = momentum, weight_decay=weight_decay)

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        print('\nLoaded checkpoint from epoch %d.\n' % start_epoch)
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']

    # Move to default device
    model = model.to(device)
    criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device)

    # Calculate total number of epochs to train and the epochs to decay learning rate at (i.e. convert iterations to epochs)
    # To convert iterations to epochs, divide iterations by the number of iterations per epoch
    # The paper trains for 120,000 iterations with a batch size of 32, decays after 80,000 and 100,000 iterations
    epochs = iterations // (len(train_dataset) // 16)
    print("Number of epochs: ", epochs)
    decay_lr_at = [it // (len(train_dataset) // 16) for it in decay_lr_at]
    
    # Epochs
    for epoch in range(start_epoch, epochs):

        # Decay learning rate at particular epochs
        if epoch in decay_lr_at:
            adjust_learning_rate(optimizer, decay_lr_to)

        # One epoch's training
        train(train_loader=train_data_loader,
              model=model,
              criterion=criterion,
              optimizer=optimizer,
              epoch=epoch)

        # Save checkpoint
        print("Saving checkpoint epoch:", epoch)
        save_checkpoint(epoch, model, optimizer)


def train(train_loader, model, criterion, optimizer, epoch):
    """
    One epoch's training.
    :param train_loader: DataLoader for training data
    :param model: model
    :param criterion: MultiBox loss
    :param optimizer: optimizer
    :param epoch: epoch number
    """
    model.train()  # training mode enables dropout

    losses = AverageMeter()  # loss


    # Batches
    for i, (images, boxes, labels, _) in enumerate(train_loader):
        
        start = time.time()
        
        # Move to default device
        images = images.to(device)  # (batch_size (N), 3, 300, 300)
        boxes = [b.to(device) for b in boxes]
        labels = [l.to(device) for l in labels]

        # Forward prop.
        predicted_locs, predicted_scores = model(images)  # (N, 8732, 4), (N, 8732, n_classes)

        # Loss
        loss = criterion(predicted_locs, predicted_scores, boxes, labels)  # scalar

        # Backward prop.
        optimizer.zero_grad()
        loss.backward()

        # Clip gradients, if necessary
        if grad_clip is not None:
            clip_gradient(optimizer, grad_clip)

        # Update model
        optimizer.step()

        losses.update(loss.item(), images.size(0))

        # Print status
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Training Time {3:.3f} \t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch, i, len(train_loader),
                                                                  (time.time()-start)*print_freq, loss=losses))
    
    if epoch % 10 == 0:
        model.eval()
        val_losses = AverageMeter()

        with torch.no_grad():
            # Batches
            for i, (images, boxes, labels, difficulties) in enumerate(test_data_loader):
                images = images.to(device)  # (batch_size (N), 3, 300, 300)
                boxes = [b.to(device) for b in boxes]
                labels = [l.to(device) for l in labels]

                # Forward prop.
                predicted_locs, predicted_scores = model(images)  # (N, 8732, 4), (N, 8732, n_classes)

                # Loss
                loss = criterion(predicted_locs, predicted_scores, boxes, labels)  # scalar

                val_losses.update(loss.item(), images.size(0))

            # Print status
            print('Validation loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(loss=val_losses))
    del predicted_locs, predicted_scores, images, boxes, labels  # free some memory since their histories may be stored

main()

## Calculating mAP

In [43]:
# Good formatting when printing the APs for each class and mAP
pp = PrettyPrinter()

In [46]:
# Parameters
data_folder = './'
batch_size = 16
workers = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = "..\checkpoint\checkpoint_ssd300_280.pth.tar"

# Load model checkpoint that is to be evaluated
checkpoint = torch.load(checkpoint)
model = checkpoint['model']
model = model.to(device)

# Switch to eval mode
model.eval()

def evaluate(test_loader, model):
    """
    Evaluate.
    :param test_loader: DataLoader for test data
    :param model: model
    """

    # Make sure it's in eval mode
    model.eval()

    # Lists to store detected and true boxes, labels, scores
    det_boxes = list()
    det_labels = list()
    det_scores = list()
    true_boxes = list()
    true_labels = list()
    true_difficulties = list()  # it is necessary to know which objects are 'difficult', see 'calculate_mAP' in utils.py

    with torch.no_grad():
        # Batches
        for i, (images, boxes, labels, difficulties) in enumerate(tqdm(test_loader, desc='Evaluating')):
            images = images.to(device)  # (N, 3, 300, 300)

            # Forward prop.
            predicted_locs, predicted_scores = model(images)

            # Detect objects in SSD output
            det_boxes_batch, det_labels_batch, det_scores_batch = model.detect_objects(predicted_locs, predicted_scores,
                                                                                       min_score=0.01, max_overlap=0.45,
                                                                                       top_k=200)
            # Store this batch's results for mAP calculation
            boxes = [b.to(device) for b in boxes]
            labels = [l.to(device) for l in labels]
            difficulties = [d.to(device) for d in difficulties]

            det_boxes.extend(det_boxes_batch)
            det_labels.extend(det_labels_batch)
            det_scores.extend(det_scores_batch)
            true_boxes.extend(boxes)
            true_labels.extend(labels)
            true_difficulties.extend(difficulties)

        # Calculate mAP
        APs, mAP = calculate_mAP(det_boxes, det_labels, det_scores, true_boxes, true_labels, true_difficulties, 0.5)

    # Print AP for each class
    pp.pprint(APs)

    print('\nMean Average Precision (mAP): %.3f' % mAP)


if __name__ == '__main__':
    evaluate(test_data_loader, model)

Evaluating: 100%|██████████| 364/364 [03:35<00:00,  1.69it/s]


{'aeroplane': 0.7581235766410828,
 'bicycle': 0.7262179851531982,
 'bird': 0.6922540068626404,
 'boat': 0.4577009677886963,
 'bottle': 0.3729631304740906,
 'bus': 0.7464362382888794,
 'car': 0.5342469215393066,
 'cat': 0.8741917610168457,
 'chair': 0.46735140681266785,
 'cow': 0.6185383796691895,
 'diningtable': 0.5899243950843811,
 'dog': 0.8436293601989746,
 'horse': 0.7462180852890015,
 'motorbike': 0.725793719291687,
 'person': 0.6655958890914917,
 'pottedplant': 0.3164544105529785,
 'sheep': 0.5664616823196411,
 'sofa': 0.6055200099945068,
 'train': 0.7921307682991028}

Mean Average Precision (mAP): 0.637


In [None]:
id = ['2008_000090', '2008_000107', '2008_000115', '2008_000116', '2008_000119', '2008_000120', '2008_000123', '2008_000133', '2008_000134', '2008_000138', '2008_000140', '2008_000145', '2008_000149', '2008_000163', '2008_000174', '2008_000177', '2008_000182', '2008_000183', '2008_000190', '2008_000194', '2008_000195', '2008_000203', '2008_000204', '2008_000213', '2008_000215', '2008_000219', '2008_000222']
for id in id:
    original_image = test_image_dict[id]
    original_image = original_image.convert('RGB')
    detect(original_image, min_score=0.2, max_overlap=0.5, top_k=200)