Import:

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
from torch.autograd import Variable
import torch.nn.functional as F

import random

import matplotlib.pyplot as plt
from collections import Counter

from PIL import Image

import numpy as np
import os
import cv2

import argparse
import time

#import wandb

from scipy.special import softmax
import  re

In [None]:
!unzip "drive/MyDrive/machine-learning/ssd/data_ssd.zip"

In [None]:
!pip install wandb
!wandb login 22ed476c0b9f3220c32f86c9de19e34fe91112cf

In [6]:
#@title Dataset Methods:

#generate default bounding boxes
def default_box_generator(layers, large_scale, small_scale):
    #input:
    #layers      -- a list of sizes of the output layers. in this assignment, it is set to [10,5,3,1].
    #large_scale -- a list of sizes for the larger bounding boxes. in this assignment, it is set to [0.2,0.4,0.6,0.8].
    #small_scale -- a list of sizes for the smaller bounding boxes. in this assignment, it is set to [0.1,0.3,0.5,0.7].
    
    #output:
    #boxes -- default bounding boxes, shape=[box_num,8]. box_num=4*(10*10+5*5+3*3+1*1) for this assignment.
    
    #TODO:
    #create an numpy array "boxes" to store default bounding boxes
    #you can create an array with shape [10*10+5*5+3*3+1*1,4,8], and later reshape it to [box_num,8]
    #the first dimension means number of cells, 10*10+5*5+3*3+1*1
    #the second dimension 4 means each cell has 4 default bounding boxes.
    #their sizes are [ssize,ssize], [lsize,lsize], [lsize*sqrt(2),lsize/sqrt(2)], [lsize/sqrt(2),lsize*sqrt(2)],
    #where ssize is the corresponding size in "small_scale" and lsize is the corresponding size in "large_scale".
    #for a cell in layer[i], you should use ssize=small_scale[i] and lsize=large_scale[i].
    #the last dimension 8 means each default bounding box has 8 attributes: [x_center, y_center, box_width, box_height, x_min, y_min, x_max, y_max]

    #IMPLEMENTATION:
    n_boxes = 0
    number_of_layer = len(layers)
    for grid in layers:
        n_boxes += grid * grid

    # print("default box ")
    boxes = np.zeros([n_boxes, 4, 8])
    n = 0
    for i in range(0, number_of_layer):
        for x in range(0, layers[i]):
            for y in range(0, layers[i]):
                x_center = (x + 0.5) / layers[i]
                y_center = (y + 0.5) / layers[i]
                if layers[i] == 3:
                    x_center = round(x_center, 2)
                    y_center = round(y_center, 2)
                l_s_scale = round(large_scale[i] / 1.4, 2)
                boxes[n, 0] = [x_center, y_center, small_scale[i], small_scale[i], x_center - small_scale[i] / 2,
                                    y_center - small_scale[i] / 2, x_center + small_scale[i] / 2,
                                    y_center + small_scale[i] / 2]

                boxes[n, 1] = [x_center, y_center, large_scale[i], large_scale[i], x_center - large_scale[i] / 2,
                                    y_center - large_scale[i] / 2, x_center + large_scale[i] / 2,
                                    y_center + large_scale[i] / 2]

                boxes[n, 2] = [x_center, y_center, large_scale[i] * 1.4, l_s_scale,
                                    x_center - large_scale[i] * 1.4 / 2, y_center - l_s_scale / 2,
                                    x_center + large_scale[i] * 1.4 / 2, y_center + l_s_scale / 2]

                boxes[n, 3] = [x_center, y_center, l_s_scale, large_scale[i] * 1.4, x_center - l_s_scale / 2,
                                    y_center - large_scale[i] * 1.4 / 2, x_center + l_s_scale / 2,
                                    y_center + large_scale[i] * 1.4 / 2]
                n += 1


    boxes = boxes.reshape((n_boxes*4, 8))
    boxes[boxes < 0] = 0

    return boxes


#this is an example implementation of IOU.
#It is different from the one used in YOLO, please pay attention.
#you can define your own iou function if you are not used to the inputs of this one.
def iou(boxs_default, x_min,y_min,x_max,y_max):
    #input:
    #boxes -- [num_of_boxes, 8], a list of boxes stored as [box_1,box_2, ...], where box_1 = [x1_center, y1_center, width, height, x1_min, y1_min, x1_max, y1_max].
    #x_min,y_min,x_max,y_max -- another box (box_r)
    
    #output:
    #ious between the "boxes" and the "another box": [iou(box_1,box_r), iou(box_2,box_r), ...], shape = [num_of_boxes]

    inter = np.maximum(np.minimum(boxs_default[:,6],x_max)-np.maximum(boxs_default[:,4],x_min),0)*np.maximum(np.minimum(boxs_default[:,7],y_max)-np.maximum(boxs_default[:,5],y_min),0)
    area_a = (boxs_default[:,6]-boxs_default[:,4])*(boxs_default[:,7]-boxs_default[:,5])
    area_b = (x_max-x_min)*(y_max-y_min)
    union = area_a + area_b - inter
    return inter/np.maximum(union,1e-8)


def match(ann_box,ann_confidence,boxs_default,threshold,cat_id,x_min,y_min,x_max,y_max, image):
    #input:
    #ann_box                 -- [num_of_boxes,4], ground truth bounding boxes to be updated
    #ann_confidence          -- [num_of_boxes,number_of_classes], ground truth class labels to be updated
    #boxs_default            -- [num_of_boxes,8], default bounding boxes
    #threshold               -- if a default bounding box and the ground truth bounding box have iou>threshold, then this default bounding box will be used as an anchor
    #cat_id                  -- class id, 0-cat, 1-dog, 2-person
    #x_min,y_min,x_max,y_max -- bounding box
    
    #compute iou between the default bounding boxes and the ground truth bounding box
    x_center = (x_min+x_max)/2
    y_center = (y_min+y_max)/2
    box_width = (x_max-x_min)
    box_height = (y_max-y_min)

    ious = iou(boxs_default, x_min,y_min,x_max,y_max)
    ious_true = ious>threshold

    #TODO:
    #update ann_box and ann_confidence, with respect to the ious and the default bounding boxes.
    #if a default bounding box and the ground truth bounding box have iou>threshold, then we will say this default bounding box is carrying an object.
    #this default bounding box will be used to update the corresponding entry in ann_box and ann_confidence
    
    true_idx = [index for (index,value) in enumerate(ious_true) if value == True]
    # print(true_idx)
    for i in true_idx:
        ann_confidence[i][3] = 0
        ann_confidence[i][cat_id] = 1
        ann_box[i][0] = (x_center-boxs_default[i][0])/boxs_default[i][2]
        ann_box[i][1] = (y_center-boxs_default[i][1])/boxs_default[i][3]
        ann_box[i][2] = np.log(box_width/boxs_default[i][2])
        ann_box[i][3] = np.log(box_height/boxs_default[i][3])

    ious_true = np.argmax(ious)
    ann_confidence[ious_true][3] = 0
    ann_confidence[ious_true][cat_id] = 1
    ann_box[ious_true][0] = (x_center-boxs_default[ious_true][0])/boxs_default[ious_true][2]
    ann_box[ious_true][1] = (y_center-boxs_default[ious_true][1])/boxs_default[ious_true][3]
    ann_box[ious_true][2] = np.log(box_width/boxs_default[ious_true][2])
    ann_box[ious_true][3] = np.log(box_height/boxs_default[ious_true][3])




def random_crop(image,w_crop,h_crop):

    x = random.randint(0, image.shape[1] - w_crop)
    y = random.randint(0, image.shape[0] - h_crop)

    image = image[y:y+h_crop, x:x+w_crop]
    return x,y,image


In [7]:
#@title COCO Dataset:

class COCO(torch.utils.data.Dataset):
    def __init__(self, imgdir, anndir, class_num, boxs_default, train = True, image_size=320, train_test_split = 0.9):

        self.train = train
        self.imgdir = imgdir
        self.anndir = anndir
        self.class_num = class_num
        self.train_test_split = train_test_split
        
        #overlap threshold for deciding whether a bounding box carries an object or no
        self.threshold = 0.5
        self.boxs_default = boxs_default
        self.box_num = len(self.boxs_default)
        
        self.img_names = os.listdir(self.imgdir)
        self.image_size = image_size
        
        #notice:
        #you can split the dataset into 90% training and 10% validation here, by slicing self.img_names with respect to self.train

    def __len__(self):
        # do training/test split here
        n_train = int(len(self.img_names)*self.train_test_split)
        if self.train:
            #print("Dataset get length train: ", str(len(self.img_names[0:n_train])))
            return len(self.img_names[0:n_train])
        else:
            #print("Dataset get length test: ", str(len(self.img_names[n_train:len(self.img_names)])))
            return len(self.img_names[n_train:len(self.img_names)])

    def __getitem__(self, index):

        ann_box = np.zeros([self.box_num,4], np.float32) #bounding boxes
        ann_confidence = np.zeros([self.box_num,self.class_num], np.float32) #one-hot vectors
        #one-hot vectors with four classes
        #[1,0,0,0] -> cat
        #[0,1,0,0] -> dog
        #[0,0,1,0] -> person
        #[0,0,0,1] -> background
        
        ann_confidence[:,-1] = 1 #the default class for all cells is set to "background"
        
        n_train = int(len(self.img_names)*self.train_test_split)
        images_train = self.img_names[0:n_train]
        images_test = self.img_names[n_train:len(self.img_names)]

        if self.train:       
            img_name = self.imgdir+images_train[index]
            ann_name = self.anndir+images_train[index][:-3]+"txt"
        else:
            img_name = self.imgdir+images_test[index]
            ann_name = self.anndir+images_test[index][:-3]+"txt"
        
        #TODO:
        #1. prepare the image [3,320,320], by reading image "img_name" first.
        #2. prepare ann_box and ann_confidence, by reading txt file "ann_name" first.
        #3. use the above function "match" to update ann_box and ann_confidence, for each bounding box in "ann_name".
        #4. Data augmentation. You need to implement random cropping first. You can try adding other augmentations to get better results.

        image = cv2.imread(img_name)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # check for grayscale and correct
        if len(image.shape) == 2:
            image_fixed = np.zeros([image.shape[0], image.shape[1], 3])
            image_fixed[:,:,0] = image
            image_fixed[:,:,1] = image
            image_fixed[:,:,2] = image
            image = image_fixed

        do_crop = False

        height = image.shape[0]
        width = image.shape[1]

        if do_crop and width > 320 and height > 320:
            crop_x, crop_y, image_ = random_crop(image, self.image_size, self.image_size)
            with open(ann_name, 'r') as f:
                for line in f:
                    class_id, x_min, y_min, box_width, box_height = line.split()
                    class_id = int(class_id)
                    x_min = float(x_min)
                    y_min = float(y_min)
                    box_width = float(box_width)
                    box_height = float(box_height)

                    x_max = x_min + box_width
                    y_max = y_min + box_height
                    x_min = x_min - crop_x
                    y_min = y_min - crop_y
                    x_max = x_max - crop_x
                    y_max = y_max - crop_y
                    
                    x_min_normalized = round(x_min/320,2)
                    y_min_normalized = round(y_min/320,2)
                    x_max_normalized = round(x_max/ 320,2)
                    y_max_normalized= round(y_max/320,2)
                    
                    match(ann_box, ann_confidence, self.boxs_default, self.threshold, class_id, x_min_normalized, y_min_normalized, x_max_normalized, y_max_normalized, image_)
            
            image_ = cv2.resize(image_, (320, 320))
            image = torch.from_numpy(image_)
            image = image.type(torch.FloatTensor)
            image =  torch.permute(image, (2, 0, 1))

            image = transforms.Resize([self.image_size,self.image_size])(image)
            ann_box = torch.from_numpy(ann_box)
            ann_confidence = torch.from_numpy(ann_confidence)

            return image, ann_box, ann_confidence
        else:
            with open(ann_name, 'r') as f:
                for line in f:
                    class_id, x_min, y_min, box_width, box_height = line.split()

                    class_id = int(class_id)

                    x_min = float(x_min)
                    y_min = float(y_min)
                    box_width = float(box_width)
                    box_height = float(box_height)

                    x_min_normalized = x_min/width
                    y_min_normalized= y_min/height
                    box_width_normalized= box_width/width
                    box_height_normalized = box_height/height

                    x_max_normalized, y_max_normalized = x_min_normalized+box_width_normalized, y_min_normalized+box_height_normalized
                    
                    match(ann_box, ann_confidence, self.boxs_default, self.threshold, class_id, x_min_normalized, y_min_normalized, x_max_normalized, y_max_normalized, image)
        
        #to use function "match":
        #match(ann_box,ann_confidence,self.boxs_default,self.threshold,class_id,x_min,y_min,x_max,y_max)
        #where [x_min,y_min,x_max,y_max] is from the ground truth bounding box, normalized with respect to the width or height of the image.

        #note: please make sure x_min,y_min,x_max,y_max are normalized with respect to the width or height of the image.
        #For example, point (x=100, y=200) in a image with (width=1000, height=500) will be normalized to (x/width=0.1,y/height=0.4)
        image = torch.from_numpy(image)
        image = image.type(torch.FloatTensor)
        image =  torch.permute(image, (2, 0, 1))

        image = transforms.Resize([self.image_size,self.image_size])(image)
        ann_box = torch.from_numpy(ann_box)
        ann_confidence = torch.from_numpy(ann_confidence)

        return image, ann_box, ann_confidence

In [8]:
#@title Model:

def SSD_loss(pred_confidence, pred_box, ann_confidence, ann_box):
    #input:
    #pred_confidence -- the predicted class labels from SSD, [batch_size, num_of_boxes, num_of_classes]
    #pred_box        -- the predicted bounding boxes from SSD, [batch_size, num_of_boxes, 4]
    #ann_confidence  -- the ground truth class labels, [batch_size, num_of_boxes, num_of_classes]
    #ann_box         -- the ground truth bounding boxes, [batch_size, num_of_boxes, 4]
    #
    #output:
    #loss -- a single number for the value of the loss function, [1]

    #Note that you need to consider cells carrying objects and empty cells separately.
    #I suggest you to reshape confidence to [batch_size*num_of_boxes, num_of_classes]
    #and reshape box to [batch_size*num_of_boxes, 4].
    #Then you need to figure out how you can get the indices of all cells carrying objects,
    #and use confidence[indices], box[indices] to select those cells.

    # reshape boxes
    batch_boxes = pred_confidence.shape[0]*pred_confidence.shape[1]
    pred_confidence = pred_confidence.reshape((batch_boxes, pred_confidence.shape[2]))
    pred_box = pred_box.reshape((batch_boxes, pred_box.shape[2]))
    ann_confidence = ann_confidence.reshape((batch_boxes, ann_confidence.shape[2]))
    ann_box = ann_box.reshape((batch_boxes, ann_box.shape[2]))

    # get the indices of all cells carrying objects
    x_obj = []
    x_no_obj = []
    x_no_obj = ann_confidence[:,3]
    x_obj = 1-x_no_obj

    no_obj = x_no_obj.cpu().numpy()
    obj = x_obj.cpu().numpy()
    x_no_ob = np.where(no_obj == 1)
    x_obj = np.where(obj == 1)

    conf_pred_ob = pred_confidence[x_obj]
    conf_pred_noob = pred_confidence[x_no_ob]
    conf_ann_ob = ann_confidence[x_obj]
    conf_ann_noob = ann_confidence[x_no_ob]
    pred_box_ob = pred_box[x_obj]
    ann_box_ob = ann_box[x_obj]

    L_cls = F.cross_entropy(conf_pred_ob, conf_ann_ob) + 3*F.cross_entropy(conf_pred_noob, conf_ann_noob)
    L_box = F.smooth_l1_loss(pred_box_ob, ann_box_ob)
    loss = (L_cls + L_box)

    return loss


    #TODO: write a loss function for SSD
    #
    #For confidence (class labels), use cross entropy (F.cross_entropy)
    #You can try F.binary_cross_entropy and see which loss is better
    #For box (bounding boxes), use smooth L1 (F.smooth_l1_loss)
    #
    #Note that you need to consider cells carrying objects and empty cells separately.
    #I suggest you to reshape confidence to [batch_size*num_of_boxes, num_of_classes]
    #and reshape box to [batch_size*num_of_boxes, 4].
    #Then you need to figure out how you can get the indices of all cells carrying objects,
    #and use confidence[indices], box[indices] to select those cells.


def conv_base(in_channels, out_channels, kernelsize, stride):
    padding = (kernelsize-1) // 2
    x = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=kernelsize, stride=stride, padding=padding),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        )
    return x

class SSD(nn.Module):
    def __init__(self, class_num):
        super(SSD, self).__init__()
        
        self.class_num = class_num #num_of_classes, in this assignment, 4: cat, dog, person, background
        self.conv1 = conv_base(3, 64, 3, 2)
        self.conv2 = conv_base(64, 64, 3, 1)
        self.conv3 = conv_base(64, 64, 3, 1)
        self.conv4 = conv_base(64, 128, 3, 2)
        self.conv5 = conv_base(128, 128, 3, 1)
        self.conv6 = conv_base(128, 128, 3, 1)
        self.conv7 = conv_base(128, 256, 3, 2)
        self.conv8 = conv_base(256, 256, 3, 1)
        self.conv9 = conv_base(256, 256, 3, 1)
        self.conv10 = conv_base(256, 512, 3, 2)
        self.conv11 = conv_base(512, 512, 3, 1)
        self.conv12 = conv_base(512, 512, 3, 1)
        self.conv13 = conv_base(512, 256, 3, 2)
        self.conv14 = conv_base(256, 256, 1, 1)
        self.conv15 = conv_base(256, 256, 3, 2)
        self.conv16 = conv_base(256, 256, 1, 1)
        self.conv17 = conv_base(256, 256, 3, 2)
        self.conv18 = conv_base(256, 256, 1, 1)
        self.conv19 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3,stride=3,padding=1),
            nn.ReLU()
        )
        self.conv_side1 = conv_base(256,16,3,1)
        self.conv_side2 = conv_base(256,16,3,1)
        self.conv_side3 = conv_base(256,16,3,1)
        self.conv_side4 = conv_base(256,16,3,1)
        self.conv_side5 = conv_base(256,16,3,1)
        self.conv_side6 = conv_base(256,16,3,1)
        self.conv_side7 = nn.Sequential(
            nn.Conv2d(256, 16, kernel_size=1,stride=1),
            nn.ReLU()
        )
        self.conv_side8 = nn.Sequential(
            nn.Conv2d(256, 16, kernel_size=1,stride=1),
            nn.ReLU()
        )

        #TODO: define layers
        
        
    def forward(self, x):
        #input:
        #x -- images, [batch_size, 3, 320, 320]
        
        x = x/255.0 #normalize image. If you already normalized your input image in the dataloader, remove this line.
        
        #TODO: define forward
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out)
        out = self.conv4(out)
        out = self.conv5(out)
        out = self.conv6(out)
        out = self.conv7(out)
        out = self.conv8(out)
        out = self.conv9(out)
        out = self.conv10(out)
        out = self.conv11(out)
        out = self.conv12(out)
        out = self.conv13(out)
        # 10 * 10
        right_1 = self.conv_side1(out)
        right_1 = right_1.reshape((right_1.shape[0],right_1.shape[1],right_1.shape[2]*right_1.shape[3]))
        left_1 = self.conv_side2(out)
        left_1 = left_1.reshape((left_1.shape[0],left_1.shape[1],left_1.shape[2]*left_1.shape[3]))
        out = self.conv14(out)
        out = self.conv15(out)
        # 5 * 5
        right_2 = self.conv_side3(out)
        right_2 = right_2.reshape((right_2.shape[0],right_2.shape[1],right_2.shape[2]*right_2.shape[3]))
        left_2 = self.conv_side4(out)
        left_2 = left_2.reshape((left_2.shape[0],left_2.shape[1],left_2.shape[2]*left_2.shape[3]))
        out = self.conv16(out)
        out = self.conv17(out)
        # 3 * 3
        right_3 = self.conv_side5(out)
        right_3 = right_3.reshape((right_3.shape[0],right_3.shape[1],right_3.shape[2]*right_3.shape[3]))
        left_3 = self.conv_side6(out)
        left_3 = left_3.reshape((left_3.shape[0],left_3.shape[1],left_3.shape[2]*left_3.shape[3]))
        out = self.conv18(out)
        out = self.conv19(out)
        # 1 * 1
        right_4 = self.conv_side7(out)
        right_4 = right_4.reshape((right_4.shape[0],right_4.shape[1],right_4.shape[2]*right_4.shape[3]))
        left_4 = self.conv_side8(out)
        left_4 = left_4.reshape((left_4.shape[0],left_4.shape[1],left_4.shape[2]*left_4.shape[3]))

        #should you apply softmax to confidence? (search the pytorch tutorial for F.cross_entropy.) If yes, which dimension should you apply softmax?
        
        #sanity check: print the size/shape of the confidence and bboxes, make sure they are as follows:
        #confidence - [batch_size,4*(10*10+5*5+3*3+1*1),num_of_classes]
        #bboxes - [batch_size,4*(10*10+5*5+3*3+1*1),4]

        # the concatenating order need to be the same as default bounding box
        confidence = torch.cat((left_1,left_2,left_3,left_4),2)
        bboxes = torch.cat((right_1,right_2,right_3,right_4),2)
        confidence =  torch.permute(confidence, (0, 2, 1))
        bboxes =  torch.permute(bboxes, (0, 2, 1))
        confidence = confidence.reshape((confidence.shape[0],540,self.class_num))
        bboxes = bboxes.reshape((bboxes.shape[0],540,4))

        #confidence = F.softmax(confidence, dim = 2)
        m = nn.Softmax(dim=2)
        #confidence = F.softmax(confidence, dim = 2)
        confidence = m(confidence)

        return confidence, bboxes



In [9]:
#@title Utils:

colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
#use [blue green red] to represent different classes

def visualize_pred(windowname, pred_confidence, pred_box, ann_confidence, ann_box, image_, boxs_default):
    #input:
    #windowname      -- the name of the window to display the images
    #pred_confidence -- the predicted class labels from SSD, [num_of_boxes, num_of_classes]
    #pred_box        -- the predicted bounding boxes from SSD, [num_of_boxes, 4]
    #ann_confidence  -- the ground truth class labels, [num_of_boxes, num_of_classes]
    #ann_box         -- the ground truth bounding boxes, [num_of_boxes, 4]
    #image_          -- the input image to the network
    #boxs_default    -- default bounding boxes, [num_of_boxes, 8]
    
    _, class_num = pred_confidence.shape

    class_num = class_num-1

    image = np.transpose(image_, (1,2,0)).astype(np.uint8)

    image1 = np.zeros(image.shape,np.uint8)
    image2 = np.zeros(image.shape,np.uint8)
    image3 = np.zeros(image.shape,np.uint8)
    image4 = np.zeros(image.shape,np.uint8)
    # add two extra images for after NMS
    image5 = np.zeros(image.shape, np.uint8)
    image6 = np.zeros(image.shape, np.uint8)

    image1[:]=image[:]
    image2[:]=image[:]
    image3[:]=image[:]
    image4[:]=image[:]
    image5[:]=image[:]
    image6[:]=image[:]
    #image1: draw ground truth bounding boxes on image1
    #image2: draw ground truth "default" boxes on image2 (to show that you have assigned the object to the correct cell/cells)
    #image3: draw network-predicted bounding boxes on image3
    #image4: draw network-predicted "default" boxes on image4 (to show which cell does your network think that contains an object)

    #draw ground truth
    for i in range(len(ann_confidence)):
        for j in range(class_num):

            if ann_confidence[i,j]>0.5: #if the network/ground_truth has high confidence on cell[i] with class[j]
                #TODO:
                #image1: draw ground truth bounding boxes on image1
                #image2: draw ground truth "default" boxes on image2 (to show that you have assigned the object to the correct cell/cells)

                #you can use cv2.rectangle as follows:
                #start_point = (x1, y1) #top left corner, x1<x2, y1<y2
                #end_point = (x2, y2) #bottom right corner
                #color = colors[j] #use red green blue to represent different classes
                #thickness = 2
                #cv2.rectangle(image?, start_point, end_point, color, thickness)

                min_gt, max_gt = get_box_shape(ann_box[i],boxs_default[i])
                min_default, max_default = get_minmax_box(boxs_default[i][0], boxs_default[i][1], boxs_default[i][2], boxs_default[i][3])

                cv2.rectangle(image1, min_gt, max_gt, color=colors[j], thickness=2)
                cv2.rectangle(image2, min_default, max_default, color=colors[j], thickness=2)


    confidence_, box_, boxs_default_, index= [],[],[],[]
    color_index = []
    #pred
    for i in range(len(pred_confidence)):
        for j in range(class_num):
            if pred_confidence[i,j]>0.5:
                #TODO:
                #image3: draw network-predicted bounding boxes on image3
                #image4: draw network-predicted "default" boxes on image4 (to show which cell does your network think that contains an object)
                confidence_.append(pred_confidence[i,j])
                box_.append(pred_box[i])
                boxs_default_.append(boxs_default[i])
                index.append(i)
                color_index.append(j)

                min_pred, max_pred = get_box_shape(pred_box[i], boxs_default[i])
                min_default, max_default = get_minmax_box(boxs_default[i][0], boxs_default[i][1],boxs_default[i][2], boxs_default[i][3])

                cv2.rectangle(image3, min_pred, max_pred, color=colors[j], thickness=2)
                cv2.rectangle(image4, min_default, max_default, color=colors[j], thickness=2)


    results = non_maximum_suppression(confidence_, box_, boxs_default_, 0.2, 0.5)

    for i in results:
        min_pred, max_pred = get_box_shape(pred_box[index[i]], boxs_default[index[i]])
        min_default, max_default = get_minmax_box(boxs_default[index[i]][0], boxs_default[index[i]][1], boxs_default[index[i]][2], boxs_default[index[i]][3])

        cv2.rectangle(image5, min_pred, max_pred, color= colors[color_index[i]], thickness=2)
        cv2.rectangle(image6, min_default, max_default, color=colors[color_index[i]], thickness=2)


    h,w,_ = image1.shape
    image = np.zeros([h*3,w*2,3], np.uint8)
    image[:h,:w] = image1
    image[:h,w:] = image2
    image[h:2*h,:w] = image3
    image[h:2*h,w:] = image4
    image[2*h:,:w] = image5
    image[2*h:,w:] = image6
    
    ax=plt.gca()
    plt.imshow(image)
    ax.figure.set_size_inches(6, 6)
    #f, axarr = plt.subplots(3,2)
    #axarr[0,0].imshow(image1)
    #axarr[0,1].imshow(image2)
    #axarr[1,0].imshow(image3)
    #axarr[1,1].imshow(image4)
    #axarr[2,0].imshow(image5)
    #axarr[2,1].imshow(image6)
    plt.show()



def non_maximum_suppression(confidence_, box_, boxs_default, overlap=0.1, threshold=0.5):
    #input:
    #confidence_  -- the predicted class labels from SSD, [num_of_boxes, num_of_classes]
    #box_         -- the predicted bounding boxes from SSD, [num_of_boxes, 4]
    #boxs_default -- default bounding boxes, [num_of_boxes, 8]
    #overlap      -- if two bounding boxes in the same class have iou > overlap, then one of the boxes must be suppressed
    #threshold    -- if one class in one cell has confidence > threshold, then consider this cell carrying a bounding box with this class.
    
    #output:
    #depends on your implementation.
    #if you wish to reuse the visualize_pred function above, you need to return a "suppressed" version of confidence [5,5, num_of_classes].
    #you can also directly return the final bounding boxes and classes, and write a new visualization function for that.

    B = []

    size = len(box_)
    confidence_ = np.array(confidence_)

    for i in range(0, size):
        if confidence_.max() <= threshold:
            break

        x = np.argmax(confidence_)
        B.append(x)

        for j in range(0, size):
            if j!= x:
                if  confidence_[j] != 0:
                    start_point_max, end_point_max = get_box_shape(box_[x], boxs_default[x])
                    start_point_i, end_point_i = get_box_shape(box_[j],boxs_default[j])
                    inter = iou_ver2(start_point_max, end_point_max, start_point_i, end_point_i)

                    if inter>overlap:
                        box_[j] = np.array([0,0,0,0])
                        boxs_default[j] = np.array([0,0,0,0])
                        confidence_[j] = 0

        box_[x] = np.array([0,0,0,0])
        confidence_[x] = 0

    return B


def get_box_shape(offset, box_default):
    # do reverse transformation as whats in match
    centre_x = offset[0]*box_default[2]+box_default[0]
    centre_y = offset[1]*box_default[3]+box_default[1]
    w = np.exp(offset[2])*box_default[2]
    h = np.exp(offset[3])*box_default[3]

    start_point, end_point = get_minmax_box(centre_x,centre_y,w,h)

    return start_point,end_point

def get_minmax_box(x_c, y_c, w, h): 

    min_point = (max(int((x_c - w / 2)*320),10), max(int((y_c - h / 2)*320),10))
    max_point = (min(int((x_c + w / 2)*320),300), min(int((y_c + h / 2)*320),300))

    return min_point, max_point  

def iou_ver2(min_1, max_1, min_2, max_2):

    inter = (np.minimum(max_2[0],max_1[0])-np.maximum(min_1[0],min_2[0])) * (np.minimum(max_2[1],max_1[1])-np.maximum(min_1[1],min_2[1]))

    if inter < 0:
        inter = 0

    area_a = (max_2[0] - min_2[0]) * (max_2[1] - min_2[1])
    area_b = (max_1[0] - min_1[0]) * (max_1[1] - min_1[1])
    
    union = area_a + area_b - inter
    return inter / np.maximum(union, 1e-8)



In [10]:
#@title Eval Utils:
def update_precision_recall(identifier, pred_confidence_, pred_box_, ann_confidence_, ann_box_, boxs_default, thres, all_pred_boxes, all_true_boxes):

    batch_size = ann_confidence_.shape[0]
    for idx in range(batch_size):
      identifier+=1
      pred_confidence = pred_confidence_[idx]
      pred_box = pred_box_[idx]
      ann_confidence = ann_confidence_[idx]
      ann_box = ann_box_[idx]

      #pred_confidence = softmax(pred_confidence,axis=1)

      for i in range(len(ann_confidence)):
          for j in range(0, 3):
              if ann_confidence[i,j]>thres:
                  start_point, end_point = get_box_shape(ann_box[i], boxs_default[i])

                  all_true_boxes.append([identifier, j, ann_confidence[i,j], start_point, end_point])

      confidence_, box_, boxs_default_, index, class_box = [],[],[],[],[]
      for i in range(len(pred_confidence)):
          for j in range(0, 3):
              if pred_confidence[i,j]>thres:
                  confidence_.append(pred_confidence[i,j])
                  box_.append(pred_box[i])
                  boxs_default_.append(boxs_default[i])
                  index.append(i)
                  class_box.append(j)


      results = non_maximum_suppression(confidence_,box_,boxs_default_,0.5,0.5)
      for i in results:
          start_point, end_point = get_box_shape(pred_box[index[i]], boxs_default[index[i]])

          all_pred_boxes.append([identifier, class_box[i], pred_confidence[index[i], class_box[i]], start_point, end_point])

    return all_pred_boxes, all_true_boxes, identifier


def update_precision_recall_test(identifier, pred_confidence_, pred_box_, ann_confidence_, ann_box_, boxs_default, thres, all_pred_boxes, all_true_boxes):

    identifier+=1

    pred_confidence = pred_confidence_
    pred_box = pred_box_
    ann_confidence = ann_confidence_[0]
    ann_box = ann_box_[0]

    for i in range(len(ann_confidence)):
        for j in range(0, 3):
            if ann_confidence[i][j]>thres:
                start_point, end_point = get_box_shape(ann_box[i], boxs_default[i])

                all_true_boxes.append([identifier, j, ann_confidence[i,j], start_point, end_point])

    confidence_, box_, boxs_default_, index, class_box = [],[],[],[], []
    for i in range(len(pred_confidence)):
        for j in range(0, 3):
            if pred_confidence[i][j]>thres:
                confidence_.append(pred_confidence[i,j])
                box_.append(pred_box[i])
                boxs_default_.append(boxs_default[i])
                index.append(i)
                class_box.append(j)

    results = non_maximum_suppression(confidence_,box_,boxs_default_,0.5,0.5)
    for i in results:
        start_point, end_point = get_box_shape(pred_box[index[i]], boxs_default[index[i]])
        all_pred_boxes.append([identifier, class_box[i], pred_confidence[index[i], class_box[i]], start_point, end_point])

    return all_pred_boxes, all_true_boxes, identifier




def generate_mAP(pred_boxes_all, true_boxes_all, iou_threshold=0.5):
    #TODO: Generate mAP

    average_precisions = []

    precisions_all = []
    recalls_all = []

    epsilon = 1e-6

    for detection in pred_boxes_all:
        if detection[1] == 4:
            detections.append(detection)

    for c in range(0,3):
        detections = []
        ground_truths = []

        for detection in pred_boxes_all:
            if detection[1] == c:
                detections.append(detection)

        for true_box in true_boxes_all:
            if true_box[1] == c:
                ground_truths.append(true_box)

        # img 1 has 5 then we will obtain a dictionary with:
        # amount_bboxes = {0:3, 1:5}
        amount_bboxes = Counter([gt[0] for gt in ground_truths])

        print("Generate mAP amount bboxes:", str(amount_bboxes[0]))

        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        # sort by box probabilities which is index 2
        detections.sort(key=lambda x: x[2], reverse=True)
        print("Generate mAP detections: " + str(len(detections)))
        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)
        print("Generate mAP gts: " + str(len(ground_truths)))

        # If none exists for this class then we can safely skip
        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            # same image idx as detection
            ground_truth_img = [
                bbox for bbox in ground_truths if bbox[0] == detection[0]
            ]

            num_gts = len(ground_truth_img)
            best_iou = 0

            for idx, gt in enumerate(ground_truth_img):
                iou = iou_ver2(gt[3],gt[4],detection[3],detection[4])

                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx

            if best_iou > iou_threshold:
                # only detect ground truth detection once
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    # true positive and add this bounding box to seen
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1

            # false positive
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = TP_cumsum / (TP_cumsum + FP_cumsum + epsilon)
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))
        # torch.trapz for numerical integration
        average_precisions.append(torch.trapz(precisions, recalls))

        precisions_all.append(precisions.cpu().detach().numpy())
        recalls_all.append(recalls.cpu().detach().numpy())

    mAP = sum(average_precisions) / len(average_precisions)
    return mAP, precisions_all, recalls_all

In [None]:
#@title Training:


class_num = 4 #cat dog person background

num_epochs = 100
batch_size = 32

boxs_default = default_box_generator([10,5,3,1], [0.2,0.4,0.6,0.8], [0.1,0.3,0.5,0.7])

#Create network
network = SSD(class_num)
network.cuda()
cudnn.benchmark = True

dataset = COCO("data/train/images/", "data/train/annotations/", class_num, boxs_default, train = True, image_size=320)
dataset_test = COCO("data/train/images/", "data/train/annotations/", class_num, boxs_default, train = False, image_size=320)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, shuffle=True, num_workers=0)

optimizer = optim.Adam(network.parameters(), lr = 1e-4)
#feel free to try other optimizers and parameters.
#!wandb login 22ed476c0b9f3220c32f86c9de19e34fe91112cf
#!wandb login 22ed476c0b9f3220c32f86c9de19e34fe91112cf --relogin
#wandb.init()
#wandb.login(key="22ed476c0b9f3220c32f86c9de19e34fe91112cf")
#wandb.init()
#wandb.config = {
#  "learning_rate": 1e-4,
#  "epochs": 100,
#  "batch_size": 32
#}
#wandb.watch(network)

precision_ = []
recall_ = []

start_time = time.time()

for epoch in range(num_epochs):
    #TRAINING
    network.train()

    avg_loss = 0
    avg_count = 0
    for i, data in enumerate(dataloader, 0):
        images_, ann_box_, ann_confidence_ = data
        images = images_.cuda()
        ann_box = ann_box_.cuda()
        ann_confidence = ann_confidence_.cuda()

        optimizer.zero_grad()
        pred_confidence, pred_box = network(images)

        loss_net = SSD_loss(pred_confidence, pred_box, ann_confidence, ann_box)
        loss_net.backward()
        optimizer.step()
        
        avg_loss += loss_net.data
        avg_count += 1

    print('Epoch Log: [%d] time: %f train loss: %f' % (epoch, time.time()-start_time, avg_loss/avg_count))

    #wandb.log({"loss": avg_loss/avg_count})
    #visualize
    pred_confidence_ = pred_confidence[0].detach().cpu().numpy()
    pred_box_ = pred_box[0].detach().cpu().numpy()
    visualize_pred("train", pred_confidence_, pred_box_, ann_confidence_[0].numpy(), ann_box_[0].numpy(), images_[0].numpy(), boxs_default)
    
    #VALIDATION
    network.eval()
    
    # TODO: split the dataset into 90% training and 10% validation
    # use the training set to train and the validation set to evaluate

    pred_boxes_all = []
    ann_boxes_all = []
    avg_loss_val = 0
    avg_count_val = 0
    identifier = 0
    
    for i, data in enumerate(dataloader_test, 0):
        images_, ann_box_, ann_confidence_ = data
        images = images_.cuda()
        ann_box = ann_box_.cuda()
        ann_confidence = ann_confidence_.cuda()

        pred_confidence, pred_box = network(images)
        
        pred_confidence_ = pred_confidence.detach().cpu().numpy()
        pred_box_ = pred_box.detach().cpu().numpy()

        loss_net = SSD_loss(pred_confidence, pred_box, ann_confidence, ann_box)
        loss_net.backward()
        
        avg_loss_val += loss_net.data
        avg_count_val += 1

        # need precision and recall:

        #optional: implement a function to accumulate precision and recall to compute mAP or F1.
        #pred_boxes_all, ann_boxes_all, identifier = update_precision_recall(identifier, pred_confidence_, pred_box_, ann_confidence_.numpy(), ann_box_.numpy(), boxs_default, 0.5, pred_boxes_all, ann_boxes_all)
    
    #visualize
    pred_confidence_ = pred_confidence[0].detach().cpu().numpy()
    pred_box_ = pred_box[0].detach().cpu().numpy()
    visualize_pred("val", pred_confidence_, pred_box_, ann_confidence_[0].numpy(), ann_box_[0].numpy(), images_[0].numpy(), boxs_default)
    
    print('Epoch Val Log: [%d] validation loss: %f' % (epoch, avg_loss_val/avg_count_val))

    #optional: compute F1 
    #F1score = 2*precision*recall/np.maximum(precision+recall,1e-8)
    #print(F1score)
    #mAP = generate_mAP(pred_boxes_all, ann_boxes_all, 0.5)
    #print('Precisions: ', precisions.shape)
    #print('Recalls: ', recalls.shape)
    #print('Epoch Validation mAP: [%d] mAP: %f' % (epoch, mAP))
    
    #save weights
    if epoch%10==0:
        #save last network
        print('saving net...')
        torch.save(network.state_dict(), 'drive/MyDrive/machine-learning/network' + str(epoch) + '.pth')


In [None]:
torch.save(network.state_dict(), 'drive/MyDrive/machine-learning/network' + str(100) + '.pth')

In [11]:
#@title Testing:

class_num = 4 #cat dog person background

num_epochs = 100
batch_size = 32

boxs_default = default_box_generator([10,5,3,1], [0.2,0.4,0.6,0.8], [0.1,0.3,0.5,0.7])

network = SSD(class_num)
network.cuda()

dataset_test = COCO("data/validate/images/", "data/validate/annotations/", class_num, boxs_default, train = True, image_size=320, train_test_split = 1)
dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=0)
network.load_state_dict(torch.load('drive/MyDrive/machine-learning/ssd/network100.pth'))
network.eval()

pred_boxes_all = []
ann_boxes_all = []
identifier = 0

for i, data in enumerate(dataloader_test, 0):
    images_, ann_box_, ann_confidence_ = data
    images = images_.cuda()
    ann_box = ann_box_.cuda()
    ann_confidence = ann_confidence_.cuda()

    pred_confidence, pred_box = network(images)

    pred_confidence_ = pred_confidence[0].detach().cpu().numpy()
    pred_box_ = pred_box[0].detach().cpu().numpy()

    #print(pred_confidence_)
    
    pred_boxes_all, ann_boxes_all, identifier = update_precision_recall_test(identifier, pred_confidence_, pred_box_, ann_confidence_.numpy(), ann_box_.numpy(), boxs_default, 0.5, pred_boxes_all, ann_boxes_all)

    #pred_confidence_, pred_box_ = non_maximum_suppression(pred_confidence_,pred_box_,boxs_default)
    
    #TODO: save predicted bounding boxes and classes to a txt file.
    #you will need to submit those files for grading this assignment
    with open('drive/MyDrive/machine-learning/ssd_test_log.txt', 'w') as f:
        f.writelines(str(pred_boxes_all))
    
    visualize_pred("test", pred_confidence_, pred_box_, ann_confidence_[0].numpy(), ann_box_[0].numpy(), images_[0].numpy(), boxs_default)
    #cv2.waitKey(1000)

mAP, precisions_all, recalls_all = generate_mAP(pred_boxes_all, ann_boxes_all, 0.5)
print('Validation mAP: mAP: %f' % (mAP))


for i in range(len(precisions_all)):
  fig = plt.figure()
  ax = fig.add_subplot(1, 1, 1)
  ax.plot(recalls_all[i], precisions_all[i], color ='tab:blue')
    
  # set the limits
  ax.set_title('precisions recall for class: ' + str(i))
  
  # display the plot
  plt.show()


Output hidden; open in https://colab.research.google.com to view.