In [1]:
import torch
import os
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torch.utils.data as data
import numpy as np
import cv2
import pickle
from ssd.augmentations import SSDAugmentation
from ssd.voc0712 import VOCDetection,VOCAnnotationTransform
from ssd.ssd import build_ssd
from ssd.multibox_loss import MultiBoxLoss
from ssd.eval import evaluate_detections

In [2]:
save_folder=os.path.expanduser('~/model/ssd/')
cfg={
    'num_classes': 21,
    'lr_steps': (80000, 100000, 120000),
    'max_iter': 120000,
    'feature_maps': [38, 19, 10, 5, 3, 1],
    'min_dim': 300,
    'steps': [8, 16, 32, 64, 100, 300],
    'min_sizes': [30, 60, 111, 162, 213, 264],
    'max_sizes': [60, 111, 162, 213, 264, 315],
    'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    'variance': [0.1, 0.2],
    'clip': True,
    'name': 'VOC',
}
labelmap=(  # always index 0
    'aeroplane', 'bicycle', 'bird', 'boat',
    'bottle', 'bus', 'car', 'cat', 'chair',
    'cow', 'diningtable', 'dog', 'horse',
    'motorbike', 'person', 'pottedplant',
    'sheep', 'sofa', 'train', 'tvmonitor')
MEANS = (104, 117, 123)
def weights_init(m):
    if isinstance(m, nn.Conv2d):
        init.xavier_uniform_(m.weight.data)
        m.bias.data.zero_()
def detection_collate(batch):
    targets = []
    imgs = []
    for sample in batch:
        imgs.append(sample[0])
        targets.append(torch.FloatTensor(sample[1]))
    return torch.stack(imgs, 0), targets

def base_transform(image, size, mean):
    x = cv2.resize(image, (size, size)).astype(np.float32)
    x -= mean
    x = x.astype(np.float32)
    return x

class BaseTransform:
    def __init__(self, size, mean):
        self.size = size
        self.mean = np.array(mean, dtype=np.float32)

    def __call__(self, image, boxes=None, labels=None):
        return base_transform(image, self.size, self.mean), boxes, labels

In [3]:
def train():
    dataset_root=os.path.expanduser('~/data/VOCdevkit/')
    dataset=VOCDetection(root=dataset_root,transform=SSDAugmentation(cfg['min_dim'],MEANS))
    net=build_ssd('train',cfg['min_dim'],cfg['num_classes'])
    net.vgg.load_state_dict(torch.load(save_folder+'vgg16_reducedfc.pth'))
    net.extras.apply(weights_init)
    net.loc.apply(weights_init)
    net.conf.apply(weights_init)
    device=torch.device('cuda:0')
    net=net.to(device)
    optimizer=optim.SGD(net.parameters(),lr=1e-3,weight_decay=5e-4,momentum=0.9)
    criterion=MultiBoxLoss(cfg['num_classes'],0.5,True,0,True,3,0.5,False,True)
    net.train()
    data_iter=iter(data.DataLoader(dataset,batch_size=32,num_workers=4,shuffle=True,pin_memory=True,collate_fn=detection_collate))
    for iteration in range(cfg['max_iter']):
        images,targets=next(data_iter)
        images,targets=images.to(device),[ann.to(device) for ann in targets]
        out=net(images)
        optimizer.zero_grad()
        loss_l,loss_c=criterion(out,targets)
        loss=loss_l+loss_c
        loss.backward()
        optimizer.step()
    torch.save(net.state_dict(),save_folder+'ssd_voc.pth')

In [4]:
def eval():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
    dataset_root=os.path.expanduser('~/data/VOCdevkit/')
    num_classes=len(labelmap)+1
    net=build_ssd('test',300,num_classes)
    net.load_state_dict(torch.load(save_folder+'ssd300_mAP_77.43_v2.pth'))
    net.eval()
    dataset=VOCDetection(dataset_root,[('2007','test')],BaseTransform(300,MEANS),
                         VOCAnnotationTransform())
    device=torch.device('cuda:0')
    net=net.to(device)
    num_images=len(dataset)
    all_boxes = [[[] for _ in range(num_images)]
                 for _ in range(len(labelmap)+1)]
    output_dir = '../ssd_eval'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    det_file=os.path.join(output_dir,'detections.pkl')
    for i in range(1):
        im,gt,h,w=dataset.pull_item(i)
        x=im.unsqueeze(0)
        x=x.to(device)
        detections=net(x).data
        for j in  range(1,detections.size(1)):
            dets=detections[0,j,:]
            mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t()
            dets = torch.masked_select(dets, mask).view(-1, 5)
            if dets.dim()==0:
                continue
            boxes=dets[:,1:]
            boxes[:, 0] *= w
            boxes[:, 2] *= w
            boxes[:, 1] *= h
            boxes[:, 3] *= h
            scores = dets[:, 0].cpu().numpy()
            cls_dets = np.hstack((boxes.cpu().numpy(),
                                  scores[:, np.newaxis])).astype(np.float32,
                                                                 copy=False)
            all_boxes[j][i] = cls_dets
    with open(det_file,'wb') as f:
        pickle.dump(all_boxes,f,pickle.HIGHEST_PROTOCOL)
    evaluate_detections(all_boxes, output_dir, dataset)

In [5]:
eval()

Writing aeroplane VOC results file
Writing bicycle VOC results file
Writing bird VOC results file
Writing boat VOC results file
Writing bottle VOC results file
Writing bus VOC results file
Writing car VOC results file
Writing cat VOC results file
Writing chair VOC results file
Writing cow VOC results file
Writing diningtable VOC results file
Writing dog VOC results file
Writing horse VOC results file
Writing motorbike VOC results file
Writing person VOC results file
Writing pottedplant VOC results file
Writing sheep VOC results file
Writing sofa VOC results file
Writing train VOC results file
Writing tvmonitor VOC results file
VOC07 metric? Yes


EOFError: Ran out of input