In [1]:

import time
from torchvision import models as Models
from os import path as osp
import os
from typing import List

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torchvision

import sys
import torch.utils.data as data
import cv2
import numpy as np
from utils_yolo.augmentations import Yolov1Augmentation
from utils_yolo.dataset import VOCDetection, detection_collate
from utils_yolo.lr_scheduler import WarmUpMultiStepLR

In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CHANNEL_MEANS = (104, 117, 123)

LEARNING_RATE: float = 0.001
MOMENTUM: float = 0.9
WEIGHT_DECAY: float = 0.0005
STEP_LR_SIZES: List[int] = [200000, 400000]
STEP_LR_GAMMA: float = 0.1
WARM_UP_FACTOR: float = 0.1
WARM_UP_NUM_ITERS: int = 1000

NUM_STEPS_TO_SAVE: int = 100
NUM_STEPS_TO_SNAPSHOT: int = 10000
NUM_STEPS_TO_FINISH: int = 600000


YOLOv1_PIC_SIZE = 448
VOC_DATA_SET_ROOT = ''
MODEL_SAVE_DIR = '/home/vankhoa/code/Object_detection/YOLOv1-Pytorch/model'
GRID_NUM = 7

batch_size = 4
num_workers = 8
save_step = 500
backbone = "resnet50"

In [3]:
voc_data_set_root = "/home/vankhoa/datasets/VOCtrainval_11-May-2012/VOCdevkit"

In [4]:
dataset = VOCDetection(root=voc_data_set_root,
                            image_sets=(('2007', 'trainval'), ('2012', 'trainval')),
                            transform=Yolov1Augmentation(size=448, percent_coord=True))

In [5]:
dataloader = data.DataLoader(dataset,
                            batch_size,
                            num_workers=num_workers,
                            shuffle=True,
                            collate_fn=detection_collate,
                            pin_memory=False)

In [6]:

def get_backbone(model_name: str):
    r"""
    get pre-trained base-network for yolo-v1,
    children[:5] do not require grad
    :param model_name: name of model
    :return: pre-layer of pre-trained model without FC
    """

    # when input shape is [, 3, 448, 448], output shape is:
    features = list(Models.resnet50(True).children())[:-2]
    for parameters in [feature.parameters() for i, feature in enumerate(features) if i <= 4]:
        for parameter in parameters:
            parameter.requires_grad = False
    return nn.Sequential(*features), (2048, 14, 14)


class Yolov1(nn.Module):
    def __init__(self, backbone_name: str, grid_num=GRID_NUM, model_save_dir=MODEL_SAVE_DIR):
        def get_tuple_multiplied(input_tuple: tuple):
            res = 1.0
            for i in input_tuple:
                res *= i
            return int(res)

        super(Yolov1, self).__init__()
        self.model_save_dir = model_save_dir
        self.grid_num = grid_num
        self.backbone, feature_maps_shape = get_backbone(backbone_name)
        self.model_save_name = '{}_{}'.format(self.__class__.__name__, backbone_name)
        last_conv3x3_out_channel = 1024
        self.last_conv3x3 = nn.Sequential(
            nn.Conv2d(in_channels=feature_maps_shape[0], out_channels=last_conv3x3_out_channel,
                      kernel_size=3, stride=2, padding=1),
            nn.ReLU(True),
            nn.BatchNorm2d(last_conv3x3_out_channel)
        )
        self.cls = nn.Sequential(
            nn.Linear(get_tuple_multiplied((last_conv3x3_out_channel, self.grid_num, self.grid_num)), 4096),
            nn.ReLU(True),
            nn.Dropout(0.2),
            nn.Linear(4096, int(self.grid_num * self.grid_num * 30)),
        )

    def forward(self, x):
        x = self.backbone(x)
        x = self.last_conv3x3(x)
        x = x.view(x.size(0), -1)
        x = self.cls(x)
        x = torch.sigmoid(x)  # 归一化到0-1
        x = x.view(-1, self.grid_num, self.grid_num, 30)
        return x

    def save_model(self, step=None, optimizer=None, lr_scheduler=None):
        self.save_safely(self.state_dict(), self.model_save_dir, self.model_save_name + '.pkl')
        print('*** model weights saved successfully at {}!'.format(
            osp.join(self.model_save_dir, self.model_save_name + '.pkl')))
        if optimizer and lr_scheduler and step is not None:
            temp = {
                'step': step,
                'lr_scheduler': lr_scheduler.state_dict(),
                'optimizer': optimizer.state_dict(),
            }
            self.save_safely(temp, self.model_save_dir, self.model_save_name + '_para.pkl')
            print('*** auxiliary part saved successfully at {}!'.format(
                osp.join(self.model_save_dir, self.model_save_name + '.pkl')))

    def load_model(self, optimizer=None, lr_scheduler=None):
        try:
            saved_model = torch.load(osp.join(self.model_save_dir, self.model_save_name + '.pkl'),
                                     map_location='cpu')
            self.load_state_dict(saved_model)
            print('*** loading model weight successfully!')
        except Exception:
            print('*** loading model weight fail!')

        if optimizer and lr_scheduler is not None:
            try:
                temp = torch.load(osp.join(self.model_save_dir, self.model_save_name + '_para.pkl'), map_location='cpu')
                lr_scheduler.load_state_dict(temp['lr_scheduler'])
                step = temp['step']
                print('*** loading optimizer&lr_scheduler&step successfully!')
                return step
            except Exception:
                print('*** loading optimizer&lr_scheduler&step fail!')
                return 0

    @staticmethod
    def save_safely(file, dir_path, file_name):
        r"""
        save the file safely, if detect the file name conflict,
        save the new file first and remove the old file
        """
        if not osp.exists(dir_path):
            os.mkdir(dir_path)
            print('*** dir not exist, created one')
        save_path = osp.join(dir_path, file_name)
        if osp.exists(save_path):
            temp_name = save_path + '.temp'
            torch.save(file, temp_name)
            os.remove(save_path)
            os.rename(temp_name, save_path)
            print('*** find the file conflict while saving, saved safely')
        else:
            torch.save(file, save_path)

In [None]:
def compute_iou(box1, box2):
    r"""Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].
    Args:
        box1: (tensor) bounding boxes, sized [N,4].
        box2: (tensor) bounding boxes, sized [M,4].
    Return:
        (tensor) iou, sized [N,M].
    """
    N = box1.size(0)
    M = box2.size(0)
    r'''
    torch.max(input, other, out=None) → Tensor
    Each element of the tensor input is compared with the corresponding element 
    of the tensor other and an element-wise maximum is taken.
    '''
    # left top
    lt = torch.max(
        box1[:, :2].unsqueeze(1).expand(N, M, 2),  # [N,2] -> [N,1,2] -> [N,M,2]
        box2[:, :2].unsqueeze(0).expand(N, M, 2),  # [M,2] -> [1,M,2] -> [N,M,2]
    )
    # right bottom
    rb = torch.min(
        box1[:, 2:].unsqueeze(1).expand(N, M, 2),  # [N,2] -> [N,1,2] -> [N,M,2]
        box2[:, 2:].unsqueeze(0).expand(N, M, 2),  # [M,2] -> [1,M,2] -> [N,M,2]
    )

    wh = rb - lt  # [N,M,2]
    wh[wh < 0] = 0  # clip at 0
    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]

    area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])  # [N,]
    area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])  # [M,]
    area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
    area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]

    iou = inter / (area1 + area2 - inter)
    return iou

In [7]:

class Yolov1Loss(nn.Module):
    def __init__(self, s=GRID_NUM, b=2, l_coord=5, l_noobj=0.5):
        super(Yolov1Loss, self).__init__()
        self.S = float(s)
        self.B = int(b)
        self.l_coord = l_coord
        self.l_noobj = l_noobj

    def forward(self, pred_tensor, target_tensor):
        r"""
        pred_tensor: (tensor) size(batchsize,S,S,Bx5+20=30) [x,y,w,h,c]
        target_tensor: (tensor) size(batchsize,S,S,30)
        """
        N = pred_tensor.size()[0]
        # contain obj
        coo_mask = target_tensor[:, :, :, 4] > 0 # 4x7x7
        # no obj
        noo_mask = target_tensor[:, :, :, 4] == 0 # 4x7x7
        coo_mask = coo_mask.unsqueeze(-1).expand_as(target_tensor) # 4x7x7x30
        noo_mask = noo_mask.unsqueeze(-1).expand_as(target_tensor) #4x7x7x30

        # coo_pred：tensor[, 30]
        coo_pred = pred_tensor[coo_mask].view(-1, 30) # 7x30
        # box[x1,y1,w1,h1,c1], [x2,y2,w2,h2,c2]
        box_pred = coo_pred[:, :10].contiguous().view(-1, 5) # 14x5
        # class[...]
        class_pred = coo_pred[:, 10:] # 7x20

        coo_target = target_tensor[coo_mask].view(-1, 30) #7x30
        box_target = coo_target[:, :10].contiguous().view(-1, 5) # 14x5
        class_target = coo_target[:, 10:] # 7x20

        # compute not contain obj loss
        noo_pred = pred_tensor[noo_mask].view(-1, 30) # 189x30
        noo_target = target_tensor[noo_mask].view(-1, 30) #189x30
        # noo pred只需要计算 Obj1、2 的损失 size[,2]
        noo_pred_mask = torch.ByteTensor(noo_pred.size()).to(DEVICE) # 189x30 matrix with 0
        noo_pred_mask.zero_()
        noo_pred_mask[:, 4] = 1 # confidence of first anchor
        noo_pred_mask[:, 9] = 1 # confidence of second anchor
        noo_pred_c = noo_pred[noo_pred_mask]
        noo_target_c = noo_target[noo_pred_mask]
        nooobj_loss = F.mse_loss(noo_pred_c, noo_target_c, reduction='sum')

        # compute contain obj loss
        coo_response_mask = torch.ByteTensor(box_target.size()).to(DEVICE)
        coo_response_mask.zero_() # 14x5
        coo_not_response_mask = torch.ByteTensor(box_target.size()).to(DEVICE)
        coo_not_response_mask.zero_() # 14x5
        box_target_iou = torch.zeros(box_target.size()).to(DEVICE) # 14x5
        # get anchor with biggest iou, use that anchor to compute loss
        for i in range(0, box_target.size()[0], 2):  # choose the best iou box
            box1 = box_pred[i:i + 2]
            box1_xyxy = torch.FloatTensor(box1.size())
            # (x,y,w,h)
            box1_xyxy[:, :2] = box1[:, :2] / self.S - 0.5 * box1[:, 2:4]
            box1_xyxy[:, 2:4] = box1[:, :2] / self.S + 0.5 * box1[:, 2:4]
            box2 = box_target[i].view(-1, 5)
            box2_xyxy = torch.FloatTensor(box2.size())
            box2_xyxy[:, :2] = box2[:, :2] / self.S - 0.5 * box2[:, 2:4]
            box2_xyxy[:, 2:4] = box2[:, :2] / self.S + 0.5 * box2[:, 2:4]
            # iou(pred_box[2,], target_box[2,])
            iou = compute_iou(box1_xyxy[:, :4], box2_xyxy[:, :4])
            # target匹配到的box
            max_iou, max_index = iou.max(0)
            # print(f'max_iou:{max_iou}, max_index:{max_index}')
            max_index = max_index.to(DEVICE)

            coo_response_mask[i + max_index] = 1
            coo_not_response_mask[i + 1 - max_index] = 1
            #####
            # we want the confidence score to equal the
            # intersection over union (IOU) between the predicted box
            # and the ground truth
            #####
            box_target_iou[i + max_index, torch.LongTensor([4]).to(DEVICE)] = max_iou.to(DEVICE)

        box_target_iou = box_target_iou.to(DEVICE)
        # 1.response loss
        box_pred_response = box_pred[coo_response_mask].view(-1, 5)
        box_target_response_iou = box_target_iou[coo_response_mask].view(-1, 5)
        box_target_response = box_target[coo_response_mask].view(-1, 5)
        contain_loss = F.mse_loss(box_pred_response[:, 4], box_target_response_iou[:, 4], reduction='sum') # C confidence

        loc_loss = F.mse_loss(box_pred_response[:, :2], box_target_response[:, :2], reduction='sum') + F.mse_loss(
            torch.sqrt(box_pred_response[:, 2:4]), torch.sqrt(box_target_response[:, 2:4]), reduction='sum') # XY, WH

        # 2.not response loss
        box_pred_not_response = box_pred[coo_not_response_mask].view(-1, 5)
        box_target_not_response = box_target[coo_not_response_mask].view(-1, 5)
        box_target_not_response[:, 4] = 0
        # not_contain_loss = F.mse_loss(box_pred_response[:,4],box_target_response[:,4],size_average=False)

        # I believe this bug is simply a typo
        not_contain_loss = F.mse_loss(box_pred_not_response[:, 4], box_target_not_response[:, 4], reduction='sum')  # I don't think this should be counted

        # 3.class loss
        class_loss = F.mse_loss(class_pred, class_target, reduction='sum')

        return (self.l_coord * loc_loss + 2 * contain_loss
                + not_contain_loss
                + self.l_noobj * nooobj_loss
                + class_loss) / N

In [8]:
model = Yolov1(backbone_name=backbone)
optimizer = optim.SGD(model.parameters(),
                        lr=LEARNING_RATE,
                        momentum=MOMENTUM,
                        weight_decay=WEIGHT_DECAY)
scheduler = WarmUpMultiStepLR(optimizer,
                                milestones=STEP_LR_SIZES,
                                gamma=STEP_LR_GAMMA,
                                warm_up_factor=WARM_UP_FACTOR,
                                warm_up_iters=WARM_UP_NUM_ITERS)
step = model.load_model(optimizer=optimizer, lr_scheduler=scheduler)
criterion = Yolov1Loss()

*** loading model weight fail!
*** loading optimizer&lr_scheduler&step fail!


In [9]:
model.to(DEVICE)
model.train()
while step < NUM_STEPS_TO_FINISH:
    t1 = time.perf_counter()
    for _, (imgs, gt_boxes, gt_labels, gt_outs) in enumerate(dataloader):
        step += 1
        scheduler.step()
        imgs = imgs.to(DEVICE)
        gt_outs = gt_outs.to(DEVICE)
        model_outs = model(imgs)
        loss = criterion(model_outs, gt_outs)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        t2 = time.perf_counter()
        print('step:{} | loss:{:.8f} | time:{:.4f}'.format(step, loss.item(), t2 - t1))
        t1 = time.perf_counter()
        if step != 0 and step % save_step == 0:
            model.save_model(step, optimizer, scheduler)

step:1 | loss:22.84800339 | time:1.2162


RuntimeError: CUDA out of memory. Tried to allocate 784.00 MiB (GPU 0; 7.79 GiB total capacity; 3.57 GiB already allocated; 410.44 MiB free; 3.67 GiB reserved in total by PyTorch)