In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
from yolov2 import YOLOv2D19 as YOLOv2
from detection_datasets import VOCDatasetV2
import pickle
with open('anchors_VOC0712trainval.pickle', 'rb') as handle:
    anchors = pickle.load(handle)

In [2]:
model = YOLOv2(device=torch.device('cpu'), dtype=torch.float32)

  state_dict = torch.load(state_dict_path, map_location=self.device)


In [3]:
transforms = A.Compose([
    A.Resize(width=416, height=416),
    A.VerticalFlip(p=1.0),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc'))
train_set = VOCDatasetV2(devkit_path = '../../datasets/VOCdevkit/', scales=[13], anchors=anchors, transforms=transforms, 
                         dtype=torch.float32, device=torch.device('cpu'))
image, gt_out = train_set[2]

True ../../datasets/VOCdevkit/VOC2007\ImageSets\Main\trainval.txt
True ../../datasets/VOCdevkit/VOC2012\ImageSets\Main\trainval.txt
class_label  12
cell_x  4
cell_y  4
best_anchor  tensor(4)
class_label  14
cell_x  4
cell_y  5
best_anchor  tensor(3)
class_label  14
cell_x  7
cell_y  3
best_anchor  tensor(3)
class_label  14
cell_x  7
cell_y  3
best_anchor  tensor(3)


In [97]:
class YOLOv2Loss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = torch.nn.MSELoss(reduction='sum')
        self.lambda_noobj = 0.5
        self.lambda_coord = 5.0
        
    def forward(self, out, gt_out, anchors):
        is_obj = gt_out[:, 0::25, ...] == 1.0
        no_obj = gt_out[:, 0::25, ...] == 0.0

        # CONFIDENCE LOSS
        conf_true = gt_out[:, 0::25, ...]
        conf_pred = out[:, 0::25, ...].sigmoid()

        conf_loss_obj = is_obj

        # obj_xc, obj_yc, obj_w, obj_h
        xc_true = gt_out[:, 1::25, ...]
        yc_true = gt_out[:, 2::25, ...]
        w_true = gt_out[:, 3::25, ...]
        h_true = gt_out[:, 4::25, ...]
        
        xc_pred = out[:, 1::25, ...].sigmoid()
        yc_pred = out[:, 2::25, ...].sigmoid()
        
        scale = gt_out.shape[-1]
        _anchors = torch.tensor(anchors) * scale
        pw = _anchors[:, 0]
        ph = _anchors[:, 1]
        
        w_pred = pw[None, :, None, None] * out[:, 3::25, ...].exp()
        h_pred = ph[None, :, None, None] * out[:, 4::25, ...].exp()

        loss = self.lambda_coord * is_obj * (
            self.mse(xc_pred, xc_true) + self.mse(yc_pred, yc_true)
        ) + \
            self.lambda_coord * is_obj * (
                self.mse(w_pred, w_true) + self.mse(h_pred, h_true)
        ) + \
            is_obj * (
                self.mse(conf_pred, conf_pred)
        ) + \
            self.lambda_noobj * no_obj * (
                self.mse(conf_pred, conf_pred)       
        ) + \
            is_obj * (
                0 # WRITE CLASS LOSS
            )

        return loss

        

In [98]:
loss = YOLOv2Loss()

In [102]:
loss(out, gt_out, anchors).sum()

tensor(1168711.1250, grad_fn=<SumBackward0>)

In [5]:
image, gt_out = train_set[2]
gt_out = gt_out.unsqueeze(0)
image = image.unsqueeze(0)
out = model(image)

class_label  12
cell_x  4
cell_y  4
best_anchor  tensor(4)
class_label  14
cell_x  4
cell_y  5
best_anchor  tensor(3)
class_label  14
cell_x  7
cell_y  3
best_anchor  tensor(3)
class_label  14
cell_x  7
cell_y  3
best_anchor  tensor(3)


In [6]:
is_obj = gt_out[:, 0::25, ...] == 1.0
_all_anchors = []
for i in range(5):
    _all_anchors.append(gt_out[:, i*25+5:25+i*25, ...])

In [7]:
class_true = torch.stack(_all_anchors, dim=1)

In [9]:
(is_obj[:, :, None, :, :] * class_true).shape

torch.Size([1, 5, 20, 13, 13])

In [10]:
softmax = torch.nn.Softmax(dim=2)

In [11]:
softmax(class_true)

tensor([[[[[0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500],
           [0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500],
           [0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500],
           ...,
           [0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500],
           [0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500],
           [0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500]],

          [[0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500],
           [0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500],
           [0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500],
           ...,
           [0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500],
           [0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500],
           [0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500]],

          [[0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500],
           [0.0500, 0.0500, 0.0500,  ..., 0.0500, 0.0500, 0.0500],
           [0.0500, 0.0500

In [12]:
class_true.sum(dim=2).shape

torch.Size([1, 5, 13, 13])

In [18]:
class_true[0, 4, :, 4, 4]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0.])

In [20]:
softmax(class_true)[0, 4, :, 4, 4]

tensor([0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460,
        0.0460, 0.0460, 0.0460, 0.1252, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460,
        0.0460, 0.0460])

In [30]:
nn.Softmax(dim=0)(class_true[0, 4, :, 4, 4])

tensor([0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460,
        0.0460, 0.0460, 0.0460, 0.1252, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460,
        0.0460, 0.0460])

In [24]:
class_true[0, 4, :, 4, 4].shape

torch.Size([20])