In [1]:
import sys
sys.path.insert(1, '../')

In [2]:
import torch
import torch.nn as nn
from collections import OrderedDict
from Models.block import Conv
from torch.autograd.profiler import record_function
from evaluation import count_parameters
from torch.utils.data import DataLoader
from torch import optim

In [3]:
class YOLOv2(torch.nn.Module):
    def __init__(self, device=None, dtype=None, num_classes=20, 
                 anchors=[(1.3221, 1.73145), (3.19275, 4.00944), (5.05587, 8.09892), (9.47112, 4.84053),
                          (11.2364, 10.0071)],
                 num_boxes=5, act='Leaky'):
        super().__init__()
        self.num_classes = num_classes
        self.anchors = anchors
        self.max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.conv1 = Conv(3, out_channels=32, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype)
        self.conv2 = Conv(32, out_channels=64, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype)

        self.seq3_5 = nn.Sequential(OrderedDict([
            ('conv3', Conv(64, out_channels=128, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv4', Conv(128, out_channels=64, kernel_size=(1,1), stride=(1,1), 
                          padding=(0,0), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv5', Conv(64, out_channels=128, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype))
        ]))
        
        self.seq6_8 = nn.Sequential(OrderedDict([
            ('conv6', Conv(128, out_channels=256, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv7', Conv(256, out_channels=128, kernel_size=(1,1), stride=(1,1), 
                          padding=(0,0), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv8', Conv(128, out_channels=256, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype))
        ]))

        self.seq9_13 = nn.Sequential(OrderedDict([
            ('conv9', Conv(256, out_channels=512, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv10', Conv(512, out_channels=256, kernel_size=(1,1), stride=(1,1), 
                          padding=(0,0), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv11', Conv(256, out_channels=512, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv12', Conv(512, out_channels=256, kernel_size=(1,1), stride=(1,1), 
                          padding=(0,0), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv13', Conv(256, out_channels=512, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype))
        ]))

        # route -->

        self.seq14_18 = nn.Sequential(OrderedDict([
            ('conv14', Conv(512, out_channels=1024, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv15', Conv(1024, out_channels=512, kernel_size=(1,1), stride=(1,1), 
                          padding=(0,0), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv16', Conv(512, out_channels=1024, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv17', Conv(1024, out_channels=512, kernel_size=(1,1), stride=(1,1), 
                          padding=(0,0), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv18', Conv(512, out_channels=1024, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype))
        ]))

        # Detection part

        self.seq19_20 = nn.Sequential(OrderedDict([
            ('conv19', Conv(1024, out_channels=1024, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv20', Conv(1024, out_channels=1024, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype)),
            
        ]))

        # --> route

        self.passthrough_conv = Conv(512, out_channels=64, kernel_size=(1,1), stride=(1,1), 
                                      padding=(0,0), bias=False, act=act,
                                      device=device, dtype=dtype)

        self.seq21_22 = nn.Sequential(OrderedDict([
            ('conv21', Conv(256+1024, out_channels=1024, kernel_size=(3,3), stride=(1,1), 
                          padding=(1,1), bias=False, act=act,
                          device=device, dtype=dtype)),
            ('conv22', Conv(1024, out_channels=num_boxes*(num_classes+5), kernel_size=(1,1), stride=(1,1), 
                          padding=(0,0), bias=False, act=act,
                          device=device, dtype=dtype))
        ]))
        
    def forward(self, x):
        # 416x416, stride: 0
        out = self.conv1(x)
        with record_function("Max pooling"):
            out = self.max_pool(out)

        # 208x208, stride: 2
        out = self.conv2(out)
        with record_function("Max pooling"):
            out = self.max_pool(out)

        # 104x104, stride: 4
        out = self.seq3_5(out)
        with record_function("Max pooling"):
            out = self.max_pool(out)

        # 52x52, stride: 8
        out = self.seq6_8(out)
        with record_function("Max pooling"):
            out = self.max_pool(out)

        # 26x26, stride: 16
        out = self.seq9_13(out)
        passthrough_out = self.passthrough_conv(out).reshape((-1, 256, 13, 13))
        
        with record_function("Max pooling"):
            out = self.max_pool(out)

        # 13x13, stride: 32
        out = self.seq14_18(out)

        # Detection part

        out = self.seq19_20(out)
        out = torch.cat([passthrough_out, out], 1)
        out = self.seq21_22(out)
        
        return out

https://github.com/yjh0410/yolov2-yolov3_PyTorch

In [37]:
class Conv_BN_LeakyReLU(nn.Module):
    def __init__(self, in_channels, out_channels, ksize, padding=0, stride=1, dilation=1):
        super(Conv_BN_LeakyReLU, self).__init__()
        self.convs = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, ksize, padding=padding, stride=stride, dilation=dilation, device=torch.device('cuda:0')),
            nn.BatchNorm2d(out_channels, device=torch.device('cuda:0')),
            nn.LeakyReLU(0.1, inplace=True)
        )

    def forward(self, x):
        return self.convs(x)

class reorg_layer(nn.Module):
    def __init__(self, stride):
        super(reorg_layer, self).__init__()
        self.stride = stride

    def forward(self, x):
        batch_size, channels, height, width = x.size()
        _height, _width = height // self.stride, width // self.stride
        
        x = x.view(batch_size, channels, _height, self.stride, _width, self.stride).transpose(3, 4).contiguous()
        x = x.view(batch_size, channels, _height * _width, self.stride * self.stride).transpose(2, 3).contiguous()
        x = x.view(batch_size, channels, self.stride * self.stride, _height, _width).transpose(1, 2).contiguous()
        x = x.view(batch_size, -1, _height, _width)

        return x


class DarkNet_19(nn.Module):
    def __init__(self):        
        super(DarkNet_19, self).__init__()
        # backbone network : DarkNet-19
        # output : stride = 2, c = 32
        self.conv_1 = nn.Sequential(
            Conv_BN_LeakyReLU(3, 32, 3, 1),
            nn.MaxPool2d((2,2), 2),
        )

        # output : stride = 4, c = 64
        self.conv_2 = nn.Sequential(
            Conv_BN_LeakyReLU(32, 64, 3, 1),
            nn.MaxPool2d((2,2), 2)
        )

        # output : stride = 8, c = 128
        self.conv_3 = nn.Sequential(
            Conv_BN_LeakyReLU(64, 128, 3, 1),
            Conv_BN_LeakyReLU(128, 64, 1),
            Conv_BN_LeakyReLU(64, 128, 3, 1),
            nn.MaxPool2d((2,2), 2)
        )

        # output : stride = 8, c = 256
        self.conv_4 = nn.Sequential(
            Conv_BN_LeakyReLU(128, 256, 3, 1),
            Conv_BN_LeakyReLU(256, 128, 1),
            Conv_BN_LeakyReLU(128, 256, 3, 1),
        )

        # output : stride = 16, c = 512
        self.maxpool_4 = nn.MaxPool2d((2, 2), 2)
        self.conv_5 = nn.Sequential(
            Conv_BN_LeakyReLU(256, 512, 3, 1),
            Conv_BN_LeakyReLU(512, 256, 1),
            Conv_BN_LeakyReLU(256, 512, 3, 1),
            Conv_BN_LeakyReLU(512, 256, 1),
            Conv_BN_LeakyReLU(256, 512, 3, 1),
        )
        
        # output : stride = 32, c = 1024
        self.maxpool_5 = nn.MaxPool2d((2, 2), 2)
        self.conv_6 = nn.Sequential(
            Conv_BN_LeakyReLU(512, 1024, 3, 1),
            Conv_BN_LeakyReLU(1024, 512, 1),
            Conv_BN_LeakyReLU(512, 1024, 3, 1),
            Conv_BN_LeakyReLU(1024, 512, 1),
            Conv_BN_LeakyReLU(512, 1024, 3, 1)
        )

    def forward(self, x):
        c1 = self.conv_1(x)
        c2 = self.conv_2(c1)
        c3 = self.conv_3(c2)
        c3 = self.conv_4(c3)
        c4 = self.conv_5(self.maxpool_4(c3))
        c5 = self.conv_6(self.maxpool_5(c4))

        output = {
            'layer1': c3,
            'layer2': c4,
            'layer3': c5
        }

        return output

In [46]:
class YOLOv2D19(nn.Module):
    def __init__(self, num_classes=20, num_anchors=5, state_dict_path='./darknet19_72.96.pth', device=None, dtype=None):
        super(YOLOv2D19, self).__init__()
        self.num_classes = num_classes
        self.num_anchors = num_anchors
        self.device = device
        self.dtype = dtype

        # Load pretrained backbone
        state_dict = torch.load(state_dict_path, map_location='cuda:0')
        del state_dict['conv_7.weight']
        del state_dict['conv_7.bias']

        self.backbone = DarkNet_19()
        self.backbone.load_state_dict(state_dict)
        
        # detection head
        self.convsets_1 = nn.Sequential(
            Conv_BN_LeakyReLU(1024, 1024, 3, 1),
            Conv_BN_LeakyReLU(1024, 1024, 3, 1)
        )

        self.route_layer = Conv_BN_LeakyReLU(512, 64, 1)
        self.reorg = reorg_layer(stride=2)

        self.convsets_2 = Conv_BN_LeakyReLU(1280, 1024, 3, 1)
        
        # prediction layer
        self.pred = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1, device=torch.device('cuda:0'))

    def forward(self, x):
        # backbone
        feats = self.backbone(x)

        # reorg layer
        p5 = self.convsets_1(feats['layer3'])
        p4 = self.reorg(self.route_layer(feats['layer2']))
        p5 = torch.cat([p4, p5], dim=1)

        # head
        p5 = self.convsets_2(p5)

        # pred
        pred = self.pred(p5)
        return pred

In [47]:
model = YOLOv2D19()

  state_dict = torch.load(state_dict_path, map_location='cuda:0')


In [48]:
from Models.loss import *
from Models.voc_dataset import *

In [49]:
from torch.utils.data.dataloader import default_collate

def custom_collate_fn(batch):
    items = list(zip(*batch))
    items[0] = default_collate(items[0])
    items[1] = list(items[1])
    return items

In [50]:
train_set = VOCDataset(root_path='../../datasets/VOCdevkit', year="2007", mode="train", image_size=416, is_training=True)
train_loader = DataLoader(train_set, batch_size=2, shuffle=True, collate_fn=custom_collate_fn)
test_set = VOCDataset(root_path='../../datasets/VOCdevkit', year="2007", mode="val", image_size=416, is_training=False)
test_loader = DataLoader(train_set, batch_size=2, shuffle=False, collate_fn=custom_collate_fn)

In [51]:
loss_fn = YoloLoss(train_set.num_classes, anchors=[(1.3221, 1.73145), (3.19275, 4.00944), (5.05587, 8.09892), (9.47112, 4.84053),
                          (11.2364, 10.0071)], reduction=32, device=torch.device('cuda:0'), dtype=None)

In [52]:
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001)

In [65]:
model.train()
loss_train = 0.0

for i, (imgs, labels) in enumerate(train_loader):
    imgs = imgs.to('cuda:0')
    #labels = labels.to('cuda:0')
    
    if (i+1) % 15 == 0:
        _datetime = datetime.datetime.now()
        print(f"{_datetime} Batch {i+1} ")
    
    outputs = model(imgs)

    loss, loss_coord, loss_conf, loss_cls, _dict = loss_fn(outputs, labels)
    
    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

    loss_train += loss.item()
    break

n_batches = len(train_loader)
print(f'[Train] Loss: {loss_train}')

[Train] Loss: 89.18915557861328


In [66]:
_dict['coord'].shape, _dict['tcoord'].shape

(torch.Size([2, 5, 4, 169]), torch.Size([2, 5, 4, 169]))

In [67]:
# N, boxes, coords, 13x13

In [68]:
_dict['cls'].shape, _dict['tcls'].shape

(torch.Size([9, 20]), torch.Size([9]))

In [69]:
_dict['conf'].shape, _dict['tconf'].shape

(torch.Size([2, 5, 169]), torch.Size([2, 5, 169]))

In [70]:
_dict['coord_mask'].shape # zeroes out no_obj ?

torch.Size([2, 5, 4, 169])

In [71]:
_dict['conf_mask'].shape

torch.Size([2, 5, 169])

In [89]:
# xmin, ymin, xmax, ymax, label
labels

[array([[141.20325,  88.56023, 147.12195, 327.4398 ,  11.     ],
        [  0.     ,   0.     , 211.38211, 435.8095 ,  14.     ]],
       dtype=float32),
 array([[ 65.743866,  46.92408 , 258.4414  , 276.13016 ,   7.      ],
        [120.15259 , 112.79826 , 295.8474  , 185.89154 ,  14.      ]],
       dtype=float32)]