In [10]:
from __future__ import division

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

from util import *


In [3]:
def parse_cfg(cfgfile):
    """
    arg: configuration file

    return: a list of blocks in the neural network (block is represented as a dictionary in the list)

    store every block as a dict
    attributes of blocks and their values are stored as key-value pairs in dict
    """

    file = open(cfgfile,'r')
    lines = file.read().split('\n')
    lines = [x for x in lines if len(x)>0] #remove empty lines
    lines = [x for x in lines if x[0] != '#'] #remove comments
    lines = [x.rstrip().lstrip() for x in lines] #remove frigne whitespaces

    block = {}
    blocks = []

    for line in lines:
        if line[0] == "[": #start of new block
            if len(block) != 0:
                blocks.append(block)
                block = {} #re-init
            block["type"] = line[1:-1].rstrip()
        else:
            key,value = line.split("=")
            block[key.rstrip()] = value.lstrip()
    blocks.append(block)

    return blocks

In [7]:
class EmptyLayer(nn.Module):
    def __init__(self):
        super(EmptyLayer, self).__init__()
        
class DetectionLayer(nn.Module):
    def __init__(self, anchors):
        super(DetectionLayer, self).__init__()
        self.anchors = anchors

def create_modules(blocks):
    net_info = blocks[0] #info about input and pre-processing
    module_list = nn.ModuleList() #normal list containing nn.Module objects
    # when add nn.ModuleList as a member of nn.Module object (add module to network), 
    # all the parameters of nn.Module objects (modules) inside the nn.ModuleList 
    # are added as parameters of the nn.Module object (network) as well
    prev_filters = 3 #image has 3 filters (RGB)
    output_filters = []
    # defining new convolutional layer requires defining the dimension of its kernel
    # height & width of kernel: provided by cfg file
    # depth of kernel = number of filters (depth of feature map) present in the previous layer
    # need to keep track of number of filters in the layer on which the conv layer is being applied

    # iterate over the list of blocks and create a PyTorch module for each block as we go
    for index, x in enumerate(blocks[1:]):
        module = nn.Sequential()
        # nn.Sequential classs is used to sequentially execute a number of nn.Module objects

        if x["type"] == "convolutional":
            activation = x["activation"]

            try:
                batch_normalize = int(x["batch_normalize"])
                bias = False
            except:
                batch_normalize = 0
                bias = True
            
            filters = int(x["filters"])
            padding = int(x["pad"])
            kernel_size = int(x["size"])
            stride = int(x["stride"])

            if padding:
                pad = (kernel_size - 1)
            else:
                pad = 0
            
            # add the convolutional layer
            conv = nn.Conv2d(prev_filters,filters,kernel_size,stride,pad,bias=bias)
            module.add_module("conv_{0}".format(index), conv)

            # add the batch norm layer
            if batch_normalize:
                bn = nn.BatchNorm2d(filters)
                module.add_module("batch_norm_{0}".format(index), bn)
            
            # check the activation: linear or leaky relu for YOLO
            if activation == "leaky":
                activn = nn.LeakyReLU(0.1,inplace=True)
                module.add_module("leaky_{0}".format(index),activn)
        
        elif x["type"] == "upsample":
            stride = int(x["stride"])
            upsample = nn.Upsample(scale_factor=2,mode="bilinear")
            module.add_module("upsample_{0}".format(index),upsample)
        
        elif x["type"] == "route":
            x["layers"] = x["layers"].split(',')

            start = int(x["layers"][0]) #start of route
            try:
                end = int(x["layers"][0]) #end if there exists one
            except:
                end = 0
            
            # positive anotation
            if start>0:
                start = start-index
            if end>0:
                end = end-index
            
            route = EmptyLayer()
            module.add_module("route_{0}".format(index),route)
            if end<0: #if concatenating maps
                filters = output_filters[index+start] + output_filters[index+end]
            else:
                filters = output_filters[index+start]
        
        elif x["type"] == "shortcut":
            shortcut = EmptyLayer()
            module.add_module("shortcut_{}".format(index),shortcut)
            # no need to update filters variable 
            # as it merely adds a feature maps of a previous layer to those of layer just behind
        
        elif x["type"] == "yolo":
            mask = x["mask"].split(",")
            mask = [int(x) for x in mask]

            anchors = x["anchors"].split(",")
            anchors = [int(a) for a in anchors]
            anchors = [(anchors[i],anchors[i+1]) for i in range(0,len(anchors),2)]
            anchors = [anchors[i] for i in mask]

            detection = DetectionLayer(anchors) #holds the anchors used to detect bounding boxes
            module.add_module("Detection_{}".format(index),detection)
        
        module_list.append(module)
        prev_filters = filters
        output_filters.append(filters)
    
    return (net_info, module_list)


In [8]:
blocks = parse_cfg("../yolo-coco/yolov3.cfg")
print(create_modules(blocks))

({'type': 'net', 'batch': '64', 'subdivisions': '16', 'width': '608', 'height': '608', 'channels': '3', 'momentum': '0.9', 'decay': '0.0005', 'angle': '0', 'saturation': '1.5', 'exposure': '1.5', 'hue': '.1', 'learning_rate': '0.001', 'burn_in': '1000', 'max_batches': '500200', 'policy': 'steps', 'steps': '400000,450000', 'scales': '.1,.1'}, ModuleList(
  (0): Sequential(
    (conv_0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), bias=False)
    (batch_norm_0): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky_0): LeakyReLU(negative_slope=0.1, inplace=True)
  )
  (1): Sequential(
    (conv_1): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(2, 2), bias=False)
    (batch_norm_1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky_1): LeakyReLU(negative_slope=0.1, inplace=True)
  )
  (2): Sequential(
    (conv_2): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
  

In [20]:
class Darknet(nn.Module):
    """custom architectures using nn.Module class to define a network for our detector"""
    def __init__(self,cfgfile):
        super(Darknet, self).__init__()
        self.blocks = parse_cfg(cfgfile)
        self.net_info,self.module_list = create_modules(self.blocks)
    
    def forward(self,x,CUDA):
        """
        2 purposes:
        - calculate the output
        - transform the output detection feature maps in a way that it can be proceased easier
            ex) transformaing them such that detection maps across multiple scales can be concatenated
        
        args:
        - CUDA: if true, use GPU to accelerate the forward pass
        """
        modules = self.blocks[1:] #blocks[0] is a net which isn't a part of the forward pass
        outputs = {} #cache output feature maps of every layer because route and shortcut layers need output maps from previous layers
        # keys: indices of layers
        # values: feature maps

        write = 0
        for i,module in enumerate(modules):
            module_type = (module["type"])

            if module_type == "convolutional" or module_type == "upsample":
                x = self.module_list[i](x)
            
            elif module_type == "route":
                #NOTE: input and output of a convolutional layer in PyTorch: B X C X H X W (depth = channel dimension)

                layers = module["layers"]
                layers = [int(a) for a in layers]

                if (layers[0]) > 0:
                    layers[0] = layers[0] - i

                if len(layers) == 1:
                    x = outputs[i + (layers[0])]

                else:
                    if (layers[1]) > 0:
                        layers[1] = layers[1] - i

                    map1 = outputs[i + layers[0]]
                    map2 = outputs[i + layers[1]]

                    x = torch.cat((map1, map2), 1)

            elif  module_type == "shortcut":
                from_ = int(module["from"])
                x = outputs[i-1] + outputs[i+from_]


            elif module_type == 'yolo':
                anchors = self.module_list[i][0].anchors
                inp_dim = int(self.net_info["height"]) #input dimensions
                num_classes = int(module["classes"]) #number of classes

                # transform
                x = x.data
                x = predict_transform(x,inp_dim,anchors,num_classes,CUDA)
                if not write: #if no collector has been initialized
                    detections = x
                    write = 1
                else:
                    detections = torch.cat((detections,x),1)
            
            outputs[i] = x

        return detections

In [13]:
def get_test_input(imgfile):
    img = cv2.imread(imgfile)
    img = cv2.resize(img,(416,416))
    img_ = img[:,:,::-1].transpose((2,0,1))
    img_ = img_[np.newaxis,:,:,:]/255.0
    img_ = torch.from_numpy(img_).float()
    img_ = Variable(img_)
    
    return img_

In [23]:
model = Darknet("../yolo-coco/yolov3.cfg")
inp = get_test_input("../images/dog-cycle-car.png")
pred = model(inp,torch.cuda.is_available())
print(pred)


RuntimeError: The size of tensor a (212) must match the size of tensor b (210) at non-singleton dimension 3