# Analysis of default SSD300 model

In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path

root_path = Path('..').absolute().resolve()
if root_path.as_posix() not in sys.path:
    sys.path.append(root_path.as_posix())

In [3]:
import torch
from ignite._utils import to_variable

from torchvision.models.vgg import vgg16
from customized_torchcv.models.ssd.net import SSD300

## Model description:

```
SSD300 = Extractor + Location & Label predictors
```

Input image size is 300x300 pixels

Model parameters:
- Number of classes
- `num_anchors`: `(4, 6, 6, 6, 4, 4)` 
    - Related with number of boxes in location & class predictions
    - Number of default boxes:
      * for a given feature map size there are (2 + num aspect ratios * 2) * fm_size^2 of boxes generated
      * num_anchors[i] = (2 + num_aspect_ratios[i] * 2)
      
- `steps`: `(8, 16, 32, 64, 100, 300)`
    - Defines default boxes in `SSDBoxCoder`
- `box_sizes`: `(30, 60, 111, 162, 213, 264, 315)`
    - Defines default boxes in `SSDBoxCoder`
- `aspect_ratios`: `((2,), (2,3), (2,3), (2,3), (2,), (2,))`
    - Defines default boxes in `SSDBoxCoder`

### Extractor

Extractor is based on VGG16 network and produces 6 feature maps of size:

- 512, 38x38
- 1024, 19x19
- 512, 10x10
- 256, 3x3
- 256, 1x1

Extractor network is composed of the following layers:
```
                                                   6     7           8             9             10            11
Extractor = [VGG16 features (block1-block5)] --> [C|R]-[C|R] -T- [C|R|C|R] -T- [C|R|C|R] -T- [C|R|C|R] -T- [C|R|C|R] -- fm5
                                        |                     |             |             |             |
                                    [L2 norm]--- fm0         fm1           fm2           fm3           fm4
``` 

- Block 6 = Dilated convolution
- Blocks 8_1, 9_1, 10_1, 11_1 = Compressions 


In [4]:
model = SSD300(num_classes=10)
vgg = vgg16(num_classes=10)

In [5]:
# Compare extractor layers (block1-block4) with VGG16
for i in range(23):
    l1 = repr(model.extractor.features.layers[i])
    l2 = repr(vgg.features[i])
    print("{}  vs  {}".format(l1, l2))

Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))  vs  Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)  vs  ReLU(inplace)
Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))  vs  Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)  vs  ReLU(inplace)
MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=True)  vs  MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=False)
Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))  vs  Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)  vs  ReLU(inplace)
Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))  vs  Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)  vs  ReLU(inplace)
MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=True)  vs  MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode

In [6]:
# Compare block 5
extractor_block5 = [model.extractor.conv5_1, model.extractor.conv5_2, model.extractor.conv5_3]
vgg16_block5 = [vgg.features[24], vgg.features[26], vgg.features[28]]

for l1, l2 in zip(extractor_block5, vgg16_block5):
    l1 = repr(l1)
    l2 = repr(l2)
    print("{}  vs  {}".format(l1, l2))

Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))  vs  Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))  vs  Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))  vs  Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))


In [7]:
# Blocks 8, 9, 10, 11

In [8]:
extractor_last_blocks = [
    ('6', model.extractor.conv6),
    ('7', model.extractor.conv7),
    ('8_1', model.extractor.conv8_1),
    ('8_2', model.extractor.conv8_2),
    ('9_1', model.extractor.conv9_1),
    ('9_2', model.extractor.conv9_2),    
    ('10_1', model.extractor.conv10_1),
    ('10_2', model.extractor.conv10_2),    
    ('11_1', model.extractor.conv11_1),
    ('11_2', model.extractor.conv11_2)
]

for n, l in extractor_last_blocks:
    print("{}, {}".format(n, repr(l)))

6, Conv2d(512, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(6, 6), dilation=(6, 6))
7, Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1))
8_1, Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
8_2, Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
9_1, Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
9_2, Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
10_1, Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
10_2, Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
11_1, Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
11_2, Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))


In [9]:
x = to_variable(torch.rand(12, 3, 300, 300))
feature_maps = model.extractor(x)

In [10]:
print(len(feature_maps), type(feature_maps))
for i, o in enumerate(feature_maps):
    print(i, o.shape)

6 <class 'list'>
0 torch.Size([12, 512, 38, 38])
1 torch.Size([12, 1024, 19, 19])
2 torch.Size([12, 512, 10, 10])
3 torch.Size([12, 256, 5, 5])
4 torch.Size([12, 256, 3, 3])
5 torch.Size([12, 256, 1, 1])


### Location & Label predictors

These predictors are just convolutions with a particular number of output feature maps

Location layer number of output feature maps is fixed by number of anchors x 4. For all 6 feature maps they are:
```
[16, 24, 24, 24, 16, 16]
```
Classification layer number of output feature maps is computed as number of anchors x number of classes (e.g. 10):
```
[40, 60, 60, 60, 40, 40]
```


In [11]:
print([n * 4 for n in model.num_anchors])
model.loc_layers

[16, 24, 24, 24, 16, 16]


ModuleList(
  (0): Conv2d(512, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Conv2d(1024, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (2): Conv2d(512, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): Conv2d(256, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (5): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)

In [12]:
print([n * 10 for n in model.num_anchors])
model.cls_layers

[40, 60, 60, 60, 40, 40]


ModuleList(
  (0): Conv2d(512, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Conv2d(1024, 60, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (2): Conv2d(512, 60, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): Conv2d(256, 60, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): Conv2d(256, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (5): Conv2d(256, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)

Final predictions on a feature map from extractor is computed as :

In [13]:
index = 0
print("0:", feature_maps[index].shape)
loc_pred = model.loc_layers[index](feature_maps[index])
print("1:", loc_pred.shape)
loc_pred = loc_pred.permute(0,2,3,1).contiguous()
print("2:", loc_pred.shape)
loc_pred = loc_pred.view(loc_pred.size(0),-1,4)
print("3:", loc_pred.shape)

0: torch.Size([12, 512, 38, 38])
1: torch.Size([12, 16, 38, 38])
2: torch.Size([12, 38, 38, 16])
3: torch.Size([12, 5776, 4])


In [14]:
n_classes = 10
index = 0

print("0:", feature_maps[index].shape)
cls_pred = model.cls_layers[index](feature_maps[index])
print("1:", cls_pred.shape)
cls_pred = cls_pred.permute(0,2,3,1).contiguous()
print("2:", cls_pred.shape)
cls_pred = cls_pred.view(cls_pred.size(0),-1, n_classes)
print("3:", cls_pred.shape)

0: torch.Size([12, 512, 38, 38])
1: torch.Size([12, 40, 38, 38])
2: torch.Size([12, 38, 38, 40])
3: torch.Size([12, 5776, 10])


In [15]:
for index in range(6):
    loc_pred = model.loc_layers[index](feature_maps[index])
    loc_pred = loc_pred.permute(0,2,3,1).contiguous()
    loc_pred = loc_pred.view(loc_pred.size(0),-1,4)
    print("Output loc pred {} :".format(index), loc_pred.shape)

Output loc pred 0 : torch.Size([12, 5776, 4])
Output loc pred 1 : torch.Size([12, 2166, 4])
Output loc pred 2 : torch.Size([12, 600, 4])
Output loc pred 3 : torch.Size([12, 150, 4])
Output loc pred 4 : torch.Size([12, 36, 4])
Output loc pred 5 : torch.Size([12, 4, 4])


In [16]:
for index in range(6):
    cls_pred = model.cls_layers[index](feature_maps[index])
    cls_pred = cls_pred.permute(0,2,3,1).contiguous()
    cls_pred = cls_pred.view(cls_pred.size(0),-1, n_classes)
    print("Output cls pred {} :".format(index), cls_pred.shape)

Output cls pred 0 : torch.Size([12, 5776, 10])
Output cls pred 1 : torch.Size([12, 2166, 10])
Output cls pred 2 : torch.Size([12, 600, 10])
Output cls pred 3 : torch.Size([12, 150, 10])
Output cls pred 4 : torch.Size([12, 36, 10])
Output cls pred 5 : torch.Size([12, 4, 10])


Finally, all predictions are concatenated

In [17]:
loc_preds, cls_preds = model(x)

In [18]:
for i, (loc_pred, cls_pred) in enumerate(zip(loc_preds, cls_preds)):
    print(i, loc_pred.shape, cls_pred.shape)
    break

0 torch.Size([8732, 4]) torch.Size([8732, 10])


### Default boxes

Default boxes are generated to 

- encode ground truth bounding boxes, see `SSDBoxCoder.encode`
- decode predictions, see `SSDBoxCoder.decode`

In [19]:
# Given models parameters
steps = (8, 16, 32, 64, 100, 300)
fm_sizes = (38, 19, 10, 5, 3, 1)
box_sizes = (30, 60, 111, 162, 213, 264, 315)

In [20]:
import itertools

for i, fm_size in enumerate(fm_sizes):
    print("Feature map: ", fm_size)
    hws = list(itertools.product(range(fm_size), repeat=2))
    for h, w in [hws[0], hws[-1]]:
        cx = (w + 0.5) * steps[i]
        cy = (h + 0.5) * steps[i]
        s = box_sizes[i]
        print("     ", cx, cy, s)

Feature map:  38
      4.0 4.0 30
      300.0 300.0 30
Feature map:  19
      8.0 8.0 60
      296.0 296.0 60
Feature map:  10
      16.0 16.0 111
      304.0 304.0 111
Feature map:  5
      32.0 32.0 162
      288.0 288.0 162
Feature map:  3
      50.0 50.0 213
      250.0 250.0 213
Feature map:  1
      150.0 150.0 264
      150.0 150.0 264


In [21]:
from customized_torchcv.models.ssd import SSDBoxCoder
from customized_torchcv.utils.box import box_iou, change_box_order

box_coder = SSDBoxCoder(model)
box_coder._get_default_boxes()


   4.0000    4.0000   30.0000   30.0000
   4.0000    4.0000   42.4264   42.4264
   4.0000    4.0000   42.4264   21.2132
                   ⋮                    
 150.0000  150.0000  288.3748  288.3748
 150.0000  150.0000  373.3524  186.6762
 150.0000  150.0000  186.6762  373.3524
[torch.FloatTensor of size 8732x4]

Tests on some example bounding boxes
- small objects
- large objects

Small objects

In [75]:
objs = torch.Tensor([[1, 1, 5, 5], [10, 10, 11, 11], [150, 150, 155, 155]])
labels = torch.LongTensor([1, 0, 2])

loc_targets, cls_targets = box_coder.encode(objs, labels)
print(loc_targets.shape, cls_targets.shape)

cls_preds = torch.zeros(cls_targets.shape[0], 4)  # 1 + 3 classes
cls_preds.scatter_(1, cls_targets.unsqueeze(1), 0.7)  # set class proba to 0.75
boxes, labels, scores = box_coder.decode(loc_targets, cls_preds, score_thresh=0.0, nms_thresh=0.0)

print(boxes)
print(scores)

torch.Size([8732, 4]) torch.Size([8732])

  10.0000   10.0000   11.0000   11.0000
   1.0000    1.0000    5.0000    5.0000
 150.0000  150.0000  155.0000  155.0000
[torch.FloatTensor of size 3x4]


 0.7000
 0.7000
 0.7000
[torch.FloatTensor of size 3]



In [76]:
for i in range(4):
    print(i, loc_targets[:, i].min(), loc_targets[:, i].max())

0 -140.00718688964844 1.53206467628479
1 -140.00718688964844 3.06412935256958
2 -22.681140899658203 -8.341645240783691
3 -22.681140899658203 -7.225927829742432


In [77]:
cls_targets.min(), cls_targets.max()

(0, 3)

In [79]:
# default_boxes = box_coder._get_default_boxes()  # xywh
# default_boxes = change_box_order(default_boxes, 'xywh2xyxy')
# ious = box_iou(default_boxes, objs)  # [#anchors, #obj]
# ious

Large objects

In [74]:
objs = torch.Tensor([[1, 1, 50, 50], [150, 150, 255, 255]])
labels = torch.LongTensor([0, 1])

loc_targets, cls_targets = box_coder.encode(objs, labels)
print(loc_targets.shape, cls_targets.shape)

cls_preds = torch.zeros(cls_targets.shape[0], 3)  # 1 + 2 classes
cls_preds.scatter_(1, cls_targets.unsqueeze(1), 0.7)  # set class proba to 0.75
boxes, labels, scores = box_coder.decode(loc_targets, cls_preds, score_thresh=0.0, nms_thresh=0.0)

print(boxes)
print(scores)

torch.Size([8732, 4]) torch.Size([8732])

   1.0000    1.0000   50.0000   50.0000
 150.0000  150.0000  255.0000  255.0000
[torch.FloatTensor of size 2x4]


 0.7000
 0.7000
[torch.FloatTensor of size 2]



## Loss function

Loss function is a sum of Smooth L1 (location) and Cross-Entropy (classification) terms

In [84]:
from customized_torchcv.loss.ssd_loss import SSDLoss

loss_fn = SSDLoss(num_classes=model.num_classes)

x = to_variable(torch.rand(1, 3, 300, 300))
loc_preds, cls_preds = model(x)

In [86]:
cls_preds.shape

torch.Size([1, 8732, 10])

In [90]:
cls_preds[0, 0, :].data.numpy(), loc_preds[0, 0, :].data.numpy()

(array([ 0.7277434 , -0.34593922, -0.44672066,  0.12167928, -0.32203   ,
         1.1578766 , -0.20052607,  0.52037036, -0.66037816,  0.11640968],
       dtype=float32),
 array([ 0.22206742,  0.211182  ,  0.4276462 , -0.02398141], dtype=float32))

In [98]:
objs = torch.Tensor([[1, 1, 5, 5], [10, 10, 11, 11], [150, 150, 155, 155]])
labels = torch.LongTensor([1, 0, 2])
loc_targets, cls_targets = box_coder.encode(objs, labels)
loc_targets = loc_targets.unsqueeze(0)
cls_targets = cls_targets.unsqueeze(0)
cls_targets.shape

torch.Size([1, 8732])

In [110]:
# positive targets: ground truth encoded boxes
pos = cls_targets > 0
print(pos.long().sum())

# get the mask of positives
mask = pos.unsqueeze(2).expand_as(loc_preds)
print(mask.shape)

3
torch.Size([1, 8732, 4])


In [101]:
from torch.nn import functional as F

loc_loss = F.smooth_l1_loss(loc_preds[mask], to_variable(loc_targets[mask]), size_average=False)

In [102]:
loc_loss

Variable containing:
 75.0979
[torch.FloatTensor of size 1]