In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models.vgg import vgg16_bn

In [57]:
class FurnitureVGG16BNSSD300Like(nn.Module):
    fm_sizes = (9, 5, 3, 1)

    def __init__(self, num_classes, pretrained=True):
        super(FurnitureVGG16BNSSD300Like, self).__init__()

        self.num_classes = num_classes
        self.num_anchors = (1, 1, 1, 1)
        self.in_channels = (512, 256, 256, 256)

        self.extractor = VGG16Extractor300(pretrained=pretrained)
        self.cls_layers = nn.ModuleList()
        for i in range(len(self.in_channels)):
            self.cls_layers += [
                nn.Conv2d(self.in_channels[i], self.num_anchors[i] * self.num_classes, kernel_size=3, padding=1)]

        n_boxes = sum([i ** 2 for i in self.fm_sizes])
        self.boxes_to_classes = []
        for i in range(num_classes):
            self.boxes_to_classes.append(nn.Linear(n_boxes, 1))

        self.boxes_to_classes = nn.ModuleList(self.boxes_to_classes)

        self.relu = nn.ReLU(inplace=True)
        self.drop = nn.Dropout(p=0.4)
        self.final_classifier = nn.Linear(num_classes, num_classes)

        for param in self.extractor.features.parameters():
            param.requires_grad = False

    def forward(self, x):
        cls_preds = []
        xs = self.extractor(x)
        for i, x in enumerate(xs):
            cls_pred = self.cls_layers[i](x)
            cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous()
            cls_preds.append(cls_pred.view(cls_pred.size(0), -1, self.num_classes))
        cls_preds = torch.cat(cls_preds, 1)

        merged_cls_preds = []
        for i, m in enumerate(self.boxes_to_classes):
            merged_cls_preds.append(m(cls_preds[:, :, i]))
        merged_cls_preds = torch.cat(merged_cls_preds, 1)

        out = self.relu(merged_cls_preds)
        out = self.drop(out)
        out = self.final_classifier(out)
        return out


class VGG16Extractor300(nn.Module):
    def __init__(self, pretrained):
        super(VGG16Extractor300, self).__init__()

        self.features = vgg16_bn(pretrained=pretrained).features[0:-1]  # Ignore the last max poolling
        self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
        self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1)

        self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1)
        self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)

        self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1)
        self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)

        self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1)
        self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3)

        self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1)
        self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3)

        self.top_features = nn.ModuleList([
            self.conv6,
            self.conv7,
            self.conv8_1,
            self.conv8_2,
            self.conv9_1,
            self.conv9_2,
            self.conv10_1,
            self.conv10_2,
            self.conv11_1,
            self.conv11_2,
        ])

    def forward(self, x):
        hs = []
        h = self.features(x)
        h = F.max_pool2d(h, kernel_size=3, stride=1, padding=1, ceil_mode=True)

        h = F.relu(self.conv6(h))
        h = F.relu(self.conv7(h))

        h = F.relu(self.conv8_1(h))
        h = F.relu(self.conv8_2(h))
        hs.append(h)  # conv8_2

        h = F.relu(self.conv9_1(h))
        h = F.relu(self.conv9_2(h))
        hs.append(h)  # conv9_2

        h = F.relu(self.conv10_1(h))
        h = F.relu(self.conv10_2(h))
        hs.append(h)  # conv10_2

        h = F.relu(self.conv11_1(h))
        h = F.relu(self.conv11_2(h))
        hs.append(h)  # conv11_2
        return hs

In [58]:
model = FurnitureVGG16BNSSD300Like(128)

In [59]:
x = torch.rand(4, 3, 300, 300)
y = model.extractor(x)

In [60]:
print([i.shape for i in y])

[torch.Size([4, 512, 9, 9]), torch.Size([4, 256, 5, 5]), torch.Size([4, 256, 3, 3]), torch.Size([4, 256, 1, 1])]


In [62]:
y = model(x)
y.shape

torch.Size([4, 128])

In [64]:
import numpy as np

np.log(1.0/128.0)

-4.852030263919617

Inception-ResnetV2 extractor for 350x350 SSD-like classification model

In [2]:
from pretrainedmodels.models.inceptionresnetv2 import inceptionresnetv2

In [23]:
class FurnitureInceptionResNetV4350SSDLike_v2(nn.Module):
    def __init__(self, num_classes, pretrained='imagenet'):
        super(FurnitureInceptionResNetV4350SSDLike_v2, self).__init__()

        self.extractor = Extractor350_v2(pretrained=pretrained)

        self.num_classes = num_classes
        self.num_anchors = (1, 1, 1)
        self.in_channels = self.extractor.channels

        self.cls_layers = nn.ModuleList()
        for i in range(len(self.in_channels)):
            self.cls_layers += [
                nn.Conv2d(self.in_channels[i], self.num_anchors[i] * self.num_classes,
                          kernel_size=3, padding=1),
                nn.Sigmoid()
            ]

        n_levels = len(self.extractor.featuremap_sizes)
        self.boxes_to_classes = []
        for i in range(num_classes):
            self.boxes_to_classes.append(nn.Linear(n_levels, 1))
        self.boxes_to_classes = nn.ModuleList(self.boxes_to_classes)

        self.inner_classifier = nn.Linear(n_levels * num_classes, num_classes)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p=0.4)
        self.final_classifier = nn.Linear(2 * num_classes, num_classes)

    def forward(self, x):

        cls_preds = []
        xs = self.extractor(x)
        # Transform output feature maps to bbox predictions
        for i, x in enumerate(xs):
            cls_pred = self.cls_layers[i](x)
            cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous()
            cls_pred = cls_pred.view(cls_pred.size(0), -1, self.num_classes)
            # Sum all predictions of all boxes at single level
            cls_pred = torch.sum(cls_pred, dim=1).unsqueeze(1)
            cls_preds.append(cls_pred)

        # Two ways to aggregate
        # A) Predictions from each bbox level are transformed with FC to a single probability
        # for each target class
        cls_preds_a = torch.cat(cls_preds, dim=1)
        merged_cls_preds = []
        for i, m in enumerate(self.boxes_to_classes):
            merged_cls_preds.append(m(cls_preds_a[:, :, i]))
        merged_cls_preds = torch.cat(merged_cls_preds, 1)
        out_a = self.relu(merged_cls_preds)

        # B) Predictions from each bbox level are transformed with FC to a vector of output probabilities
        cls_preds_b = torch.cat(cls_preds, dim=2).squeeze(1)
        out_b = self.inner_classifier(cls_preds_b)
        out_b = self.relu(out_b)
        
        # Aggregate results:
        out = torch.cat([out_a, out_b], dim=1)

        out = self.drop(out)
        out = self.final_classifier(out)
        return out


class Extractor350_v2(nn.Module):
    featuremap_sizes = (20, 9, 1)
    channels = (256, 256, 256)

    def __init__(self, pretrained):
        super(Extractor350_v2, self).__init__()

        model = inceptionresnetv2(pretrained=pretrained)
        self.stem = nn.Sequential(
            model.conv2d_1a,
            model.conv2d_2a,
            model.conv2d_2b,
            model.maxpool_3a,
            model.conv2d_3b,
            model.conv2d_4a,
            model.maxpool_5a,
        )

        self.low_features_a = nn.Sequential(
            model.mixed_5b,
            model.repeat,
        )

        self.low_features_b = nn.Sequential(
            model.mixed_6a,
            model.repeat_1
        )

        self.mid_features = nn.Sequential(
            model.mixed_7a,
            model.repeat_2,
            model.block8
        )

        self.top_features = nn.Sequential(
            model.conv2d_7b,
            model.avgpool_1a,

        )
        self.smooth2 = nn.Sequential(
            nn.Conv2d(1088, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )
        self.smooth3 = nn.Sequential(
            nn.Conv2d(2080, 320, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(320, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )
        self.top_smooth = nn.Sequential(
            nn.Conv2d(1536, 256, kernel_size=1, stride=1, padding=0),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0),
            nn.ReLU()
        )

        # aliases
        self.smooth_layers = nn.ModuleList([
            self.smooth2,
            self.smooth3,
            self.top_smooth,
        ])

    def forward(self, x):
        out = []
        x = self.stem(x)

        x = self.low_features_a(x)

        x = self.low_features_b(x)
        out.append(self.smooth2(x))

        x = self.mid_features(x)
        out.append(self.smooth3(x))

        x = self.top_features(x)
        out.append(self.top_smooth(x))

        return out


In [24]:
model = FurnitureInceptionResNetV4350SSDLike_v2(128)

In [25]:
x = torch.rand(4, 3, 350, 350)
y = model.extractor(x)

In [26]:
print([i.shape for i in y])

[torch.Size([4, 256, 20, 20]), torch.Size([4, 256, 9, 9]), torch.Size([4, 256, 1, 1])]


In [27]:
y = model(x)
y.shape

torch.Size([4, 128])

RetinaNet for classification

In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torchvision.models.resnet import resnet50


class FurnitureRetinaNetClassification(nn.Module):
    num_anchors = 1

    def __init__(self, num_classes, pretrained=True):
        super(FurnitureRetinaNetClassification, self).__init__()
        self.fpn = FPN50(pretrained)
        self.num_classes = num_classes
        self.cls_head = self._make_head(self.num_anchors * self.num_classes)
        
        n_levels = 5
        self.inner_classifier = nn.Linear(n_levels * num_classes, num_classes)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p=0.4)
        self.final_classifier = nn.Linear(num_classes, num_classes)
        
    def forward(self, x):
        fms = self.fpn(x)
        cls_preds = []
        for fm in fms:
            cls_pred = self.cls_head(fm)
            cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.num_classes)
            cls_pred = F.relu(cls_pred)
            cls_pred, _ = torch.max(cls_pred, dim=1)
            cls_preds.append(cls_pred)

        cls_preds = torch.cat(cls_preds, dim=1)
        out = self.inner_classifier(cls_preds)
        out = self.drop(out)
        out = self.final_classifier(out)
        return out

    def _make_head(self, out_planes):
        layers = []
        for _ in range(4):
            layers.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1))
            layers.append(nn.ReLU(True))
        layers.append(nn.Conv2d(256, out_planes, kernel_size=3, stride=1, padding=1))
        return nn.Sequential(*layers)


class FPN50(nn.Module):
    def __init__(self, pretrained):
        super(FPN50, self).__init__()
        
        self.resnet = resnet50(pretrained=pretrained)

        self.stem = nn.Sequential(
            self.resnet.conv1,
            self.resnet.bn1,
            self.resnet.relu,
            self.resnet.maxpool            
        )
        
        self.low_features = nn.Sequential(
            self.resnet.layer1,
            self.resnet.layer2,
        )
        self.layer3 = self.resnet.layer3        
        self.layer4 = self.resnet.layer4                
        
        self.conv6 = nn.Conv2d(2048, 256, kernel_size=3, stride=2, padding=1)
        self.conv7 = nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)

        # Top-down layers
        self.toplayer = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=0)

        # Lateral layers
        self.latlayer1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0)
        self.latlayer2 = nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0)

        # Smooth layers
        self.smooth1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.smooth2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        
        # Aliases
        self.mid_features = nn.ModuleList([
            self.layer3,
            self.layer4,            
        ])
        
        self.top_features = nn.ModuleList([
            self.conv6,
            self.conv7,
            self.toplayer,
            self.latlayer1,
            self.latlayer2,
            self.smooth1,
            self.smooth2
        ])        
        
    def _upsample_add(self, x, y):
        """Upsample and add two feature maps.

        Args:
          x: (Variable) top feature map to be upsampled.
          y: (Variable) lateral feature map.

        Returns:
          (Variable) added feature map.

        Note in PyTorch, when input size is odd, the upsampled feature map
        with `F.upsample(..., scale_factor=2, mode='nearest')`
        maybe not equal to the lateral feature map size.

        e.g.
        original input size: [N,_,15,15] ->
        conv2d feature map size: [N,_,8,8] ->
        upsampled feature map size: [N,_,16,16]

        So we choose bilinear upsample which supports arbitrary output sizes.
        """
        _, _, h, w = y.size()
        return F.upsample(x, size=(h, w), mode='bilinear', align_corners=True) + y 

    def forward(self, x):
        # Bottom-up
        c1 = self.stem(x)
        c3 = self.low_features(c1)
        c4 = self.layer3(c3)
        c5 = self.layer4(c4)        
        p6 = self.conv6(c5)
        p7 = self.conv7(F.relu(p6))
        # Top-down
        p5 = self.toplayer(c5)
        p4 = self._upsample_add(p5, self.latlayer1(c4))
        p4 = self.smooth1(p4)
        p3 = self._upsample_add(p4, self.latlayer2(c3))
        p3 = self.smooth2(p3)
        return p3, p4, p5, p6, p7

In [40]:
model = FurnitureRetinaNetClassification(128)

In [41]:
x = torch.rand(4, 3, 350, 350)
y = model.fpn(x)

In [42]:
print([i.shape for i in y])

[torch.Size([4, 256, 44, 44]), torch.Size([4, 256, 22, 22]), torch.Size([4, 256, 11, 11]), torch.Size([4, 256, 6, 6]), torch.Size([4, 256, 3, 3])]


In [43]:
y = model(x)

In [44]:
y.shape

torch.Size([4, 128])

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from pretrainedmodels.models.inceptionresnetv2 import inceptionresnetv2


class FurnitureInceptionResNet350RetinaLike(nn.Module):
    def __init__(self, num_classes, pretrained='imagenet'):
        super(FurnitureInceptionResNet350RetinaLike, self).__init__()

        self.fpn = InceptionResnetFPN350(pretrained=pretrained)

        self.num_classes = num_classes
        self.cls_head = self._make_head(self.num_classes)

        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.base_classifier = nn.Linear(1536, num_classes)

        self.boxes_classifier = nn.Linear(num_classes, num_classes)
        self.relu = nn.ReLU()
        self.final_classifier = nn.Linear(num_classes, num_classes)

    def forward(self, x):
        c5, fms = self.fpn(x)

        # A) Standard classification:
        out_a = self.avgpool(c5)
        out_a = out_a.view(out_a.size(0), -1)
        out_a = self.base_classifier(out_a)

        # B) Boxes classification:
        cls_preds = 0
        for fm in fms:
            cls_pred = self.cls_head(fm)
            cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.num_classes)
            cls_pred = F.relu(cls_pred)
            cls_pred, _ = torch.max(cls_pred, dim=1)
            cls_preds += cls_pred
        out_b = self.boxes_classifier(cls_preds)

        # Merge A + B
        out = out_a + out_b
        out = self.final_classifier(out)
        return out

    def _make_head(self, out_planes):
        layers = []
        for _ in range(4):
            layers.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1))
            layers.append(nn.ReLU(True))
        layers.append(nn.Conv2d(256, out_planes, kernel_size=3, stride=1, padding=1))
        return nn.Sequential(*layers)


class InceptionResnetFPN350(nn.Module):

    def __init__(self, pretrained):
        super(InceptionResnetFPN350, self).__init__()

        model = inceptionresnetv2(pretrained=pretrained)
        self.stem = nn.Sequential(
            model.conv2d_1a,
            model.conv2d_2a,
            model.conv2d_2b,
            model.maxpool_3a,
            model.conv2d_3b,
            model.conv2d_4a,
            model.maxpool_5a,
        )

        self.low_features = nn.Sequential(
            model.mixed_5b,
            model.repeat,
        )

        self.layer3 = nn.Sequential(
            model.mixed_6a,
            model.repeat_1
        )

        self.layer4 = nn.Sequential(
            model.mixed_7a,
            model.repeat_2,
            model.block8,
            model.conv2d_7b,
        )

        self.conv6 = nn.Conv2d(1536, 256, kernel_size=3, stride=2, padding=1)
        self.conv7 = nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)

        # Top-down layers
        self.toplayer = nn.Conv2d(1536, 256, kernel_size=1, stride=1, padding=0)

        # Lateral layers
        self.latlayer1 = nn.Conv2d(1088, 256, kernel_size=1, stride=1, padding=0)
        self.latlayer2 = nn.Conv2d(320, 256, kernel_size=1, stride=1, padding=0)

        # Smooth layers
        self.smooth1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.smooth2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)

        # Aliases
        self.mid_features = nn.ModuleList([
            self.layer3,
            self.layer4,
        ])

        self.top_features = nn.ModuleList([
            self.conv6,
            self.conv7,
            self.toplayer,
            self.latlayer1,
            self.latlayer2,
            self.smooth1,
            self.smooth2
        ])

    def _upsample_add(self, x, y):
        _, _, h, w = y.size()
        return F.upsample(x, size=(h, w), mode='bilinear', align_corners=True) + y

    def forward(self, x):
        # Bottom-up
        c1 = self.stem(x)
        c3 = self.low_features(c1)
        c4 = self.layer3(c3)
        c5 = self.layer4(c4)
        p6 = self.conv6(c5)
        p7 = self.conv7(F.relu(p6))
        # Top-down
        p5 = self.toplayer(c5)
        p4 = self._upsample_add(p5, self.latlayer1(c4))
        p4 = self.smooth1(p4)
        p3 = self._upsample_add(p4, self.latlayer2(c3))
        p3 = self.smooth2(p3)
        return c5, (p3, p4, p5, p6, p7)


In [2]:
model = FurnitureInceptionResNet350RetinaLike(128)

In [3]:
x = torch.rand(4, 3, 350, 350)
y = model.fpn(x)

In [7]:
print(y[0].shape, [i.shape for i in y[1]])

torch.Size([4, 1536, 9, 9]) [torch.Size([4, 256, 41, 41]), torch.Size([4, 256, 20, 20]), torch.Size([4, 256, 9, 9]), torch.Size([4, 256, 5, 5]), torch.Size([4, 256, 3, 3])]


In [8]:
y = model(x)

In [9]:
y.shape

torch.Size([4, 128])