In [1]:
import torch as t
from torch.autograd import Variable as V
from torch import nn
from torchvision.models import vgg16

### 1. Use a conv layer in pytorch

In [2]:
img = t.randn((1,3,227,227))
img = V(img)
conv1 = nn.Conv2d(3,64,(3,3))
features = conv1(img)
print(features.shape)

torch.Size([1, 64, 225, 225])


### 2. Total pretrained VGG16 net

In [3]:
model = vgg16(pretrained=True)
extractor = model.features
classifier = model.classifier

In [4]:
# check the extractor part of VGG16
extractor

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(

In [5]:
# check the classifier part of VGG16
classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace)
  (2): Dropout(p=0.5)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace)
  (5): Dropout(p=0.5)
  (6): Linear(in_features=4096, out_features=1000, bias=True)
)

In [6]:
feature = extractor(img)
feature.shape # 2**5=32个downsampling

torch.Size([1, 512, 7, 7])

### 3. Just use part of the pretrained model

In [2]:
# put the VGG pretrained model here
vgg_checkpoint = "/root/.torch/models/vgg16-397923af.pth"

#### 3.1 Drop the last FC layer

In [8]:
# A VGG16 without the last FC layer
class EncoderCNN(nn.Module):

    def __init__(self):
        super(EncoderCNN, self).__init__()
        self.vgg = vgg16()
        self.vgg.load_state_dict(t.load(vgg_checkpoint))
        # rewrite the classifler, just use 0-5 layers
        self.vgg.classifier = nn.Sequential(
            *(self.vgg.classifier[i] for i in range(6)))

    def forward(self, images):
        return self.vgg(images)

In [9]:
model = EncoderCNN()
r = model(img)
r.shape

torch.Size([1, 4096])

#### 3.2 Drop whatever layer you want

In [10]:
class EncoderCNN(nn.Module):

    def __init__(self):
        super(EncoderCNN, self).__init__()
        
        vgg = vgg16()
        vgg.load_state_dict(t.load(vgg_checkpoint))
        
        # keep the extractor
        self.extractor = vgg.extractor
        
        # classifier
        classifier = vgg.classifier
        classifier = list(classifier)
        del classifier[6] # drop the last FC layer
        del classifier[2] # drop the 1st dropout layer
        del classifier[5] # drop the 2nd dropout layer
        self.classifier = nn.Sequential(*classifier)

    def forward(self, images):
        features = self.extractor(images)
        r = self.classifier(features)
        return r

In [None]:
model = EncoderCNN()
r = model(img)
r.shape

#### 3.3 Use extractor in VGG16 to generate multi feature map 

- 15的输出是conv3
- 22的输出是conv4
- 29的输出是conv5

In [9]:
class EncoderCNN(nn.Module):

    def __init__(self):
        super(EncoderCNN, self).__init__()
        
        vgg = vgg16()
        vgg.load_state_dict(t.load(vgg_checkpoint))
        
        # keep the extractor
        extractor = vgg.features
        extractor = list(extractor)
        
        stage_1 = extractor[0:16]
        self.stage_1 = nn.Sequential(*stage_1)
        
        stage_2 = extractor[16:23]
        self.stage_2 = nn.Sequential(*stage_2)        
        
        stage_3 = extractor[23:30]
        self.stage_3 = nn.Sequential(*stage_3)       
        

    def forward(self, images):
        conv3_3 = self.stage_1(images)
        conv4_3 = self.stage_2(conv3_3)
        conv5_3 = self.stage_3(conv4_3)
        return [conv3_3, conv4_3, conv5_3]

In [10]:
model = EncoderCNN()

In [11]:
img = t.randn((1,3,227,227))
img = V(img)
conv1 = nn.Conv2d(3,64,(3,3))
features = conv1(img)
print(features.shape)

torch.Size([1, 64, 225, 225])


In [12]:
r = model(img)

In [15]:
for i in r : print(i.shape)

torch.Size([1, 256, 56, 56])
torch.Size([1, 512, 28, 28])
torch.Size([1, 512, 14, 14])


### 4. Build a classifier based on 3 feature maps 

- VGG16 is applied to generate a basic classifier on CIFAR-10 dataset.
- ROI pooling is applied to generate same shape feature maps of 7*7.

#### 4.1 test the output of adaptivepooling

In [3]:
import torch as t
from torch.nn import AdaptiveAvgPool2d
from torch.autograd import Variable as V

import numpy as np

In [47]:
p = AdaptiveAvgPool2d((2,2))

In [48]:
x = np.arange(25)
x = x.reshape((5,5))
x = t.Tensor(x)
x = t.unsqueeze(x, dim=0)
r = p(V(x))

In [55]:
print(r)
print(x)
print(r.shape)

tensor([[[  6.,   8.],
         [ 16.,  18.]]])
tensor([[[  0.,   1.,   2.,   3.,   4.],
         [  5.,   6.,   7.,   8.,   9.],
         [ 10.,  11.,  12.,  13.,  14.],
         [ 15.,  16.,  17.,  18.,  19.],
         [ 20.,  21.,  22.,  23.,  24.]]])
torch.Size([1, 2, 2])


#### 4.2 build net

In [4]:
class shift_VGG(nn.Module):

    def __init__(self, roi_dim):
        super(shift_VGG, self).__init__()
        
        vgg = vgg16()
        vgg.load_state_dict(t.load(vgg_checkpoint))
        
        # keep the extractor
        extractor = vgg.features
        extractor = list(extractor)
        classifier = vgg.classifier
        classifier = list(classifier)
        del classifier[6]
        
        stage_1 = extractor[0:16]
        self.stage_1 = nn.Sequential(*stage_1)      
        stage_2 = extractor[16:23]
        self.stage_2 = nn.Sequential(*stage_2)               
        stage_3 = extractor[23:30]
        self.stage_3 = nn.Sequential(*stage_3)  
        
        self.roipooling_1 = nn.AdaptiveAvgPool2d(roi_dim) # for conv3_3
        self.roipooling_2 = nn.AdaptiveAvgPool2d(roi_dim) # for conv4_3
        self.roipooling_3 = nn.AdaptiveAvgPool2d(roi_dim) # for conv5_3
        
        self.conv_1 = nn.Conv2d(1280, 512, (1,1)) # 1*1 conv layer
        
        self.classifier = nn.Sequential(*classifier)
        self.linear = nn.Linear(4096, 10)

    def forward(self, images):
        
        conv3_3 = self.stage_1(images)
        conv4_3 = self.stage_2(conv3_3)
        conv5_3 = self.stage_3(conv4_3)
        
        feature_3 = self.roipooling_1(conv3_3)  # 256
        feature_4 = self.roipooling_2(conv4_3)  # 512
        feature_5 = self.roipooling_3(conv5_3)  # 512
        
        features = t.cat([feature_3, feature_4, feature_5], dim=1)
        
        re_features = self.conv_1(features)
        l = self.classifier(re_features)
        output = self.linear(l)
          
        return output

In [5]:
x = np.arange(112**2)
x = x.reshape((112,112))
x = t.Tensor(x)
x = t.unsqueeze(x, dim=0)
x = x.repeat((3,1,1))
print(x.shape)
x = t.unsqueeze(x, dim=0)
print(x.shape)

torch.Size([3, 112, 112])
torch.Size([1, 3, 112, 112])


In [None]:
net = shift_VGG(roi_dim=7)

In [None]:
net(x)