- http://localhost:8887/edit/git/pytorch-tutorial/tutorials/03-advanced/image_captioning/model.py
- http://localhost:8887/edit/git/pytorch-tutorial/tutorials/03-advanced/image_captioning/train.py

In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence


class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        
    def forward(self, images):
        """Extract feature vectors from input images."""
        with torch.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features

In [2]:
encoder = EncoderCNN(embed_size=256)

nmd_children = list(encoder.named_children())

[o[0] for o in nmd_children]

['resnet', 'linear', 'bn']

In [3]:
children = list(encoder.children())
len(children)

3

In [4]:
children[1:]

[Linear(in_features=2048, out_features=256, bias=True),
 BatchNorm1d(256, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)]

In [5]:
def print_ioshape(self, input, output):
    # input is a tuple of packed inputs
    # output is a Tensor. output.data is the Tensor we are interested
    print('Inside ' + self.__class__.__name__ + ' forward')
    print('')
    print('input: ', type(input))
    print('input[0]: ', type(input[0]))
    print('output: ', type(output))
    print('')
    print('input size:', input[0].size())
    print('output size:', output.data.size())
    print('-'*10, '\n')

for m in children[1:]:
    m.register_forward_hook(print_ioshape)

In [6]:
# https://github.com/pytorch/pytorch/issues/4534
input = torch.randn(1, 3, 224, 224)

out = encoder(input)

Inside Linear forward

input:  <class 'tuple'>
input[0]:  <class 'torch.Tensor'>
output:  <class 'torch.Tensor'>

input size: torch.Size([1, 2048])
output size: torch.Size([1, 256])
---------- 



ValueError: Expected more than 1 value per channel when training, got input size [1, 256]

In [7]:
input = torch.randn(8, 3, 224, 224)

out = encoder(input)

Inside Linear forward

input:  <class 'tuple'>
input[0]:  <class 'torch.Tensor'>
output:  <class 'torch.Tensor'>

input size: torch.Size([8, 2048])
output size: torch.Size([8, 256])
---------- 

Inside BatchNorm1d forward

input:  <class 'tuple'>
input[0]:  <class 'torch.Tensor'>
output:  <class 'torch.Tensor'>

input size: torch.Size([8, 256])
output size: torch.Size([8, 256])
---------- 

