In [1]:
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
from data_loader import get_loader
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN, EncoderCNNWithAttention, DecoderRNNWithAttention
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
from PIL import Image

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
embed_size = 512
pixel_num = 16
encoded_size = 512
hidden_size = 512
vocab_path = 'data/vocab.pkl'
num_layers = 1
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)
vocab_size = len(vocab)
encoder = EncoderCNNWithAttention(pixel_num).eval()
decoder = DecoderRNNWithAttention(embed_size, hidden_size, vocab_size, num_layers, encoded_size, device).eval()

423


In [4]:
for i, c in enumerate(encoder.modules()):
    print(i, c)

0 EncoderCNNWithAttention(
  (resnet): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
         

In [5]:
encoder = encoder.to(device)

In [6]:
transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])

In [7]:
def load_image(image_path, transform=None):
    image = Image.open(image_path).convert('RGB')
    # image = image.resize([224, 224], Image.LANCZOS)

    if transform is not None:
        image = transform(image).unsqueeze(0)

    return image

In [8]:
image = load_image('data/resized2014/coco_val_0.jpg', transform)
image_tensor = image.to(device)

In [9]:
image_tensor = torch.concat([image_tensor, image_tensor])
print(image_tensor.shape)

torch.Size([2, 3, 256, 256])


In [10]:
with torch.no_grad():
    feature = encoder(image_tensor)

ValueError: expected 2D or 3D input (got 4D input)

In [45]:
preds = decoder.sample(feature[0].unsqueeze(0))

In [46]:
preds

tensor([[3597, 3376, 3376, 8958, 2032, 2032, 2032, 2032, 2032, 2032, 2032, 2032,
         2032, 2032, 2032, 2032, 2032, 2032, 2032, 2032]])

In [47]:
image_dir = 'data/resized2014/'
caption_path = 'data/annotations/captions_train2014.json'
vocab_path = 'data/vocab.pkl'
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)
batch_size = 2
data_loader = get_loader(image_dir, caption_path, vocab,
                             transform, batch_size,
                             shuffle=True, num_workers=2)

loading annotations into memory...
Done (t=0.51s)
creating index...
index created!


In [59]:
for i ,(_, captions, lengths) in enumerate(data_loader):
    captions = captions.to(device)
    print(captions)
    break

tensor([[   1,    4,  170, 2546, 2973,  162,    4,  170,   40,    4,  860,   19,
            2],
        [   1,    4,   60,   36, 1478, 1616,  207,   78,   33,  666,   19,    2,
            0]])
expected Tensor as element 0 in argument 0, but got NoneType
expected Tensor as element 0 in argument 0, but got NoneType
expected Tensor as element 0 in argument 0, but got NoneType
expected Tensor as element 0 in argument 0, but got NoneType
expected Tensor as element 0 in argument 0, but got NoneType


In [60]:

with torch.no_grad():
    predictions, sorted_captions, alphas, sort_ind, decode_lengths = decoder(feature, captions, lengths)

tensor([13., 12.])


In [61]:
criterion = nn.CrossEntropyLoss()

In [62]:
targets = sorted_captions[:, 1:]

In [63]:
packed_targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0]
packed_outputs = pack_padded_sequence(predictions, decode_lengths, batch_first=True)[0]

In [64]:
packed_targets.size()

torch.Size([23])

In [65]:
loss = criterion(packed_outputs, packed_targets)

In [66]:
print(packed_outputs.size())
print(packed_targets.size())

torch.Size([23, 9948])
torch.Size([23])


In [67]:
sorted_captions.size()

torch.Size([2, 13])

In [68]:
feature.mean(dim=1).size()

torch.Size([2, 16, 512])

In [69]:
feature.size()

torch.Size([2, 16, 16, 512])

In [70]:
encoder_out = feature.view(2,-1,512)

In [71]:
encoder_out.size()

torch.Size([2, 256, 512])

In [72]:
vocab_size

9948

In [18]:

caption_lengths, sort_ind = lengths.squeeze(
            1).sort(dim=0, descending=True)

In [19]:
decode_lengths = (caption_lengths - 1).tolist()

In [21]:
max(decode_lengths)

9.0

In [21]:
captions

tensor([[   1,  367,   36,  460, 1108, 1515,   78,    4, 1774,  601,  162, 3639,
            2],
        [   1,    4,  116,  225,   78,   33,   64,  286,  295,  112,    4,  476,
            2]])

In [38]:
captions

tensor([[   1,    4,  768,  805,   21, 5468, 1983,   78,    4,  272,   19,    2],
        [   1,   33,  455,   53,    3,   33,  540, 2361, 2168,   19,    2,    0]])