In [1]:
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
from data_loader import get_loader
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN, EncoderCNNWithAttention, DecoderRNNWithAttention
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
from PIL import Image

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
embed_size = 256
pixel_num = 16
encoded_size = 512
hidden_size = 512
vocab_path = 'data/vocab.pkl'
num_layers = 1
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)
vocab_size = len(vocab)
encoder = EncoderCNNWithAttention(pixel_num).eval()
decoder = DecoderRNNWithAttention(embed_size, hidden_size, vocab_size, num_layers, encoded_size, device).eval()

In [4]:
encoder = encoder.to(device)

In [5]:
transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])

In [6]:
def load_image(image_path, transform=None):
    image = Image.open(image_path).convert('RGB')
#     image = image.resize([224, 224], Image.LANCZOS)

    if transform is not None:
        image = transform(image).unsqueeze(0)

    return image

In [7]:
image = load_image('data/resized2014/coco_val_0.jpg', transform)
image_tensor = image.to(device)

In [8]:
image_tensor = torch.concat([image_tensor, image_tensor])
print(image_tensor.shape)

torch.Size([2, 3, 256, 256])


In [9]:
with torch.no_grad():
    feature = encoder(image_tensor)

In [10]:
feature.size()

torch.Size([2, 16, 16, 512])

In [11]:
# feature.view(1, 512, 256)

In [12]:
# features = torch.stack([feature[0], feature[0]])

In [13]:
image_dir = 'data/resized2014/'
caption_path = 'data/annotations/captions_train2014.json'
vocab_path = 'data/vocab.pkl'
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)
batch_size = 2
data_loader = get_loader(image_dir, caption_path, vocab,
                             transform, batch_size,
                             shuffle=True, num_workers=2)

loading annotations into memory...
Done (t=0.42s)
creating index...
index created!


In [33]:
for i ,(_, captions, lengths) in enumerate(data_loader):
    captions = captions.to(device)
    print(captions)
    break

tensor([[   1,    4,  768,  805,   21, 5468, 1983,   78,    4,  272,   19,    2],
        [   1,   33,  455,   53,    3,   33,  540, 2361, 2168,   19,    2,    0]])
expected Tensor as element 0 in argument 0, but got NoneType
expected Tensor as element 0 in argument 0, but got NoneType
expected Tensor as element 0 in argument 0, but got NoneType
expected Tensor as element 0 in argument 0, but got NoneType
expected Tensor as element 0 in argument 0, but got NoneType


In [37]:
torch.stack([torch.Tensor([length]) for length in lengths]).size()

torch.Size([2, 1])

In [15]:
lengths = torch.stack([torch.Tensor([len(caption)]) for caption in captions])
lengths.size()

torch.Size([2, 1])

In [16]:
with torch.no_grad():
    predictions, sorted_captions, alphas, sort_ind = decoder(feature, captions, lengths)

In [39]:
predictions.size()

torch.Size([2, 12, 9948])

In [24]:
sorted_captions[0].size()

torch.Size([13])

In [17]:
lengths.squeeze(1)

tensor([13., 13.])

In [None]:
feature.mean(dim=1).size()

In [None]:
feature.size()

In [None]:
encoder_out = feature.view(2,-1,512)

In [None]:
encoder_out.size()

In [17]:
vocab_size

9948

In [18]:

caption_lengths, sort_ind = lengths.squeeze(
            1).sort(dim=0, descending=True)

In [19]:
decode_lengths = (caption_lengths - 1).tolist()

In [21]:
max(decode_lengths)

9.0

In [21]:
captions

tensor([[   1,  367,   36,  460, 1108, 1515,   78,    4, 1774,  601,  162, 3639,
            2],
        [   1,    4,  116,  225,   78,   33,   64,  286,  295,  112,    4,  476,
            2]])

In [38]:
captions

tensor([[   1,    4,  768,  805,   21, 5468, 1983,   78,    4,  272,   19,    2],
        [   1,   33,  455,   53,    3,   33,  540, 2361, 2168,   19,    2,    0]])