
## Step 1: Write the Data Loader


In [18]:
from pycocotools.coco import COCO
import nltk
from data_loader import get_loader
import torch
import numpy as np
import torch.utils.data as data
from torchvision import transforms

nltk.download("punkt")


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vedpa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
transform_train = transforms.Compose(
    [
        transforms.Resize(256),  
        transforms.RandomCrop(224),  
        transforms.RandomHorizontalFlip(),  
        transforms.ToTensor(),  
        transforms.Normalize(
            (0.485, 0.456, 0.406),  # normalize image for pre-trained model
            (0.229, 0.224, 0.225),
        ),
    ]
)

vocab_threshold = 5
batch_size = 10

cocoapi_dir = r"C:/Users/vedpa/OneDrive/Desktop/sampleProjects/image_captioning/"

data_loader = get_loader(
    transform=transform_train,
    mode="train",
    batch_size=batch_size,
    vocab_threshold=vocab_threshold,
    vocab_from_file=False,
    cocoapi_loc=cocoapi_dir,
)


Looking for annotations at: C:\Users\vedpa\OneDrive\Desktop\sampleProjects\image_captioning\cocoapi\annotations\captions_train2014.json
Looking for images at: C:\Users\vedpa\OneDrive\Desktop\sampleProjects\image_captioning\cocoapi\images\train2014

loading annotations into memory...
Done (t=0.45s)
creating index...
index created!
[0/414113] Tokenizing captions...
[100000/414113] Tokenizing captions...
[200000/414113] Tokenizing captions...
[300000/414113] Tokenizing captions...
[400000/414113] Tokenizing captions...
loading annotations into memory...
Done (t=0.45s)
creating index...
index created!
Obtaining caption lengths...


100%|██████████| 414113/414113 [00:20<00:00, 20077.05it/s]


In [35]:
sample_caption = "A person doing a trick on a rail while riding a skateboard."

In [36]:
sample_tokens = nltk.tokenize.word_tokenize(str(sample_caption).lower())
print(sample_tokens)

['a', 'person', 'doing', 'a', 'trick', 'on', 'a', 'rail', 'while', 'riding', 'a', 'skateboard', '.']


In [37]:
sample_caption = []

start_word = data_loader.dataset.vocab.start_word
print("Special start word:", start_word)
sample_caption.append(data_loader.dataset.vocab(start_word))
print(sample_caption)

Special start word: <start>
[0]


In [38]:
sample_caption.extend([data_loader.dataset.vocab(token) for token in sample_tokens])
print(sample_caption)

[0, 3, 98, 754, 3, 396, 39, 3, 1010, 207, 139, 3, 753, 18]


In [39]:
end_word = data_loader.dataset.vocab.end_word
print("Special end word:", end_word)

sample_caption.append(data_loader.dataset.vocab(end_word))
print(sample_caption)

Special end word: <end>
[0, 3, 98, 754, 3, 396, 39, 3, 1010, 207, 139, 3, 753, 18, 1]


In [40]:
sample_caption = torch.Tensor(sample_caption).long()
print(sample_caption)

tensor([   0,    3,   98,  754,    3,  396,   39,    3, 1010,  207,  139,    3,
         753,   18,    1])


## IDs
start : 0

end : 1

unk : 2    

In [41]:
# Preview the word2idx dictionary.
dict(list(data_loader.dataset.vocab.word2idx.items())[:10])

{'<start>': 0,
 '<end>': 1,
 '<unk>': 2,
 'a': 3,
 'very': 4,
 'clean': 5,
 'and': 6,
 'well': 7,
 'decorated': 8,
 'empty': 9}

We also print the total number of keys.

In [42]:
# Print the total number of keys in the word2idx dictionary.
print("Total number of tokens in vocabulary:", len(data_loader.dataset.vocab))

Total number of tokens in vocabulary: 8852


In [None]:
vocab_threshold = 4

data_loader = get_loader(
    transform=transform_train,
    mode="train",
    batch_size=batch_size,
    vocab_threshold=vocab_threshold,
    vocab_from_file=False,
    cocoapi_loc=cocoapi_dir,
)


Looking for annotations at: C:\Users\vedpa\OneDrive\Desktop\sampleProjects\image_captioning\cocoapi\annotations\captions_train2014.json
Looking for images at: C:\Users\vedpa\OneDrive\Desktop\sampleProjects\image_captioning\cocoapi\images\train2014

loading annotations into memory...
Done (t=0.47s)
creating index...
index created!
[0/414113] Tokenizing captions...
[100000/414113] Tokenizing captions...
[200000/414113] Tokenizing captions...
[300000/414113] Tokenizing captions...
[400000/414113] Tokenizing captions...
loading annotations into memory...
Done (t=0.44s)
creating index...
index created!
Obtaining caption lengths...


100%|██████████| 414113/414113 [00:19<00:00, 21338.29it/s]


In [None]:

print(f"Total number of tokens in vocabulary: {len(data_loader.dataset.vocab)}")

Total number of tokens in vocabulary: 9947


In [45]:
unk_word = data_loader.dataset.vocab.unk_word
print(f"Special unknown word: {unk_word}")

print(
    f"All unknown words are mapped to this integer: {data_loader.dataset.vocab(unk_word)}"
)

Special unknown word: <unk>
All unknown words are mapped to this integer: 2


In [46]:
print(data_loader.dataset.vocab("jfkafejw"))
print(data_loader.dataset.vocab("ieowoqjf"))

2
2


In [47]:
print(data_loader.dataset.vocab("."))

18


In [None]:
# Obtain the data loader 
data_loader = get_loader(
    transform=transform_train,
    mode="train",
    batch_size=batch_size,
    vocab_from_file=True,
    cocoapi_loc=cocoapi_dir,
)


Looking for annotations at: C:\Users\vedpa\OneDrive\Desktop\sampleProjects\image_captioning\cocoapi\annotations\captions_train2014.json
Looking for images at: C:\Users\vedpa\OneDrive\Desktop\sampleProjects\image_captioning\cocoapi\images\train2014

Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=0.43s)
creating index...
index created!
Obtaining caption lengths...


100%|██████████| 414113/414113 [00:19<00:00, 21234.54it/s]


The way to use the data loader to obtain batches of training data is explained in the next section.


## Step 2: Using the Data Loader to Obtain Batches


In [49]:
type(data_loader.dataset.caption_lengths), len(data_loader.dataset.caption_lengths)

(list, 414113)

In [None]:
from collections import Counter

counter = Counter(data_loader.dataset.caption_lengths)
lengths = sorted(counter.items(), key=lambda pair: pair[1], reverse=True)
for value, count in lengths:
    print("value: %2d --- count: %5d" % (value, count))

value: 10 --- count: 86302
value: 11 --- count: 79971
value:  9 --- count: 71920
value: 12 --- count: 57653
value: 13 --- count: 37668
value: 14 --- count: 22342
value:  8 --- count: 20742
value: 15 --- count: 12839
value: 16 --- count:  7736
value: 17 --- count:  4845
value: 18 --- count:  3101
value: 19 --- count:  2017
value:  7 --- count:  1594
value: 20 --- count:  1453
value: 21 --- count:   997
value: 22 --- count:   683
value: 23 --- count:   534
value: 24 --- count:   384
value: 25 --- count:   277
value: 26 --- count:   214
value: 27 --- count:   160
value: 28 --- count:   114
value: 29 --- count:    87
value: 30 --- count:    58
value: 31 --- count:    49
value: 32 --- count:    44
value: 34 --- count:    40
value: 37 --- count:    32
value: 35 --- count:    31
value: 33 --- count:    30
value: 36 --- count:    26
value: 38 --- count:    18
value: 39 --- count:    18
value: 43 --- count:    16
value: 44 --- count:    16
value: 48 --- count:    12
value: 45 --- count:    11
v

In [None]:
print(batch_size)
indices = data_loader.dataset.get_train_indices()
print("sampled indices:", indices)                                              
                                         
new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
data_loader.batch_sampler.sampler = new_sampler

images, captions = next(iter(data_loader))

print("images.shape:", images.shape)
print("captions.shape:", captions.shape)


10
sampled indices: [313215, 255458, 348108, 26394, 401770, 160397, 374683, 397941, 346966, 404149]
images.shape: torch.Size([10, 3, 224, 224])
captions.shape: torch.Size([10, 15])



## Step 3: Experimenting with the CNN Encoder


In [None]:

from model import EncoderCNN, DecoderRNN

In [54]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [None]:
image_embed_size = 256

encoder = EncoderCNN(image_embed_size)

encoder.to(device)
images = images.to(device)

features = encoder(images)

print("type(features):", type(features))
print("features.shape:", features.shape)
print("captions.shape:", captions.shape)

assert type(features) == torch.Tensor, "Encoder output needs to be a PyTorch Tensor."

assert (features.shape[0] == batch_size) and (
    features.shape[1] == image_embed_size
), "The shape of the encoder output is incorrect."



type(features): <class 'torch.Tensor'>
features.shape: torch.Size([10, 256])
captions.shape: torch.Size([10, 15])



## Step 4: Implementing the RNN Decoder


In [56]:
print(image_embed_size)

256


In [None]:
hidden_size = 512

word_embed_size = image_embed_size
vocab_size = len(data_loader.dataset.vocab)

decoder = DecoderRNN(word_embed_size, hidden_size, vocab_size)
decoder.to(device)
captions = captions.to(device)
outputs = decoder(features, captions) 


print("type(outputs):", type(outputs))
print("outputs.shape:", outputs.shape)


assert type(outputs) == torch.Tensor, "Decoder output needs to be a PyTorch Tensor."
assert (
    (outputs.shape[0] == batch_size)
    and (outputs.shape[1] == captions.shape[1])
    and (outputs.shape[2] == vocab_size)
), "The shape of the decoder output is incorrect."

type(outputs): <class 'torch.Tensor'>
outputs.shape: torch.Size([10, 15, 9947])
