In [1]:
import sys
sys.path.append('/cocoapi/PythonAPI')
from pycocotools.coco import COCO
!pip install nltk
import nltk
nltk.download('punkt')
from data_loader import get_loader
from torchvision import transforms

# Define a transform to pre-process the training images.
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Set the minimum word count threshold.
vocab_threshold = 5

# Specify the batch size.
batch_size = 10

# Obtain the data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=False)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


loading annotations into memory...
Done (t=0.94s)
creating index...
index created!
[0/591753] Tokenizing captions...
[100000/591753] Tokenizing captions...
[200000/591753] Tokenizing captions...
[300000/591753] Tokenizing captions...
[400000/591753] Tokenizing captions...
[500000/591753] Tokenizing captions...
loading annotations into memory...
Done (t=0.80s)
creating index...
index created!
Obtaining caption lengths...


100%|███████████████████████████████████████████████████████████████████████| 591753/591753 [00:57<00:00, 10254.32it/s]


In [2]:
sample_caption = 'A person doing a trick on a rail while riding a skateboard.'

In [3]:
import nltk

sample_tokens = nltk.tokenize.word_tokenize(str(sample_caption).lower())
print(sample_tokens)

['a', 'person', 'doing', 'a', 'trick', 'on', 'a', 'rail', 'while', 'riding', 'a', 'skateboard', '.']


In [4]:
sample_caption = []

start_word = data_loader.dataset.vocab.start_word
print('Special start word:', start_word)
sample_caption.append(data_loader.dataset.vocab(start_word))
print(sample_caption)

Special start word: <start>
[0]


In [5]:
sample_caption.extend([data_loader.dataset.vocab(token) for token in sample_tokens])
print(sample_caption)


[0, 3, 145, 495, 3, 727, 49, 3, 681, 265, 106, 3, 147, 12]


In [6]:
end_word = data_loader.dataset.vocab.end_word
print('Special end word:', end_word)

sample_caption.append(data_loader.dataset.vocab(end_word))
print(sample_caption)

Special end word: <end>
[0, 3, 145, 495, 3, 727, 49, 3, 681, 265, 106, 3, 147, 12, 1]


In [7]:
import torch

sample_caption = torch.Tensor(sample_caption).long()
print(sample_caption)

tensor([  0,   3, 145, 495,   3, 727,  49,   3, 681, 265, 106,   3, 147,  12,
          1])


In [8]:
# Preview the word2idx dictionary.
dict(list(data_loader.dataset.vocab.word2idx.items())[:10])

{'<start>': 0,
 '<end>': 1,
 '<unk>': 2,
 'a': 3,
 'bicycle': 4,
 'replica': 5,
 'with': 6,
 'clock': 7,
 'as': 8,
 'the': 9}

In [9]:
# Print the total number of keys in the word2idx dictionary.
print('Total number of tokens in vocabulary:', len(data_loader.dataset.vocab))

Total number of tokens in vocabulary: 10321


In [10]:
# Modify the minimum word count threshold.
vocab_threshold = 4

# Obtain the data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=False)

loading annotations into memory...
Done (t=0.75s)
creating index...
index created!
[0/591753] Tokenizing captions...
[100000/591753] Tokenizing captions...
[200000/591753] Tokenizing captions...
[300000/591753] Tokenizing captions...
[400000/591753] Tokenizing captions...
[500000/591753] Tokenizing captions...
loading annotations into memory...
Done (t=0.81s)
creating index...
index created!
Obtaining caption lengths...


100%|████████████████████████████████████████████████████████████████████████| 591753/591753 [00:59<00:00, 9990.85it/s]


In [11]:
# Print the total number of keys in the word2idx dictionary.
print('Total number of tokens in vocabulary:', len(data_loader.dataset.vocab))

Total number of tokens in vocabulary: 11543


In [12]:
unk_word = data_loader.dataset.vocab.unk_word
print('Special unknown word:', unk_word)

print('All unknown words are mapped to this integer:', data_loader.dataset.vocab(unk_word))

Special unknown word: <unk>
All unknown words are mapped to this integer: 2


In [13]:
print(data_loader.dataset.vocab('jfkafejw'))
print(data_loader.dataset.vocab('ieowoqjf'))

2
2


In [14]:
# Obtain the data loader (from file). Note that it runs much faster than before!
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_from_file=True)

Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=0.75s)
creating index...
index created!
Obtaining caption lengths...


100%|███████████████████████████████████████████████████████████████████████| 591753/591753 [00:57<00:00, 10294.63it/s]


In [15]:
from collections import Counter

# Tally the total number of training captions with each length.
counter = Counter(data_loader.dataset.caption_lengths)
lengths = sorted(counter.items(), key=lambda pair: pair[1], reverse=True)
for value, count in lengths:
    print('value: %2d --- count: %5d' % (value, count))

value: 10 --- count: 123316
value: 11 --- count: 114543
value:  9 --- count: 102606
value: 12 --- count: 82271
value: 13 --- count: 53723
value: 14 --- count: 31924
value:  8 --- count: 29696
value: 15 --- count: 18511
value: 16 --- count: 11022
value: 17 --- count:  6928
value: 18 --- count:  4382
value: 19 --- count:  2890
value:  7 --- count:  2297
value: 20 --- count:  2047
value: 21 --- count:  1443
value: 22 --- count:   977
value: 23 --- count:   747
value: 24 --- count:   563
value: 25 --- count:   390
value: 26 --- count:   287
value: 27 --- count:   212
value: 28 --- count:   162
value: 29 --- count:   124
value: 30 --- count:    92
value: 31 --- count:    80
value: 32 --- count:    61
value: 34 --- count:    51
value: 33 --- count:    45
value: 37 --- count:    43
value: 35 --- count:    42
value: 36 --- count:    32
value: 38 --- count:    27
value: 39 --- count:    25
value: 43 --- count:    23
value: 44 --- count:    21
value: 40 --- count:    18
value: 45 --- count:    1

In [16]:
import numpy as np
import torch.utils.data as data

# Randomly sample a caption length, and sample indices with that length.
indices = data_loader.dataset.get_train_indices()
print('sampled indices:', indices)

# Create and assign a batch sampler to retrieve a batch with the sampled indices.
new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
data_loader.batch_sampler.sampler = new_sampler
    
# Obtain the batch.
images, captions = next(iter(data_loader))
    
print('images.shape:', images.shape)
print('captions.shape:', captions.shape)

# (Optional) Uncomment the lines of code below to print the pre-processed images and captions.
print('images:', images)
print('captions:', captions)

sampled indices: [15782, 429033, 132129, 378617, 216521, 244855, 43147, 22409, 54615, 265085]
images.shape: torch.Size([10, 3, 224, 224])
captions.shape: torch.Size([10, 11])
images: tensor([[[[-1.9980, -1.9809, -1.8610,  ..., -0.8849, -0.9705, -1.0219],
          [-1.8782, -1.7240, -1.6898,  ..., -0.6623, -0.8507, -0.9534],
          [-1.9124, -1.7412, -1.2274,  ..., -0.4911, -0.5938, -0.7308],
          ...,
          [-0.7479, -0.7822, -0.8335,  ..., -0.1657, -0.1143,  0.0056],
          [-0.5424, -0.7822, -0.8164,  ..., -0.1486, -0.0629, -0.1657],
          [-0.4397, -0.7137, -0.9020,  ...,  0.0056, -0.0287, -0.2684]],

         [[-1.7906, -1.8081, -1.7906,  ..., -0.1275, -0.1450, -0.0574],
          [-1.5980, -1.4755, -1.5455,  ...,  0.3102,  0.0476, -0.0574],
          [-1.6681, -1.4930, -0.9678,  ...,  0.4678,  0.2227,  0.0126],
          ...,
          [-0.4776, -0.4951, -0.5301,  ...,  0.5028,  0.5378,  0.5378],
          [-0.3550, -0.5651, -0.5476,  ...,  0.4153,  0.5728,  0.

In [17]:
# Watch for any changes in model.py, and re-load it automatically.
%load_ext autoreload
%autoreload 2

# Import EncoderCNN and DecoderRNN. 
from model import EncoderCNN, DecoderRNN

In [18]:
device = torch.device("cuda") 
torch.cuda.is_available()

True

In [19]:
# Specify the dimensionality of the image embedding.
embed_size = 256

#-#-#-# Do NOT modify the code below this line. #-#-#-#

# Initialize the encoder. (Optional: Add additional arguments if necessary.)
encoder = EncoderCNN(embed_size)

# Move the encoder to GPU if CUDA is available.
encoder.to(device)

# Move last batch of images (from Step 2) to GPU if CUDA is available.   
images = images.to(device)

# Pass the images through the encoder.
features = encoder(images)

print('type(features):', type(features))
print('features.shape:', features.shape)

# Check that your encoder satisfies some requirements of the project! :D
assert type(features)==torch.Tensor, "Encoder output needs to be a PyTorch Tensor." 
assert (features.shape[0]==batch_size) & (features.shape[1]==embed_size), "The shape of the encoder output is incorrect."

[autoreload of model failed: Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\envs\tanvi\lib\site-packages\IPython\extensions\autoreload.py", line 261, in check
    superreload(m, reload, self.old_objects)
  File "C:\ProgramData\Anaconda3\envs\tanvi\lib\site-packages\IPython\extensions\autoreload.py", line 459, in superreload
    module = reload(module)
  File "C:\ProgramData\Anaconda3\envs\tanvi\lib\importlib\__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 613, in _exec
  File "<frozen importlib._bootstrap_external>", line 846, in exec_module
  File "<frozen importlib._bootstrap_external>", line 983, in get_code
  File "<frozen importlib._bootstrap_external>", line 913, in source_to_code
  File "<frozen importlib._bootstrap>", line 228, in _call_with_frames_removed
  File "D:\student\tanvi\model.py", line 10
    super(EncoderCNN, self).__init__()
TabError: inconsistent use of tabs and spaces in indent

  0%|          | 0.00/170M [00:00<?, ?B/s]

type(features): <class 'torch.Tensor'>
features.shape: torch.Size([10, 256])


In [20]:
# Specify the number of features in the hidden state of the RNN decoder.
hidden_size = 512

#-#-#-# Do NOT modify the code below this line. #-#-#-#

# Store the size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the decoder.
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move the decoder to GPU if CUDA is available.
decoder.to(device)
    
# Move last batch of captions (from Step 1) to GPU if CUDA is available 
captions = captions.to(device)

# Pass the encoder output and captions through the decoder.
outputs = decoder(features, captions)

print('type(outputs):', type(outputs))
print('outputs.shape:', outputs.shape)

# Check that your decoder satisfies some requirements of the project! :D
assert type(outputs)==torch.Tensor, "Decoder output needs to be a PyTorch Tensor."
assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect." 

type(outputs): <class 'torch.Tensor'>
outputs.shape: torch.Size([10, 11, 11543])
