# __Long Short Term Memory Decoder__

### __Deep Learning__

#### __Project: Image Captioning with Visual Attention__

In [1]:
import os
os.chdir(os.environ["PYTHONPATH"])

import torch
import matplotlib.pyplot as plt
import seaborn as sns

import scripts.data_loading as dl
import scripts.data_processing as dp
from scripts import model

%matplotlib inline
%load_ext autoreload
%autoreload 2


plt.rcParams["figure.figsize"] = (10, 10)
plt.rcParams["image.cmap"] = "plasma"

In [2]:
coco_train = dl.CocoCaptions(
    dl.DATASET_PATHS[dl.DatasetType.TRAIN],
    dp.VGGNET_PREPROCESSING_PIPELINE,
    dp.TextPipeline(),
)

loading annotations into memory...
Done (t=1.51s)
creating index...
index created!


In [3]:
coco_loader = dl.CocoLoader(coco_train, batch_size=2, num_workers=1)
it = iter(coco_loader)
image_batch, caption_batch = next(it)

In [4]:
encoder = model.VGG19Encoder()
feature_maps, feature_mean = encoder.forward(image_batch)

In [5]:
decoder = model.LSTMDecoder(
    num_embeddings=len(coco_train.target_transform.vocabulary),
    embedding_dim=8,
    encoder_dim=feature_mean.shape[-1],
    decoder_dim=16,
    attention_dim=4
)

In [6]:
embeddings = decoder.word_embedding(caption_batch)

In [7]:
print(f"One-hot encoded caption shape = {caption_batch[0].shape}")
print(f"Embedding shape = {embeddings[0].shape}")

One-hot encoded caption shape = torch.Size([51])
Embedding shape = torch.Size([51, 8])


In [8]:
print(caption_batch[0])
print(embeddings[0])

tensor([10000,    12,    78,    22,    34,   850,    31,     4,     0,   182,
            1,  1421,    65,     2,     0,     8,   283,  1486,     0,    34,
          204,     9,  1584,    47,    34,   924,     1,    34,  1118,     5,
          133,   167,    62,    34,  1186,     4,   121,     4,    36,     2,
          244,     6,    12,  1186,     1,   206,     6,   314, 10002,   139,
        10001])
tensor([[ 1.0778, -0.0469,  0.1941, -0.2538, -0.8269,  0.1713,  0.5453,  0.1663],
        [ 0.8444, -0.1235,  2.5719,  0.1893, -0.6015, -0.4761, -1.3315,  1.9284],
        [-0.4916, -0.7716,  0.5826, -1.1466,  0.0599, -1.0457,  1.5602, -0.2518],
        [ 0.2387, -0.0527, -0.9759,  0.3773, -0.2878, -1.0109,  0.8171,  1.3285],
        [ 0.1028,  1.3981, -1.0535,  0.6148,  1.0786,  1.3067,  0.4572, -0.3867],
        [-1.2201,  1.2055, -0.3191,  0.2431, -1.0352, -0.7764,  1.2666, -1.3419],
        [-0.5972,  2.3009,  0.4261,  0.1007,  1.6311,  0.9198, -0.7458, -0.7728],
        [-0.8278, -1

In [9]:
h = decoder.init_h(feature_mean)
c = decoder.init_c(feature_mean)

print(f"Initial h shape = {h.shape}")
print(f"Initial c shape = {c.shape}")

Initial h shape = torch.Size([2, 16])
Initial c shape = torch.Size([2, 16])


In [10]:
# Initial h, c of LSTM computed by MLP(feature_maps_mean)
print(h[0])
print(c[0])

tensor([-0.1671, -0.2366,  0.0020,  0.1604,  0.0024, -0.1476,  0.0745,  0.2381,
         0.0619,  0.3908,  0.1211,  0.2242,  0.3448,  0.0006,  0.0996,  0.1740],
       grad_fn=<SelectBackward>)
tensor([ 0.1581,  0.1302,  0.0385, -0.0008, -0.1579,  0.3189,  0.2711, -0.0877,
        -0.0498, -0.0392, -0.0884,  0.1122, -0.1119, -0.2567,  0.0209,  0.3199],
       grad_fn=<SelectBackward>)


In [11]:
# How to get word embeddings of words at particular index of a caption in batch
index = 50
embeddings[:, index]

tensor([[ 1.0397, -1.2835,  0.1092,  0.3504,  0.5646,  2.0723, -0.0058, -0.2355],
        [ 1.0397, -1.2835,  0.1092,  0.3504,  0.5646,  2.0723, -0.0058, -0.2355]],
       grad_fn=<SelectBackward>)

In [12]:
feature_mean

tensor([[0.1412, 0.1195, 0.1757, 0.1956, 0.2014, 0.1619, 0.1296, 0.1212, 0.1625,
         0.2444, 0.2788, 0.3305, 0.3097, 0.3634, 0.2017, 0.1454, 0.0949, 0.0646,
         0.0738, 0.0580, 0.0685, 0.0834, 0.1497, 0.3198, 0.2564, 0.2706, 0.2274,
         0.2668, 0.4305, 0.2778, 0.3106, 0.2719, 0.2167, 0.0923, 0.0908, 0.1187,
         0.3236, 0.6264, 0.3948, 0.4090, 0.3418, 0.3622, 0.3230, 0.1912, 0.2707,
         0.2215, 0.1806, 0.0852, 0.0931, 0.1090, 0.2908, 0.4428, 0.1921, 0.2183,
         0.2172, 0.3397, 0.3572, 0.2309, 0.2297, 0.1774, 0.2011, 0.1361, 0.1507,
         0.1538, 0.2483, 0.2350, 0.1305, 0.1665, 0.2978, 0.4957, 0.3393, 0.1802,
         0.1848, 0.1314, 0.2549, 0.3474, 0.3141, 0.2434, 0.2089, 0.1747, 0.2000,
         0.2210, 0.3853, 0.5794, 0.2008, 0.1367, 0.1783, 0.1296, 0.2880, 0.4784,
         0.5109, 0.4420, 0.3456, 0.3144, 0.3219, 0.3269, 0.3614, 0.5044, 0.2522,
         0.1661, 0.2265, 0.1435, 0.2713, 0.4259, 0.4878, 0.4517, 0.3450, 0.2899,
         0.2741, 0.2950, 0.3

In [13]:
# How to concatenate context and embedding word
torch.cat([embeddings[:, index], feature_mean], dim=1)

tensor([[ 1.0397, -1.2835,  0.1092,  0.3504,  0.5646,  2.0723, -0.0058, -0.2355,
          0.1412,  0.1195,  0.1757,  0.1956,  0.2014,  0.1619,  0.1296,  0.1212,
          0.1625,  0.2444,  0.2788,  0.3305,  0.3097,  0.3634,  0.2017,  0.1454,
          0.0949,  0.0646,  0.0738,  0.0580,  0.0685,  0.0834,  0.1497,  0.3198,
          0.2564,  0.2706,  0.2274,  0.2668,  0.4305,  0.2778,  0.3106,  0.2719,
          0.2167,  0.0923,  0.0908,  0.1187,  0.3236,  0.6264,  0.3948,  0.4090,
          0.3418,  0.3622,  0.3230,  0.1912,  0.2707,  0.2215,  0.1806,  0.0852,
          0.0931,  0.1090,  0.2908,  0.4428,  0.1921,  0.2183,  0.2172,  0.3397,
          0.3572,  0.2309,  0.2297,  0.1774,  0.2011,  0.1361,  0.1507,  0.1538,
          0.2483,  0.2350,  0.1305,  0.1665,  0.2978,  0.4957,  0.3393,  0.1802,
          0.1848,  0.1314,  0.2549,  0.3474,  0.3141,  0.2434,  0.2089,  0.1747,
          0.2000,  0.2210,  0.3853,  0.5794,  0.2008,  0.1367,  0.1783,  0.1296,
          0.2880,  0.4784,  

In [14]:
predictions, attention_scores = decoder.forward(feature_maps, feature_mean, caption_batch)

In [15]:
caption_batch.shape

torch.Size([2, 51])

Number of predicted words == caption_len - 1

In [16]:
print(predictions.shape)

torch.Size([50, 2, 10004])


Random guess prediction probability

In [17]:
predictions = torch.softmax(predictions, dim=2)

In [18]:
predictions[0, 0, :100]

tensor([8.0981e-05, 7.1937e-05, 1.0210e-04, 1.2367e-04, 1.1758e-04, 7.6954e-05,
        1.2097e-04, 1.1994e-04, 9.5205e-05, 1.0197e-04, 1.1928e-04, 1.6873e-04,
        1.2307e-04, 8.8403e-05, 7.0400e-05, 6.1201e-05, 7.2279e-05, 8.3104e-05,
        8.7575e-05, 1.4165e-04, 1.0737e-04, 5.8038e-05, 9.8186e-05, 9.2949e-05,
        1.0736e-04, 8.1271e-05, 1.2535e-04, 1.0711e-04, 1.0878e-04, 1.5737e-04,
        1.2322e-04, 9.5206e-05, 1.2395e-04, 8.1855e-05, 1.5322e-04, 1.1930e-04,
        7.1873e-05, 1.0991e-04, 1.0167e-04, 8.6898e-05, 6.0001e-05, 1.0933e-04,
        1.5482e-04, 6.1983e-05, 1.4360e-04, 6.6817e-05, 1.0841e-04, 9.7156e-05,
        9.6314e-05, 1.2056e-04, 1.4362e-04, 8.9042e-05, 1.1316e-04, 1.2110e-04,
        9.8696e-05, 1.2230e-04, 7.3913e-05, 7.5212e-05, 1.1653e-04, 1.0285e-04,
        1.0164e-04, 6.3000e-05, 1.3507e-04, 1.7154e-04, 7.1862e-05, 8.3223e-05,
        1.0052e-04, 1.0690e-04, 9.3785e-05, 1.2631e-04, 9.4175e-05, 8.3557e-05,
        6.5326e-05, 1.5785e-04, 7.6731e-

In [19]:
1 / 10_004

9.996001599360256e-05

Attention_scores

In [20]:
attention_scores.shape

torch.Size([50, 2, 512])

In [21]:
attention_scores[0, 0]

tensor([0.0019, 0.0023, 0.0019, 0.0020, 0.0023, 0.0020, 0.0018, 0.0019, 0.0025,
        0.0018, 0.0019, 0.0019, 0.0017, 0.0021, 0.0010, 0.0018, 0.0015, 0.0020,
        0.0018, 0.0018, 0.0012, 0.0019, 0.0023, 0.0027, 0.0019, 0.0019, 0.0024,
        0.0030, 0.0024, 0.0015, 0.0018, 0.0018, 0.0019, 0.0020, 0.0019, 0.0017,
        0.0031, 0.0018, 0.0018, 0.0019, 0.0019, 0.0025, 0.0019, 0.0018, 0.0020,
        0.0019, 0.0023, 0.0019, 0.0013, 0.0018, 0.0018, 0.0018, 0.0018, 0.0020,
        0.0019, 0.0018, 0.0017, 0.0018, 0.0022, 0.0020, 0.0018, 0.0019, 0.0019,
        0.0018, 0.0025, 0.0019, 0.0018, 0.0018, 0.0016, 0.0022, 0.0024, 0.0019,
        0.0019, 0.0020, 0.0021, 0.0023, 0.0016, 0.0019, 0.0014, 0.0022, 0.0018,
        0.0019, 0.0018, 0.0020, 0.0018, 0.0018, 0.0018, 0.0019, 0.0016, 0.0020,
        0.0015, 0.0018, 0.0027, 0.0023, 0.0019, 0.0019, 0.0017, 0.0017, 0.0022,
        0.0019, 0.0015, 0.0020, 0.0018, 0.0019, 0.0018, 0.0019, 0.0026, 0.0019,
        0.0026, 0.0021, 0.0018, 0.0018, 

In [22]:
1 / 512

0.001953125