# Pytorch Image Captioning Tutorial

## Resources
#### Video
- https://www.youtube.com/watch?v=y2BaTt1fxJU&list=PL3bRG3rC5WYd517_ZF0GewyFzlsmDag0o&index=12

#### Article 

#### Code 
- **[c1]** https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning
- **[c2]** https://github.com/aladdinpersson/Machine-Learning-Collection/tree/master/ML/Pytorch/more_advanced/image_captioning

![](./archi.png)

[Image From **[c1]**]

In **[c1]** the author uses resnet-152 so the feature vector at fc layer is 1 * 1 * 2048. We will be using vgg19 and the feature vector at fc layer we will be taking is 1 * 1* 4096

In [1]:
import torch
from torch import nn, optim
from torchvision import models, transforms

In [2]:
from custom_dataset import get_loader

In [3]:
a_device = "cuda" if torch.cuda.is_available else "cpu"
print(a_device)
device = torch.device(a_device)
device

cuda


device(type='cuda')

In [4]:
transform = transforms.Compose(
    [
            transforms.Resize((356, 356)),
            transforms.RandomCrop((299, 299)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]
)

In [5]:
image_folder = "./flickr8k/images"
csv = "./flickr8k/captions.txt"
loader, dataset = get_loader(image_folder, csv, transform=transform)

In [6]:
len(dataset.vocab)

2664

In [7]:
vocab_size = len(dataset.vocab)
vocab_size

2664

In [8]:
for x, y in loader:
    print(x.shape)
    print(y.shape)
    break

torch.Size([32, 3, 299, 299])
torch.Size([23, 32])


## Loading VGG16 pretrained model, and breaking CNNEncoder
- We will not take the final fc connected layer

In [7]:
vgg16 = models.vgg19(pretrained=True)
print(vgg16)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [42]:
vgg16.features

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (17): ReLU(inplace=True)
  (18): MaxPoo

In [47]:
vgg16_modules = list(vgg16.children())[:-1]  # delete the last fc layer.
vgg16_feature_extractor1 = nn.Sequential(*vgg16_modules)
vgg16_feature_extractor1

Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [160]:
with torch.no_grad():
    feature_x = vgg16_feature_extractor1(x)
# encodings = relu(linear(feature_x))

In [161]:
feature_x.shape

torch.Size([32, 512, 7, 7])

In [56]:
feature_x.view(feature_x.shape[0],-1).shape

torch.Size([32, 25088])

In [57]:
no_last_fc =  list(vgg16.classifier.children())[:-1]
vgg16_feature_extractor2 = nn.Sequential(*no_last_fc)
vgg16_feature_extractor2

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
)

In [58]:
with torch.no_grad():
    feature_x = feature_x.view(feature_x.shape[0],-1)
    feature_x = vgg16_feature_extractor2(feature_x)
# encodings = relu(linear(feature_x))
feature_x.shape

torch.Size([32, 4096])

In [162]:
x.shape

torch.Size([32, 3, 224, 224])

In [147]:
emb_size = 120
linear = nn.Linear(4096, emb_size)
relu = nn.ReLU()

In [60]:
feature_x.shape

torch.Size([32, 4096])

In [145]:
with torch.no_grad():
    feature_x = vgg16_feature_extractor1(x)
    feature_x = feature_x.view(feature_x.shape[0],-1)
    feature_x = vgg16_feature_extractor2(feature_x)
    
encodings = relu(linear(feature_x))

In [148]:
encodings.shape

torch.Size([32, 120])

## Breaking RNN Decoder

In [149]:
y.shape

torch.Size([20, 32])

In [211]:
vocab_size

2664

In [213]:
embedding_layer = nn.Embedding(vocab_size, emb_size)
lstm_layer = nn.LSTM(emb_size, 50, 1)
linear = nn.Linear(50, vocab_size)

In [174]:
embds = embedding_layer(y[:-1])  # not passing final word 
embds.shape

torch.Size([19, 32, 120])

In [175]:
encodings.unsqueeze(0).shape

torch.Size([1, 32, 120])

In [176]:
final_embds = torch.cat((encodings.unsqueeze(0), embds), dim=0)
final_embds.shape

torch.Size([20, 32, 120])

In [178]:
o, (h, c) = lstm_layer(final_embds, None)
o.shape

torch.Size([20, 32, 50])

In [179]:
out = linear(o)
out.shape

torch.Size([20, 32, 2664])

In [108]:
y.shape

torch.Size([20, 32])

In [10]:
from model import VGG16_Encoder, RNN_Decoder, CNNtoRNN

In [11]:
captionModel = CNNtoRNN(256, vocab_size, 256, 1)

In [12]:
captionModel.cnnEncoder.feature_extractor2

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
)

In [177]:
caption_de = captionModel(x, y[:-1])  # not passing final word 
caption_de.shape

torch.Size([20, 32, 2664])

In [180]:
caption_de.shape

torch.Size([20, 32, 2664])

In [182]:
y.shape

torch.Size([20, 32])

In [184]:
caption_de.reshape(-1, caption_de.shape[2]).shape

torch.Size([640, 2664])

In [186]:
w_v_m = caption_de.reshape(-1, caption_de.shape[2])
w_v_m[0].shape

torch.Size([2664])

In [187]:
w_v_m[0]

tensor([ 0.0009, -0.0055, -0.0191,  ..., -0.0325,  0.0148, -0.0369],
       grad_fn=<SelectBackward>)

In [185]:
y.reshape(-1).shape

torch.Size([640])

### How to calculated loss "`nn.CrossEntropyLoss()`" using this caption output  

In [188]:
torch.nn.functional.cross_entropy(caption_de.reshape(-1, caption_de.shape[2]), 
                                y.reshape(-1))

tensor(7.8598, grad_fn=<NllLossBackward>)

### Let's train the model

In [13]:
captionModel = CNNtoRNN(embedding_size=256, 
                   vocab_size=vocab_size, 
                   hidden_size=256,
                   num_layers=1).to(device)

### Selecting the parameters which we want to train 
- the fc layers of the cnnEncoder 
- the rnnDecoder layer 

In [15]:
encoder_params = list(captionModel.cnnEncoder.feature_extractor2.parameters()) + list(captionModel.cnnEncoder.linear.parameters())
decoder_params = list(captionModel.rnnDecoder.parameters())

parameters_to_train = decoder_params + encoder_params
len(parameters_to_train)

13

### Select loss function and optimizer

In [16]:
criterion = nn.CrossEntropyLoss()
lr = 0.01
optimizer = optim.Adam(params=parameters_to_train, lr=lr)

## Now the training part

In [17]:
epochs = 2

In [19]:
for epoch in range(epochs):
    for images, captions in loader:

        images = images.to(device)
        captions = captions.to(device)

        optimizer.zero_grad()

        output = captionModel(images, captions[:-1])
        loss = criterion(output.reshape(-1, output.shape[2]), 
                        captions.reshape(-1))
        loss.backward()

        optimizer.step()
    
    if epoch%1==0:
        print(f"epochs {epoch+1}, loss {loss.item()}")

        break

RuntimeError: CUDA out of memory. Tried to allocate 700.00 MiB (GPU 0; 3.82 GiB total capacity; 1.30 GiB already allocated; 260.94 MiB free; 2.44 GiB reserved in total by PyTorch)

### Saving the model 

In [None]:
checkpoint_path = "image_captioning.pth.tar"

In [None]:
checkpoint = {
    "state_dict": model.state_dict(),
    "optimizer": optimizer.state_dict(),
    "step": step,
}

torch.save(checkpoint, checkpoint_path)

### Loading the model

In [None]:
loaded_checkpoint = torch.load(checkpoint_path)

In [None]:
# model = 
# optimizer = 

In [None]:
# model.load_state_dict(loaded_checkpoint["state_dict"])
# optimizer.load_state_dict(loaded_checkpoint["optimizer"])
