## Step 1: Training Setup



In [None]:
import math
from model import EncoderCNN, DecoderRNN
from data_loader import get_loader
from data_loader_val import get_loader as val_get_loader
from pycocotools.coco import COCO
from torchvision import transforms
from tqdm.notebook import tqdm
import torch.nn as nn
import torch
import torch.utils.data as data
from collections import defaultdict
import json
import os
import sys
import numpy as np
from nlp_utils import clean_sentence, bleu_score

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# Setting hyperparameters
batch_size = 128  
vocab_threshold = 5 
vocab_from_file = True  
embed_size = 256  
hidden_size = 512  
num_epochs = 3  
save_every = 1  
print_every = 20  
log_file = "training_log.txt" 

cocoapi_dir = r"C:/Users/vedpa/OneDrive/Desktop/sampleProjects/image_captioning/"


transform_train = transforms.Compose(
    [
        transforms.Resize(256),
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(
            (0.485, 0.456, 0.406),  
            (0.229, 0.224, 0.225),
        ),
    ]
)

In [8]:
# Build data loader.
data_loader = get_loader(
    transform=transform_train,
    mode="train",
    batch_size=batch_size,
    vocab_threshold=vocab_threshold,
    vocab_from_file=vocab_from_file,
    cocoapi_loc=cocoapi_dir,
)


Looking for annotations at: C:\Users\vedpa\OneDrive\Desktop\sampleProjects\image_captioning\cocoapi\annotations\captions_train2014.json
Looking for images at: C:\Users\vedpa\OneDrive\Desktop\sampleProjects\image_captioning\cocoapi\images\train2014

Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=0.46s)
creating index...
index created!
Obtaining caption lengths...


100%|██████████| 414113/414113 [00:19<00:00, 21277.62it/s]


In [None]:
vocab_size = len(data_loader.dataset.vocab)

encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

criterion = (
    nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()
)

params = list(decoder.parameters()) + list(encoder.embed.parameters())
optimizer = torch.optim.Adam(params, lr=0.001)
total_step = math.ceil(len(data_loader.dataset) / data_loader.batch_sampler.batch_size)



In [10]:
print(total_step)

3236


## Step 2: Training the Model



In [None]:
f = open(log_file, "w")

for epoch in range(1, num_epochs + 1):
    for i_step in range(1, total_step + 1):
        indices = data_loader.dataset.get_train_indices()
        
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler

        images, captions = next(iter(data_loader))

        images = images.to(device)
        captions = captions.to(device)

        decoder.zero_grad()
        encoder.zero_grad()

        features = encoder(images)
        outputs = decoder(features, captions)

        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        loss.backward()
        optimizer.step()

        # Getting training statistics
        stats = (
            f"Epoch [{epoch}/{num_epochs}], Step [{i_step}/{total_step}], "
            f"Loss: {loss.item():.4f}, Perplexity: {np.exp(loss.item()):.4f}"
        )

        # Print training statistics to file.
        f.write(stats + "\n")
        f.flush()

        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print("\r" + stats)

    # Save the weights.
    if epoch % save_every == 0:
        torch.save(
            decoder.state_dict(), os.path.join("./models", "decoder-%d.pkl" % epoch)
        )
        torch.save(
            encoder.state_dict(), os.path.join("./models", "encoder-%d.pkl" % epoch)
        )

# Close the training log file.
f.close()

Epoch [1/3], Step [20/3236], Loss: 4.8315, Perplexity: 125.3978
Epoch [1/3], Step [40/3236], Loss: 4.3798, Perplexity: 79.8236


KeyboardInterrupt: 

In [None]:
# torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-final.pkl'))
# torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-final.pkl'))


## Step 3: Validating the Model using Bleu Score


In [None]:
from vocabulary import Vocabulary
vocabulary = Vocabulary(vocab_file='./models/vocab.pkl', vocab_from_file=True)

transform_test = transforms.Compose(
    [
        transforms.Resize(224),
        transforms.ToTensor(),
        transforms.Normalize(
            (0.485, 0.456, 0.406),  
            (0.229, 0.224, 0.225),
        ),
    ]
)


# Create the data loader.
val_data_loader = val_get_loader(
    transform=transform_test, mode="valid", cocoapi_loc=cocoapi_dir
)

encoder_file = "encoder-3.pkl"
# decoder_file = "decoder-3.pkl"

encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, len(vocabulary))  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)


encoder.load_state_dict(torch.load(os.path.join("./models", encoder_file),map_location=torch.device('cpu')))
decoder.load_state_dict(torch.load('./models/decoder-3.pkl', map_location=torch.device('cpu')))

encoder.to('cpu')
decoder.to('cpu')

encoder.eval()
decoder.eval()


Vocabulary successfully loaded from vocab.pkl file!
Vocabulary successfully loaded from vocab.pkl file!


DecoderRNN(
  (embed): Embedding(9955, 256)
  (lstm): LSTM(256, 512, batch_first=True)
  (linear): Linear(in_features=512, out_features=9955, bias=True)
)

In [29]:
# infer captions for all images
pred_result = defaultdict(list)
for img_id, img in tqdm(val_data_loader):
    img = img.to(device)
    with torch.no_grad():
        features = encoder(img).unsqueeze(1)
        output = decoder.sample(features)
    sentence = clean_sentence(output, val_data_loader.dataset.vocab.idx2word)
    pred_result[img_id.item()].append(sentence)

  0%|          | 0/40504 [00:00<?, ?it/s]

In [30]:
with open(
    os.path.join(cocoapi_dir, "cocoapi", "annotations/captions_val2014.json"), "r"
) as f:
    caption = json.load(f)

valid_annot = caption["annotations"]
valid_result = defaultdict(list)
for i in valid_annot:
    valid_result[i["image_id"]].append(i["caption"].lower())

In [31]:
list(valid_result.values())[:3]

[['a bicycle replica with a clock as the front wheel.',
  'the bike has a clock as a tire.',
  'a black metal bicycle with a clock inside the front wheel.',
  'a bicycle figurine in which the front wheel is replaced with a clock\n',
  'a clock with the appearance of the wheel of a bicycle '],
 ['a black honda motorcycle parked in front of a garage.',
  'a honda motorcycle parked in a grass driveway',
  'a black honda motorcycle with a dark burgundy seat.',
  'ma motorcycle parked on the gravel in front of a garage',
  'a motorcycle with its brake extended standing outside'],
 ['a room with blue walls and a white sink and door.',
  'blue and white color scheme in a small bathroom.',
  'this is a blue and white bathroom with a wall sink and a lifesaver on the wall.',
  'a blue boat themed bathroom with a life preserver on the wall',
  'a bathroom with walls that are painted baby blue.']]

In [32]:
list(pred_result.values())[:3]

[[' a group of people playing a game of frisbee.'],
 [' a plate of food with a fork and knife'],
 [' a group of cows standing in a field.']]

In [33]:
bleu_score(true_sentences=valid_result, predicted_sentences=pred_result)

0.20911807788961906