## Image Captioning - Training Notebook

Make sure to look at setup the project environment and data according to the README.

In [1]:
import torch
import torch.nn as nn
from torchvision import transforms
from pycocotools.coco import COCO
from imcaption.data_loader import get_transform, get_loader
from imcaption.model import EncoderCNN, DecoderRNN
import math
import torch.utils.data as data
import numpy as np
import os
import sys
import requests
import time

batch_size = 16             # batch size
vocab_threshold = 20        # minimum word count threshold
vocab_from_file = False     # if True, load existing vocab file
embed_size = 256            # dimensionality of image and word embeddings
hidden_size = 512           # number of features in hidden state of the RNN decoder
num_epochs = 5              # number of training epochs
save_every = 1              # determines frequency of saving model weights
print_every = 2000          # determines window for printing average loss
learning_rate = 1e-3        # learning rate passed to the optimizer
saved_model_dir = "saved_models"  # folder containing the saved model weights

In [2]:
# Setup nltk
import nltk
nltk.download('punkt')

# Define the Image Transform
transform_train = get_transform()

# Build data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file)

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function. 
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

# Specify the learnable parameters of the model.
params = decoder.parameters()

# Define the optimizer.
optimizer = torch.optim.Adam(params, lr=learning_rate)

[nltk_data] Downloading package punkt to /home/houssam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


loading annotations into memory...
Done (t=1.59s)
creating index...
index created!
[0/414113] Tokenizing captions...
[100000/414113] Tokenizing captions...
[200000/414113] Tokenizing captions...
[300000/414113] Tokenizing captions...
[400000/414113] Tokenizing captions...
loading annotations into memory...
Done (t=0.68s)
creating index...


  0%|          | 792/414113 [00:00<00:52, 7917.91it/s]

index created!
Obtaining caption lengths...


100%|██████████| 414113/414113 [00:49<00:00, 8348.27it/s]


In [None]:
if not os.path.exists(saved_model_dir):
    os.mkdir(saved_model_dir)

# Set the total number of training steps per epoch.
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

for epoch in range(1, num_epochs+1):
    for i_step in range(1, total_step+1):
        # Randomly sample a caption length, and sample indices with that length.
        indices = data_loader.dataset.get_train_indices()
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler
        
        # Obtain the batch.
        images, captions = next(iter(data_loader))

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)
        
        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()
        
        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions)
        
        # Calculate the batch loss.
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        
        # Backward pass.
        loss.backward()
        
        # Update the parameters in the optimizer.
        optimizer.step()
            
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
        
        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush()
        

        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)
            
    # Save the weights.
    if epoch % save_every == 0:
        torch.save(decoder.state_dict(), os.path.join(saved_model_dir, 'decoder-%d.pkl' % epoch))
        torch.save(encoder.state_dict(), os.path.join(saved_model_dir, 'encoder-%d.pkl' % epoch))


Epoch [1/5], Step [2000/25883], Loss: 3.6123, Perplexity: 37.0494
Epoch [1/5], Step [4000/25883], Loss: 2.7713, Perplexity: 15.9792
Epoch [1/5], Step [6000/25883], Loss: 2.2727, Perplexity: 9.70529
Epoch [1/5], Step [8000/25883], Loss: 2.2720, Perplexity: 9.69854
Epoch [1/5], Step [10000/25883], Loss: 2.4542, Perplexity: 11.6373
Epoch [1/5], Step [12000/25883], Loss: 2.0725, Perplexity: 7.94450
Epoch [1/5], Step [14000/25883], Loss: 2.4636, Perplexity: 11.7468
Epoch [1/5], Step [16000/25883], Loss: 2.3179, Perplexity: 10.1546
Epoch [1/5], Step [18000/25883], Loss: 2.5545, Perplexity: 12.8646
Epoch [1/5], Step [20000/25883], Loss: 2.2813, Perplexity: 9.78967
Epoch [1/5], Step [22000/25883], Loss: 2.0412, Perplexity: 7.70005
Epoch [1/5], Step [24000/25883], Loss: 2.0952, Perplexity: 8.12696
Epoch [2/5], Step [2000/25883], Loss: 2.0445, Perplexity: 7.725482
Epoch [2/5], Step [4000/25883], Loss: 2.1870, Perplexity: 8.90866
Epoch [2/5], Step [5137/25883], Loss: 2.3620, Perplexity: 10.6125