## Step 1: Training Setup

In [1]:
import torch
import torch.nn as nn
import math
import torch.utils.data as data
import numpy as np
import os
import requests
import time
import sys

from torchvision import transforms
from pycocotools.coco import COCO

from utils.data_loader import get_loader
from utils.model import EncoderCNN, DecoderRNN

In [2]:
## Select appropriate values for the Python variables below.
batch_size = 32          # batch size
vocab_threshold = 5        # minimum word count threshold
vocab_from_file = True    # if True, load existing vocab file
embed_size = 512           # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
num_epochs = 3             # number of training epochs
save_every = 100             # determines frequency of saving model weights
print_every = 100          # determines window for printing average loss
log_file = 'logs/training_log_2.txt'       # name of file with saved training loss and perplexity

In [3]:
clip_value = 2             # the maximum gradient value for clipping
num_layers = 3 

In [4]:
# Amend the image transform below.
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

In [5]:
# Build data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file)

Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=1.37s)
creating index...
index created!
Obtaining caption lengths...


HBox(children=(FloatProgress(value=0.0, max=414113.0), HTML(value='')))




In [6]:
# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

In [7]:
# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

In [8]:
# Load the trained weights.
encoder.load_state_dict(torch.load('./models/encoder-3.pkl'))
decoder.load_state_dict(torch.load('./models/decoder-3.pkl'))

<All keys matched successfully>

In [9]:
# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

DecoderRNN(
  (embedding): Embedding(9955, 512)
  (lstm): LSTM(512, 512, num_layers=3, batch_first=True)
  (fc): Linear(in_features=512, out_features=9955, bias=True)
)

In [10]:
# Define the loss function. 
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

In [11]:
# Specify the learnable parameters of the model.
params = list(decoder.parameters()) + list(encoder.embed.parameters())

In [12]:
# Define the optimizer.
optimizer = torch.optim.Adam(params, lr=0.003)

In [13]:
# Set the total number of training steps per epoch.
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

## Step 2: Train your Model

In [14]:
f = open(log_file, 'w')

In [15]:
for epoch in range(3, num_epochs+1):
    
    for i_step in range(1, total_step+1):
        
        # Randomly sample a caption length, and sample indices with that length.
        indices = data_loader.dataset.get_train_indices()
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler
        
        # Obtain the batch.
        images, captions = next(iter(data_loader))

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)
        
        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()
        
        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions)
        
        # Calculate the batch loss.
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        
        # Backward pass.
        loss.backward()
        
        torch.nn.utils.clip_grad_value_(decoder.parameters(), clip_value)
        
        # Update the parameters in the optimizer.
        optimizer.step()
            
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
        
        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush()
        
        # Print training statistics to file.
        f.write(stats + '\n')
        f.flush()
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)
            
        # Save the weights.
        if i_step % save_every == 0:
            torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch))
            torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch))

# Close the training log file.
f.close()

Epoch [3/3], Step [100/12942], Loss: 2.1846, Perplexity: 8.8870
Epoch [3/3], Step [200/12942], Loss: 2.1471, Perplexity: 8.56004
Epoch [3/3], Step [300/12942], Loss: 2.2987, Perplexity: 9.96091
Epoch [3/3], Step [400/12942], Loss: 2.5213, Perplexity: 12.4451
Epoch [3/3], Step [500/12942], Loss: 2.1132, Perplexity: 8.27493
Epoch [3/3], Step [600/12942], Loss: 2.3283, Perplexity: 10.2602
Epoch [3/3], Step [700/12942], Loss: 2.1786, Perplexity: 8.83360
Epoch [3/3], Step [800/12942], Loss: 2.4615, Perplexity: 11.7222
Epoch [3/3], Step [900/12942], Loss: 2.4042, Perplexity: 11.0695
Epoch [3/3], Step [1000/12942], Loss: 1.8481, Perplexity: 6.3477
Epoch [3/3], Step [1100/12942], Loss: 2.2758, Perplexity: 9.73606
Epoch [3/3], Step [1200/12942], Loss: 2.2271, Perplexity: 9.27321
Epoch [3/3], Step [1300/12942], Loss: 2.1168, Perplexity: 8.30464
Epoch [3/3], Step [1400/12942], Loss: 2.3126, Perplexity: 10.1006
Epoch [3/3], Step [1500/12942], Loss: 2.1319, Perplexity: 8.43117
Epoch [3/3], Step [16