
# Notebook 3: Training the CNN-RNN model


<a id='step1'></a>
## 1: Training Preparation

* ``params``: It is a Python list containing the learnable parameters of the model. 
- Only the weights in the embedding layer of the encoder and the weights of the decoder were trained. 

In [None]:
import torch
import torch.nn as nn
from torchvision import transforms
import sys
sys.path.append('/opt/cocoapi/PythonAPI')
from pycocotools.coco import COCO
from data_loader import get_loader
from model import EncoderCNN, DecoderRNN
import math

batch_size = 128           
vocab_threshold = 4        # minimum word count threshold
vocab_from_file = True    
embed_size = 256           # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
num_epochs = 3             # number of training epochs
save_every = 1             # determines frequency of saving model weights
print_every = 100          # determines window for printing average loss

transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Build data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file)

vocab_size = len(data_loader.dataset.vocab)

encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()
params = list(decoder.parameters()) + list(encoder.embed.parameters()) 
optimizer = torch.optim.Adam(params, lr=0.001) 
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

loading annotations into memory...
Done (t=1.30s)
creating index...
index created!
[0/414113] Tokenizing captions...
[100000/414113] Tokenizing captions...
[200000/414113] Tokenizing captions...
[300000/414113] Tokenizing captions...
[400000/414113] Tokenizing captions...
loading annotations into memory...
Done (t=0.81s)
creating index...
index created!
Obtaining caption lengths...


100%|████████████████████████████████████████████████████████████████████████| 414113/414113 [00:48<00:00, 8512.66it/s]


## 2: Training the Model


In [None]:
import torch.utils.data as data
import numpy as np
import os


for epoch in range(1, num_epochs+1):
    for i_step in range(1, total_step+1):
        
        # Randomly sample a caption length, and sample indices with that length.
        indices = data_loader.dataset.get_train_indices()
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler
        
        # Obtain the batch.
        images, captions = next(iter(data_loader))

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)
        
        decoder.zero_grad()
        encoder.zero_grad()
        features = encoder(images)
        outputs = decoder(features, captions)
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        loss.backward()
        optimizer.step()
            
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
        
        print('\r' + stats, end="")
        
        if i_step % print_every == 0:
            print('\r' + stats)
            
    # Save the weights.
    if epoch % save_every == 0:
        torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch))
        torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch))


Epoch [1/3], Step [100/3236], Loss: 3.7985, Perplexity: 44.6321
Epoch [1/3], Step [200/3236], Loss: 3.5927, Perplexity: 36.33101
Epoch [1/3], Step [300/3236], Loss: 3.1750, Perplexity: 23.9261
Epoch [1/3], Step [400/3236], Loss: 3.2486, Perplexity: 25.7532
Epoch [1/3], Step [500/3236], Loss: 3.3328, Perplexity: 28.01535
Epoch [1/3], Step [600/3236], Loss: 2.8008, Perplexity: 16.4580
Epoch [1/3], Step [700/3236], Loss: 3.0129, Perplexity: 20.3461
Epoch [1/3], Step [800/3236], Loss: 2.7014, Perplexity: 14.9004
Epoch [1/3], Step [900/3236], Loss: 2.8401, Perplexity: 17.1173
Epoch [1/3], Step [1000/3236], Loss: 2.4991, Perplexity: 12.1720
Epoch [1/3], Step [1100/3236], Loss: 2.8928, Perplexity: 18.0430
Epoch [1/3], Step [1200/3236], Loss: 2.3459, Perplexity: 10.4429
Epoch [1/3], Step [1300/3236], Loss: 2.4772, Perplexity: 11.9076
Epoch [1/3], Step [1400/3236], Loss: 2.4438, Perplexity: 11.5164
Epoch [1/3], Step [1500/3236], Loss: 2.3213, Perplexity: 10.1889
Epoch [1/3], Step [1600/3236], L