<a id='step1'></a>
## Step 1: Training Setup

In [None]:
! git clone https://github.com/tojiboyevf/image_captioning.git
! mv  -v /content/image_captioning/* /content/  
# IF YOU ARE NOT USING COLAB THEN COMMENT ABOVE TWO LINES AND RUN

! git clone https://github.com/cocodataset/cocoapi.git
! cd cocoapi/PythonAPI  && make  && cd ..
! pip install pycocotools
! mkdir -p models cocoapi/images
! wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
! wget http://images.cocodataset.org/zips/val2014.zip
! unzip annotations_trainval2014.zip -d cocoapi/
! unzip val2014.zip -d cocoapi/images

# Read it!!!
### There are 82k images in train dataset, you can decrease the number  of files in train dataset or use validation dataset (change folder name and caption file name to val) as train for the beginning

In [None]:
! wget http://images.cocodataset.org/zips/train2014.zip
! unzip train2014.zip -d cocoapi/images  
! rm train2014.zip 

In [None]:
import torch
import torch.nn as nn
from torchvision import transforms
import sys
sys.path.append('cocoapi/PythonAPI')
from pycocotools.coco import COCO
from datasets.coco_loader import get_loader
from model import EncoderCNN, DecoderRNN
import math


#1: Select appropriate values for the Python variables below.
batch_size = 128           # batch size
vocab_threshold = 5        # minimum word count threshold
vocab_from_file = False     # if True, load existing vocab file
embed_size = 300           # dimensionality of image and word embeddings
hidden_size = 128          # number of features in hidden state of the RNN decoder
num_epochs = 1             # number of training epochs
save_every = 1             # determines frequency of saving model weights
print_every = 100          # determines window for printing average loss
log_file = 'training_log.txt'       # name of file with saved training loss and perplexity

transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      
    transforms.RandomHorizontalFlip(),               
    transforms.ToTensor(),                           
    transforms.Normalize((0.485, 0.456, 0.406),      
                         (0.229, 0.224, 0.225))])

# Build data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file)

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function. 
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

#3: Specify the learnable parameters of the model.
params = list(decoder.parameters()) + list(encoder.embed.parameters())

#4: Define the optimizer.
optimizer = torch.optim.Adam(params, lr = 0.001, weight_decay=1e-5)

# Set the total number of training steps per epoch.
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

<a id='step2'></a>
## Step 2: Train your Model

In [None]:
import torch.utils.data as data
import numpy as np
import os
import time
from torch.utils.tensorboard import SummaryWriter

# Open the training log file.
f = open(log_file, 'w')

old_time = time.time()

writer = SummaryWriter(log_dir='logs/resnetlstm')

for epoch in range(1, num_epochs+1):
    
    for i_step in range(1, total_step+1):
        
        if time.time() - old_time > 60:
            old_time = time.time()
            
        
        # Randomly sample a caption length, and sample indices with that length.
        indices = data_loader.dataset.get_train_indices()
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler
        
        # Obtain the batch.
        images, captions = next(iter(data_loader))

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)
        
        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()
        
        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions)
        
        # Calculate the batch loss.
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        
        # Backward pass.
        loss.backward()
        
        # Update the parameters in the optimizer.
        optimizer.step()
            
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
        writer.add_scalar("Train loss", loss.item(), global_step=i_step)
        writer.add_scalar("Perplexity", np.exp(loss.item()), global_step=i_step)

        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush()
        
        # Print training statistics to file.
        f.write(stats + '\n')
        f.flush()
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)
            
    # Save the weights.
    if epoch % save_every == 0:
        torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch))
        torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch))

# Close the training log file.
f.close()