<a id='step1'></a>
## Step 1: Get Data Loader for Test Dataset

In [None]:
import sys
sys.path.append('cocoapi/PythonAPI')
from pycocotools.coco import COCO
from data.coco_loader import get_loader
from torchvision import transforms

# TODO #1: Define a transform to pre-process the testing images.
transform_test = transforms.Compose([ 
    transforms.Resize((224,224)),                        
    transforms.ToTensor(),                          
    transforms.Normalize((0.485, 0.456, 0.406),      
                         (0.229, 0.224, 0.225))])

# Create the data loader.
data_loader = get_loader(transform=transform_test,    
                         mode='val')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Obtain sample image before and after pre-processing.
orig_image, image = next(iter(data_loader))

# Visualize sample image, before pre-processing.
plt.imshow(np.squeeze(orig_image))
plt.title('example image')
plt.show()

<a id='step2'></a>
## Step 2: Load Trained Models

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Watch for any changes in model.py, and re-load it automatically.
% load_ext autoreload
% autoreload 2

import os
import torch
from model import EncoderCNN, DecoderRNN

#2: Specify the saved models to load.
encoder_file = 'encoder-3.pkl' 
decoder_file = 'decoder-3.pkl'

#3: Select appropriate values for the Python variables below.
embed_size = 300
hidden_size = 128

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder, and set each to inference mode.
encoder = EncoderCNN(embed_size)
encoder.eval()
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
decoder.eval()

# Load the trained weights.
encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file)))
decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file)))

# Move models to GPU if CUDA is available.
encoder.to(device)
decoder.to(device)

<a id='step3'></a>
## Step 3: Sampler
After training you just input image but not the ground truth caption. That's why there is a sample function of decoder model

In [None]:
image = image.to(device)

# Obtain the embedded image features.
features = encoder(image).unsqueeze(1)

# Pass the embedded image features through the model to get a predicted caption.
output = decoder.sample(features)
print('example output:', output)

assert (type(output)==list), "Output needs to be a Python list" 
assert all([type(x)==int for x in output]), "Output should be a list of integers." 
assert all([x in data_loader.dataset.vocab.idx2word for x in output]), "Each entry in the output needs to correspond to an integer that indicates a token in the vocabulary."

<a id='step4'></a>
## Step 4: Clean up the Captions

In the code cell below `clean_sentence` function takes a list of integers (corresponding to the variable `output` in **Step 3**) as input and return the corresponding predicted sentence (as a single Python string). 

In [None]:
def clean_sentence(output):
    sentence = ''
    for x in output:
        word = data_loader.dataset.vocab.idx2word[x]
        if word not in [data_loader.dataset.vocab.start_word, data_loader.dataset.vocab.end_word]:
            sentence = sentence + ' ' + data_loader.dataset.vocab.idx2word[x]
        sentence = sentence.strip()
    return sentence

In [None]:
sentence = clean_sentence(output)
print('example sentence:', sentence)

assert type(sentence)==str, 'Sentence needs to be a Python string!'

<a id='step5'></a>
## Step 5: Generate Predictions

In the code cell below, we have written a function (`get_prediction`) that you can use to use to loop over images in the test dataset and print your model's predicted caption.

In [None]:
def get_prediction():
    orig_image, image = next(iter(data_loader))
    plt.imshow(np.squeeze(orig_image))
    plt.title('Sample Image')
    plt.show()
    image = image.to(device)
    features = encoder(image).unsqueeze(1)
    output = decoder.sample(features)    
    sentence = clean_sentence(output)
    print(sentence)

Run the code cell below (multiple times, if you like!) to test how this function works.

In [None]:
get_prediction()

In [None]:
get_prediction()

In [None]:
get_prediction()

In [None]:
get_prediction()

In [None]:
get_prediction()

In [None]:
# TODO write BLEU

from torchtext.data.metrics import bleu_score

reference = [['this', 'is', 'small', 'test']]
candidate = [['this', 'is', 'small', 'test']]

print('Cumulative 1-gram: %f' % bleu_score(candidate, reference, weights=(1, 0, 0, 0)))
print('Cumulative 2-gram: %f' % bleu_score(candidate, reference, weights=(0.5, 0.5, 0, 0)))
print('Cumulative 3-gram: %f' % bleu_score(candidate, reference, weights=(0.33, 0.33, 0.33, 0)))
print('Cumulative 4-gram: %f' % bleu_score(candidate, reference, weights=(0.25, 0.25, 0.25, 0.25)))