In [16]:
#Get data loader for test mode
from data_set_loader import get_data_set_loader
from torchvision import transforms

transform_test = transforms.Compose([transforms.Resize((224, 224)), \
                                     transforms.ToTensor(), \
                                     transforms.Normalize((0.485, 0.456, 0.406), \
                                                          (0.229, 0.224, 0.225))])

data_set_loader = get_data_set_loader(transform=transform_test,    
                         mode='test')

Vocabulary completely loaded..


In [11]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import os
import torch
from pipeline_models import EncoderCNN, DecoderRNN
from PIL import Image
import numpy as np

#embedding size, number of hidden units and batch size
embed_size = 512
hidden_size = 512
batch_size= 512

#Size of the vocabulary created while training the CNN-RNN model
vocab_size = len(data_set_loader.dataset.vocabulary)

encoder = EncoderCNN(embed_size)
encoder.eval()
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, batch_size)
decoder.eval()

# This file contains the captions for the test images. We will evaluate our model performance using these captions
import json
f = open('captions_val2014.json')
original_data=json.load(f)

#Each image has 5 captions. We will mantain a dictionary with key as image_id and list of 5 captions as values
original_data_anns={}
ann_list=original_data["annotations"]
for ele in ann_list:
    cnt=12-len(str(ele["image_id"]))
    img_key="COCO_val2014_"+'0'*cnt+str(ele["image_id"])+".jpg"
    if(img_key in original_data_anns.keys()):
        original_data_anns[img_key].append(ele["caption"])
    else:
        original_data_anns[img_key]=[ele["caption"]]

In [8]:
#Get the words from vocabulary file using indices and then clean the sentence
def clean_sentence(output):
    word_list = []
    
    for index in output:
        token=data_set_loader.dataset.vocabulary.idx2word[index]
        word_list.append(token)
    
    word_list = word_list[1:-1]
    output_sentence = ' '.join(word_list)
    sentence = output_sentence.capitalize()
    return output_sentence

In [9]:
#Calculate BLEU score for the model.
def get_bleu_score(encoder_file,decoder_file):
    #Load encoder and decoder models.
    encoder.load_state_dict(torch.load(encoder_file,map_location=torch.device('cpu')))
    decoder.load_state_dict(torch.load(decoder_file,map_location=torch.device('cpu')))

    encoder.to(device)
    decoder.to(device)

    dir_src = ("val2014/")
    predictions={}
    #Get predictions for each image in the test set.
    for filename in os.listdir(dir_src):
        PIL_image = Image.open(dir_src + filename).convert('RGB')
        orig_image = np.array(PIL_image)

        #Transform the image
        transform_test = transforms.Compose([transforms.Resize((224, 224)), \
                                            transforms.ToTensor(), \
                                            transforms.Normalize((0.485, 0.456, 0.406), \
                                                                (0.229, 0.224, 0.225))])

        image = transform_test(PIL_image)
        image = image.to(device)
        image=torch.Tensor(image).unsqueeze(0)

        #Get features from the encoder model.
        features = encoder(image).unsqueeze(1)

        #Get word tokens from decoder model.
        output = decoder.sample(features)

        #Get the text from word token indices    
        sentence = clean_sentence(output)
        predictions[filename]=sentence

    from nltk.translate.bleu_score import sentence_bleu,corpus_bleu
    references=[]
    candidates=[]
    #Format input original captions and predicted captions to calculate BLEU score.
    for key in list(original_data_anns.keys()):
        refs=original_data_anns[key]
        refs_split=[]
        for ref in refs:
            refs_split.append(ref.split())
        cand=predictions[key].split()
        references.append(refs_split)
        candidates.append(cand)
    
    #Calculating BLEU using unigrams
    score=corpus_bleu(references,candidates,weights=(1,0,0,0))
    print(score)

In [16]:
get_bleu_score("encoderCNN.pkl","decoderRNN.pkl")

0.5927661081245713
