# Evaluation of results
In this notebook we will evaluate the performace of our image captioning system using BLEU as our evaluation metric.

In [1]:
import warnings
warnings.filterwarnings('ignore')
import torch
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
import pickle
import yaml
import pandas as pd
from utilities import get_dataset, print_examples
from pytorch_lightning.callbacks import ModelCheckpoint
from pl_model import CaptionGenerator, CocoDataModule
import os
from bleu import BLEU

## Automatic evaluation: BLEU 
This section of the notebook will generate the model's BLEU score.

In [2]:
# In order to speed up inference we saved dataset and test as .pkl, so we can load them faster. These pickle files are located in ../data/
with open('../data/transform.pkl', 'rb') as output:
    transform = pickle.load(output)
    
with open('../data/dataset.pkl', 'rb') as output:
    dataset = pickle.load(output)
    
with open('../data/test.pkl', 'rb') as output:
    test = pickle.load(output)

#This file contains the testing processed images in tensor form, 7GB
with open('../data/images_inference.pkl', 'rb') as output:
    imgs = pickle.load(output)

In [3]:
# #Slower way of loading the necessary objects. 

# transform = transforms.Compose(
#         [
#             transforms.Resize((224, 224)),
#             transforms.ToTensor(),
#             transforms.Normalize(mean = (0.485, 0.456, 0.406), std = (0.229, 0.224, 0.225)),
#         ]
#     )
    
# dataset, _ = get_dataset(
#                         "../data/images",
#                         "../data/Captiones.tsv",
#                         transform)

# test, _ = get_dataset(
#                         "../data/images",
#                         "../data/testing_captions.tsv",
#                         transform)

# To save our objects

# with open('../data/transform.pkl', 'wb') as input:
#     pickle.dump(transform, input)
    
# with open('../data/dataset.pkl', 'wb') as input:
#     pickle.dump(dataset, input)
    
# with open('../data/test.pkl', 'wb') as input:
#     pickle.dump(test, input)

In [4]:
# Loading the model we are evaluating
version_number = '14'
model = CaptionGenerator.load_from_checkpoint(checkpoint_path= 'lightning_logs/version_'+version_number+'/checkpoints/epoch=48.ckpt')
model.eval();

# Printing its parameters
with open(r'lightning_logs/version_'+version_number+'/hparams.yaml') as file:
    parameters = yaml.load(file, Loader = yaml.FullLoader)
    print(parameters)

{'embed_size': 200, 'hidden_size': 200, 'num_layers': 1, 'vocab_size': 10209}


In [5]:
# # Generating the file with all our model's predictions
# # SLOW!
# # Getting unique image file names
#file_names = np.unique(np.asarray(test.df['image'])) 

# # We also saved a version of the transformed images in order to make inference faster.

# imgs = []
# for name in file_names:
#     path = '../data/images/'+name
#     imgs.append(transform(Image.open(path).convert('RGB')).unsqueeze(0))

In [6]:
# # Generating captions for every image
# #SLOW!
# captions = [" ".join(model.caption_image(image, dataset.vocab)[1:-1]) for image in imgs]

# # Putting the file names and their corresponding captions together in a DataFrame to then save as .tsv
# df = pd.DataFrame(data = {'image':file_names, 'caption':captions})
# df.to_csv('../data/version_'+version_number+'_outputs.tsv', index = False, sep = '\t')

In [7]:
evaluation = BLEU('../data/version_'+version_number+'_outputs.tsv')
print('BLEU-4 average (rounded) score: ' + '{:.3f}'.format(evaluation.get_bleu_score()))

BLEU-4 average (rounded) score: 0.692


## (Our) Human evaluation
The __print_examples__  function allows us to get a quick glance of the model's performance by generating captions on the same 5 images every time.

In [8]:
print_examples(model, dataset)

Example 1 CORRECT: A man talking on his phone in the public
Example 1 OUTPUT: <SOS> a dogs stop a tracks pans with near box . <EOS>
Example 2 CORRECT: A giraffe walking in the grass near a fence
Example 2 OUTPUT: <SOS> a clock street with a bread zebra day with vase little . <EOS>
Example 3 CORRECT: A group of women in a small kitchen.
Example 3 OUTPUT: <SOS> a that riding street with couple of a zebras . <EOS>
Example 4 CORRECT: A group of stuffed animals are lined up on a bed.
Example 4 OUTPUT: <SOS> a flowers glass forest , tree of a cutting . <EOS>
Example 5 CORRECT: A bowl filled with vegetables and noodles on a table.
Example 5 OUTPUT: <SOS> a standing of sits zebra poster grass poster . <EOS>
