# __MS COCO Server Evaluation__

### __Deep Learning__

#### __Project: Image Captioning with Visual Attention__

This notebook generates json files ready to be uploaded to MS COCO Evaluation server.

In [None]:
import os
os.chdir(os.environ["PYTHONPATH"])

import json

import torch
from PIL import Image

from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

import scripts.data_loading as dl
import scripts.data_processing as dp
from scripts import model

%load_ext autoreload
%autoreload 2

In [None]:
MODEL_PATH = "./models/checkpoints/decoder_lr_3e-04_dropout_0.5_lambda_0.0.pth"

RESULT_DIRECTORY = "./results"
VALIDATION_DATASET_CAPTIONS_JSON = os.path.join(RESULT_DIRECTORY, "val2014_cap.json")
TEST_DATASET_CAPTIONS_JSON = os.path.join(RESULT_DIRECTORY, "test2014_cap.json")

COCO_VAL14_DATASET_IMAGES = "./data/validation/val2014"
COCO_VAL14_DATASET_ANNOTATIONS = "./data/validation/captions_val2014.json"

COCO_TEST14_DATASET_IMAGES = "./data/test/test2014"
COCO_TEST14_DATASET_ANNOTATIONS = "./data/test/image_info_test2014.json"

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def load_decoder(state_path: str) -> model.LSTMDecoder:
    decoder_state = torch.load(state_path)
    
    decoder = model.LSTMDecoder(num_embeddings=10_004, embedding_dim=128, encoder_dim=196, decoder_dim=512, attention_dim=256)
    decoder.load_state_dict(decoder_state["decoder"])
    decoder.to(device)

    return decoder

In [None]:
encoder = model.VGG19Encoder()
encoder.to(device)
decoder = load_decoder(MODEL_PATH)

In [None]:
vocabulary = dp.Vocabulary()
beam_size = 3
num_sequences = 5

images_directory = COCO_VAL14_DATASET_IMAGES
ann_file = COCO_VAL14_DATASET_ANNOTATIONS
out_file = VALIDATION_DATASET_CAPTIONS_JSON

coco = COCO(ann_file)

# create json file
evaluation_results = []
for i, image_id in enumerate(coco.imgs):
    img_filename = coco.loadImgs(image_id)[0]["file_name"]
    img_path = os.path.join(images_directory, img_filename)
    
    image = Image.open(img_path).convert("RGB")
    image = dp.VGGNET_PREPROCESSING_PIPELINE(image)
    
    image = image.to(device)
    
    feature_maps, feature_mean = encoder(image.unsqueeze(0))
    captions = decoder.beam_search(
        feature_maps=feature_maps,
        feature_mean=feature_mean,
        start_token_index=vocabulary.word2idx("<SOS>"),
        end_token_index=vocabulary.word2idx("<EOS>"),
        beam_size=beam_size,
        num_sequences=num_sequences,
        max_length=100
    )
    
    decoded_captions = []
    for sequence, score in captions:
        sequence = dp.TextPipeline.decode_caption(vocabulary, sequence)
        sequence = (sequence, score)
        
        decoded_captions.append(sequence)
    
    best_caption = sorted(decoded_captions, reverse=True, key=lambda x: x[1])[0][0]
    
    result_dict = {
        "image_id": image_id,
        "caption": best_caption
    }
    evaluation_results.append(result_dict)

    
os.makedirs(RESULT_DIRECTORY, exist_ok=True)
with open(out_file, 'w') as out_json:
    json.dump(evaluation_results, out_json)

In [None]:
coco_val_results = coco.loadRes(VALIDATION_DATASET_CAPTIONS_JSON)

cocoEval = COCOEvalCap(coco, coco_val_results)
cocoEval.params['image_id'] = coco_val_results.getImgIds()

cocoEval.evaluate()