# __Bilingual Evaluation Understudy (BLEU) score___

### __Deep Learning__

#### __Project: Image Captioning with Visual Attention__

In [1]:
import os
os.chdir(os.environ["PYTHONPATH"])

import torch
import torchvision

import nltk.translate.bleu_score as bleu

import scripts.data_loading as dl
import scripts.data_processing as dp
from scripts import model

%load_ext autoreload
%autoreload 2

In [2]:
root = dl.DATASET_PATHS[dl.DatasetType.VALIDATION].images
ann_json = dl.DATASET_PATHS[dl.DatasetType.VALIDATION].captions_json

vocabulary = dp.Vocabulary()

coco_val = torchvision.datasets.CocoCaptions(root, ann_json, dp.VGGNET_PREPROCESSING_PIPELINE)

loading annotations into memory...
Done (t=0.09s)
creating index...
index created!


In [3]:
img, captions = coco_val[123]

In [4]:
captions

['A crowd of people standing around each other.',
 'Someone is holding a large teddy bear in the crowd.',
 'A group of men and women gathered together outside. ',
 'A large crowd of people walking around while one owns a teddy bear.',
 'A large group of people on a city street.']

In [5]:
preprocessed_captions = []
for caption in captions:
    preprocessed_caption = dp.TextPipeline.normalize(caption).split()
    preprocessed_captions.append(preprocessed_caption)
    print(preprocessed_caption)

['a', 'crowd', 'of', 'people', 'standing', 'around', 'each', 'other']
['someone', 'is', 'holding', 'a', 'large', 'teddy', 'bear', 'in', 'the', 'crowd']
['a', 'group', 'of', 'men', 'and', 'women', 'gathered', 'together', 'outside']
['a', 'large', 'crowd', 'of', 'people', 'walking', 'around', 'while', 'one', 'owns', 'a', 'teddy', 'bear']
['a', 'large', 'group', 'of', 'people', 'on', 'a', 'city', 'street']


In [6]:
encoder = model.VGG19Encoder()
decoder = model.LSTMDecoder(
    num_embeddings=len(vocabulary),
    embedding_dim=8,
    encoder_dim=196,
    decoder_dim=16,
    attention_dim=4
)

In [7]:
feature_maps, feature_mean = encoder.forward(img.unsqueeze(0))

In [8]:
sequence, _ = decoder.greedy_decoding(
    feature_maps=feature_maps,
    feature_mean=feature_mean,
    start_token_index=vocabulary.word2idx("<SOS>"),
    end_token_index=vocabulary.word2idx("<EOS>"),
    max_length=10
)

In [9]:
print(sequence)

[7415, 4186, 6536, 1821, 9678, 3774, 9767, 5687, 2204, 9904]


In [10]:
sequence = dp.TextPipeline.decode_caption(vocabulary, sequence)
sequence

'toasters swimmers mango bin nuclear docking shoeless chaise mushroom suggesting'

In [11]:
sequence = ['people', 'people', 'people', 'people', 'people']

In [12]:
bleu.modified_precision(preprocessed_captions, sequence, n=1)

Fraction(1, 5)

In [13]:
bleu.brevity_penalty(8, len(sequence))

0.5488116360940264

In [14]:
bleu.sentence_bleu(references=preprocessed_captions, hypothesis=sequence, weights=(1.0, 0.0, 0.0, 0.0))

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.10976232721880529

In [15]:
hyp = ["the"] * 7
hyp

['the', 'the', 'the', 'the', 'the', 'the', 'the']

In [16]:
ref = [["the", "cat", "is", "on", "the", "mat"], ["there", "is", "a", "cat", "on", "the", "mat"]]
ref

[['the', 'cat', 'is', 'on', 'the', 'mat'],
 ['there', 'is', 'a', 'cat', 'on', 'the', 'mat']]

In [17]:
bleu.sentence_bleu(references=ref, hypothesis=hyp, weights=(1, 0, 0, 0))

0.2857142857142857