In [1]:
import nltk
import numpy as np
import json
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score as meteor

import os
from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     /home/guest/thdaryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# !ls /scratch2/thdaryan/data/fine_tuned/refCOCO/test/labels

In [3]:
labels_folder_path = '/scratch2/thdaryan/data/fine_tuned/refCOCO/test/labels'
result_folder_path = 'result'

In [4]:
with open(os.path.join(labels_folder_path, 'lab_0.json')) as f:
    sample_label = json.load(f)
print(sample_label['ref_sents'])

ref = sample_label['ref_sents']
exp = [word_tokenize(r.lower()) for r in ref]
print(exp)

['guy petting elephant', 'foremost person', 'green shirt']
[['guy', 'petting', 'elephant'], ['foremost', 'person'], ['green', 'shirt']]


In [5]:
print(sentence_bleu(exp, ['foremost'], weights=(1,0,0,0)))

0.36787944117144233


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [6]:
with open(os.path.join(result_folder_path, 'result_pragmatic_caption_0_1000.json')) as f:
    result_0_1000 = json.load(f)
print(result_0_1000['0'])
print(result_0_1000['0'][0][1:-1])

['^man is sitting on the ground$', '-5.629921482260508']
man is sitting on the ground


## Calculate Bleu Score

In [7]:
bleu_scores = []
for file in tqdm(range(0, 5000, 1000)):
    print(file, file+1000)
    with open(os.path.join(result_folder_path, f'result_pragmatic_caption_{file}_{file+1000}.json')) as f:
        list_result = json.load(f)
    for idx in list_result:
        result = list_result[idx]
        if (result[0]=='ERROR'):
#             print(idx, result)
            continue

        with open(os.path.join(labels_folder_path, f'lab_{idx}.json')) as f:
            sample_label = json.load(f)

        ref = sample_label['ref_sents']
        ref = [word_tokenize(r.lower()) for r in ref]

        generated_exp = result[0][1:-1]
        score = sentence_bleu(ref, word_tokenize(generated_exp), weights=(1,0,0,0))

#         print(score, ref,  word_tokenize(generated_exp))
        bleu_scores.append(score)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0 1000


 20%|██        | 1/5 [00:00<00:01,  2.71it/s]

1000 2000


 40%|████      | 2/5 [00:00<00:01,  2.74it/s]

2000 3000


 60%|██████    | 3/5 [00:01<00:00,  2.76it/s]

3000 4000


 80%|████████  | 4/5 [00:01<00:00,  2.79it/s]

4000 5000


100%|██████████| 5/5 [00:01<00:00,  2.80it/s]


In [8]:
len(bleu_scores)

4984

In [9]:
np.average(bleu_scores)

0.168943800017137

## Calculate Rouge Score

In [10]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

In [11]:
rouge1_scores = []
rougeL_scores = []
for file in tqdm(range(0, 5000, 1000)):
    print(file, file+1000)
    with open(os.path.join(result_folder_path, f'result_pragmatic_caption_{file}_{file+1000}.json')) as f:
        list_result = json.load(f)
    for idx in list_result:
        result = list_result[idx]
        if (result[0]=='ERROR'):
#             print(idx, result)
            continue

        with open(os.path.join(labels_folder_path, f'lab_{idx}.json')) as f:
            sample_label = json.load(f)

        ref = sample_label['ref_sents']
        generated_exp = result[0][1:-1]
        rouge1_scores.append(np.average([scorer.score(target, generated_exp)['rouge1'].recall for target in ref]))
        rougeL_scores.append(np.average([scorer.score(target, generated_exp)['rougeL'].recall for target in ref]))

  0%|          | 0/5 [00:00<?, ?it/s]

0 1000


 20%|██        | 1/5 [00:00<00:02,  1.36it/s]

1000 2000


 40%|████      | 2/5 [00:01<00:02,  1.37it/s]

2000 3000


 60%|██████    | 3/5 [00:02<00:01,  1.37it/s]

3000 4000


 80%|████████  | 4/5 [00:02<00:00,  1.38it/s]

4000 5000


100%|██████████| 5/5 [00:03<00:00,  1.39it/s]


In [12]:
print(np.average(rouge1_scores), np.std(rouge1_scores))

0.14807628272710655 0.17988326157351756


In [13]:
print(np.average(rougeL_scores), np.std(rougeL_scores))

0.14305185991986696 0.17403894462041697


## Calculate Meteor Score

In [14]:
meteor_scores = []
for file in tqdm(range(0, 5000, 1000)):
    print(file, file+1000)
    with open(os.path.join(result_folder_path, f'result_pragmatic_caption_{file}_{file+1000}.json')) as f:
        list_result = json.load(f)
    for idx in list_result:
        result = list_result[idx]
        if (result[0]=='ERROR'):
#             print(idx, result)
            continue

        with open(os.path.join(labels_folder_path, f'lab_{idx}.json')) as f:
            sample_label = json.load(f)

        ref = sample_label['ref_sents']
        ref = [r.lower() for r in ref]

        generated_exp = result[0][1:-1]
        score = meteor(ref, generated_exp)

        meteor_scores.append(score)

  0%|          | 0/5 [00:00<?, ?it/s]

0 1000


 20%|██        | 1/5 [00:01<00:07,  1.79s/it]

1000 2000


 40%|████      | 2/5 [00:02<00:04,  1.45s/it]

2000 3000


 60%|██████    | 3/5 [00:03<00:02,  1.20s/it]

3000 4000


 80%|████████  | 4/5 [00:03<00:01,  1.02s/it]

4000 5000


100%|██████████| 5/5 [00:04<00:00,  1.17it/s]


In [15]:
print(np.average(meteor_scores), np.std(meteor_scores))

0.13534073040601782 0.15401653259086828


## Dump expressions to a single json file

In [18]:
recurrentRSA_generated_pragmatic_exps = {}
recurrentRSA_generated_literal_exps = {}
for file in tqdm(range(0, 5000, 1000)):
    print(file, file+1000)
    with open(os.path.join(result_folder_path, f'result_pragmatic_caption_{file}_{file+1000}.json')) as f:
        list_result = json.load(f)
    for idx in list_result:
        result = list_result[idx]
        if (result[0]=='ERROR'):
            result = ""
        else:
            result = result[0][1:-1]
        recurrentRSA_generated_pragmatic_exps[idx] = result
        
    with open(os.path.join(result_folder_path, f'result_literal_caption_{file}_{file+1000}.json')) as f:
        list_result = json.load(f)
    for idx in list_result:
        result = list_result[idx]
        if (result[0]=='ERROR'):
            result = ""
        else:
            result = result[0][1:-1]
        recurrentRSA_generated_literal_exps[idx] = result

100%|██████████| 5/5 [00:00<00:00, 301.73it/s]

0 1000
1000 2000
2000 3000
3000 4000
4000 5000





In [19]:
recurrentRSA_generated_pragmatic_exps

{'0': 'man is sitting on the ground',
 '1': 'zebra in a cat',
 '2': 'van is on the street',
 '3': 'laptop on the desk',
 '4': 'white cow in field',
 '5': 'fence behind the fence',
 '6': 'man wearing black pants',
 '7': 'food on the table',
 '8': 'young elephant walking on grass',
 '9': 'man holding a cell phone',
 '10': 'young elephant with tusks',
 '11': 'man holding a banana',
 '12': 'kid is wearing blue jeans',
 '13': 'x shiny stripes on the shirt',
 '14': 'water behind player holding baseball bat',
 '15': 'bx pants on the person',
 '16': 'chairs around the table',
 '17': 'zipper on bike',
 '18': 'child with blonde hair',
 '19': 'quarted cow in front of a horse',
 '20': 'elephant walking on the ground',
 '21': '^asian cake on the table',
 '22': 'woman is wearing a black jacket',
 '23': 'zebra shirt on a man',
 '24': 'man wearing a blue shirt',
 '25': 'kids standing in grass',
 '26': 'information blue shirt',
 '27': 'x shiny black shirt',
 '28': 'one elephant is standing in the grass

In [20]:
recurrentRSA_generated_literal_exps

{'0': 'man is sitting on the ground',
 '1': 'the cat is black',
 '2': 'the car is black',
 '3': 'a laptop on a table',
 '4': 'a brown cow standing in grass',
 '5': 'a fence on the ground',
 '6': 'a pair of black pants',
 '7': 'a black bowl on the table',
 '8': 'a large grey elephant',
 '9': 'the person is holding a cell phone',
 '10': 'the elephant is gray',
 '11': 'man holding a banana',
 '12': 'a blue jeans on a man',
 '13': 'the shirt is black',
 '14': 'the pants are white',
 '15': 'baseball player wearing white pants',
 '16': 'a blue chair in front of the couch',
 '17': 'the bikes are black',
 '18': 'boy wearing a blue shirt',
 '19': 'the cow is brown',
 '20': 'the elephant is gray',
 '21': 'the donut has a spot',
 '22': 'woman is wearing a black jacket',
 '23': 'a black and white shirt',
 '24': 'a man wearing a blue shirt',
 '25': 'two giraffes standing in a field',
 '26': 'the baseball bats are white',
 '27': 'a black shirt on a man',
 '28': 'an elephant walking in the grass',
 '

In [21]:
with open('recurrentRSA_generated_pragmatic_exps.json', 'w') as f:
    json.dump(recurrentRSA_generated_pragmatic_exps, f)

In [23]:
with open('recurrentRSA_generated_literal_exps.json', 'w') as f:
    json.dump(recurrentRSA_generated_literal_exps, f)