# Caption Accuracy Metric with BLEU (Multi Edition)

In [5]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
import pandas as pd
from data_loader import BLEUDataLoader
import time

In [6]:
ref_path="../captions/ref/gpt4v_llava_10k_test.json"
hypo_dir="../captions/hypo"

data_loader = BLEUDataLoader(ref_path=ref_path, ref_repeat=4, hypo_dir=hypo_dir, split=True)
print(len(data_loader.ref), data_loader.ref[0])
print(len(data_loader.hypo_dict["anti-0.01"]), data_loader.hypo_dict["anti-0.01"][0])


1023 [['The', 'image', 'is', 'a', 'black', 'and', 'white', 'illustration', 'that', 'depicts', 'a', 'man', 'and', 'a', 'woman', 'in', 'vintage', 'attire', 'The', 'man', 'is', 'wearing', 'a', 'dark', 'suit', 'with', 'a', 'vest', 'a', 'white', 'shirt', 'and', 'a', 'tie', 'He', 'also', 'has', 'a', 'top', 'hat', 'on', 'his', 'head', 'His', 'posture', 'is', 'slightly', 'bent', 'towards', 'the', 'woman', 'and', 'he', 'appears', 'to', 'be', 'in', 'motion', 'The', 'woman', 'is', 'wearing', 'a', 'dark', 'dress', 'with', 'a', 'layered', 'skirt', 'and', 'a', 'fringed', 'shawl', 'or', 'scarf', 'draped', 'around', 'her', 'shoulders', 'Her', 'hair', 'is', 'styled', 'up', 'and', 'her', 'expression', 'is', 'neutral', 'She', 'seems', 'to', 'be', 'pointing', 'to', 'something', 'out', 'of', 'the', 'frame', 'with', 'her', 'right', 'hand', 'Both', 'individuals', 'are', 'depicted', 'with', 'a', 'sense', 'of', 'movement', 'as', 'if', 'they', 'are', 'walking', 'or', 'dancing', 'The', 'illustration', 'style', '

In [7]:
weights_list = [(1,0,0,0), (0,1,0,0), (0,0,1,0), (0,0,0,1), (0.25,0.25,0.25,0.25)]
data = []

for weights in weights_list:
    bleu_dict = {"weights": weights}
    for attr in data_loader.hypo_dict:
        bleu = corpus_bleu(data_loader.ref, data_loader.hypo_dict[attr],
                        weights=weights,
                        smoothing_function=None,
                        auto_reweigh=False)

        bleu_dict[attr] = bleu
    data.append(bleu_dict)

df_bleu = pd.DataFrame(data)
df_bleu

Unnamed: 0,weights,untuned,0,hypo-0.001-13b,hypo-0.0005,hypo-0.0001,anti-0.001-13b,anti-0.01,untuned-13b,anti-0.0005,anti-0.0001,0-13b,hypo-0.001,anti-0.001-3ep,hypo-0.01
0,"(1, 0, 0, 0)",0.284334,0.358166,0.352819,0.351847,0.353417,0.357865,0.361096,0.284178,0.354126,0.357595,0.360394,0.355769,0.356978,0.354741
1,"(0, 1, 0, 0)",0.074488,0.122576,0.120953,0.119259,0.120555,0.122915,0.123796,0.07706,0.119456,0.121313,0.124217,0.119957,0.120624,0.120276
2,"(0, 0, 1, 0)",0.018539,0.046329,0.045379,0.043784,0.045335,0.04702,0.047127,0.02008,0.043638,0.045803,0.047759,0.044961,0.045249,0.044844
3,"(0, 0, 0, 1)",0.005824,0.021664,0.021142,0.020099,0.021109,0.022253,0.022363,0.00646,0.019991,0.021486,0.022546,0.020728,0.020946,0.02082
4,"(0.25, 0.25, 0.25, 0.25)",0.038887,0.081474,0.079991,0.077953,0.079909,0.082366,0.082848,0.041054,0.077941,0.080833,0.083324,0.079414,0.079927,0.079445


In [8]:
csv_path = "../results/bleu_metric.csv"

df_bleu.to_csv(csv_path)

In [9]:
for attr in data_loader.hypo_dict:
    data = []
    for i in range(len(data_loader.ref)):
        bleu = sentence_bleu(data_loader.ref[i], data_loader.hypo_dict[attr][i],
                    weights=(0.25,0.25,0.25,0.25),
                    smoothing_function=None,
                    auto_reweigh=False)
        if len(data_loader.hypo_dict[attr][i]) <= 4:
            print(data_loader.hypo_dict[attr][i])
        record = {"bleu": bleu, "hypo": data_loader.hypo_dict[attr][i], "ref": data_loader.ref[i]}
        data.append(record)
    df = pd.DataFrame(data)
    df.to_csv(f"../results/{attr}.csv")
    print(f"{attr}: {df['bleu'].mean()}")
df

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


untuned: 0.018658549908503262
0: 0.06808678431038513
hypo-0.001-13b: 0.0667147969689883
hypo-0.0005: 0.0637530539956609
hypo-0.0001: 0.06571584122036635
anti-0.001-13b: 0.06826203036817373
anti-0.01: 0.06766483253567397
untuned-13b: 0.020197908357929277
anti-0.0005: 0.06473226404356133
anti-0.0001: 0.0663643909895492
0-13b: 0.06978556659069314
hypo-0.001: 0.0654593111428434
anti-0.001-3ep: 0.065798691862908
hypo-0.01: 0.06468060129811007


Unnamed: 0,bleu,hypo,ref
0,1.162839e-01,"[The, image, is, a, black, and, white, illustr...","[[The, image, is, a, black, and, white, illust..."
1,9.569315e-02,"[The, image, displays, a, floor, plan, of, a, ...","[[The, image, displays, a, detailed, architect..."
2,9.956186e-02,"[The, image, displays, a, white, traditional, ...","[[The, image, displays, a, white, two, seater,..."
3,8.712053e-02,"[The, image, displays, a, single, shoe, agains...","[[The, image, shows, a, pair, of, brightly, co..."
4,1.378318e-01,"[The, image, is, a, photograph, of, a, granite...","[[This, image, depicts, a, headstone, with, in..."
...,...,...,...
1018,1.894427e-78,"[The, image, appears, to, be, a, photograph, c...","[[The, image, is, a, photograph, depicting, a,..."
1019,2.244350e-78,"[The, image, depicts, a, scene, of, a, car, ac...","[[The, image, is, a, color, photograph, depict..."
1020,1.316715e-01,"[The, image, is, a, color, photograph, featuri...","[[The, image, is, a, color, photograph, featur..."
1021,6.867714e-02,"[The, image, shows, a, close, up, of, a, pedal...","[[The, image, is, a, close, up, photograph, of..."
