In [None]:
! pip install bert-score rouge-score pycocoevalcap

In [4]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score as nltk_meteor_score
from nltk.metrics.distance import edit_distance

from bert_score import score as bert_scorer
from rouge_score import rouge_scorer as rouge_calculator 
from pycocoevalcap.cider.cider import Cider 

In [6]:
# 1. Exact Match Accuracy
def exact_match_accuracy(gt_list, pred_list):
    matches = sum(1 for gt, pred in zip(gt_list, pred_list) if str(gt).strip().lower() == str(pred).strip().lower())
    return matches / len(gt_list) if len(gt_list) > 0 else 0

# 2. BERTScore (using bert-score library)
def calculate_bert_score(gt_list, pred_list):
    # Ensure inputs are lists of strings
    gt_list_str = [str(s) for s in gt_list]
    pred_list_str = [str(s) for s in pred_list]
    P, R, F1 = bert_scorer(pred_list_str, gt_list_str, lang="en", verbose=False, model_type="bert-base-uncased")
    return F1.mean().item() 

# 3. BLEU Score (specifically BLEU-1)
def calculate_bleu_scores(gt_list, pred_list):
    """Calculates average BLEU-1 scores."""
    bleu1_scores = []
    chencherry = SmoothingFunction()

    for gt, pred in zip(gt_list, pred_list):
        gt_tokens = [word_tokenize(str(gt).lower())]
        pred_tokens = word_tokenize(str(pred).lower())

        if not pred_tokens:
            bleu1_scores.append(0.0)
            continue

        bleu1_scores.append(sentence_bleu(gt_tokens, pred_tokens, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1))

    avg_bleu1 = sum(bleu1_scores) / len(bleu1_scores) if bleu1_scores else 0
    return avg_bleu1

# 4. ROUGE-L Score (using rouge-score library)
def calculate_rouge_l_score(gt_list, pred_list):
    scorer = rouge_calculator.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_fscores = []
    for gt, pred in zip(gt_list, pred_list):
        score = scorer.score(str(gt), str(pred))
        rouge_l_fscores.append(score['rougeL'].fmeasure)
    return sum(rouge_l_fscores) / len(rouge_l_fscores) if rouge_l_fscores else 0

# 5. METEOR Score (using NLTK)
def calculate_meteor_score(gt_list, pred_list):
    meteor_scores = []
    for gt, pred in zip(gt_list, pred_list):
        gt_tokens = word_tokenize(str(gt).lower())
        pred_tokens = word_tokenize(str(pred).lower())
        if not pred_tokens:
             meteor_scores.append(0.0)
             continue
        meteor_scores.append(nltk_meteor_score([gt_tokens], pred_tokens))
    return sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0


def calculate_cider_score(gt_list, pred_list):
    gts = {i: [str(gt)] for i, gt in enumerate(gt_list)}
    res = {i: [str(pred)] for i, pred in enumerate(pred_list)}

    if not gts or not res: # Handle empty lists
        return 0.0

    cider_scorer = Cider()
    avg_cider_score, _ = cider_scorer.compute_score(gts, res)
    return avg_cider_score

# 7. Normalized Levenshtein Similarity (1 - (edit_distance / max_len))
def normalized_levenshtein_similarity(gt_list, pred_list):
    similarities = []
    for gt, pred in zip(gt_list, pred_list):
        s_gt = str(gt).lower().strip()
        s_pred = str(pred).lower().strip()
        if not s_gt and not s_pred: # both empty
            similarities.append(1.0)
            continue
        if not s_gt or not s_pred: # one is empty
             similarities.append(0.0)
             continue
        
        dist = edit_distance(s_gt, s_pred)
        max_len = max(len(s_gt), len(s_pred))
        if max_len == 0: 
            similarities.append(1.0 if dist == 0 else 0.0)
        else:
            similarities.append(1.0 - (dist / max_len))
    return sum(similarities) / len(similarities) if similarities else 0

In [17]:
csv_file = "/kaggle/input/pred-vr/fintuned_pred.csv"

df = pd.read_csv(csv_file)

gt_answers = df['ground_truth'].tolist()
pred_answers = df['pred'].tolist()

gt_answers_str = [str(ans) for ans in gt_answers]
pred_answers_str = [str(ans) for ans in pred_answers]

print(f"Ground Truths found: {len(gt_answers_str)}\n")
print(f"Predictions found: {len(pred_answers_str)}\n")


print("--- VQA Metrics ---")

em_accuracy = exact_match_accuracy(gt_answers_str, pred_answers_str)
print(f"1. Exact Match Accuracy: {em_accuracy:.4f}")

avg_bert_f1 = calculate_bert_score(gt_answers_str, pred_answers_str)
print(f"2. Average BERTScore F1: {avg_bert_f1:.4f}")

avg_bleu1 = calculate_bleu_scores(gt_answers_str, pred_answers_str)
print(f"3. Average BLEU-1 Score: {avg_bleu1:.4f}")

avg_rouge_l = calculate_rouge_l_score(gt_answers_str, pred_answers_str)
print(f"4. Average ROUGE-L (F1): {avg_rouge_l:.4f}")

avg_meteor = calculate_meteor_score(gt_answers_str, pred_answers_str)
print(f"5. Average METEOR Score: {avg_meteor:.4f}")

avg_cider = calculate_cider_score(gt_answers_str, pred_answers_str)
print(f"6. Average CIDEr-D Score: {avg_cider:.4f}")

norm_lev_sim = normalized_levenshtein_similarity(gt_answers_str, pred_answers_str)
print(f"7. Normalized Levenshtein Similarity: {norm_lev_sim:.4f}")

Ground Truths found: 25740

Predictions found: 25740

--- VQA Metrics ---
1. Exact Match Accuracy: 0.3180
2. Average BERTScore F1: 0.7410
3. Average BLEU-1 Score: 0.3180
4. Average ROUGE-L (F1): 0.3270
5. Average METEOR Score: 0.1673
6. Average CIDEr-D Score: 0.7951
7. Normalized Levenshtein Similarity: 0.4352
