In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets bert_score rouge_score nltk

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas

In [None]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
import pandas as pd

# Update these paths with your actual file paths in your Google Drive
csv1_path = '/content/drive/MyDrive/research/BDSLW60GPT/sortedoutput/human2.csv'
csv2_path = '/content/drive/MyDrive/research/BDSLW60GPT/sortedoutput/VideoToUnstructuredText_sign_descriptions_With_LR_hand_info_GPT41 - VideoToUnstructuredText_sign_descriptions_With_LR_hand_info_GPT41.csv'

df1 = pd.read_csv(csv1_path)
df2 = pd.read_csv(csv2_path)

# Merge on folder_name and file_name to align rows to compare
merged = pd.merge(df1, df2, on=['folder_name', 'file_name'], suffixes=('_1', '_2'))

print(f"Number of matched rows: {len(merged)}")

Number of matched rows: 60


In [None]:
from rouge_score import rouge_scorer
import bert_score
import evaluate
import nltk
nltk.download('wordnet')  # for METEOR

# Initialize scorers
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)


bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")



# Prepare containers for scores
rouge1_list = []
rouge2_list = []
rougeL_list = []
bleu_list = []
meteor_list = []
bert_f1_list = []

for _, row in merged.iterrows():
    ref = row['description_1']
    hyp = row['description_2']

    # ROUGE
    scores = rouge.score(ref, hyp)
    rouge1_list.append(scores['rouge1'].fmeasure)
    rouge2_list.append(scores['rouge2'].fmeasure)
    rougeL_list.append(scores['rougeL'].fmeasure)

    # BLEU expects tokenized lists of tokens
    ref_tokens = [ref.split()]
    hyp_tokens = hyp.split()
    bleu_score = bleu.compute(predictions=[hyp], references=[[ref]])['bleu']
    bleu_list.append(bleu_score)



    # METEOR expects raw strings inside lists
    meteor_score = meteor.compute(predictions=[hyp], references=[ref])['meteor']
    meteor_list.append(meteor_score)

# BERTScore expects lists of sentences (refs and hyps)
refs = merged['description_1'].tolist()
hyps = merged['description_2'].tolist()

P, R, F1 = bert_score.score(hyps, refs, lang='en', rescale_with_baseline=True)
bert_f1_list = F1.tolist()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Print with .3f precision
print(f"Average ROUGE-1 F1: {np.mean(rouge1_list):.3f}")
print(f"Average ROUGE-2 F1: {np.mean(rouge2_list):.3f}")
print(f"Average ROUGE-L F1: {np.mean(rougeL_list):.3f}")
print(f"Average BLEU: {np.mean(bleu_list):.3f}")
print(f"Average METEOR: {np.mean(meteor_list):.3f}")
print(f"Average BERTScore F1: {np.mean(bert_f1_list):.3f}")



Average ROUGE-1 F1: 0.527517336501077
Average ROUGE-2 F1: 0.2197827585442175
Average ROUGE-L F1: 0.33458774760437965
Average BLEU: 0.14945841044606786
Average METEOR: 0.41588469173608283
Average BERTScore F1: 0.3943711852033933


In [None]:
# Save scores to DataFrame (rounded to .3f)
merged['rouge1_f1'] = np.round(rouge1_list, 3)
merged['rouge2_f1'] = np.round(rouge2_list, 3)
merged['rougeL_f1'] = np.round(rougeL_list, 3)
merged['bleu'] = np.round(bleu_list, 3)
merged['meteor'] = np.round(meteor_list, 3)
merged['bertscore_f1'] = np.round(bert_f1_list, 3)

# Export to CSV
merged.to_csv('/content/drive/MyDrive/metrics_comparison_results_GPT41_NO_PIJ.csv', index=False)


# Batch Process

In [None]:
!pip install datasets bert_score rouge_score nltk evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
import pandas as pd
import os
from rouge_score import rouge_scorer
import evaluate
import bert_score
import nltk
import numpy as np
from google.colab import drive
drive.mount('/content/drive')
nltk.download('wordnet')  # for METEOR

# === Paths ===
reference_path = '/content/drive/MyDrive/research/BDSLW60GPT/sortedoutput/human2.csv'
csv_folder = '/content/drive/MyDrive/research/BDSLW60GPT/sortedoutput/'

# All CSVs in the folder (except the reference file)
all_csvs = [os.path.join(csv_folder, f) for f in os.listdir(csv_folder)
            if f.endswith('.csv') and not f.startswith('human2')]

# Load reference
ref_df = pd.read_csv(reference_path)

# Init scorers
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

# Table to store results
results = []

for csv_path in all_csvs:
    try:
        compare_df = pd.read_csv(csv_path)
        merged = pd.merge(ref_df, compare_df, on=['folder_name', 'file_name'], suffixes=('_1', '_2'))

        if merged.empty:
            print(f"No matching rows with {os.path.basename(csv_path)}")
            continue

        # Containers for scores
        rouge1_list, rouge2_list, rougeL_list = [], [], []
        bleu_list, meteor_list = [], []

        for _, row in merged.iterrows():
            ref = str(row['description_1'])
            hyp = str(row['description_2'])

            # ROUGE
            scores = rouge.score(ref, hyp)
            rouge1_list.append(scores['rouge1'].fmeasure)
            rouge2_list.append(scores['rouge2'].fmeasure)
            rougeL_list.append(scores['rougeL'].fmeasure)

            # BLEU
            bleu_score = bleu.compute(predictions=[hyp], references=[[ref]])['bleu']
            bleu_list.append(bleu_score)

            # METEOR
            meteor_score = meteor.compute(predictions=[hyp], references=[ref])['meteor']
            meteor_list.append(meteor_score)

        # BERTScore
        refs = merged['description_1'].astype(str).tolist()
        hyps = merged['description_2'].astype(str).tolist()
        _, _, F1 = bert_score.score(hyps, refs, lang='en', rescale_with_baseline=True)
        bert_f1_list = F1.tolist()

        # Mean scores rounded to .3f
        results.append({
            'File': os.path.basename(csv_path),
            'ROUGE-1': round(np.mean(rouge1_list), 3),
            'ROUGE-2': round(np.mean(rouge2_list), 3),
            'ROUGE-L': round(np.mean(rougeL_list), 3),
            'BLEU': round(np.mean(bleu_list), 3),
            'METEOR': round(np.mean(meteor_list), 3),
            'BERTScore F1': round(np.mean(bert_f1_list), 3)
        })
    except Exception as e:
        print(f"Error processing {csv_path}: {e}")

# Display the final table
result_df = pd.DataFrame(results)
print(result_df.to_string(index=False))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream ta

                                                                                                                                                                       File  ROUGE-1  ROUGE-2  ROUGE-L  BLEU  METEOR  BERTScore F1
                                                                                                                                                                  human.csv    0.969    0.949    0.956 0.944   0.959         0.963
        VideoToImageToStructuredText_sign_descriptions_parameter_injection_ChatGPT_4_1 - VideoToImageToStructuredText_sign_descriptions_parameter_injection_ChatGPT_4_1.csv    0.540    0.255    0.364 0.180   0.473         0.425
                                VideoToSstructuredText_sign_descriptions_parameter_injection - VideoToSstructuredText_sign_descriptions_parameter_injection_GEMINIflash.csv    0.502    0.250    0.365 0.168   0.361         0.455
                      VideoToStructuredText_sign_descriptions_parameter_injection_GEMINI25pr