In [1]:
from rouge_score import rouge_scorer
import json
import os

def evaluate_responses_with_rouge(data):
    """
    Evaluates responses using ROUGE-L against gold answers.
    Returns the average ROUGE-L F1 score for the file.
    """
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    total_rouge_l = 0
    total_entries = len(data)

    for entry in data:
        response = entry["response"].strip()
        gold_answers = [answer.strip() for answer in entry["gold_answer"]]
        
        # Compute ROUGE-L for each gold answer
        rouge_scores = [scorer.score(response, gold)['rougeL'].fmeasure for gold in gold_answers]
        max_rouge = max(rouge_scores)
        total_rouge_l += max_rouge

    # Average ROUGE-L score across all entries
    average_rouge_l = total_rouge_l / total_entries if total_entries > 0 else 0
    return average_rouge_l, total_entries

def process_folder_with_rouge(folder_path: str, output_file: str):
    """
    Processes all JSON files in a folder and evaluates ROUGE-L scores.
    Writes the results to an output file.
    """
    file_scores = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            print(f"Processing file: {file_name}")

            # Load the JSON data
            with open(file_path, "r") as f:
                try:
                    data = json.load(f)
                except json.JSONDecodeError:
                    print(f"Error reading JSON from {file_name}. Skipping.")
                    continue

            # Evaluate ROUGE scores
            average_rouge_l, total_entries = evaluate_responses_with_rouge(data)

            # Record the score for this file
            file_scores.append({
                "file_name": file_name,
                "average_rouge_l": average_rouge_l,
                "total_entries": total_entries
            })

    # Write the scores to the output file
    with open(output_file, "w") as f:
        json.dump(file_scores, f, indent=4)

    print(f"ROUGE scores saved to {output_file}")


In [2]:
# Parameters
folder_path = "../Responses"  # Replace with the path to your folder
output_file = "rouge_scores.json"  # Replace with your desired output file name

# Process the folder and calculate ROUGE-L scores
process_folder_with_rouge(folder_path, output_file)

Processing file: open_source_1_2_top_100_response.json
Processing file: tf_idf_bm25_open_1_1_top_100_combined_response.json
Processing file: tf_idf_bm25_open_1_1_top_100_combined_both_response.json
Processing file: tf-idf_1_2_top_100_modified_response.json
Processing file: vision_1_1_top_100_modified_response.json
Processing file: bm25_1_2_top_100_modified_response.json
Processing file: BOW_1_1_top_100_response.json
Processing file: BOW_1_2_top_100_response.json
Processing file: bm25_1_1_top_100_response.json
Processing file: BOW_1_0_top_100_modified_response.json
Processing file: ZeroShot_response.json
Processing file: open_source_1_0_top_100_response.json
Processing file: tf_idf_bm25_open_1_2_top_100_combined_response.json
Processing file: bm25_1_0_top_100_response.json
Processing file: tf_idf_bm25_open_1_1_top_100_combined_modified_response.json
Processing file: LlamaAgent_response.json
Processing file: bm25_1_2_top_100_response.json
Processing file: reranked_best_answers_1_1.json
P