In [1]:
from bert_score import score
import json
import os


def evaluate_responses_with_bertscore(data, model_type='bert-base-uncased'):
    """
    Evaluates responses using BERTScore against gold answers.
    Returns the average BERTScore precision, recall, and F1 score for the file.
    """
    responses = []
    gold_answers = []

    for entry in data:
        responses.append(entry["response"].strip())
        gold_answers.append(" ||| ".join(entry["gold_answer"]).strip())  # Combine all gold answers

    # Compute BERTScore
    P, R, F1 = score(responses, gold_answers, model_type=model_type, lang="en", verbose=True)

    # Average scores
    average_precision = P.mean().item()
    average_recall = R.mean().item()
    average_f1 = F1.mean().item()
    
    return average_precision, average_recall, average_f1, len(data)

def process_folder_with_bertscore(folder_path: str, output_file: str, model_type='bert-base-uncased'):
    """
    Processes all JSON files in a folder and evaluates BERTScore.
    Writes the results to an output file.
    """
    file_scores = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            print(f"Processing file: {file_name}")

            # Load the JSON data
            with open(file_path, "r") as f:
                try:
                    data = json.load(f)
                except json.JSONDecodeError:
                    print(f"Error reading JSON from {file_name}. Skipping.")
                    continue

            # Evaluate BERTScore
            precision, recall, f1, total_entries = evaluate_responses_with_bertscore(data, model_type=model_type)

            # Record the score for this file
            file_scores.append({
                "file_name": file_name,
                "average_precision": precision,
                "average_recall": recall,
                "average_f1": f1,
                "total_entries": total_entries
            })

    # Write the scores to the output file
    with open(output_file, "w") as f:
        json.dump(file_scores, f, indent=4)

    print(f"BERTScore results saved to {output_file}")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Parameters
folder_path = "../Responses" 
output_file = "bert_scores.json" 

# Process the folder and calculate BERTScore
process_folder_with_bertscore(folder_path, output_file, model_type="bert-base-uncased")

Processing file: open_source_1_2_top_100_response.json




calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:12<00:00,  3.21s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 27.85it/s]


done in 12.92 seconds, 7.74 sentences/sec
Processing file: tf_idf_bm25_open_1_1_top_100_combined_response.json




calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:12<00:00,  3.08s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 30.93it/s]


done in 12.40 seconds, 8.07 sentences/sec
Processing file: tf_idf_bm25_open_1_1_top_100_combined_both_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:17<00:00,  4.31s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 27.32it/s]


done in 17.35 seconds, 5.76 sentences/sec
Processing file: tf-idf_1_2_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.68s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 29.72it/s]


done in 14.81 seconds, 6.75 sentences/sec
Processing file: vision_1_1_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:13<00:00,  3.28s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 27.76it/s]


done in 13.22 seconds, 7.57 sentences/sec
Processing file: bm25_1_2_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:13<00:00,  3.29s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 24.92it/s]


done in 13.26 seconds, 7.54 sentences/sec
Processing file: BOW_1_1_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 3/3 [00:13<00:00,  4.46s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 26.07it/s]


done in 13.48 seconds, 7.42 sentences/sec
Processing file: BOW_1_2_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 3/3 [00:13<00:00,  4.48s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 26.86it/s]


done in 13.54 seconds, 7.39 sentences/sec
Processing file: bm25_1_1_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.61s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 30.66it/s]


done in 14.53 seconds, 6.88 sentences/sec
Processing file: BOW_1_0_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 3/3 [00:13<00:00,  4.54s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 27.99it/s]


done in 13.71 seconds, 7.30 sentences/sec
Processing file: ZeroShot_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:18<00:00,  4.59s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00,  9.07it/s]


done in 18.58 seconds, 5.38 sentences/sec
Processing file: open_source_1_0_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.57s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 31.21it/s]


done in 14.38 seconds, 6.95 sentences/sec
Processing file: tf_idf_bm25_open_1_2_top_100_combined_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:13<00:00,  3.45s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 31.38it/s]


done in 13.88 seconds, 7.20 sentences/sec
Processing file: bm25_1_0_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:13<00:00,  3.44s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 30.11it/s]


done in 13.86 seconds, 7.22 sentences/sec
Processing file: tf_idf_bm25_open_1_1_top_100_combined_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:13<00:00,  3.48s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 25.96it/s]


done in 14.02 seconds, 7.13 sentences/sec
Processing file: LlamaAgent_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.59s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 29.36it/s]


done in 14.46 seconds, 6.92 sentences/sec
Processing file: bm25_1_2_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.73s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 27.67it/s]


done in 15.00 seconds, 6.67 sentences/sec
Processing file: reranked_best_answers_1_1.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:16<00:00,  4.18s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00,  9.41it/s]


done in 16.97 seconds, 5.89 sentences/sec
Processing file: vision_1_2_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 27.18it/s]


done in 15.73 seconds, 6.36 sentences/sec
Processing file: WikiAgent_response_modified.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:17<00:00,  4.41s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00,  7.36it/s]


done in 17.92 seconds, 5.58 sentences/sec
Processing file: vision_1_0_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 3/3 [00:15<00:00,  5.19s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 29.31it/s]


done in 15.65 seconds, 6.39 sentences/sec
Processing file: tf_idf_bm25_open_1_0_top_100_combined_both_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.71s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 27.31it/s]


done in 14.94 seconds, 6.69 sentences/sec
Processing file: tf-idf_1_2_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:15<00:00,  3.80s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 31.98it/s]


done in 15.28 seconds, 6.55 sentences/sec
Processing file: BOW_1_1_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 3/3 [00:14<00:00,  4.84s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 26.95it/s]


done in 14.61 seconds, 6.84 sentences/sec
Processing file: WikiAgent_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.74s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00,  7.12it/s]


done in 15.25 seconds, 6.56 sentences/sec
Processing file: vision_1_1_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.71s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 30.60it/s]


done in 14.92 seconds, 6.70 sentences/sec
Processing file: LlamaAgent_response_modified.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.61s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 23.99it/s]


done in 14.54 seconds, 6.88 sentences/sec
Processing file: BOW_1_2_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 3/3 [00:14<00:00,  4.83s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 28.81it/s]


done in 14.58 seconds, 6.86 sentences/sec
Processing file: tf_idf_bm25_open_1_2_top_100_combined_both_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.64s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 32.46it/s]


done in 14.63 seconds, 6.84 sentences/sec
Processing file: reranked_best_answers_1_0.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.74s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00,  9.28it/s]


done in 15.21 seconds, 6.57 sentences/sec
Processing file: reranked_best_answers_1_2.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00,  7.55it/s]


done in 16.38 seconds, 6.11 sentences/sec
Processing file: bm25_1_0_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 25.60it/s]


done in 16.19 seconds, 6.18 sentences/sec
Processing file: open_source_1_0_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:15<00:00,  4.00s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 24.17it/s]


done in 16.09 seconds, 6.21 sentences/sec
Processing file: tf-idf_1_0_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 27.70it/s]


done in 15.52 seconds, 6.44 sentences/sec
Processing file: open_source_1_1_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:15<00:00,  3.92s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 26.99it/s]


done in 15.77 seconds, 6.34 sentences/sec
Processing file: vision_1_0_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.67s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 30.65it/s]


done in 14.75 seconds, 6.78 sentences/sec
Processing file: open_source_1_1_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.58s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 29.53it/s]


done in 14.42 seconds, 6.94 sentences/sec
Processing file: tf-idf_1_0_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:14<00:00,  3.70s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 23.32it/s]


done in 14.92 seconds, 6.70 sentences/sec
Processing file: tf_idf_bm25_open_1_0_top_100_combined_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:16<00:00,  4.14s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 26.84it/s]


done in 16.65 seconds, 6.01 sentences/sec
Processing file: ZeroShot_response_modified.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:19<00:00,  4.88s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00,  5.96it/s]


done in 19.88 seconds, 5.03 sentences/sec
Processing file: vision_1_2_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:16<00:00,  4.04s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 21.41it/s]


done in 16.29 seconds, 6.14 sentences/sec
Processing file: tf-idf_1_1_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:15<00:00,  3.99s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 21.97it/s]


done in 16.08 seconds, 6.22 sentences/sec
Processing file: tf-idf_1_1_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 23.45it/s]


done in 16.34 seconds, 6.12 sentences/sec
Processing file: tf_idf_bm25_open_1_0_top_100_combined_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:15<00:00,  3.89s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 20.26it/s]


done in 15.70 seconds, 6.37 sentences/sec
Processing file: bm25_1_1_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:15<00:00,  3.88s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 19.54it/s]


done in 15.64 seconds, 6.39 sentences/sec
Processing file: open_source_1_2_top_100_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 22.16it/s]


done in 15.57 seconds, 6.42 sentences/sec
Processing file: BOW_1_0_top_100_response.json
calculating scores...
computing bert embedding.


100%|██████████| 3/3 [00:15<00:00,  5.10s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 22.37it/s]


done in 15.41 seconds, 6.49 sentences/sec
Processing file: tf_idf_bm25_open_1_2_top_100_combined_modified_response.json
calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:15<00:00,  3.79s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 23.25it/s]

done in 15.27 seconds, 6.55 sentences/sec
BERTScore results saved to bert_scores.json



