In [2]:
%pip install evaluate nltk rouge_score absl-py bert_score


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import evaluate
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Setup paths and load original data
original_quiz_path = os.path.abspath(os.getcwd()).split('gpt5_evaluation_scripts')[0] + 'processed_data/gpt5/processed_test.jsonl'
generated_data_dir = os.path.abspath(os.getcwd()).split('gpt5_evaluation_scripts')[0] + 'generated_data_gpt5/'

# Find all .json files in the generated_data_gpt5 directory (excluding automatic_evaluation files)
json_files = [f for f in os.listdir(generated_data_dir) if f.endswith('.json') and not f.startswith('automatic_evaluation')]
print(f"Found {len(json_files)} JSON files to process:")
for file in json_files:
    print(f"  - {file}")
print(f"Original quiz path: {original_quiz_path}")

# Get original quizzes (load once, use for all files)
original_quizzes = []
test_data = []
for line in open(original_quiz_path):
    data = json.loads(line)
    original_quizzes.append(data['completion'].split("\n###")[0].replace("\n"," ").strip())
    test_data.append(data)

Found 4 JSON files to process:
  - a1_test.json
  - a2_full.json
  - a1.json
  - gpt3_output.json
Original quiz path: /Users/vh19/Documents/Works/personal-projects/EduQuiz-vh/processed_data/gpt5/processed_test.jsonl


In [11]:
def evaluate_json_file(json_file, original_quizzes, generated_data_dir):
    """
    Evaluate a single JSON file with BLEU, ROUGE, METEOR, and BERTScore metrics
    """
    print(f"\n{'='*60}")
    print(f"Processing: {json_file}")
    print(f"{'='*60}")
    
    generated_quiz_path = os.path.join(generated_data_dir, json_file)
    file_basename = os.path.splitext(json_file)[0]
    
    # Get generated quizzes for current file
    generated_quizzes = []
    with open(generated_quiz_path) as f:
        generated_quizzes_dict = json.load(f)

    for key in generated_quizzes_dict:
        generated_quizzes.append(generated_quizzes_dict[key].replace("\n"," ").strip())
    
    # Prepare predictions and references
    predictions = []
    gold_references = []
    
    max_items = min(len(generated_quizzes), len(original_quizzes))
    for i in range(max_items):
        predictions.append(generated_quizzes[i])
        gold_references.append([original_quizzes[i]])
    
    print(f"Processing {len(predictions)} quiz pairs")
    
    # Load evaluation metrics
    bleu = evaluate.load('bleu')
    rouge = evaluate.load('rouge')
    bertscore = evaluate.load('bertscore')
    
    # Compute BLEU and ROUGE
    bleu.add_batch(predictions=predictions, references=gold_references)
    rouge.add_batch(predictions=predictions, references=gold_references)
    
    final_bleu = bleu.compute()
    final_rouge = rouge.compute()
    
    # Compute BERTScore
    flat_references = [ref[0] for ref in gold_references]
    bertscore_results = bertscore.compute(predictions=predictions, references=flat_references, lang="en")
    bertscore_f1 = sum(bertscore_results['f1']) / len(bertscore_results['f1'])
    
    # Compute METEOR
    meteor_path = os.path.abspath(os.getcwd()).split('gpt5_evaluation_scripts')[0] + ''
    
    with open(meteor_path + "predictions.txt", 'w') as f:
        for prediction in predictions:
            f.write(str(prediction) + '\n')
            
    with open(meteor_path + "ground_truth.txt", 'w') as f:
        for reference in gold_references:
            f.write(str(reference) + '\n')
    
    wd = os.getcwd()
    os.chdir(meteor_path)
    output = os.popen("java -Xmx2G -jar meteor-*.jar predictions.txt ground_truth.txt -l en -norm").read()
    os.remove(meteor_path + "predictions.txt")
    os.remove(meteor_path + "ground_truth.txt")
    os.chdir(wd)
    
    meteor_score = round(float(output.split("Final score:")[1].strip()) * 100, 2)
    
    # Print results
    print(f"Results for {json_file}:")
    print("BLEU: ", str(round(final_bleu['bleu'] * 100, 2)))
    print(f"ROUGE-L: {final_rouge['rougeL'] * 100:.2f}")
    print("METEOR: ", str(meteor_score))
    print(f"BERTScore-F1: {bertscore_f1 * 100:.2f}")
    
    # Save results to file
    result_filename = f'automatic_evaluation_{file_basename}.txt'
    result_path = os.path.join(generated_data_dir, result_filename)
    
    with open(result_path, 'w') as f:
        f.write("BLEU: " + str(round(final_bleu['bleu'] * 100, 2)))
        f.write('\n')
        f.write("ROUGE-L: " + str(round(final_rouge['rougeL'] * 100, 2)))
        f.write('\n')
        f.write("METEOR: " + str(meteor_score))
        f.write('\n')
        f.write("BERTScore-F1: " + str(round(bertscore_f1 * 100, 2)))
    
    print(f"Saved results to: {result_filename}")
    
    return {
        'bleu': round(final_bleu['bleu'] * 100, 2),
        'rouge_l': round(final_rouge['rougeL'] * 100, 2),
        'meteor': meteor_score,
        'bertscore_f1': round(bertscore_f1 * 100, 2)
    }

In [12]:
# Loop through all JSON files and evaluate them
all_results = {}

for json_file in json_files:
    results = evaluate_json_file(json_file, original_quizzes, generated_data_dir)
    all_results[json_file] = results

print(f"\n{'='*60}")
print("SUMMARY OF ALL EVALUATIONS")
print(f"{'='*60}")
for json_file, results in all_results.items():
    print(f"\n{json_file}:")
    print(f"  BLEU: {results['bleu']}")
    print(f"  ROUGE-L: {results['rouge_l']}")
    print(f"  METEOR: {results['meteor']}")
    print(f"  BERTScore-F1: {results['bertscore_f1']}")


Processing: a1_test.json
Processing 5 quiz pairs


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results for a1_test.json:
BLEU:  15.93
ROUGE-L: 24.66
METEOR:  23.83
BERTScore-F1: 88.64
Saved results to: automatic_evaluation_a1_test.txt

Processing: a2_full.json
Processing 939 quiz pairs


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results for a2_full.json:
BLEU:  11.97
ROUGE-L: 23.20
METEOR:  23.83
BERTScore-F1: 86.25
Saved results to: automatic_evaluation_a2_full.txt

Processing: a1.json
Processing 949 quiz pairs


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results for a1.json:
BLEU:  16.47
ROUGE-L: 27.01
METEOR:  23.83
BERTScore-F1: 88.15
Saved results to: automatic_evaluation_a1.txt

Processing: gpt3_output.json
Processing 950 quiz pairs


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results for gpt3_output.json:
BLEU:  24.57
ROUGE-L: 36.16
METEOR:  23.83
BERTScore-F1: 89.97
Saved results to: automatic_evaluation_gpt3_output.txt

SUMMARY OF ALL EVALUATIONS

a1_test.json:
  BLEU: 15.93
  ROUGE-L: 24.66
  METEOR: 23.83
  BERTScore-F1: 88.64

a2_full.json:
  BLEU: 11.97
  ROUGE-L: 23.2
  METEOR: 23.83
  BERTScore-F1: 86.25

a1.json:
  BLEU: 16.47
  ROUGE-L: 27.01
  METEOR: 23.83
  BERTScore-F1: 88.15

gpt3_output.json:
  BLEU: 24.57
  ROUGE-L: 36.16
  METEOR: 23.83
  BERTScore-F1: 89.97


In [None]:
def generate_human_evaluation_file(json_file, test_data, generated_data_dir):
    """
    Generate human evaluation file for a single JSON file
    """
    print(f"Generating human evaluation file for: {json_file}")
    
    generated_quiz_path = os.path.join(generated_data_dir, json_file)
    file_basename = os.path.splitext(json_file)[0]
    
    # Get quizzes for current file
    quizzes = []
    with open(generated_quiz_path) as f:
        gen_quizzes = json.load(f)

    for key in gen_quizzes:
        quizzes.append(gen_quizzes[key])

    print(f"Test data length: {len(test_data)}")
    print(f"Generated quizzes length: {len(quizzes)}")

    # Create human evaluation file
    count = 0
    output_dir = os.path.abspath(os.getcwd()).split('gpt5_evaluation_scripts')[0] + 'human_evaluation/EEQG/'
    os.makedirs(output_dir, exist_ok=True)

    human_eval_filename = f'{file_basename}_results.txt'
    with open(os.path.join(output_dir, human_eval_filename), 'w') as f:
        max_samples = min(len(test_data), len(quizzes), 100)
        step_size = max(1, len(quizzes) // max_samples) if len(quizzes) > max_samples else 1
        
        for i in range(0, len(quizzes), step_size):
            if count >= max_samples:
                break
                
            f.write('Excel row: ' + str(count+1) + ' Test instance: ' + str(i+1) + '\n\n')
            f.write(test_data[i]['prompt'].split("\n\n###")[0] + '\n\n')
            f.write('Generated quiz:\n')
            f.write(quizzes[i].strip() + '\n\n')
            f.write('----------------------------------------------------------------------------------------' + '\n\n')
            count+=1

    print(f"Created human evaluation file: {human_eval_filename} with {count} samples")
    return human_eval_filename

In [None]:
# Loop through all JSON files and generate human evaluation files
# print(f"\n{'='*60}")
# print("GENERATING HUMAN EVALUATION FILES")
# print(f"{'='*60}")

# human_eval_files = []
# for json_file in json_files:
#     filename = generate_human_evaluation_file(json_file, test_data, generated_data_dir)
#     human_eval_files.append(filename)

# print(f"\nGenerated {len(human_eval_files)} human evaluation files:")
# for filename in human_eval_files:
#     print(f"  - {filename}")


GENERATING HUMAN EVALUATION FILES
Generating human evaluation file for: a1_test.json
Test data length: 950
Generated quizzes length: 5
Created human evaluation file: a1_test_results.txt with 5 samples
Generating human evaluation file for: a2_full.json
Test data length: 950
Generated quizzes length: 939
Created human evaluation file: a2_full_results.txt with 100 samples
Generating human evaluation file for: a1.json
Test data length: 950
Generated quizzes length: 950
Created human evaluation file: a1_results.txt with 100 samples
Generating human evaluation file for: gpt3_output.json
Test data length: 950
Generated quizzes length: 950
Created human evaluation file: gpt3_output_results.txt with 100 samples

Generated 4 human evaluation files:
  - a1_test_results.txt
  - a2_full_results.txt
  - a1_results.txt
  - gpt3_output_results.txt
