In [18]:
%pip install evaluate nltk rouge_score absl-py


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [20]:
import evaluate
import json
import os

In [21]:
# Change these variables to perform evalution for you task
original_quiz_path = os.path.abspath(os.getcwd()).split('gpt5_evaluation_scripts')[0] + 'processed_data/gpt5/processed_test.jsonl'
generated_data_dir = os.path.abspath(os.getcwd()).split('gpt5_evaluation_scripts')[0] + 'generated_data_gpt5/'

# Find all .json files in the generated_data_gpt5 directory (excluding automatic_evaluation files)
json_files = [f for f in os.listdir(generated_data_dir) if f.endswith('.json') and not f.startswith('automatic_evaluation')]
print(f"Found {len(json_files)} JSON files to process:")
for file in json_files:
    print(f"  - {file}")
print(f"Original quiz path: {original_quiz_path}")

# Get original quizzes (load once, use for all files)
original_quizzes = []
for line in open(original_quiz_path):
    original_quizzes.append(json.loads(line)['completion'].split("\n###")[0].replace("\n"," ").strip())

Found 1 JSON files to process:
  - a1_test.json
Original quiz path: /Users/vh19/Documents/Works/personal-projects/EduQuiz-vh/processed_data/gpt5/processed_test.jsonl


In [22]:
# Process each JSON file sequentially
for json_file in json_files:
    print(f"\n{'='*60}")
    print(f"Processing: {json_file}")
    print(f"{'='*60}")
    
    generated_quiz_path = os.path.join(generated_data_dir, json_file)
    file_basename = os.path.splitext(json_file)[0]  # Remove .json extension for output files
    
    # Get generated quizzes for current file
    generated_quizzes = []
    with open(generated_quiz_path) as f:
        generated_quizzes_dict = json.load(f)

    for key in generated_quizzes_dict:
        generated_quizzes.append(generated_quizzes_dict[key].replace("\n"," ").strip())


Processing: a1_test.json


In [23]:
# Put the predictions and gold references in lists
predictions = []
gold_references = []

# Use the length of generated quizzes and ensure we don't exceed original quizzes length
max_items = min(len(generated_quizzes), len(original_quizzes))
for i in range(max_items):
    predictions.append(generated_quizzes[i])
    gold_references.append([original_quizzes[i]])  # BLEU expects list of reference strings

print(f"Processing {len(predictions)} quiz pairs")
print("Sample prediction:", predictions[0] if predictions else "None")
print("Sample reference:", gold_references[0] if gold_references else "None")

Processing 5 quiz pairs
Sample prediction: Question: Why did the narrator arrange for Santa to bring an extra gift for Alice? True answer: To give her the doll she never received as a child in 1925. False answer: To surprise her with a trip to visit her childhood home. False answer: Because Katie asked Santa to bring a present for her grandmother. False answer: To replace a doll Alice had recently lost.
Sample reference: ["Question: Why couldn't Alice get a doll as a child? True answer: Because her family was very poor. False answer: Because her mother died quite early. False answer: Because her family disliked her. False answer: Because Alice didn't love dolls."]


In [24]:
# Load evaluation metrics
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')

In [25]:
# Add batches for evaluation
bleu.add_batch(predictions=predictions, references=gold_references)
rouge.add_batch(predictions=predictions, references=gold_references)

In [26]:
# Compute final scores
final_bleu = bleu.compute()
final_rouge = rouge.compute()

In [27]:
# Define path of downloaded meteor from https://www.cs.cmu.edu/~alavie/METEOR/
meteor_path = os.path.abspath(os.getcwd()).split('gpt5_evaluation_scripts')[0] + '/meteor-1.5/'

# Move results to meteor directory
with open(meteor_path + "predictions.txt", 'w') as f:
    for i in range(len(predictions)):
        f.write(str(predictions[i]) + '\n')
        
with open(meteor_path + "ground_truth.txt", 'w') as f:
    for i in range(len(gold_references)):
        f.write(str(gold_references[i]) + '\n')

# Run the meteor command from the meteor directory and remove result files again   
wd = os.getcwd()
os.chdir(meteor_path)
output = os.popen("java -Xmx2G -jar meteor-*.jar predictions.txt ground_truth.txt -l en -norm").read()
os.remove(meteor_path + "predictions.txt")
os.remove(meteor_path + "ground_truth.txt")
os.chdir(wd)

# Get the score from the output
meteor_score = round(float(output.split("Final score:")[1].strip()) * 100, 2)

In [28]:
# Print results for current file
print(f"Results for {json_file}:")
print("BLEU: ", str(round(final_bleu['bleu'] * 100, 2)))
print(f"ROUGE-L: {final_rouge['rougeL'] * 100:.2f}")
print("METEOR: ", str(meteor_score))

Results for a1_test.json:
BLEU:  15.93
ROUGE-L: 24.66
METEOR:  21.68


In [29]:
# Create result files with file-specific names
result_filename = f'automatic_evaluation_{file_basename}.txt'
result_path = os.path.join(generated_data_dir, result_filename)

with open(result_path, 'w') as f:
    f.write("BLEU: " + str(round(final_bleu['bleu'] * 100, 2)))
    f.write('\n')
    f.write("ROUGE-L: " + str(round(final_rouge['rougeL'] * 100, 2)))
    f.write('\n')
    f.write("METEOR: " + str(meteor_score))

print(f"Saved results to: {result_filename}")

Saved results to: automatic_evaluation_a1_test.txt


In [30]:
# Get all the target prompts and completions (load once outside loop, use test_data from earlier)
test_data = []
for line in open(original_quiz_path):
    test_data.append((json.loads(line)))

# Get quizzes for current file
quizzes = []
with open(generated_quiz_path) as f:
    gen_quizzes = json.load(f)

for key in gen_quizzes:
    quizzes.append(gen_quizzes[key])

print(f"Test data length: {len(test_data)}")
print(f"Generated quizzes length: {len(quizzes)}")

# Create human evaluation file - handle small datasets
count = 0
output_dir = os.path.abspath(os.getcwd()).split('gpt5_evaluation_scripts')[0] + 'human_evaluation/EEQG/'
os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist

human_eval_filename = f'{file_basename}_results.txt'
with open(os.path.join(output_dir, human_eval_filename), 'w') as f:
    # Use the smaller of the two datasets to avoid index errors
    max_samples = min(len(test_data), len(quizzes), 100)
    step_size = max(1, len(quizzes) // max_samples) if len(quizzes) > max_samples else 1
    
    for i in range(0, len(quizzes), step_size):
        if count >= max_samples:
            break
            
        f.write('Excel row: ' + str(count+1) + ' Test instance: ' + str(i+1) + '\n\n')
        f.write(test_data[i]['prompt'].split("\n\n###")[0] + '\n\n')
        f.write('Generated quiz:\n')
        f.write(quizzes[i].strip() + '\n\n')
        f.write('----------------------------------------------------------------------------------------' + '\n\n')
        count+=1

print(f"Created human evaluation file: {human_eval_filename} with {count} samples")
print(f"\nCompleted processing all {len(json_files)} files!")


Test data length: 950
Generated quizzes length: 5
Created human evaluation file: a1_test_results.txt with 5 samples

Completed processing all 1 files!
