In [None]:
import torch
import json
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from datasets import load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt')

In [None]:
# Load the test dataset
test_dataset = load_from_disk("../data/processed/test_dataset")
print(f"Test examples: {len(test_dataset)}")
print("Sample test example:", test_dataset[0])

# Load the fine-tuned model and tokenizer
model_path = "./models/deepseek-quiz-flashcard-generator"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
def generate_quiz_flashcard(input_text, max_length=512):
    """Generate quiz/flashcard content using the fine-tuned model"""
    prompt = f"<|im_start|>user\n{input_text}\n<|im_end|>\n<|im_start|>assistant\n"
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate with appropriate parameters
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # Extract only the assistant's response
    response = generated_text.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0].strip()
    return response

In [None]:
def generate_quiz_flashcard(input_text, max_length=512):
    """Generate quiz/flashcard content using the fine-tuned model"""
    prompt = f"<|im_start|>user\n{input_text}\n<|im_end|>\n<|im_start|>assistant\n"
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate with appropriate parameters
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # Extract only the assistant's response
    response = generated_text.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0].strip()
    return response

In [None]:
# Initialize metrics calculators
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Run evaluation on test set
results = []

for i, example in enumerate(tqdm(test_dataset)):
    input_text = example["content"]  # Adjust field name if needed
    reference = example["quiz_flashcard"]  # Adjust field name if needed
    
    # Generate quiz/flashcard
    try:
        generated = generate_quiz_flashcard(input_text)
        
        # Calculate ROUGE scores
        rouge_scores = rouge_scorer_obj.score(reference, generated)
        
        # Calculate BLEU score
        reference_tokens = nltk.word_tokenize(reference.lower())
        generated_tokens = nltk.word_tokenize(generated.lower())
        bleu_score = sentence_bleu([reference_tokens], generated_tokens)
        
        results.append({
            "input": input_text,
            "reference": reference,
            "generated": generated,
            "rouge1": rouge_scores["rouge1"].fmeasure,
            "rouge2": rouge_scores["rouge2"].fmeasure,
            "rougeL": rouge_scores["rougeL"].fmeasure,
            "bleu": bleu_score
        })
        
        # Show first few examples
        if i < 5:
            print(f"Example {i+1}:")
            print(f"Input: {input_text[:100]}...")
            print(f"Reference: {reference[:100]}...")
            print(f"Generated: {generated[:100]}...")
            print("-" * 80)
    
    except Exception as e:
        print(f"Error processing example {i}: {e}")

In [None]:
# Create DataFrame from results
results_df = pd.DataFrame(results)

# Calculate average metrics
average_metrics = {
    "ROUGE-1": results_df["rouge1"].mean(),
    "ROUGE-2": results_df["rouge2"].mean(),
    "ROUGE-L": results_df["rougeL"].mean(),
    "BLEU": results_df["bleu"].mean()
}

print("\nAverage metrics:")
for metric, value in average_metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
# Set up the plot styles
plt.style.use('ggplot')
plt.figure(figsize=(10, 6))

# Create bar chart of average metrics
bars = plt.bar(
    average_metrics.keys(), 
    average_metrics.values(),
    color=['#ff9999', '#66b3ff', '#99ff99', '#ffcc99']
)

# Add value labels on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width()/2.,
        height + 0.01,
        f'{height:.3f}',
        ha='center',
        fontsize=11
    )

plt.title("Average Evaluation Metrics", fontsize=15)
plt.ylabel("Score", fontsize=12)
plt.ylim(0, max(average_metrics.values()) * 1.2)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Save the figure
plt.savefig("./results/evaluation_metrics.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create histograms of score distributions
fig, axs = plt.subplots(2, 2, figsize=(15, 10))
metrics = ["rouge1", "rouge2", "rougeL", "bleu"]
titles = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "BLEU"]

for i, (metric, title) in enumerate(zip(metrics, titles)):
    row, col = i // 2, i % 2
    axs[row, col].hist(results_df[metric], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
    axs[row, col].set_title(f"{title} Score Distribution")
    axs[row, col].set_xlabel("Score")
    axs[row, col].set_ylabel("Frequency")
    axs[row, col].grid(alpha=0.3)

plt.tight_layout()
plt.savefig("./results/score_distributions.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Save detailed results to CSV
os.makedirs("./results", exist_ok=True)
results_df.to_csv("./results/evaluation_results.csv", index=False)

# Save metrics summary
with open("./results/metrics_summary.json", "w") as f:
    json.dump(average_metrics, f, indent=4)

print("Results saved to ./results/")

In [None]:
# Sample random examples for qualitative analysis
sample_indices = np.random.choice(len(results_df), min(10, len(results_df)), replace=False)
sample_results = results_df.iloc[sample_indices].copy()

# Sort samples by ROUGE-L score to see best and worst examples
sample_results = sample_results.sort_values(by="rougeL", ascending=False)

print("\nQualitative Analysis - Best and Worst Examples:")
print("\n--- TOP PERFORMING EXAMPLES ---")
for i, row in enumerate(sample_results.head(3).itertuples()):
    print(f"Example {i+1} (ROUGE-L: {row.rougeL:.4f}, BLEU: {row.bleu:.4f}):")
    print(f"Input: {row.input[:150]}...")
    print(f"Generated: {row.generated[:150]}...")
    print("-" * 80)

print("\n--- WORST PERFORMING EXAMPLES ---")
for i, row in enumerate(sample_results.tail(3).itertuples()):
    print(f"Example {i+1} (ROUGE-L: {row.rougeL:.4f}, BLEU: {row.bleu:.4f}):")
    print(f"Input: {row.input[:150]}...")
    print(f"Reference: {row.reference[:150]}...")
    print(f"Generated: {row.generated[:150]}...")
    print("-" * 80)

In [None]:
# Find examples with the largest discrepancy between metrics
results_df['metric_diff'] = abs(results_df['rougeL'] - results_df['bleu'])
interesting_cases = results_df.sort_values(by='metric_diff', ascending=False).head(5)

print("\nInteresting Cases (Large Metric Discrepancies):")
for i, row in enumerate(interesting_cases.itertuples()):
    print(f"Case {i+1} (ROUGE-L: {row.rougeL:.4f}, BLEU: {row.bleu:.4f}, Diff: {row.metric_diff:.4f}):")
    print(f"Input: {row.input[:100]}...")
    print(f"Reference: {row.reference[:100]}...")
    print(f"Generated: {row.generated[:100]}...")
    print("-" * 80)