In [None]:
# Reddit Post Classification Model Evaluation
# ==========================================
# This notebook evaluates the performance of our fine-tuned Llama 4 Scout model 
# on the task of classifying Reddit posts into six categories:
# Entertainment, Health, Comedy, Profession, Travel, and Education

# Make sure you are using a Python 3.13.13 .venv environment

# Install required packages
# Note: sklearn is imported as a module, but the package name for installation is scikit-learn
# !pip install scikit-learn pandas numpy matplotlib seaborn torch transformers bert-score rouge-score nltk tqdm

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from bert_score import score as bert_score
from rouge_score import rouge_scorer
import json
from tqdm.notebook import tqdm
import random
from collections import Counter
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

In [None]:
# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

In [None]:
# Configuration
CATEGORIES = ["Entertainment", "Health", "Comedy", "Profession", "Travel", "Education"]
MODEL_PATH = "path/to/finetuned/model"              # Update with MODEL PATH
BASELINE_MODEL_PATH = "meta-llama/Llama-4-Scout"    # For zero-shot comparison
TEST_DATA_PATH = "path/to/test/data.csv"            # Update with TEST DATA PATH
RESULTS_DIR = "evaluation_results"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Create results directory if it doesn't exist
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"Running on device: {DEVICE}")

In [None]:
# 1. Data Loading and Preprocessing
# ================================

def load_test_data(file_path):
    """
    Load the test dataset and preprocess it for evaluation.
    
    Format expected:
    - Each row contains: post title, post body, top 5 comments, subreddit name, and topic label
    """
    df = pd.read_csv(file_path)
    
    # Verify columns exist
    required_columns = ['post_title', 'post_body', 'comments', 'subreddit', 'category']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found in dataset")
    
    # Ensure comments are properly formatted (as a list of 5 comments)
    if isinstance(df['comments'].iloc[0], str):
        # If comments are stored as a string, convert to list
        df['comments'] = df['comments'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
    
    # Verify that all categories are valid
    invalid_categories = set(df['category']) - set(CATEGORIES)
    if invalid_categories:
        print(f"Warning: Found invalid categories in dataset: {invalid_categories}")
        # Filter out invalid categories
        df = df[df['category'].isin(CATEGORIES)]
    
    print(f"Loaded test dataset with {len(df)} samples")
    print(f"Category distribution: {df['category'].value_counts().to_dict()}")
    
    return df

def format_input_for_model(row):
    """
    Format the input data for the model according to our fine-tuning format:
    
    Post title: <title>
    Post body: <body>
    Comment 1: <comment>
    ...
    Category:
    """
    # Start with post title and body
    formatted_input = f"Post title: {row['post_title']}\nPost body: {row['post_body']}\n"
    
    # Add comments
    for i, comment in enumerate(row['comments'][:5]):  # Only include up to 5 comments
        formatted_input += f"Comment {i+1}: {comment}\n"
    
    # Add the category prompt
    formatted_input += "Category:"
    
    return formatted_input

# Load test data
try:
    test_df = load_test_data(TEST_DATA_PATH)
    print("Successfully loaded test data")
except Exception as e:
    print(f"Error loading test data: {e}")
    # Create a sample test dataset for testing
    print("Creating sample test dataset...")
    
    #=========================
    # EXAMPLE DATA FOR TESTING
    #=========================
    sample_data = []
    for _ in range(50):  # 50 sample entries
        category = random.choice(CATEGORIES)
        subreddit = f"r/{category.lower()}"
        sample_data.append({
            'post_title': f"Sample post about {category}",
            'post_body': f"This is a sample post body about {category}",
            'comments': [f"Comment {i+1} about {category}" for i in range(5)],
            'subreddit': subreddit,
            'category': category
        })
    
    test_df = pd.DataFrame(sample_data)
    print("Created sample test dataset")

In [None]:
# 2. Model Loading
# ===============

def load_model(model_path):
    """Load the model and tokenizer"""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
            device_map="auto"
        )
        return model, tokenizer
    except Exception as e:
        print(f"Error loading model from {model_path}: {e}")
        return None, None

# Load the fine-tuned model
print("Loading fine-tuned model...")
ft_model, ft_tokenizer = load_model(MODEL_PATH)
if ft_model is None:
    print("Failed to load fine-tuned model. Check the model path.")
    # For demonstration, we'll create placeholder prediction functions
    def predict_category_ft(*args, **kwargs):
        return random.choice(CATEGORIES)
else:
    print("Successfully loaded fine-tuned model")
    
    def predict_category_ft(input_text, model=ft_model, tokenizer=ft_tokenizer):
        """Generate category prediction using the fine-tuned model"""
        inputs = tokenizer(input_text, return_tensors="pt").to(DEVICE)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=10,  # We only need a short output for the category
                temperature=0.1,    # Lower temperature for more deterministic outputs
                do_sample=False,    # We want deterministic outputs for evaluation
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode the output and extract the category
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # The output might contain the input text plus the generated category
        # Extract just the category name from the generated output
        for category in CATEGORIES:
            if category in generated_text[len(input_text):]:
                return category
        
        # If no exact category match, use some heuristics to find the closest one
        generated_category = generated_text[len(input_text):].strip()
        
        # Simple approach: return the category with highest overlap with generated text
        max_overlap = 0
        best_category = CATEGORIES[0]  # Default to first category
        
        for category in CATEGORIES:
            # Calculate word overlap
            overlap = len(set(word_tokenize(generated_category.lower())) & 
                          set(word_tokenize(category.lower())))
            if overlap > max_overlap:
                max_overlap = overlap
                best_category = category
        
        return best_category

# Load the baseline model for zero-shot comparison
print("Loading baseline model for zero-shot evaluation...")
baseline_model, baseline_tokenizer = load_model(BASELINE_MODEL_PATH)
if baseline_model is None:
    print("Failed to load baseline model. Zero-shot comparison will be skipped.")
    # Placeholder prediction function for baseline
    def predict_category_baseline(*args, **kwargs):
        return random.choice(CATEGORIES)
else:
    print("Successfully loaded baseline model")
    
    def predict_category_baseline(input_text, model=baseline_model, tokenizer=baseline_tokenizer):
        """Generate zero-shot category prediction using the baseline model"""
        # For zero-shot, we need to modify the prompt to include all possible categories
        zero_shot_prompt = input_text + " (Choose one category from: Entertainment, Health, Comedy, Profession, Travel, Education)\nCategory:"
        
        inputs = tokenizer(zero_shot_prompt, return_tensors="pt").to(DEVICE)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=10,
                temperature=0.1,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract the category from the generated output
        for category in CATEGORIES:
            if category in generated_text[len(zero_shot_prompt):]:
                return category
        
        # If no exact match, use heuristics as before
        generated_category = generated_text[len(zero_shot_prompt):].strip()
        
        # Return the category with highest overlap
        max_overlap = 0
        best_category = CATEGORIES[0]
        
        for category in CATEGORIES:
            overlap = len(set(word_tokenize(generated_category.lower())) & 
                         set(word_tokenize(category.lower())))
            if overlap > max_overlap:
                max_overlap = overlap
                best_category = category
        
        return best_category

In [None]:
# 3. Evaluation Functions
# =====================

def calculate_basic_metrics(y_true, y_pred):
    """Calculate accuracy, precision, recall, and F1 scores"""
    # Overall accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Per-class precision, recall, and F1
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, labels=CATEGORIES, average=None
    )
    
    # Macro averages
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro'
    )
    
    # Format results as a dictionary
    results = {
        'accuracy': accuracy,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'macro_f1': macro_f1,
        'class_metrics': {}
    }
    
    for i, category in enumerate(CATEGORIES):
        results['class_metrics'][category] = {
            'precision': precision[i],
            'recall': recall[i],
            'f1': f1[i],
            'support': support[i]
        }
    
    return results

def calculate_bert_score(references, predictions):
    """Calculate BERTScore for semantic similarity"""
    try:
        P, R, F1 = bert_score(predictions, references, lang='en', model_type='microsoft/deberta-xlarge-mnli')
        # Convert tensor to float
        return {
            'precision': P.mean().item(),
            'recall': R.mean().item(),
            'f1': F1.mean().item()
        }
    except Exception as e:
        print(f"Error calculating BERTScore: {e}")
        return {
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0
        }

def calculate_rouge_score(references, predictions):
    """Calculate ROUGE-L score for text overlap"""
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = []
    
    for ref, pred in zip(references, predictions):
        score = scorer.score(ref, pred)
        scores.append(score['rougeL'].fmeasure)
    
    return {'rouge_l': np.mean(scores)}

def create_confusion_matrix(y_true, y_pred):
    """Create and plot confusion matrix"""
    cm = confusion_matrix(y_true, y_pred, labels=CATEGORIES)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=CATEGORIES, yticklabels=CATEGORIES)
    plt.title('Confusion Matrix')
    plt.ylabel('True Category')
    plt.xlabel('Predicted Category')
    plt.tight_layout()
    plt.savefig(f"{RESULTS_DIR}/confusion_matrix.png")
    plt.show()
    
    return cm

def analyze_misclassifications(test_df, predictions):
    """Analyze patterns in misclassified examples"""
    test_df['predicted_category'] = predictions
    test_df['is_correct'] = test_df['category'] == test_df['predicted_category']
    
    # Identify the most commonly misclassified categories
    misclassified = test_df[~test_df['is_correct']]
    
    if len(misclassified) == 0:
        return {"error_analysis": "No misclassifications found"}
    
    # Calculate error rates by true category
    error_by_category = {}
    for category in CATEGORIES:
        category_samples = test_df[test_df['category'] == category]
        if len(category_samples) > 0:
            error_rate = 1 - category_samples['is_correct'].mean()
            error_by_category[category] = error_rate
    
    # Find the most common misclassification pairs
    misclass_pairs = Counter()
    for _, row in misclassified.iterrows():
        pair = (row['category'], row['predicted_category'])
        misclass_pairs[pair] += 1
    
    most_common_errors = misclass_pairs.most_common(5)
    
    # Sample misclassified examples for qualitative analysis
    sample_size = min(10, len(misclassified))
    sample_errors = misclassified.sample(sample_size)
    
    error_analysis = {
        "error_rates_by_category": error_by_category,
        "most_common_misclassifications": [
            {
                "true_category": true_cat,
                "predicted_category": pred_cat,
                "count": count
            }
            for (true_cat, pred_cat), count in most_common_errors
        ],
        "sample_misclassified": [
            {
                "post_title": row['post_title'],
                "true_category": row['category'],
                "predicted_category": row['predicted_category'],
                "subreddit": row['subreddit']
            }
            for _, row in sample_errors.iterrows()
        ]
    }
    
    return error_analysis

def generate_evaluation_report(basic_metrics, bert_scores, rouge_scores, error_analysis):
    """Generate a comprehensive evaluation report"""
    report = {
        "basic_metrics": basic_metrics,
        "semantic_similarity": {
            "bert_score": bert_scores,
            "rouge_score": rouge_scores
        },
        "error_analysis": error_analysis
    }
    
    with open(f"{RESULTS_DIR}/evaluation_report.json", 'w') as f:
        json.dump(report, f, indent=2)
    
    # Create a summary markdown report
    markdown_report = f"""# Reddit Classification Model Evaluation Report
    
    ## Overview

- **Model**: Fine-tuned Llama 4 Scout
- **Task**: Classify Reddit posts into 6 categories
- **Test Set Size**: {len(test_df)} samples

## Performance Metrics

### Classification Accuracy

- **Overall Accuracy**: {basic_metrics['accuracy']:.4f} (Target: ≥0.85)
- **Macro F1 Score**: {basic_metrics['macro_f1']:.4f} (Target: ≥0.82)

### Per-Category Performance

| Category | Precision | Recall | F1 Score | Support |
|----------|-----------|--------|----------|---------|
"""
    
    for category in CATEGORIES:
        metrics = basic_metrics['class_metrics'][category]
        markdown_report += f"| {category} | {metrics['precision']:.4f} | {metrics['recall']:.4f} | {metrics['f1']:.4f} | {metrics['support']} |\n"
    
    markdown_report += f"""
### Semantic Similarity Metrics

- **BERTScore F1**: {bert_scores['f1']:.4f} (Target: ≥0.88)
- **ROUGE-L Score**: {rouge_scores['rouge_l']:.4f} (Target: ≥0.75)

## Error Analysis

### Most Challenging Categories

The categories with the highest error rates:

"""
    
    # Sort categories by error rate (descending)
    error_rates = error_analysis.get("error_rates_by_category", {})
    sorted_error_rates = sorted(error_rates.items(), key=lambda x: x[1], reverse=True)
    
    for category, error_rate in sorted_error_rates[:3]:  # Top 3 most challenging
        markdown_report += f"- **{category}**: {error_rate:.4f} error rate\n"
    
    markdown_report += f"""
### Common Misclassifications

The most common category confusions:

"""
    
    for error in error_analysis.get("most_common_misclassifications", [])[:3]:
        markdown_report += f"- **{error['true_category']}** misclassified as **{error['predicted_category']}**: {error['count']} instances\n"
    
    markdown_report += f"""
## Conclusion

The model {'meets' if basic_metrics['accuracy'] >= 0.85 else 'does not meet'} our target accuracy benchmark of ≥0.85.
The BERTScore F1 {'meets' if bert_scores['f1'] >= 0.88 else 'does not meet'} our target of ≥0.88.
The ROUGE-L score {'meets' if rouge_scores['rouge_l'] >= 0.75 else 'does not meet'} our target of ≥0.75.

"""
    
    with open(f"{RESULTS_DIR}/evaluation_summary.md", 'w') as f:
        f.write(markdown_report)
    
    return markdown_report

In [None]:
# 4. Evaluation Execution
# =====================

def evaluate_model(model_type="fine-tuned"):
    """Run full evaluation for the specified model type"""
    print(f"Starting evaluation of {model_type} model...")
    
    # Format inputs
    inputs = test_df.apply(format_input_for_model, axis=1).tolist()
    true_categories = test_df['category'].tolist()
    
    # Make predictions based on model type
    if model_type == "fine-tuned":
        predict_fn = predict_category_ft
    else:  # baseline
        predict_fn = predict_category_baseline
    
    # Run predictions
    predictions = []
    
    print(f"Making predictions with {model_type} model...")
    for input_text in tqdm(inputs):
        prediction = predict_fn(input_text)
        predictions.append(prediction)
    
    # Calculate metrics
    print("Calculating evaluation metrics...")
    basic_metrics = calculate_basic_metrics(true_categories, predictions)
    
    # For BERTScore and ROUGE, pass the category names as simple text
    bert_scores = calculate_bert_score(true_categories, predictions)
    rouge_scores = calculate_rouge_score(true_categories, predictions)
    
    # Create confusion matrix
    if model_type == "fine-tuned":  # Only create plots for fine-tuned model
        print("Creating confusion matrix...")
        cm = create_confusion_matrix(true_categories, predictions)
    
    # Analyze misclassifications (only for fine-tuned model)
    if model_type == "fine-tuned":
        print("Analyzing misclassifications...")
        error_analysis = analyze_misclassifications(test_df.copy(), predictions)
    else:
        error_analysis = {}
    
    # Save results
    results = {
        "model_type": model_type,
        "basic_metrics": basic_metrics,
        "bert_scores": bert_scores,
        "rouge_scores": rouge_scores,
        "predictions": predictions
    }
    
    with open(f"{RESULTS_DIR}/{model_type}_results.json", 'w') as f:
        # Convert numpy values to float for JSON serialization
        json_results = {
            "model_type": results["model_type"],
            "basic_metrics": results["basic_metrics"],
            "bert_scores": {k: float(v) for k, v in results["bert_scores"].items()},
            "rouge_scores": {k: float(v) for k, v in results["rouge_scores"].items()},
            "predictions": results["predictions"]
        }
        json.dump(json_results, f, indent=2)
    
    print(f"Evaluation of {model_type} model completed")
    return results, error_analysis

In [None]:
# 5. Compare Fine-tuned vs. Baseline Performance
# ===========================================

def compare_models(ft_results, baseline_results):
    """Compare performance between fine-tuned and baseline models"""
    comparison = {
        "accuracy_improvement": ft_results["basic_metrics"]["accuracy"] - baseline_results["basic_metrics"]["accuracy"],
        "f1_improvement": ft_results["basic_metrics"]["macro_f1"] - baseline_results["basic_metrics"]["macro_f1"],
        "bert_score_improvement": ft_results["bert_scores"]["f1"] - baseline_results["bert_scores"]["f1"],
        "rouge_improvement": ft_results["rouge_scores"]["rouge_l"] - baseline_results["rouge_scores"]["rouge_l"]
    }
    
    # Create comparison visualizations
    plt.figure(figsize=(12, 6))
    
    # Accuracy comparison by category
    metrics_by_category = {
        "Fine-tuned": {},
        "Baseline": {}
    }
    
    for category in CATEGORIES:
        metrics_by_category["Fine-tuned"][category] = ft_results["basic_metrics"]["class_metrics"][category]["f1"]
        metrics_by_category["Baseline"][category] = baseline_results["basic_metrics"]["class_metrics"][category]["f1"]
    
    # Convert to DataFrame for easier plotting
    comparison_df = pd.DataFrame(metrics_by_category)
    
    # Plot F1 scores by category
    comparison_df.plot(kind='bar', figsize=(12, 6))
    plt.title('F1 Score Comparison by Category')
    plt.ylabel('F1 Score')
    plt.xlabel('Category')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(f"{RESULTS_DIR}/f1_comparison_by_category.png")
    plt.show()
    
    # Plot overall metrics comparison
    metrics = ['accuracy', 'macro_f1']
    ft_values = [ft_results["basic_metrics"]["accuracy"], ft_results["basic_metrics"]["macro_f1"]]
    baseline_values = [baseline_results["basic_metrics"]["accuracy"], baseline_results["basic_metrics"]["macro_f1"]]
    
    plt.figure(figsize=(10, 6))
    x = np.arange(len(metrics))
    width = 0.35
    
    plt.bar(x - width/2, ft_values, width, label='Fine-tuned')
    plt.bar(x + width/2, baseline_values, width, label='Baseline')
    
    plt.xticks(x, ['Accuracy', 'Macro F1'])
    plt.ylabel('Score')
    plt.title('Performance Comparison: Fine-tuned vs. Baseline')
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.savefig(f"{RESULTS_DIR}/overall_metrics_comparison.png")
    plt.show()
    
    # Generate comparison report
    comparison_report = f"""# Model Comparison: Fine-tuned vs. Baseline

## Overall Improvement

- **Accuracy Improvement**: {comparison["accuracy_improvement"]:.4f} 
  - Fine-tuned: {ft_results["basic_metrics"]["accuracy"]:.4f}
  - Baseline: {baseline_results["basic_metrics"]["accuracy"]:.4f}

- **Macro F1 Improvement**: {comparison["f1_improvement"]:.4f}
  - Fine-tuned: {ft_results["basic_metrics"]["macro_f1"]:.4f}
  - Baseline: {baseline_results["basic_metrics"]["macro_f1"]:.4f}

- **BERTScore F1 Improvement**: {comparison["bert_score_improvement"]:.4f}
  - Fine-tuned: {ft_results["bert_scores"]["f1"]:.4f}
  - Baseline: {baseline_results["bert_scores"]["f1"]:.4f}

- **ROUGE-L Improvement**: {comparison["rouge_improvement"]:.4f}
  - Fine-tuned: {ft_results["rouge_scores"]["rouge_l"]:.4f}
  - Baseline: {baseline_results["rouge_scores"]["rouge_l"]:.4f}

## Per-Category F1 Score Comparison

| Category | Fine-tuned | Baseline | Improvement |
|----------|------------|----------|-------------|
"""
    
    for category in CATEGORIES:
        ft_f1 = ft_results["basic_metrics"]["class_metrics"][category]["f1"]
        base_f1 = baseline_results["basic_metrics"]["class_metrics"][category]["f1"]
        improvement = ft_f1 - base_f1
        comparison_report += f"| {category} | {ft_f1:.4f} | {base_f1:.4f} | {improvement:.4f} |\n"
    
    with open(f"{RESULTS_DIR}/model_comparison.md", 'w') as f:
        f.write(comparison_report)
    
    return comparison, comparison_report

In [None]:
# Main execution
if __name__ == "__main__":
    print("Starting model evaluation...")
    
    # Evaluate fine-tuned model
    ft_results, error_analysis = evaluate_model("fine-tuned")
    
    # Evaluate baseline model
    baseline_results, _ = evaluate_model("baseline")
    
    # Compare models
    comparison, comparison_report = compare_models(ft_results, baseline_results)
    
    # Generate full report for fine-tuned model
    report = generate_evaluation_report(
        ft_results["basic_metrics"],
        ft_results["bert_scores"],
        ft_results["rouge_scores"],
        error_analysis
    )
    
    print("\nEvaluation Summary:")
    print(f"Fine-tuned model accuracy: {ft_results['basic_metrics']['accuracy']:.4f}")
    print(f"Baseline model accuracy: {baseline_results['basic_metrics']['accuracy']:.4f}")
    print(f"Improvement: {comparison['accuracy_improvement']:.4f}")
    print(f"\nFine-tuned model F1: {ft_results['basic_metrics']['macro_f1']:.4f}")
    print(f"Baseline model F1: {baseline_results['basic_metrics']['macro_f1']:.4f}")
    print(f"Improvement: {comparison['f1_improvement']:.4f}")
    
    print("\nDetailed reports saved to:")
    print(f"- {RESULTS_DIR}/evaluation_report.json")
    print(f"- {RESULTS_DIR}/evaluation_summary.md")
    print(f"- {RESULTS_DIR}/model_comparison.md")
    
    print("\nEvaluation complete!")