# IPV Detection Evaluation Pipeline

This notebook evaluates LLM predictions for Intimate Partner Violence (IPV) detection.

## Structure
1. **Setup**: Import libraries and define evaluation functions
2. **Single Evaluation**: Test one prediction file at a time
3. **Bulk Evaluation**: Compare all prediction files and generate summary

## Outputs
All plots are saved to `./eval_figs/` directory:
- `ROC_curve.png`: Receiver Operating Characteristic curve
- `PR_curve.png`: Precision-Recall curve

## Function Reference

| Function | Input | Output |
|----------|-------|--------|
| `load_results(path)` | File path (JSON/CSV) | DataFrame with `id` column |
| `load_groundtruth(csv_path)` | CSV file path | DataFrame with normalized abuse columns and binary `label` |
| `evaluate_binary(df_pred, df_truth)` | Two DataFrames | Dict of metrics + arrays `(y_true, y_pred)` |
| `evaluate_multilabel(df_pred, df_truth)` | Two DataFrames | Dict with F1_macro, F1_micro, ExactMatchAcc |
| `plot_roc_curve(y_true, y_scores, model_name, out_dir, filename)` | Arrays, model name, paths | ROC AUC + saves ROC curve plot |
| `plot_pr_curve(y_true, y_scores, out_dir)` | Arrays + directory path | PR AUC + saves PR curve plot |
| `plot_binary(y_true, y_pred, out_dir)` | Arrays + directory path | Dict with ROC_AUC, PR_AUC + saves plots |
| `run_evaluation(results_path, groundtruth_path, task, out_dir)` | File paths, task type, output dir | Dict of all metrics + prints results |

In [2]:
# ============================================================
# FINAL EVALUATION PIPELINE (CLEAN + FLEXIBLE)
# ============================================================
from pathlib import Path
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    precision_recall_curve, roc_curve, auc
)

# ============================================================
# LOADERS
# ============================================================

def load_results(path: str):
    """Load model output file (JSON or CSV)"""
    path = Path(path)
    if path.suffix == ".json":
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        df = pd.DataFrame(data)
    elif path.suffix == ".csv":
        df = pd.read_csv(path)
    else:
        raise ValueError("Results file must be .json or .csv")

    if "id" not in df.columns:
        df["id"] = np.arange(len(df))
    return df


def load_groundtruth(csv_path: str):
    """Load ground truth CSV and normalize for IPV detection."""
    df = pd.read_csv(csv_path)

    if "id" not in df.columns:
        df["id"] = df.index

    # rename
    rename_map = {
        "Physical Abuse": "Physical",
        "Emotional Abuse": "Emotional",
        "Sexual Abuse": "Sexual"
    }
    df = df.rename(columns=rename_map)

    # convert TRUE/FALSE strings → booleans
    for c in ["Physical", "Emotional", "Sexual"]:
        df[c] = (
            df[c].astype(str)
            .str.upper()
            .map({"TRUE": True, "FALSE": False})
            .fillna(False)
        )

    # create IPV label (for binary)
    df["IPV"] = df[["Physical", "Emotional", "Sexual"]].any(axis=1)
    df["label"] = df["IPV"].map({True: "IPV", False: "NOT_IPV"})
    return df


# ============================================================
# EVALUATION FUNCTIONS
# ============================================================

def evaluate_binary(df_pred: pd.DataFrame, df_truth: pd.DataFrame):
    """Evaluate binary IPV detection performance."""
    y_true = df_truth["label"].map({"IPV": 1, "NOT_IPV": 0}).values

    # clean predictions
    y_raw = df_pred["extracted_label"].astype(str).str.upper().str.strip()
    y_raw = y_raw.replace({"": "NOT_IPV", "NONE": "NOT_IPV", "N/A": "NOT_IPV"})
    y_pred = y_raw.map({"IPV": 1, "NOT_IPV": 0}).fillna(0).astype(int)

    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    # optional AUC if scores exist
    if "y_score" in df_pred.columns:
        y_score = df_pred["y_score"].fillna(0.5).values
        try:
            rocauc = roc_auc_score(y_true, y_score)
        except Exception:
            rocauc = np.nan
    else:
        rocauc = np.nan

    return {"Accuracy": acc, "F1": f1, "ROC_AUC": rocauc}, y_true, y_pred


def evaluate_multilabel(df_pred: pd.DataFrame, df_truth: pd.DataFrame):
    """Evaluate multilabel (Physical, Emotional, Sexual) predictions."""
    cols = ["Physical", "Emotional", "Sexual"]

    # build y_pred from extracted_labels if necessary
    if "extracted_labels" in df_pred.columns:
        def parse_labels(x):
            if isinstance(x, str):
                return {lbl.strip().capitalize() for lbl in x.split(",")}
            elif isinstance(x, list):
                return {lbl.strip().capitalize() for lbl in x}
            return set()

        for c in cols:
            df_pred[c] = df_pred["extracted_labels"].apply(lambda L: int(c in parse_labels(L)))

    y_true = df_truth[cols].astype(int).values
    y_pred = df_pred[cols].astype(int).values

    f1_macro = f1_score(y_true, y_pred, average="macro", zero_division=0)
    f1_micro = f1_score(y_true, y_pred, average="micro", zero_division=0)
    exact_acc = (y_true == y_pred).all(axis=1).mean()

    return {"F1_macro": f1_macro, "F1_micro": f1_micro, "ExactMatchAcc": exact_acc}


# ============================================================
# VISUALIZATIONS
# ============================================================

def plot_roc_curve(y_true, y_scores, model_name="Model", out_dir="./eval_figs", filename="ROC_curve.png"):
    """
    Plot ROC curve for binary classification.
    
    Args:
        y_true: True binary labels (0 or 1)
        y_scores: Prediction scores or probabilities (continuous values)
        model_name: Name of the model for title
        out_dir: Output directory for saving plots
        filename: Name of the output file
        
    Returns:
        ROC AUC score
    """
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    
    # Compute ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    
    # Plot
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.3f}', linewidth=2)
    plt.plot([0, 1], [0, 1], '--', color='gray', alpha=0.5, label='Random Classifier')
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title(f'{model_name} ROC Curve', fontsize=14, fontweight='bold')
    plt.legend(loc='lower right', fontsize=11)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(out_dir / filename, dpi=150, bbox_inches='tight')
    plt.close()
    
    return roc_auc


def plot_pr_curve(y_true, y_scores, out_dir="./eval_figs"):
    """Plot Precision-Recall curve for binary evaluation."""
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    
    prec, rec, _ = precision_recall_curve(y_true, y_scores)
    pr_auc = auc(rec, prec)
    
    plt.figure()
    plt.plot(rec, prec, label=f"PR AUC={pr_auc:.3f}")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision-Recall Curve")
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_dir / "PR_curve.png", dpi=150)
    plt.close()
    
    return pr_auc


def plot_binary(y_true, y_pred, out_dir="./eval_figs"):
    """Plot ROC + PR curves for binary evaluation (legacy function for compatibility)."""
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    # For plotting, use predictions as scores (no probs available)
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    
    # Use the new ROC function (y_true, y_pred as scores)
    plot_roc_curve(y_true, y_pred, model_name="Model", out_dir=out_dir, filename="ROC_curve.png")
    
    # Plot PR curve
    pr_auc = plot_pr_curve(y_true, y_pred, out_dir)

    return {"ROC_AUC": roc_auc, "PR_AUC": pr_auc}


# ============================================================
# DRIVER
# ============================================================

def run_evaluation(
    results_path: str,
    groundtruth_path: str,
    task: str = "binary",
    out_dir: str = "./eval_figs"
):
    """Main entry point: evaluate one model output file."""
    df_pred = load_results(results_path)
    df_truth = load_groundtruth(groundtruth_path)

    if len(df_pred) != len(df_truth):
        print(f"⚠️ Warning: Mismatch in length ({len(df_pred)} vs {len(df_truth)})")
        n = min(len(df_pred), len(df_truth))
        df_pred = df_pred.head(n)
        df_truth = df_truth.head(n)

    if task == "binary":
        metrics, y_true, y_pred = evaluate_binary(df_pred, df_truth)
        curves = plot_binary(y_true, y_pred, out_dir)
        metrics.update(curves)
    elif task == "multilabel":
        metrics = evaluate_multilabel(df_pred, df_truth)
    else:
        raise ValueError("Task must be 'binary' or 'multilabel'")

    print(f"\n{task.upper()} Evaluation Results")
    for k, v in metrics.items():
        print(f"{k:>12}: {v:.3f}")

    return metrics


# 2. Actual Evaluation

## Running Evaluations

You can evaluate either:
- **Single file**: Specify one prediction file (useful for testing/debugging)
- **All files**: Loop through all prediction files in a directory (useful for comparison)

## 2.1 Single Evaluation

In [3]:
run_evaluation(
    results_path="../1_LLM_Eval/test_results/binary_zeroshot.json",
    groundtruth_path="../Dataset/617points.csv",
    task="binary"
)


BINARY Evaluation Results
    Accuracy: 0.728
          F1: 0.656
     ROC_AUC: 0.742
      PR_AUC: 0.874


{'Accuracy': 0.7281553398058253,
 'F1': 0.6557377049180327,
 'ROC_AUC': 0.7419741154718884,
 'PR_AUC': 0.8735301406197031}

## 2.2 Bulk Evaluation

In [4]:
from pathlib import Path

results_dir = Path("../1_LLM_Eval/test_results")
gt_path = "../Dataset/617points.csv"

records = []

for fp in results_dir.glob("*.json"):
    task = "multilabel" if "multilabel" in fp.name.lower() else "binary"
    # Use default ./eval_figs directory (same as single evaluation)
    metrics = run_evaluation(
        results_path=fp,
        groundtruth_path=gt_path,
        task=task
    )
    metrics["file"] = fp.name
    metrics["task"] = task
    records.append(metrics)

# Summary table
import pandas as pd
df_summary = pd.DataFrame(records)
print(df_summary)

# Identify best binary prompt
df_binary = df_summary[df_summary["task"] == "binary"]
if not df_binary.empty:
    # Sort by F1 score (or ROC_AUC as tiebreaker)
    df_binary_sorted = df_binary.sort_values(["F1", "ROC_AUC"], ascending=False)
    best_binary_file = df_binary_sorted.iloc[0]["file"]
    best_binary_f1 = df_binary_sorted.iloc[0]["F1"]
    
    print(f"\n🏆 Best Binary Prompt: {best_binary_file}")
    print(f"   F1 Score: {best_binary_f1:.3f}")
    print(f"   Available at: {Path(results_dir) / best_binary_file}")



BINARY Evaluation Results
    Accuracy: 0.853
          F1: 0.843
     ROC_AUC: 0.859
      PR_AUC: 0.923

BINARY Evaluation Results
    Accuracy: 0.856
          F1: 0.846
     ROC_AUC: 0.862
      PR_AUC: 0.925

BINARY Evaluation Results
    Accuracy: 0.806
          F1: 0.782
     ROC_AUC: 0.814
      PR_AUC: 0.899

BINARY Evaluation Results
    Accuracy: 0.728
          F1: 0.656
     ROC_AUC: 0.742
      PR_AUC: 0.874

BINARY Evaluation Results
    Accuracy: 0.893
          F1: 0.890
     ROC_AUC: 0.898
      PR_AUC: 0.944
   Accuracy        F1   ROC_AUC    PR_AUC                         file    task
0  0.852751  0.842832  0.858822  0.922790             binary_meta.json  binary
1  0.855987  0.846287  0.862068  0.925491  binary_selfconsistency.json  binary
2  0.805825  0.781818  0.814344  0.899471              binary_cot.json  binary
3  0.728155  0.655738  0.741974  0.873530         binary_zeroshot.json  binary
4  0.893204  0.890000  0.897523  0.944470          binary_fewshot.json

## 2.3 Plot ROC Curve for Best Binary Prompt


In [5]:
# Load the best binary prompt predictions
best_binary_path = results_dir / best_binary_file
df_pred_best = load_results(best_binary_path)
df_truth_best = load_groundtruth(gt_path)

# Evaluate to get true labels and predictions
_, y_true_best, y_pred_best = evaluate_binary(df_pred_best, df_truth_best)

# Plot ROC curve with model name in title
auc_score = plot_roc_curve(
    y_true=y_true_best,
    y_scores=y_pred_best.astype(float),  # Convert to float for plotting
    model_name="Qwen2.5-7B-Instruct",
    out_dir="./eval_figs",
    filename="best_binary_roc.png"
)

print(f"\n✅ ROC curve saved with AUC = {auc_score:.3f}")



✅ ROC curve saved with AUC = 0.898
