In [1]:
import matplotlib.pyplot as plt
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, average_precision_score,
    confusion_matrix, roc_curve, precision_recall_curve
)
from eval_llm_pipeline import (
    compute_binary_metrics_detailed,
    compute_multitype_metrics_per_subgroup,
    plot_confusion_matrix,
    plot_roc_curve_binary,
    plot_precision_recall_curve_binary,
    plot_per_class_f1_bar_chart,
    append_binary_results_to_json,
    append_multitype_results_to_json,
)

# Paths for JSON results
BINARY_JSON_PATH = Path("results/binary_results.json")
MULTITYPE_JSON_PATH = Path("results/multitype_results.json")

In [2]:
# =====================================
# Paths
# =====================================
GT_PATH = "../Dataset/reddit_data.csv"
PLOT_DIR=RESULTS_DIR = "w4/chatgpt"
BINARY_RESULTS_JSON = "results/binary_results.json"
os.makedirs(PLOT_DIR, exist_ok=True)

MODEL_NAME = "ChatGPT 5.1"

# =====================================
# Helper Functions
# =====================================
def compute_binary_metrics_detailed(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "F1": f1_score(y_true, y_pred),
        "ROC_AUC": roc_auc_score(y_true, y_pred),
        "PR_AUC": average_precision_score(y_true, y_pred)
    }

def append_binary_results_to_json(json_path, model_name, prompt_version, metrics, notes=""):
    now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    entry = {
        "model_name": model_name,
        "prompt_version": prompt_version,
        "metrics": metrics,
        "notes": notes,
        "date_tested": now_str,
    }

    if Path(json_path).exists():
        with open(json_path, "r") as f:
            data = json.load(f)
    else:
        data = []

    data.append(entry)

    with open(json_path, "w") as f:
        json.dump(data, f, indent=2)


def plot_confusion_matrix(y_true, y_pred, ax, title):
    cm = confusion_matrix(y_true, y_pred)
    im = ax.imshow(cm, cmap="Blues")

    ax.set_title(title)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")

    ax.set_xticks([0,1])
    ax.set_yticks([0,1])
    ax.set_xticklabels(["NOT_IPV", "IPV"])
    ax.set_yticklabels(["NOT_IPV", "IPV"])

    for (i,j), val in np.ndenumerate(cm):
        ax.text(j, i, val, ha="center", va="center")


def plot_roc_curve_binary(y_true, y_pred, ax, label):
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    ax.plot(fpr, tpr, lw=2, label=label)
    ax.plot([0,1], [0,1], "--", color="gray")
    ax.set_title("ROC Curve")
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.legend()


def plot_precision_recall_curve_binary(y_true, y_pred, ax, label):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    ax.plot(recall, precision, lw=2, label=label)
    ax.set_title("Precision-Recall Curve")
    ax.set_xlabel("Recall")
    ax.set_ylabel("Precision")
    ax.legend()


# =====================================
# Load Ground Truth
# =====================================
df_gt = pd.read_csv(GT_PATH)
df_gt["label_true"] = (
    df_gt[["Physical Abuse", "Emotional Abuse", "Sexual Abuse"]].any(axis=1)
).astype(int)
df_gt = df_gt.reset_index().rename(columns={"index": "id"})

# =====================================
# Process ALL binary JSON files
# =====================================
binary_files = sorted([f for f in os.listdir(RESULTS_DIR) if f.startswith("binary_") and f.endswith(".json")])
print("Found files:", binary_files)

for file in binary_files:
    json_path = os.path.join(RESULTS_DIR, file)

    # Extract prompt_type properly
    prompt_type = file.replace("binary_", "").replace(".json", "")

    print(f"\n=== Evaluating {file} ({prompt_type}) ===")

    # Load predictions
    with open(json_path, "r") as f:
        preds = json.load(f)
    df_pred = pd.DataFrame(preds)

    if "id" not in df_pred:
        df_pred = df_pred.reset_index().rename(columns={"index": "id"})

    merged = df_gt.merge(df_pred, on="id", how="inner")

    merged["y_true"] = merged["label_true"]
    merged["y_pred"] = merged["extracted_label"].str.upper().eq("IPV").astype(int)

    # Compute metrics
    metrics = compute_binary_metrics_detailed(merged["y_true"], merged["y_pred"])
    print(metrics)

    # Save results (append)
    append_binary_results_to_json(
        json_path=BINARY_RESULTS_JSON,
        model_name=MODEL_NAME,
        prompt_version=prompt_type,
        metrics=metrics,
        notes=f"Evaluation of {prompt_type} prompt"
    )

    # ===== Plotting =====
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))

    plot_confusion_matrix(merged["y_true"], merged["y_pred"], ax=axes[0],
                          title=f"{prompt_type} — Confusion Matrix")

    plot_roc_curve_binary(merged["y_true"], merged["y_pred"], ax=axes[1],
                          label=prompt_type)

    plot_precision_recall_curve_binary(merged["y_true"], merged["y_pred"], ax=axes[2],
                                       label=prompt_type)

    plt.suptitle(f"Binary Evaluation — {prompt_type}", fontsize=14)
    plt.tight_layout()

    # Save plot
    save_path = os.path.join(PLOT_DIR, f"{prompt_type}_eval.png")
    plt.savefig(save_path, dpi=300)
    plt.close()

    print(f"Saved plot → {save_path}")


print("\nAll results saved to:", BINARY_RESULTS_JSON)

Found files: ['binary_fewshot.json', 'binary_zeroshot.json']

=== Evaluating binary_fewshot.json (fewshot) ===
{'Accuracy': 0.6456310679611651, 'F1': 0.5249457700650759, 'ROC_AUC': 0.6616102193461636, 'PR_AUC': 0.6643896025395817}
Saved plot → w4/chatgpt/fewshot_eval.png

=== Evaluating binary_zeroshot.json (zeroshot) ===
{'Accuracy': 0.6456310679611651, 'F1': 0.5249457700650759, 'ROC_AUC': 0.6616102193461636, 'PR_AUC': 0.6643896025395817}
Saved plot → w4/chatgpt/zeroshot_eval.png

All results saved to: results/binary_results.json
