<a href="https://colab.research.google.com/github/supriyag123/PHD_Pub/blob/main/AGENTIC-DecisionAgentRetrieveResults.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score
from collections import Counter

CKPT_PATH = "/content/drive/MyDrive/PHD/2025/holdout_eval_checkpoint.json"

# --------------------------------------------------
# 1. Load checkpoint
# --------------------------------------------------
with open(CKPT_PATH, "r") as f:
    ckpt = json.load(f)

print("âœ… Checkpoint loaded")

# --------------------------------------------------
# 2. Extract arrays
# --------------------------------------------------
y_warn_true = np.array(ckpt["y_warn_true"])
y_fail_true = np.array(ckpt["y_fail_true"])

transformer_warn_flags = np.array(ckpt["transformer_warn_flags"])
transformer_fail_flags = np.array(ckpt["transformer_fail_flags"])

decision_warn_flags = np.array(ckpt["decision_warn_flags"])
decision_fail_flags = np.array(ckpt["decision_fail_flags"])

n = len(y_warn_true)

print(f"Samples in checkpoint: {n}")

# --------------------------------------------------
# 3. Sanity checks (lengths must match)
# --------------------------------------------------
assert len(y_warn_true) == len(transformer_warn_flags) == len(decision_warn_flags)
assert len(y_fail_true) == len(transformer_fail_flags) == len(decision_fail_flags)

print("âœ… Length consistency check passed")

# --------------------------------------------------
# 4. Recompute metrics (should match printed results)
# --------------------------------------------------
def metrics(y_true, y_pred):
    return {
        "recall": recall_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
    }

print("\n=== METRIC VERIFICATION ===")

print("\nðŸŸ¦ TRANSFORMER â€” WARNING")
print(metrics(y_warn_true, transformer_warn_flags))

print("\nðŸŸ© DECISION AGENT â€” WARNING")
print(metrics(y_warn_true, decision_warn_flags))

print("\nðŸŸ¥ TRANSFORMER â€” FAILURE")
print(metrics(y_fail_true, transformer_fail_flags))

print("\nðŸŸ§ DECISION AGENT â€” FAILURE")
print(metrics(y_fail_true, decision_fail_flags))

# --------------------------------------------------
# 5. Bucket reconstruction (WARNING logic)
# --------------------------------------------------
bucket_A, bucket_B, bucket_C, bucket_D = [], [], [], []

for i in range(n):
    T = transformer_warn_flags[i] == 1
    D = decision_warn_flags[i] == 1

    if T and D:
        bucket_A.append(i)
    elif T and not D:
        bucket_B.append(i)
    elif not T and D:
        bucket_C.append(i)
    else:
        bucket_D.append(i)

def summarize(indices):
    labels = [y_hold[i] for i in indices]
    c = Counter(labels)
    total = len(indices)
    return {
        "count": total,
        "normal (0)": c.get(0, 0),
        "warning (1)": c.get(1, 0),
        "failure (2)": c.get(2, 0),
        "warn_rate": c.get(1, 0) / total if total else 0.0,
        "fail_rate": c.get(2, 0) / total if total else 0.0,
    }

bucket_stats = {
    "A_T1_D1": summarize(bucket_A),
    "B_T1_D0": summarize(bucket_B),
    "C_T0_D1": summarize(bucket_C),
    "D_T0_D0": summarize(bucket_D),
}

print("\n=== BUCKET BREAKDOWN ===")
print(json.dumps(bucket_stats, indent=2))

print("\nâœ… Checkpoint integrity VERIFIED")



=== Prototype Scoring (Validation, th=0.3) ===
              precision    recall  f1-score   support

           0     1.0000    0.7396    0.8503       553
           1     0.0710    1.0000    0.1325        11

    accuracy                         0.7447       564
   macro avg     0.5355    0.8698    0.4914       564
weighted avg     0.9819    0.7447    0.8363       564

[[409 144]
 [  0  11]]


In [None]:
from google.colab import drive
drive.mount('/content/drive')