In [1]:
import pandas as pd
import re
from collections import Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load
df = pd.read_csv("dob_review_board_sample.csv")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# Column names (exact)
TEXT_COL = "input"
GT_COL = "ground_truth"

BASELINE_RAW_COL = "R1_meta-llama/Llama-3.2-3B-Instruct"
BOARD_RAW_COLS = [
    "R2_meta-llama/Llama-3.2-3B-Instruct",
    "R2_Qwen/Qwen3-4B-Instruct-2507",
    "R2_google/gemma-3-4b-it",
]

def extract_yesno(text: str):
    """Extract YES/NO from messy model output."""
    if not isinstance(text, str):
        return None
    t = text.lower()
    # Look for JSON-style field if present
    m = re.search(r"decarbonization_project\"?\s*:\s*\"?(yes|no)", t)
    if m:
        return m.group(1).upper()
    # Otherwise, try to find explicit YES/NO tokens
    tokens = re.findall(r"\b(yes|no)\b", t)
    if tokens:
        return tokens[-1].upper()
    return None

def majority_vote(vals):
    vals = [v for v in vals if v in ("YES", "NO")]
    if not vals:
        return None
    return Counter(vals).most_common(1)[0][0]

# Extract GT and preds
df["gt"] = df[GT_COL].apply(extract_yesno)
df["baseline_pred"] = df[BASELINE_RAW_COL].apply(extract_yesno)

for c in BOARD_RAW_COLS:
    df[c + "_yn"] = df[c].apply(extract_yesno)

df["board_pred"] = df[[c + "_yn" for c in BOARD_RAW_COLS]].apply(
    lambda r: majority_vote(r.tolist()), axis=1
)

# Hallucination checks
EQUIP_TERMS = [
    "heat pump",
    "ev charger",
    "electric vehicle",
    "charging station",
    "solar",
    "photovoltaic",
    "vrf",
    "mini-split",
    "electrification",
    "air source heat pump",
    "as hp",  # sometimes abbreviated
]

def hallucination_flag(model_text: str, job_desc: str):
    """Flag if output claims equipment not present in input; evidence_quote mismatch when detectable."""
    if not isinstance(model_text, str) or not isinstance(job_desc, str):
        return 0
    mt = model_text.lower()
    jd = job_desc.lower()

    # Equipment claimed but not present in input text
    for term in EQUIP_TERMS:
        if term in mt and term not in jd:
            return 1

    # Evidence quote substring check if JSON-like evidence_quote exists
    m = re.search(r"evidence_quote\"?\s*:\s*\"([^\"]+)\"", model_text, flags=re.IGNORECASE)
    if m:
        quote = m.group(1).strip()
        if quote and quote.lower() not in jd:
            return 1

    return 0

df["halluc_baseline"] = df.apply(lambda r: hallucination_flag(r[BASELINE_RAW_COL], r[TEXT_COL]), axis=1)

agent_hall_cols = []
for c in BOARD_RAW_COLS:
    hcol = c + "_hall"
    df[hcol] = df.apply(lambda r: hallucination_flag(r[c], r[TEXT_COL]), axis=1)
    agent_hall_cols.append(hcol)

# Conservative: board hallucination if ANY agent hallucinated
df["hall_board_any"] = df[agent_hall_cols].max(axis=1)

# Metrics helper
def compute_metrics(eval_df, pred_col):
    y_true = eval_df["gt"]
    y_pred = eval_df[pred_col]
    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", pos_label="YES")
    return acc, p, r, f1

# Evaluate only where GT and prediction exist
eval_base = df.dropna(subset=["gt", "baseline_pred"])
eval_board = df.dropna(subset=["gt", "board_pred"])

base_acc, base_p, base_r, base_f1 = compute_metrics(eval_base, "baseline_pred")
board_acc, board_p, board_r, board_f1 = compute_metrics(eval_board, "board_pred")

base_hall = df["halluc_baseline"].mean()
board_hall = df["hall_board_any"].mean()

results = pd.DataFrame([
    {
        "system": "baseline_single_model (R1 Llama)",
        "accuracy": base_acc,
        "precision_yes": base_p,
        "recall_yes": base_r,
        "f1_yes": base_f1,
        "hallucination_rate": base_hall,
        "n_eval": len(eval_base),
    },
    {
        "system": "review_board_majority_vote (R2 Llama+Qwen+Gemma)",
        "accuracy": board_acc,
        "precision_yes": board_p,
        "recall_yes": board_r,
        "f1_yes": board_f1,
        "hallucination_rate": board_hall,
        "n_eval": len(eval_board),
    },
])

print(results)

# Export promised artifacts
results.to_csv("dob_tiny_results_table.csv", index=False)

export_cols = [
    TEXT_COL,
    "gt",
    "baseline_pred",
    "board_pred",
    "halluc_baseline",
    "hall_board_any",
    GT_COL,
    BASELINE_RAW_COL,
] + BOARD_RAW_COLS

out = df[export_cols].copy()
out.to_csv("dob_review_board_results.csv", index=False)

print("Saved dob_tiny_results_table.csv and dob_review_board_results.csv")
out.head(2)


Shape: (5000, 9)
Columns: ['input', 'ground_truth', 'history_raw', 'R1_meta-llama/Llama-3.2-3B-Instruct', 'R1_Qwen/Qwen3-4B-Instruct-2507', 'R1_google/gemma-3-4b-it', 'R2_meta-llama/Llama-3.2-3B-Instruct', 'R2_Qwen/Qwen3-4B-Instruct-2507', 'R2_google/gemma-3-4b-it']


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                             system  accuracy  precision_yes  \
0                  baseline_single_model (R1 Llama)       NaN            0.0   
1  review_board_majority_vote (R2 Llama+Qwen+Gemma)       NaN            0.0   

   recall_yes  f1_yes  hallucination_rate  n_eval  
0         0.0     0.0              0.0112       0  
1         0.0     0.0              0.0110       0  
Saved dob_tiny_results_table.csv and dob_review_board_results.csv


Unnamed: 0,input,gt,baseline_pred,board_pred,halluc_baseline,hall_board_any,ground_truth,R1_meta-llama/Llama-3.2-3B-Instruct,R2_meta-llama/Llama-3.2-3B-Instruct,R2_Qwen/Qwen3-4B-Instruct-2507,R2_google/gemma-3-4b-it
0,The Mechanical work fo ra Business occupancy b...,,,,0,0,Mechanical Systems,"Based on the provided job description, I would...",Based on the job description provided and the ...,,
1,INTERIOR RENOVATION INCLUDING RENOVATED KITCHE...,,,,0,0,Plumbing,"Based on the job description, I would classify...","Based on the job description ""INTERIOR RENOVAT...",,
