In [2]:
import openai, sys
print("openai version:", openai.__version__)
print("python version:", sys.version)

openai version: 1.108.0
python version: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]


In [3]:
import os
os.environ["OPENAI_API_KEY"] = "sk..."

assert os.environ.get("OPENAI_API_KEY","").startswith("sk-"), "API key missing or malformed."
print("API key set ✔")


API key set ✔


In [4]:
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

resp = client.responses.create(
    model="gpt-4o-mini",
    input=[{"role": "user", "content": "Say hello in exactly 7 words."}],
    max_output_tokens=30,
)
print(resp.output_text)

Hello! How are you doing today?


In [5]:
def extract_first_json_object(s: str) -> str:
    """
    Returns the first top-level JSON object found in s.
    Raises ValueError if none or braces unbalanced.
    """
    start = s.find("{")
    if start == -1:
        raise ValueError("No '{' found in model output.")
    depth = 0
    in_str = False
    esc = False
    for i in range(start, len(s)):
        c = s[i]
        if in_str:
            if esc:
                esc = False
            elif c == "\\":
                esc = True
            elif c == '"':
                in_str = False
        else:
            if c == '"':
                in_str = True
            elif c == "{":
                depth += 1
            elif c == "}":
                depth -= 1
                if depth == 0:
                    return s[start:i+1]
    raise ValueError("Unbalanced JSON braces in model output.")


In [6]:
EVAL_USER_TEMPLATE = """\
Task: {task_type}

Input features (optional): {input_brief}

Model's EXPLANATION for its prediction (XAI):
\"\"\"{xai_explanation}\"\"\"

Objective (Simulatability):
1) Guess the model's LABEL from the explanation (use only the valid label set).
2) Give confidence 0–1.
3) Short rationale.

Subjective (1–5): clarity, helpfulness, trust, fairness, confidence_after.
Also: one short free-text comment.

Binary flags: suspected_proxy_bias, overfitting_smell, leakage_risk.

Return JSON only with the fields we ask for.
"""

# Example inputs — replace with your real task & explanation
task_type = "sentiment classification"
valid_labels = ["POSITIVE","NEGATIVE"]
input_brief = "Short hotel review about cleanliness and service."
xai_explanation = (
    "The model focused on 'spotless room', 'friendly staff', and 'would definitely return', "
    "pushing toward POSITIVE. 'small lobby' had low weight."
)

user_prompt = EVAL_USER_TEMPLATE.format(
    task_type=f"{task_type} (valid labels: {', '.join(valid_labels)})",
    input_brief=input_brief,
    xai_explanation=xai_explanation
)

print(user_prompt[:400] + " ...")

Task: sentiment classification (valid labels: POSITIVE, NEGATIVE)

Input features (optional): Short hotel review about cleanliness and service.

Model's EXPLANATION for its prediction (XAI): 
"""The model focused on 'spotless room', 'friendly staff', and 'would definitely return', pushing toward POSITIVE. 'small lobby' had low weight."""

Objective (Simulatability):
1) Guess the model's LABEL from ...


In [7]:
import json
from copy import deepcopy
from jsonschema import validate, ValidationError

# === Our schema (same as before) ===
OUTPUT_SCHEMA = {
    "name": "xai_eval_result",
    "schema": {
        "type": "object",
        "properties": {
            "persona_id": {"type": "string"},
            "objective": {
                "type": "object",
                "properties": {
                    "simulated_label": {"type": "string"},
                    "confidence_0to1": {"type": "number"},
                    "rationale": {"type": "string"}
                },
                "required": ["simulated_label", "confidence_0to1", "rationale"],
                "additionalProperties": False
            },
            "subjective": {
                "type": "object",
                "properties": {
                    "clarity_1to5": {"type": "integer"},
                    "helpfulness_1to5": {"type": "integer"},
                    "trust_1to5": {"type": "integer"},
                    "fairness_1to5": {"type": "integer"},
                    "confidence_after_1to5": {"type": "integer"},
                    "free_text_feedback": {"type": "string"}
                },
                "required": [
                    "clarity_1to5","helpfulness_1to5","trust_1to5",
                    "fairness_1to5","confidence_after_1to5","free_text_feedback"
                ],
                "additionalProperties": False
            },
            "flags": {
                "type": "object",
                "properties": {
                    "suspected_proxy_bias": {"type": "boolean"},
                    "overfitting_smell": {"type": "boolean"},
                    "leakage_risk": {"type": "boolean"}
                },
                "required": ["suspected_proxy_bias","overfitting_smell","leakage_risk"],
                "additionalProperties": False
            }
        },
        "required": ["persona_id","objective","subjective","flags"],
        "additionalProperties": False
    },
    "strict": True
}

JSON_SCHEMA_INNER = OUTPUT_SCHEMA["schema"]

# === A concrete JSON *template* the model can mimic exactly ===
JSON_TEMPLATE_EXAMPLE = {
    "persona_id": "SAMPLE_PERSONA_ID",
    "objective": {
        "simulated_label": "POSITIVE",
        "confidence_0to1": 0.85,
        "rationale": "Brief reason here."
    },
    "subjective": {
        "clarity_1to5": 4,
        "helpfulness_1to5": 4,
        "trust_1to5": 3,
        "fairness_1to5": 4,
        "confidence_after_1to5": 4,
        "free_text_feedback": "One short sentence of feedback."
    },
    "flags": {
        "suspected_proxy_bias": False,
        "overfitting_smell": False,
        "leakage_risk": False
    }
}

def coerce_common_near_misses(d: dict) -> dict:
    """
    Maps common 'near-miss' outputs into our schema shape.
    Handles cases like:
      LABEL -> objective.simulated_label
      confidence -> objective.confidence_0to1
      rationale -> objective.rationale
      comment -> subjective.free_text_feedback
      subjective keys without _1to5 suffix
      flags at top-level, etc.
    """
    # If already valid-ish, return quickly
    if isinstance(d, dict) and "objective" in d and "subjective" in d and "flags" in d:
        return d

    fixed = {
        "objective": {"simulated_label": None, "confidence_0to1": None, "rationale": None},
        "subjective": {
            "clarity_1to5": None,
            "helpfulness_1to5": None,
            "trust_1to5": None,
            "fairness_1to5": None,
            "confidence_after_1to5": None,
            "free_text_feedback": ""
        },
        "flags": {
            "suspected_proxy_bias": False,
            "overfitting_smell": False,
            "leakage_risk": False
        }
    }

    # Flattened top-level near-misses
    if "LABEL" in d:
        fixed["objective"]["simulated_label"] = d["LABEL"]
    if "simulated_label" in d:
        fixed["objective"]["simulated_label"] = d["simulated_label"]

    if "confidence" in d:
        fixed["objective"]["confidence_0to1"] = d["confidence"]
    if "confidence_0to1" in d:
        fixed["objective"]["confidence_0to1"] = d["confidence_0to1"]

    if "rationale" in d:
        fixed["objective"]["rationale"] = d["rationale"]

    # Subjective may be nested or flat
    subj = d.get("subjective", {})
    # Accept either with or without suffix; coerce to ints when possible
    def as_int(x):
        try: return int(x)
        except: return None

    for k_src, k_dst in [
        ("clarity", "clarity_1to5"),
        ("helpfulness", "helpfulness_1to5"),
        ("trust", "trust_1to5"),
        ("fairness", "fairness_1to5"),
        ("confidence_after", "confidence_after_1to5"),
    ]:
        if k_src in d: fixed["subjective"][k_dst] = as_int(d[k_src])
        if k_src in subj: fixed["subjective"][k_dst] = as_int(subj[k_src])
        if k_dst in d: fixed["subjective"][k_dst] = as_int(d[k_dst])
        if k_dst in subj: fixed["subjective"][k_dst] = as_int(subj[k_dst])

    # Free-text feedback sometimes called 'comment'
    if "comment" in d:
        fixed["subjective"]["free_text_feedback"] = str(d["comment"])
    if "free_text_feedback" in d:
        fixed["subjective"]["free_text_feedback"] = str(d["free_text_feedback"])
    if isinstance(subj, dict) and "free_text_feedback" in subj:
        fixed["subjective"]["free_text_feedback"] = str(subj["free_text_feedback"])

    # Flags show up top-level sometimes
    for k in ["suspected_proxy_bias", "overfitting_smell", "leakage_risk"]:
        if k in d:
            fixed["flags"][k] = bool(d[k])
    if "flags" in d and isinstance(d["flags"], dict):
        for k,v in d["flags"].items():
            if k in fixed["flags"]:
                fixed["flags"][k] = bool(v)

    return fixed

def eval_once(system_prompt, user_prompt, persona_id, temperature=0.6, model="gpt-4o-mini"):
    """
    One persona evaluates one explanation; asks the model for JSON matching our schema.
    1) First attempt: prompt with an explicit example template.
    2) Validate; if it fails, try to coerce common near-misses and re-validate.
    """
    # Include a tiny, concrete example so the model mirrors the exact keys.
    composed_user = (
        user_prompt
        + "\n\nIMPORTANT:\n"
          "- Return ONLY a valid JSON object.\n"
          "- Use EXACTLY these keys and nesting.\n"
          "- Example (fill with your own values):\n"
          + json.dumps(JSON_TEMPLATE_EXAMPLE, indent=2)
    )

    resp = client.responses.create(
        model=model,
        input=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": composed_user},
        ],
        temperature=temperature,
        max_output_tokens=700,
    )

    text = resp.output_text.strip()
    raw_json = extract_first_json_object(text)
    data = json.loads(raw_json)

    # Ensure persona_id is always our canonical value
    data["persona_id"] = persona_id

    # First validation attempt
    try:
        validate(instance=data, schema=JSON_SCHEMA_INNER)
        return data
    except ValidationError:
        # Try to coerce near-misses into our schema
        repaired = coerce_common_near_misses(data)
        repaired["persona_id"] = persona_id
        validate(instance=repaired, schema=JSON_SCHEMA_INNER)  # will raise if still invalid
        return repaired

# ---- Try once ----
skeptical = "You are a risk-averse compliance auditor. You look for leakage, proxy bias, and overclaiming."
res1 = eval_once(skeptical, user_prompt, "skeptical_auditor")
res1


{'persona_id': 'skeptical_auditor',
 'objective': {'simulated_label': 'POSITIVE',
  'confidence_0to1': 0.9,
  'rationale': 'The presence of strong positive phrases indicates an overall favorable impression.'},
 'subjective': {'clarity_1to5': 5,
  'helpfulness_1to5': 5,
  'trust_1to5': 4,
  'fairness_1to5': 4,
  'confidence_after_1to5': 5,
  'free_text_feedback': 'The explanation clearly supports the positive label.'},
 'flags': {'suspected_proxy_bias': False,
  'overfitting_smell': False,
  'leakage_risk': False}}

In [8]:
PERSONAS = [
    ("skeptical_auditor",
     "You are a risk-averse compliance auditor. You look for leakage, proxy bias, and overclaiming."),
    ("busy_clinician",
     "You are a time-pressured clinician; value concise, actionable, trustworthy explanations that fit workflow."),
    ("fairness_advocate",
     "You are a fairness-focused advocate; scrutinize demographic harms and proxies."),
    ("enthusiastic_engineer",
     "You are a pragmatic ML engineer; emphasize faithfulness and implementation feasibility.")
]

def simulate_crowd(personas, user_prompt, runs_per_persona=3, temperature=0.7, model="gpt-4o-mini"):
    out = []
    for pid, sys in personas:
        for r in range(runs_per_persona):
            data = eval_once(
                system_prompt=sys,
                user_prompt=user_prompt,
                persona_id=pid,
                temperature=temperature,
                model=model,
            )
            data["meta"] = {"rep": r, "temperature": temperature, "model": model}
            out.append(data)
    return out

# Run the crowd simulation
crowd = simulate_crowd(PERSONAS, user_prompt, runs_per_persona=3)

# Print all results nicely
print(f"Total results: {len(crowd)}\n")
for i, res in enumerate(crowd, 1):
    print(f"=== Result {i} ({res['persona_id']}, rep {res['meta']['rep']}) ===")
    print(json.dumps(res, indent=2))
    print()



Total results: 12

=== Result 1 (skeptical_auditor, rep 0) ===
{
  "persona_id": "skeptical_auditor",
  "objective": {
    "simulated_label": "POSITIVE",
    "confidence_0to1": 0.9,
    "rationale": "The presence of positive phrases strongly outweighs any negative aspects mentioned."
  },
  "subjective": {
    "clarity_1to5": 5,
    "helpfulness_1to5": 5,
    "trust_1to5": 4,
    "fairness_1to5": 5,
    "confidence_after_1to5": 4,
    "free_text_feedback": "The explanation clearly supports the positive sentiment."
  },
  "flags": {
    "suspected_proxy_bias": false,
    "overfitting_smell": false,
    "leakage_risk": false
  },
  "meta": {
    "rep": 0,
    "temperature": 0.7,
    "model": "gpt-4o-mini"
  }
}

=== Result 2 (skeptical_auditor, rep 1) ===
{
  "persona_id": "skeptical_auditor",
  "objective": {
    "simulated_label": "POSITIVE",
    "confidence_0to1": 0.9,
    "rationale": "The review highlights key positive aspects like cleanliness and friendly service."
  },
  "subjecti

In [10]:
import pandas as pd

TRUE_MODEL_LABEL = "POSITIVE"  # replace with your model's actual prediction

def flatten(rows):
    flat = []
    for r in rows:
        flat.append({
            "persona_id": r["persona_id"],
            "simulated_label": r["objective"]["simulated_label"],
            "conf": r["objective"]["confidence_0to1"],
            "clarity": r["subjective"]["clarity_1to5"],
            "helpfulness": r["subjective"]["helpfulness_1to5"],
            "trust": r["subjective"]["trust_1to5"],
            "fairness": r["subjective"]["fairness_1to5"],
            "conf_after": r["subjective"]["confidence_after_1to5"],
            "comment": r["subjective"]["free_text_feedback"],
            "proxy_bias": r["flags"]["suspected_proxy_bias"],
            "overfit": r["flags"]["overfitting_smell"],
            "leakage": r["flags"]["leakage_risk"],
            "model_used": r["meta"]["model"],
        })
    return pd.DataFrame(flat)

df = flatten(crowd)
df["simulatability_correct"] = df["simulated_label"].str.upper().eq(TRUE_MODEL_LABEL)

print("Overall simulatability accuracy:", float(df["simulatability_correct"].mean()))
print("\nPersona averages:")
display(df.groupby("persona_id")[["clarity","helpfulness","trust","fairness","conf_after"]].mean().round(2))

df

Overall simulatability accuracy: 1.0

Persona averages:


Unnamed: 0_level_0,clarity,helpfulness,trust,fairness,conf_after
persona_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
busy_clinician,5.0,5.0,4.0,4.0,4.67
enthusiastic_engineer,5.0,5.0,4.0,4.33,5.0
fairness_advocate,5.0,5.0,4.0,4.0,5.0
skeptical_auditor,4.33,4.33,4.0,4.33,4.0


Unnamed: 0,persona_id,simulated_label,conf,clarity,helpfulness,trust,fairness,conf_after,comment,proxy_bias,overfit,leakage,model_used,simulatability_correct
0,skeptical_auditor,POSITIVE,0.9,5,5,4,5,4,The explanation clearly supports the positive ...,False,False,False,gpt-4o-mini,True
1,skeptical_auditor,POSITIVE,0.9,4,4,4,4,4,The explanation aligns well with the positive ...,False,False,False,gpt-4o-mini,True
2,skeptical_auditor,POSITIVE,0.9,4,4,4,4,4,The explanation effectively highlights positiv...,False,False,False,gpt-4o-mini,True
3,busy_clinician,POSITIVE,0.9,5,5,4,4,5,The explanation is clear and directly tied to ...,False,False,False,gpt-4o-mini,True
4,busy_clinician,POSITIVE,0.9,5,5,4,4,4,The explanation clearly supports the prediction.,False,False,False,gpt-4o-mini,True
5,busy_clinician,POSITIVE,0.9,5,5,4,4,5,The explanation effectively highlights key pos...,False,False,False,gpt-4o-mini,True
6,fairness_advocate,POSITIVE,0.9,5,5,4,4,5,The explanation clearly highlights key positiv...,False,False,False,gpt-4o-mini,True
7,fairness_advocate,POSITIVE,0.9,5,5,4,4,5,The explanation clearly aligns with the sentim...,False,False,False,gpt-4o-mini,True
8,fairness_advocate,POSITIVE,0.9,5,5,4,4,5,The explanation clearly supports the positive ...,False,False,False,gpt-4o-mini,True
9,enthusiastic_engineer,POSITIVE,0.9,5,5,4,4,5,The explanation is clear and well-supported.,False,False,False,gpt-4o-mini,True
