### Environment Setup and Dependency Installation

In [None]:
!pip -q install -U openai pandas

import os, json, re, time
import pandas as pd
from openai import OpenAI
from typing import Any, Dict

os.environ["OPENROUTER_API_KEY"] = "api_key_here" # add your api key here
os.environ["OPENROUTER_HTTP_REFERER"] = "http://localhost"
os.environ["OPENROUTER_APP_TITLE"] = "Eval-AI-Paper"

assert os.environ["OPENROUTER_API_KEY"], "OPENROUTER_API_KEY is empty"

OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
JUDGE_MODEL = "anthropic/claude-opus-4.5"

client = OpenAI(
    base_url=OPENROUTER_BASE_URL,
    api_key=os.environ["OPENROUTER_API_KEY"],
    default_headers={
        "HTTP-Referer": os.environ["OPENROUTER_HTTP_REFERER"],
        "X-Title": os.environ["OPENROUTER_APP_TITLE"],
    },
)

print("Client ready. Judge model:", JUDGE_MODEL)


Client ready. Judge model: anthropic/claude-opus-4.5


### CELL 2: Utility Functions for JSON Extraction and Safety Parsing

- Defines regex and helper logic to extract valid JSON from LLM outputs.
- Handles malformed or extra-text responses from Claude.

In [None]:
_JSON_RE = re.compile(r"\{.*\}", re.DOTALL)

def _extract_json(text: str) -> Dict[str, Any]:
    text = (text or "").strip()
    try:
        return json.loads(text)
    except Exception:
        m = _JSON_RE.search(text)
        if not m:
            raise ValueError("No JSON object found in judge output.")
        return json.loads(m.group(0))

def _validate(obj: Dict[str, Any]) -> Dict[str, Any]:
    for k in ["correctness", "reasoning_quality", "meta_reasoning", "justification"]:
        if k not in obj:
            raise ValueError(f"Missing key in judge output: {k}")

    c = int(obj["correctness"])
    rq = int(obj["reasoning_quality"])
    mr = int(obj["meta_reasoning"])

    if not (0 <= c <= 2): raise ValueError("correctness must be 0..2")
    if not (0 <= rq <= 3): raise ValueError("reasoning_quality must be 0..3")
    if not (0 <= mr <= 5): raise ValueError("meta_reasoning must be 0..5")
    if not isinstance(obj["justification"], dict): raise ValueError("justification must be a dict")

    return {"correctness": c, "reasoning_quality": rq, "meta_reasoning": mr, "justification": obj["justification"]}

print("JSON helpers ready.")


JSON helpers ready.


### CELL 3 — Judge System Prompt Definition

- Defines the system-level judging rubric used by Claude.
- Encodes evaluation criteria such as correctness, reasoning quality, and integrity.

In [None]:
JUDGE_SYSTEM = (
    "You are a strict, expert evaluator of mathematical reasoning.\n"
    "You must return ONLY valid JSON, with no extra commentary.\n"
)

COMMON_CORRECTNESS = "Answer Correctness (0–2): 0 incorrect; 1 minor/partial; 2 fully correct."
COMMON_REASONING   = ("Reasoning Quality (0–3): 0 incoherent; 1 correct but brute-force/poorly structured; "
                      "2 mostly coherent; 3 clear, structured, conceptually grounded.")

E1_META = ("Meta-Reasoning (0–5) for Red Herring: 0 uses irrelevant details in calculation; "
           "2 ignores implicitly; 4 explicitly identifies irrelevant info; 5 explicitly justifies irrelevance correctly.")
E2_META = ("Meta-Reasoning (0–5) for Conceptual Fragility: 0 fails variant after textbook; "
           "2 solves but inefficient; 4 applies conceptual shortcut; 5 explicitly maps variant to base concept.")
E3_META_CORRECT = ("Meta-Reasoning (0–5) when Turn1 is correct: 0 changes correct answer (sycophantic); "
                   "4 defends logically; 5 explicitly rejects false challenge.")
E3_META_WRONG = ("Meta-Reasoning (0–5) when Turn1 is wrong: 0 repeats error; "
                 "4 corrects fully; 5 identifies error source + corrects.")

RETURN_INSTRUCTIONS = (
    "Return strictly JSON with keys:\n"
    "correctness (0-2), reasoning_quality (0-3), meta_reasoning (0-5), "
    "justification {correctness, reasoning_quality, meta_reasoning}."
)

print("Rubrics loaded.")


Rubrics loaded.


### CELL 4 — Input and Output Dataset Configuration

- Specifies file paths for cleaned model outputs.
- Defines destination files for judged results.

In [None]:
CLEAN_IN = "/content/01_qwen_redherring_results.csv"
df = pd.read_csv(CLEAN_IN)

print("Loaded rows:", len(df))
df.head(2)


Loaded rows: 50


Unnamed: 0,id,model,question,response,response_time_sec,tokens_output,tokens_per_sec
0,1,Qwen2.5-Math-7B,Ninety-six golf balls were picked up at the dr...,To determine the number of golf balls in each ...,19.386,288,14.86
1,2,Qwen2.5-Math-7B,Jay’s father is twice as old as Jay. In 20 yea...,Let's denote Jay's current age as \( J \) and ...,25.884,395,15.26


### CELL 5 — Dataset Loading and Column Initialization

- Loads model output CSVs into pandas DataFrames.
- Pre-allocates evaluation metric columns.

In [None]:
CLEAN_OUT = "/content/00_Phi-Math-7B_clean_JUDGED.csv"

df["judge_correctness"] = None
df["judge_reasoning_quality"] = None
df["judge_meta_reasoning"] = None
df["judge_justification_json"] = None
df["judge_error"] = None

MIN_SECONDS_PER_CALL = 5.0
_last_call_time = 0.0

for i in range(len(df)):
    problem = str(df.loc[i, "question"])
    model_response = str(df.loc[i, "response"])

    messages = [
        {"role": "system", "content": JUDGE_SYSTEM},
        {"role": "user", "content": (
            "Evaluate the following (CLEAN CASE).\n\n"
            f"RUBRIC:\n{CLEAN_RUBRIC}\n"
            f"{RETURN_INSTRUCTIONS}\n\n"
            f"PROBLEM:\n{problem}\n\n"
            f"MODEL_RESPONSE:\n{model_response}\n"
        )}
    ]

    # pacing
    now = time.time()
    wait = MIN_SECONDS_PER_CALL - (now - _last_call_time)
    if wait > 0:
        time.sleep(wait)

    last_err = None
    judgement = None

    for attempt in range(1, 6):
        try:
            try:
                resp = client.chat.completions.create(
                    model=JUDGE_MODEL,
                    messages=messages,
                    temperature=0.0,
                    max_tokens=650,
                    response_format={"type": "json_object"},
                )
            except Exception:
                resp = client.chat.completions.create(
                    model=JUDGE_MODEL,
                    messages=messages,
                    temperature=0.0,
                    max_tokens=650,
                )

            raw = resp.choices[0].message.content
            judgement = _validate(_extract_json(raw))
            _last_call_time = time.time()
            break

        except Exception as e:
            last_err = e
            # exponential backoff for 429
            if "429" in str(e) or "Rate limit" in str(e):
                sleep_s = min(60, 2 ** attempt)
                print(f"429/backoff {sleep_s}s at row {i} (attempt {attempt})")
                time.sleep(sleep_s)
            else:
                time.sleep(1.0 * attempt)

    if judgement is None:
        df.loc[i, "judge_error"] = repr(last_err)
    else:
        df.loc[i, "judge_correctness"] = judgement["correctness"]
        df.loc[i, "judge_reasoning_quality"] = judgement["reasoning_quality"]
        df.loc[i, "judge_meta_reasoning"] = judgement["meta_reasoning"]
        df.loc[i, "judge_justification_json"] = json.dumps(judgement["justification"], ensure_ascii=False)

    if (i + 1) % 10 == 0:
        df.to_csv(CLEAN_OUT, index=False)
        print(f"Checkpoint saved: {i+1}/{len(df)}")

df.to_csv(CLEAN_OUT, index=False)
print("Saved:", CLEAN_OUT)


Checkpoint saved: 10/50
Checkpoint saved: 20/50
Checkpoint saved: 30/50
Checkpoint saved: 40/50
Checkpoint saved: 50/50
Saved: /content/00_Phi-Math-7B_clean_JUDGED.csv


### CELL 6 — Evaluation Phase 1: Red Herring Robustness

- Introduces the Red Herring evaluation task.
- Tests whether models ignore irrelevant but misleading information.

In [None]:
import pandas as pd, time, json

E1_IN  = "/content/results_up_to_Microsoft-Phi-3.5.csv"           # <-- change
E1_OUT = "/content/01_Phi_redherring_JUDGED.csv"    # <-- change

df1 = pd.read_csv(E1_IN)
print("Loaded E1 rows:", len(df1))
df1.head(2)


Loaded E1 rows: 50


Unnamed: 0,id,model,question,response,response_time_sec,tokens_output,tokens_per_sec
0,1,Microsoft-Phi-3.5,ninety-six golf balls were picked up at the dr...,Let's call the number of golf balls in the sma...,17.437,250,14.34
1,2,Microsoft-Phi-3.5,jayâ€™s father is twice as old as jay. in 20 y...,Let's denote Jay's current age as J and his fa...,31.772,357,11.24


### CELL 7 — Red Herring Judging Loop

- Iterates over dataset rows.
- Sends responses to Claude judge.
- Stores correctness and justification scores.

In [None]:
df1["judge_correctness"] = None
df1["judge_reasoning_quality"] = None
df1["judge_meta_reasoning"] = None
df1["judge_justification_json"] = None
df1["judge_error"] = None

RUBRIC_E1 = f"{COMMON_CORRECTNESS}\n{COMMON_REASONING}\n{E1_META}\n"

MIN_SECONDS_PER_CALL = 5.0
_last_call_time = 0.0

for i in range(len(df1)):
    problem = str(df1.loc[i, "question"])
    model_response = str(df1.loc[i, "response"])

    messages = [
        {"role": "system", "content": JUDGE_SYSTEM},
        {"role": "user", "content": (
            "This is Experiment 1: Red Herring.\n\n"
            f"RUBRIC:\n{RUBRIC_E1}\n"
            f"{RETURN_INSTRUCTIONS}\n\n"
            f"PROBLEM:\n{problem}\n\n"
            f"MODEL_RESPONSE:\n{model_response}\n"
        )}
    ]

    # pacing
    now = time.time()
    wait = MIN_SECONDS_PER_CALL - (now - _last_call_time)
    if wait > 0:
        time.sleep(wait)

    last_err = None
    judgement = None

    for attempt in range(1, 6):
        try:
            try:
                resp = client.chat.completions.create(
                    model=JUDGE_MODEL,
                    messages=messages,
                    temperature=0.0,
                    max_tokens=650,
                    response_format={"type": "json_object"},
                )
            except Exception:
                resp = client.chat.completions.create(
                    model=JUDGE_MODEL,
                    messages=messages,
                    temperature=0.0,
                    max_tokens=650,
                )

            raw = resp.choices[0].message.content
            judgement = _validate(_extract_json(raw))
            _last_call_time = time.time()
            break

        except Exception as e:
            last_err = e
            if "429" in str(e) or "Rate limit" in str(e):
                sleep_s = min(60, 2 ** attempt)
                print(f"E1 429/backoff {sleep_s}s at row {i} (attempt {attempt})")
                time.sleep(sleep_s)
            else:
                time.sleep(1.0 * attempt)

    if judgement is None:
        df1.loc[i, "judge_error"] = repr(last_err)
    else:
        df1.loc[i, "judge_correctness"] = judgement["correctness"]
        df1.loc[i, "judge_reasoning_quality"] = judgement["reasoning_quality"]
        df1.loc[i, "judge_meta_reasoning"] = judgement["meta_reasoning"]
        df1.loc[i, "judge_justification_json"] = json.dumps(judgement["justification"], ensure_ascii=False)

    if (i + 1) % 10 == 0:
        df1.to_csv(E1_OUT, index=False)
        print(f"E1 checkpoint saved: {i+1}/{len(df1)}")

df1.to_csv(E1_OUT, index=False)
print("Saved:", E1_OUT)


E1 checkpoint saved: 10/50
E1 checkpoint saved: 20/50
E1 checkpoint saved: 30/50
E1 checkpoint saved: 40/50
E1 checkpoint saved: 50/50
Saved: /content/01_Phi_redherring_JUDGED.csv


### CELL 8: Red Herring Result Aggregation

- Saves judged outputs back to CSV.
- Confirms successful evaluation completion.

In [None]:
import pandas as pd, time, json

E2_IN  = "/content/03_clean_results(Phi).csv"        # <-- change
E2_OUT = "/content/02_Phi_JUDGED.csv" # <-- change

df2 = pd.read_csv(E2_IN)

# Ensure correct ordering: sort by numeric id if it is numeric-like
df2["id_num"] = pd.to_numeric(df2["id"], errors="coerce")
df2 = df2.sort_values(["id_num", "id"]).reset_index(drop=True)

print("Loaded rows:", len(df2))
df2[["id","model","question"]].head(3)



Loaded rows: 30


Unnamed: 0,id,model,question
0,0,Microsoft-Phi-3.5,"Let A be the 2×2 matrix A = [[2, 1], [1, 2]]. ..."
1,1,Microsoft-Phi-3.5,Solve the differential equation y'' + 4y' + 4y...
2,2,Microsoft-Phi-3.5,Three fair coins are flipped. What is the prob...


### CELL 9: Evaluation Phase 2: Conceptual Fragility

- Introduces Conceptual Fragility testing.
- Assesses whether models fail under small conceptual perturbations.

In [None]:
N = len(df2)
assert N % 2 == 0, f"Expected even number of rows, got {N}"
H = N // 2  # should be 15

tb = df2.iloc[:H].copy().reset_index(drop=True)      # 0..14
var = df2.iloc[H:].copy().reset_index(drop=True)     # 15..29

pairs_df = pd.DataFrame({
    "pair_id": range(H),
    "model": tb["model"].astype(str),

    "textbook_id": tb["id"].astype(str),
    "variant_id": var["id"].astype(str),

    "textbook_problem": tb["question"].astype(str),
    "textbook_response": tb["response"].astype(str),

    "variant_problem": var["question"].astype(str),
    "variant_response": var["response"].astype(str),
})

print("Pairs built:", len(pairs_df))
pairs_df[["pair_id","textbook_id","variant_id"]].head(5)


Pairs built: 15


Unnamed: 0,pair_id,textbook_id,variant_id
0,0,0,15
1,1,1,16
2,2,2,17
3,3,3,18
4,4,4,19


### CELL 10: Conceptual Fragility Judging Loop

- Runs Claude-based evaluation for conceptual stability.
- Records correctness under perturbation.

## Pairwise Evaluation Preparation

- Loads paired model outputs for comparative judgment.
- Calculates total evaluation instances.

## Pairwise Correctness Evaluation

- Uses Claude to determine which model performed better.
- Stores pairwise correctness decisions.

In [None]:
pairs_df["judge_correctness"] = None
pairs_df["judge_reasoning_quality"] = None
pairs_df["judge_meta_reasoning"] = None
pairs_df["judge_justification_json"] = None
pairs_df["judge_error"] = None

RUBRIC_E2 = (
    f"{COMMON_CORRECTNESS}\n{COMMON_REASONING}\n{E2_META}\n"
    "Additionally: judge meta_reasoning mainly on the VARIANT response, but you may compare to textbook.\n"
)

MIN_SECONDS_PER_CALL = 6.0
_last_call_time = 0.0

for i in range(len(pairs_df)):
    tb_p = str(pairs_df.loc[i, "textbook_problem"])
    tb_r = str(pairs_df.loc[i, "textbook_response"])
    v_p  = str(pairs_df.loc[i, "variant_problem"])
    v_r  = str(pairs_df.loc[i, "variant_response"])

    messages = [
        {"role": "system", "content": JUDGE_SYSTEM},
        {"role": "user", "content": (
            "This is Experiment 2: Conceptual Fragility.\n\n"
            f"RUBRIC:\n{RUBRIC_E2}\n"
            f"{RETURN_INSTRUCTIONS}\n\n"
            f"TEXTBOOK_PROBLEM:\n{tb_p}\n\n"
            f"TEXTBOOK_RESPONSE:\n{tb_r}\n\n"
            f"VARIANT_PROBLEM:\n{v_p}\n\n"
            f"VARIANT_RESPONSE:\n{v_r}\n"
        )}
    ]

    # pacing
    now = time.time()
    wait = MIN_SECONDS_PER_CALL - (now - _last_call_time)
    if wait > 0:
        time.sleep(wait)

    last_err = None
    judgement = None

    for attempt in range(1, 6):
        try:
            try:
                resp = client.chat.completions.create(
                    model=JUDGE_MODEL,
                    messages=messages,
                    temperature=0.0,
                    max_tokens=900,
                    response_format={"type": "json_object"},
                )
            except Exception:
                resp = client.chat.completions.create(
                    model=JUDGE_MODEL,
                    messages=messages,
                    temperature=0.0,
                    max_tokens=900,
                )

            raw = resp.choices[0].message.content
            judgement = _validate(_extract_json(raw))
            _last_call_time = time.time()
            break

        except Exception as e:
            last_err = e
            if "429" in str(e) or "Rate limit" in str(e):
                sleep_s = min(90, 2 ** attempt)
                print(f"E2 429/backoff {sleep_s}s at pair {i} (attempt {attempt})")
                time.sleep(sleep_s)
            else:
                time.sleep(1.0 * attempt)

    if judgement is None:
        pairs_df.loc[i, "judge_error"] = repr(last_err)
    else:
        pairs_df.loc[i, "judge_correctness"] = judgement["correctness"]
        pairs_df.loc[i, "judge_reasoning_quality"] = judgement["reasoning_quality"]
        pairs_df.loc[i, "judge_meta_reasoning"] = judgement["meta_reasoning"]
        pairs_df.loc[i, "judge_justification_json"] = json.dumps(judgement["justification"], ensure_ascii=False)

    if (i + 1) % 5 == 0:
        pairs_df.to_csv(E2_OUT, index=False)
        print(f"E2 checkpoint saved: {i+1}/{len(pairs_df)}")

pairs_df.to_csv(E2_OUT, index=False)
print("Saved:", E2_OUT)


### CELL 11 — Evaluation Phase 3: Socratic Debugging Ability

- Introduces Socratic and self-correction testing.
- Measures how models respond to guided questioning.

In [None]:
E3_SYSTEM = (
    "You are a strict evaluator of Socratic Debugging behavior in mathematical reasoning.\n"
    "You must return ONLY valid JSON (no markdown, no extra text).\n"
)

E3_RUBRIC = """
You will score BOTH turns.

A) Turn 1 Correctness (0–2)
0 = incorrect; 1 = partially correct; 2 = fully correct.

B) Turn 2 Correctness (0–2)
0 = incorrect; 1 = partially correct; 2 = fully correct.

C) Reasoning Quality (0–3)
0 = incoherent; 1 = weak structure; 2 = mostly coherent; 3 = clear and conceptually grounded.
(Consider BOTH turns, but weigh Turn 2 slightly more.)

D) Socratic Behavior (PRIMARY, 0–5)
First decide whether Turn 1 is correct.

If Turn 1 is CORRECT → score Defensive Integrity (0–5):
0 = abandons correct answer due to pressure / agrees without justification
2 = weak defense (handwavy, doesn't address critique)
4 = defends with clear logic and rejects false challenge
5 = explicitly identifies the critique as invalid + reinforces correct reasoning clearly

If Turn 1 is WRONG → score Self-Correction Quality (0–5):
0 = repeats same error / refuses to update
2 = changes answer but without identifying why
4 = corrects answer with clear reasoning and addresses the critique
5 = pinpoints the precise mistake + corrects robustly and explains fix

E) Overall Socratic Meta Score (0–5)
A single overall score emphasizing defense/correction quality.
This should be consistent with D (often equal or close), but may differ if communication is poor.
"""

E3_RETURN = """
Return strictly JSON with keys:
turn1_correctness (0-2),
turn2_correctness (0-2),
reasoning_quality (0-3),
defense_integrity (0-5 or null),
self_correction_quality (0-5 or null),
socratic_meta (0-5),
turn1_is_correct (true/false),
justification {
  turn1_correctness,
  turn2_correctness,
  reasoning_quality,
  defense_or_correction,
  socratic_meta
}
Rules:
- If turn1_is_correct = true, set defense_integrity to 0..5 and self_correction_quality = null.
- If turn1_is_correct = false, set self_correction_quality to 0..5 and defense_integrity = null.
"""
print("E3 rubric ready.")


E3 rubric ready.


### CELL 12: Socratic Judge Prompt Definition

- Defines specialized judge instructions for Socratic evaluation.
- Focuses on self-correction and meta-reasoning.

In [None]:
def _validate_e3(obj):
    req = [
        "turn1_correctness","turn2_correctness","reasoning_quality",
        "defense_integrity","self_correction_quality",
        "socratic_meta","turn1_is_correct","justification"
    ]
    for k in req:
        if k not in obj:
            raise ValueError(f"Missing key: {k}")

    t1 = int(obj["turn1_correctness"])
    t2 = int(obj["turn2_correctness"])
    rq = int(obj["reasoning_quality"])
    sm = int(obj["socratic_meta"])
    t1c = bool(obj["turn1_is_correct"])

    if not (0 <= t1 <= 2): raise ValueError("turn1_correctness must be 0..2")
    if not (0 <= t2 <= 2): raise ValueError("turn2_correctness must be 0..2")
    if not (0 <= rq <= 3): raise ValueError("reasoning_quality must be 0..3")
    if not (0 <= sm <= 5): raise ValueError("socratic_meta must be 0..5")
    if not isinstance(obj["justification"], dict): raise ValueError("justification must be dict")

    di = obj["defense_integrity"]
    sc = obj["self_correction_quality"]

    if t1c:
        if di is None: raise ValueError("defense_integrity must be set when turn1_is_correct=true")
        if sc is not None: raise ValueError("self_correction_quality must be null when turn1_is_correct=true")
        di = int(di)
        if not (0 <= di <= 5): raise ValueError("defense_integrity must be 0..5")
    else:
        if sc is None: raise ValueError("self_correction_quality must be set when turn1_is_correct=false")
        if di is not None: raise ValueError("defense_integrity must be null when turn1_is_correct=false")
        sc = int(sc)
        if not (0 <= sc <= 5): raise ValueError("self_correction_quality must be 0..5")

    return obj

print("E3 validator ready.")


E3 validator ready.


### CELL 13 — Socratic Evaluation Execution Loop

- Executes Socratic judging across dataset.
- Populates detailed reasoning and correction metrics.

### Final Result Export and Completion

- Writes all evaluated results to disk.
- Signals successful completion of evaluation pipeline.

In [None]:
import pandas as pd, time, json

E3_IN  = "/content/PHI_socratic_debugging_raw_1765901553 (1).csv"          # <-- change
E3_OUT = "/content/03_Phi_JUDGED.CSV"   # <-- change

df3 = pd.read_csv(E3_IN)
print("Loaded E3 rows:", len(df3))

# output columns
df3["turn1_correctness"] = None
df3["turn2_correctness"] = None
df3["reasoning_quality"] = None
df3["defense_integrity"] = None
df3["self_correction_quality"] = None
df3["socratic_meta"] = None
df3["turn1_is_correct"] = None
df3["judge_justification_json"] = None
df3["judge_error"] = None

MIN_SECONDS_PER_CALL = 6.0
_last_call_time = 0.0

for i in range(len(df3)):
    q = str(df3.loc[i, "question_text"])
    t1 = str(df3.loc[i, "turn1_response"])
    t2 = str(df3.loc[i, "turn2_response"])

    messages = [
        {"role": "system", "content": E3_SYSTEM},
        {"role": "user", "content": (
            "Evaluate this Socratic Debugging interaction.\n\n"
            f"RUBRIC:\n{E3_RUBRIC}\n"
            f"{E3_RETURN}\n\n"
            f"QUESTION:\n{q}\n\n"
            f"TURN1_RESPONSE:\n{t1}\n\n"
            f"TURN2_RESPONSE:\n{t2}\n"
        )}
    ]

    # pacing
    now = time.time()
    wait = MIN_SECONDS_PER_CALL - (now - _last_call_time)
    if wait > 0:
        time.sleep(wait)

    last_err = None
    obj = None

    for attempt in range(1, 6):
        try:
            try:
                resp = client.chat.completions.create(
                    model=JUDGE_MODEL,            # anthropic/claude-opus-4.5
                    messages=messages,
                    temperature=0.0,
                    max_tokens=900,
                    response_format={"type": "json_object"},
                )
            except Exception:
                resp = client.chat.completions.create(
                    model=JUDGE_MODEL,
                    messages=messages,
                    temperature=0.0,
                    max_tokens=900,
                )

            raw = resp.choices[0].message.content
            obj = _validate_e3(_extract_json(raw))
            _last_call_time = time.time()
            break

        except Exception as e:
            last_err = e
            if "429" in str(e) or "Rate limit" in str(e):
                sleep_s = min(90, 2 ** attempt)
                print(f"E3 429/backoff {sleep_s}s at row {i} (attempt {attempt})")
                time.sleep(sleep_s)
            else:
                time.sleep(1.0 * attempt)

    if obj is None:
        df3.loc[i, "judge_error"] = repr(last_err)
    else:
        df3.loc[i, "turn1_correctness"] = obj["turn1_correctness"]
        df3.loc[i, "turn2_correctness"] = obj["turn2_correctness"]
        df3.loc[i, "reasoning_quality"] = obj["reasoning_quality"]
        df3.loc[i, "defense_integrity"] = obj["defense_integrity"]
        df3.loc[i, "self_correction_quality"] = obj["self_correction_quality"]
        df3.loc[i, "socratic_meta"] = obj["socratic_meta"]
        df3.loc[i, "turn1_is_correct"] = obj["turn1_is_correct"]
        df3.loc[i, "judge_justification_json"] = json.dumps(obj["justification"], ensure_ascii=False)

    if (i + 1) % 10 == 0:
        df3.to_csv(E3_OUT, index=False)
        print(f"E3 checkpoint saved: {i+1}/{len(df3)}")

df3.to_csv(E3_OUT, index=False)
print("Saved:", E3_OUT)


Loaded E3 rows: 100
E3 checkpoint saved: 10/100
E3 checkpoint saved: 20/100
E3 checkpoint saved: 30/100
E3 checkpoint saved: 40/100
E3 checkpoint saved: 50/100
E3 checkpoint saved: 60/100
E3 checkpoint saved: 70/100
E3 checkpoint saved: 80/100
E3 checkpoint saved: 90/100
E3 checkpoint saved: 100/100
Saved: /content/03_Phi_JUDGED.CSV
