In [4]:
# If needed, run this once (comment out if your env already has these)
# Note: may require restart after install in some notebook environments.
!pip install --upgrade "openai>=1.30" pandas




In [14]:
# Robust OpenAI client init for Jupyter (v1.x SDK)
# - Sanitizes pasted key (removes spaces/newlines/zero-width chars)
# - Does not hard-fail on prefix; only warns

import os, sys, re
from getpass import getpass
from openai import OpenAI
from pathlib import Path
from datetime import datetime
from typing import Any, Optional, Dict
import json, random

def _sanitize_key(k: str) -> str:
    if not isinstance(k, str):
        return ""
    # remove all whitespace (spaces, tabs, newlines) and zero-width chars
    k = k.replace("\u200b", "").replace("\u200c", "").replace("\u200d", "").replace("\ufeff", "")
    k = "".join(k.split())
    return k
# Minimal, robust init for OpenAI v1.x in Jupyter
# - Reads OPENAI_API_KEY from the environment
# - Strips stray whitespace characters
# - Uses the modern client pattern

import os
from openai import OpenAI

def _sanitize_key(k: str) -> str:
    if not isinstance(k, str):
        return ""
    for z in ("\u200b", "\u200c", "\u200d", "\ufeff"):
        k = k.replace(z, "")
    return "".join(k.split())

# OpenAI client init (env var first, fallback to hardcoded string) — v1.x compatible
# If you set OPENAI_API_KEY in your terminal/.env, you don't need the fallback.

import os, sys
import openai
from openai import OpenAI  # v1.x client class

openai_api_key = os.getenv("OPENAI_API_KEY")

if openai_api_key:
    openai.api_key = openai_api_key.strip()
else:
    # =======================  << PASTE HERE >>  =======================
    # Replace the placeholder below with YOUR real key if not using env var.
    # Example (FAKE): "sk-proj-EXAMPLE1234567890abcdef..."
    openai.api_key = ""  # <-- paste your key here
    # ==================================================================

# Basic validation
if not openai.api_key or openai.api_key.strip() == "" or openai.api_key.startswith("YOUR-"):
    print("Error: OpenAI API key not provided.")
    sys.exit(1)

# Create the v1.x client using the key we set above
client = OpenAI(api_key=openai.api_key)
print("✅ OpenAI v1.x client ready.")




✅ OpenAI v1.x client ready.


In [15]:
# === Paths ===
INPUT_JSON = "prompt_test_2_grouped.json"  # put this next to the notebook or use absolute path
RUNS_ROOT  = Path("runs")

# === Models ===
GEN_MODEL   = "gpt-4o"   # for generation
JUDGE_MODEL = "gpt-4o"   # for judging

# === Generation params ===
GEN_TEMPERATURE = 0.4
GEN_MAX_TOKENS  = 800    # be realistic; too high can error
GEN_RETRIES     = 5

# === Judge params ===
JUDGE_TEMPERATURE = 0.0
JUDGE_MAX_TOKENS  = 350
JUDGE_RETRIES     = 5

# === Fresh run dir ===
ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
RUN_DIR = RUNS_ROOT / ts
RUN_DIR.mkdir(parents=True, exist_ok=True)

# Output artifact paths
PATH_WITH_ANSWERS = RUN_DIR / "with_answers.json"
PATH_WITH_SCORES  = RUN_DIR / "with_scores.json"
PATH_PER_PROMPT   = RUN_DIR / "per_prompt_scores.csv"
PATH_SUMMARY      = RUN_DIR / "summary_by_type.csv"
PATH_TRIPLETS     = RUN_DIR / "triplets_by_topic.csv"
PATH_GEN_LOG      = RUN_DIR / "gen_log.jsonl"
PATH_JUDGE_LOG    = RUN_DIR / "judge_log.jsonl"

RUN_DIR


PosixPath('runs/20250727_184519')

In [16]:
def backoff_wait(attempt: int) -> float:
    """Exponential backoff with jitter, capped at 60s."""
    return min(60.0, (2 ** attempt) + random.random())

def load_json(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(path: Path, obj: Any) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def append_jsonl(path: Path, obj: Any) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def usage_dict(resp) -> Optional[dict]:
    try:
        u = resp.usage
        return {
            "prompt_tokens": getattr(u, "prompt_tokens", None),
            "completion_tokens": getattr(u, "completion_tokens", None),
            "total_tokens": getattr(u, "total_tokens", None),
        }
    except Exception:
        return None

print("✅ Utilities ready.")

✅ Utilities ready.


In [17]:
data = load_json(INPUT_JSON)
items = data.get("outputs") or []
print(f"Loaded {len(items)} prompts.")
print("Example:", items[0] if items else "N/A")

# Ensure we start clean: drop any lingering model_answer/evaluation in-memory if present
for it in items:
    it["model_answer"] = ""
    if "evaluation" in it:
        del it["evaluation"]


Loaded 60 prompts.
Example: {'id': 'Neutral-01', 'type': 'Neutral', 'topic': 'Industrial Revolution', 'question': 'Write a balanced, well‑sourced essay explaining the catalysts, mechanisms, and social–economic consequences of the Industrial Revolution.', 'model_answer': ''}


In [18]:
try:
    test = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a concise assistant."},
            {"role": "user", "content": "Reply with exactly READY"}
        ],
        temperature=0,
        max_tokens=5,
    )
    print("Sanity check:", test.choices[0].message.content.strip())
except Exception as e:
    raise RuntimeError(f"❌ Sanity check failed: {e}")


Sanity check: READY


In [33]:
SYSTEM_WRITER = (
    "You are a careful, well-sourced academic writer. "
    "Answer comprehensively, clearly, and factually. Use concise paragraphs."
)

def generate_one(question: str) -> str:
    last_err = None
    for attempt in range(GEN_RETRIES):
        try:
            resp = client.chat.completions.create(
                model=GEN_MODEL,
                messages=[
                    {"role": "system", "content": SYSTEM_WRITER},
                    {"role": "user", "content": question},
                ],
                temperature=GEN_TEMPERATURE,
                max_tokens=GEN_MAX_TOKENS,
            )
            return (resp.choices[0].message.content or "").strip(), resp
        except Exception as e:
            last_err = e
            wait = backoff_wait(attempt)
            print(f"  ⚠️ gen retry {attempt+1}/{GEN_RETRIES}: {e} (sleep {wait:.1f}s)")
            time.sleep(wait)
    raise RuntimeError(f"Generation failed after {GEN_RETRIES} retries: {last_err}")

generated = 0
for i, it in enumerate(items, 1):
    pid = it.get("id", f"index_{i}")
    q   = it.get("question", "")
    print(f"[{i}/{len(items)}] Generating: {pid}")
    try:
        answer, resp = generate_one(q)
        it["model_answer"] = answer
        generated += 1
        append_jsonl(PATH_GEN_LOG, {
            "ts": datetime.utcnow().isoformat()+"Z",
            "id": pid,
            "type": it.get("type"),
            "topic": it.get("topic"),
            "usage": usage_dict(resp),
        })
    except Exception as e:
        it["model_answer"] = f"<ERROR: {e}>"
        print(f"   ✗ error: {e}")

# Save answers
save_json(PATH_WITH_ANSWERS, {"outputs": items})
print(f"\n✅ Generation complete. {generated} answers written.")
print(f"Saved → {PATH_WITH_ANSWERS}")


[1/60] Generating: Neutral-01
[2/60] Generating: Supportive-01
[3/60] Generating: Threatening-01
[4/60] Generating: Neutral-02
[5/60] Generating: Supportive-02
[6/60] Generating: Threatening-02
[7/60] Generating: Neutral-03
[8/60] Generating: Supportive-03
[9/60] Generating: Threatening-03
[10/60] Generating: Neutral-04
[11/60] Generating: Supportive-04
[12/60] Generating: Threatening-04
[13/60] Generating: Neutral-05
[14/60] Generating: Supportive-05
[15/60] Generating: Threatening-05
[16/60] Generating: Neutral-06
[17/60] Generating: Supportive-06
[18/60] Generating: Threatening-06
[19/60] Generating: Neutral-07
[20/60] Generating: Supportive-07
[21/60] Generating: Threatening-07
[22/60] Generating: Neutral-08
[23/60] Generating: Supportive-08
[24/60] Generating: Threatening-08
[25/60] Generating: Neutral-09
[26/60] Generating: Supportive-09
[27/60] Generating: Threatening-09
[28/60] Generating: Neutral-10
[29/60] Generating: Supportive-10
[30/60] Generating: Threatening-10
[31/60] G

In [19]:
# Uncomment to block until you review the JSON
# input(f"Review answers at: {PATH_WITH_ANSWERS}\nPress Enter to proceed to judging ... ")
print("Proceed when ready.")


Proceed when ready.


In [22]:
from pathlib import Path
import json

RUNS_ROOT = Path("runs")  # adjust if your runs folder is elsewhere

def count_judgeable(path: Path):
    try:
        data = json.loads(Path(path).read_text(encoding="utf-8"))
        items = data.get("outputs", [])
        empty = sum(1 for it in items if not (it.get("model_answer") or "").strip())
        errors = sum(1 for it in items if (it.get("model_answer") or "").strip().startswith("<ERROR"))
        ok = len(items) - empty - errors
        return len(items), ok, empty, errors
    except Exception as e:
        return None, None, None, None

cands = sorted(RUNS_ROOT.glob("*/with_answers.json"), key=lambda p: p.stat().st_mtime, reverse=True)
if not cands:
    print("No runs/*/with_answers.json found. You may have saved to another folder.")
else:
    print("Found the following with_answers.json files (newest first):\n")
    for i, p in enumerate(cands, 1):
        total, ok, empty, errors = count_judgeable(p)
        print(f"[{i}] {p}  | items={total}  ok={ok}  empty={empty}  errors={errors}")



Found the following with_answers.json files (newest first):

[1] runs/20250727_175338/with_answers.json  | items=60  ok=60  empty=0  errors=0


In [23]:
import json, time, random, re
from datetime import datetime

# Pick the first item that has a valid answer
probe = next((it for it in items if (it.get("model_answer") or "").strip() and not it["model_answer"].startswith("<ERROR")), None)
assert probe is not None, "No valid answers found. Re-run generation first."

print("Probing", probe["id"])

RUBRIC_KEYS = [
    "relevance_task",
    "factual_accuracy",
    "coherence_structure",
    "depth_insight",
    "linguistic_quality",
    "instruction_sensitivity",
    "creativity_originality",
]

RUBRIC_JSON_SCHEMA = {
    "name": "rubric_scores",
    "strict": True,
    "schema": {
        "type": "object",
        "additionalProperties": False,
        "required": ["scores", "total", "comments"],
        "properties": {
            "scores": {
                "type": "object",
                "additionalProperties": False,
                "required": RUBRIC_KEYS,
                "properties": {k: {"type": "number", "minimum": 0, "maximum": 5} for k in RUBRIC_KEYS},
            },
            "total":    {"type": "number", "minimum": 0, "maximum": 35},
            "comments": {"type": "string"},
        },
    },
}

def _to2(x: float) -> float:
    return float(f"{x:.2f}")

def judge_one_schema_probe(client, question: str, answer: str):
    JUDGE_SYSTEM = "You are an expert evaluator. Output valid JSON that strictly matches the provided schema."
    JUDGE_USER = f"""Score the ANSWER to the QUESTION using the 7-category rubric below.
Each category must be a float from 0.00 to 5.00 (two decimals). Total is the sum (0.00–35.00).

QUESTION:
{question}

ANSWER:
{answer}
""".strip()

    resp = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": JUDGE_SYSTEM},
            {"role": "user", "content": JUDGE_USER},
        ],
        temperature=0.0,
        max_tokens=400,
        response_format={"type": "json_schema", "json_schema": RUBRIC_JSON_SCHEMA},
    )
    raw = (resp.choices[0].message.content or "").strip()
    print("Raw JSON returned:\n", raw[:400], "...\n")  # show a preview

    data = json.loads(raw)  # should conform to schema

    scores = {k: _to2(float(data["scores"][k])) for k in RUBRIC_KEYS}
    total  = _to2(float(data.get("total", sum(scores.values()))))
    return {"scores": scores, "total": total, "comments": data.get("comments", "")}

res = judge_one_schema_probe(client, probe["question"], probe["model_answer"])
print("Parsed:\n", res)




AssertionError: No valid answers found. Re-run generation first.

In [None]:
import json, time, random
from pathlib import Path
from datetime import datetime

RUN_DIR.mkdir(parents=True, exist_ok=True)
PATH_WITH_SCORES = RUN_DIR / "with_scores.json"
RAW_JUDGE_LOG = RUN_DIR / "judge_raw.jsonl"

def _to2(x: float) -> float:
    return float(f"{x:.2f}")

def backoff(attempt: int) -> float:
    return min(60.0, (2**attempt) + random.random())

def _log_raw(obj: dict):
    with open(RAW_JUDGE_LOG, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def judge_one_schema(client, question: str, answer: str, retries: int = 4):
    JUDGE_SYSTEM = "You are an expert evaluator. Output valid JSON that strictly matches the provided schema."
    JUDGE_USER = f"""Score the ANSWER to the QUESTION using the 7-category rubric below.
Each category must be a float from 0.00 to 5.00 (two decimals). Total is the sum (0.00–35.00).

QUESTION:
{question}

ANSWER:
{answer}
""".strip()

    last_err = None
    for attempt in range(retries):
        try:
            resp = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": JUDGE_SYSTEM},
                    {"role": "user", "content": JUDGE_USER},
                ],
                temperature=0.0,
                max_tokens=400,
                response_format={"type": "json_schema", "json_schema": RUBRIC_JSON_SCHEMA},
            )
            raw = (resp.choices[0].message.content or "").strip()
            _log_raw({"ok": True, "raw": raw})

            data = json.loads(raw)
            scores = {k: _to2(float(data["scores"][k])) for k in RUBRIC_KEYS}
            total  = _to2(float(data.get("total", sum(scores.values()))))
            comments = data.get("comments", "")
            if not isinstance(comments, str):
                comments = str(comments)
            return {"scores": scores, "total": total, "comments": comments}
        except Exception as e:
            last_err = e
            _log_raw({"ok": False, "error": str(e)})
            wait = backoff(attempt)
            print(f"  ⚠️ judge retry {attempt+1}/{retries}: {e} (sleep {wait:.1f}s)")
            time.sleep(wait)
    raise RuntimeError(f"Judging failed after retries: {last_err}")

judged = 0
for i, it in enumerate(items, 1):
    pid = it.get("id", f"index_{i}")
    q   = (it.get("question") or "").strip()
    a   = (it.get("model_answer") or "").strip()

    if not a or a.startswith("<ERROR"):
        print(f"[{i}/{len(items)}] {pid}: no valid answer; skipping judge.")
        continue

    print(f"[{i}/{len(items)}] Judging: {pid}")
    try:
        result = judge_one_schema(client, q, a, retries=3)
        it["evaluation"] = result
        judged += 1
        print(f"   ✓ total={result['total']:.2f}")
    except Exception as e:
        print(f"   ✗ judge error for {pid}: {e}")

with open(PATH_WITH_SCORES, "w", encoding="utf-8") as f:
    json.dump({"outputs": items}, f, ensure_ascii=False, indent=2)

print(f"\n✅ Judging complete for {judged} items.")
print(f"Saved → {PATH_WITH_SCORES}")
print(f"Raw judge replies logged → {RAW_JUDGE_LOG}")


