In [1]:

import os, json, time, random, re
from pathlib import Path
from datetime import datetime
from typing import Any, Dict, List, Optional

import pandas as pd

try:
    from openai import OpenAI
except ImportError:
    !pip install -qU openai pandas
    from openai import OpenAI


api_key = os.getenv("OPENAI_API_KEY")
if not api_key:

    api_key = ""
    raise RuntimeError("Set OPENAI_API_KEY env var or paste your key above.")
client = OpenAI(api_key=api_key)


WITH_ANSWERS_PATH = Path("runs")  

candidates = sorted(
    WITH_ANSWERS_PATH.glob("*/with_answers.json"),
    key=lambda p: p.stat().st_mtime,
    reverse=True,
)
if not candidates:
    raise FileNotFoundError("No runs/*/with_answers.json found. Set DATA_PATH manually.")
DATA_PATH = candidates[0]
RUN_DIR   = DATA_PATH.parent

print("Using:", DATA_PATH)

data = json.loads(DATA_PATH.read_text(encoding="utf-8"))
items: List[Dict[str, Any]] = data.get("outputs", [])
print("Items loaded:", len(items))


judgeable = sum(1 for it in items if (it.get("model_answer") or "").strip() and not it["model_answer"].strip().startswith("<ERROR"))
print(f"Judgeable answers: {judgeable}/{len(items)}")


Using: runs/20250727_175338/with_answers.json
Items loaded: 60
Judgeable answers: 60/60


In [2]:
# === Rubric (7 categories, 0.00–5.00 each) ===
RUBRIC_KEYS = [
    "relevance_task",
    "factual_accuracy",
    "coherence_structure",
    "depth_insight",
    "linguistic_quality",
    "instruction_sensitivity",
    "creativity_originality",
]

# JSON Schema for structured outputs
RUBRIC_JSON_SCHEMA = {
    "name": "rubric_scores",
    "strict": True,           # reject extra keys; enforce required fields
    "schema": {
        "type": "object",
        "additionalProperties": False,
        "required": ["scores", "total", "comments"],
        "properties": {
            "scores": {
                "type": "object",
                "additionalProperties": False,
                "required": RUBRIC_KEYS,
                "properties": {k: {"type": "number", "minimum": 0, "maximum": 5} for k in RUBRIC_KEYS},
            },
            "total":    {"type": "number", "minimum": 0, "maximum": 35},
            "comments": {"type": "string"},
        },
    },
}

JUDGE_MODEL        = "gpt-4o"
JUDGE_TEMPERATURE  = 0.0
JUDGE_MAX_TOKENS   = 400
JUDGE_RETRIES      = 3

def _to2(x: float) -> float:
    return float(f"{x:.2f}")

def _backoff(attempt: int) -> float:
    return min(60.0, (2**attempt) + random.random())

def judge_one_schema(question: str, answer: str, retries: int = JUDGE_RETRIES) -> Dict[str, Any]:
    """LLM-as-a-judge with JSON Schema; returns {'scores':{…}, 'total':float, 'comments':str}."""
    sys_msg = "You are an expert evaluator. Output valid JSON that strictly matches the provided schema."
    user_msg = f"""Score the ANSWER to the QUESTION using the 7-category rubric below.
Each category must be a float from 0.00 to 5.00 (two decimals). Total is the sum (0.00–35.00).

QUESTION:
{question}

ANSWER:
{answer}
""".strip()

    last_err = None
    for attempt in range(retries):
        try:
            resp = client.chat.completions.create(
                model=JUDGE_MODEL,
                messages=[
                    {"role": "system", "content": sys_msg},
                    {"role": "user", "content": user_msg},
                ],
                temperature=JUDGE_TEMPERATURE,
                max_tokens=JUDGE_MAX_TOKENS,
                response_format={"type": "json_schema", "json_schema": RUBRIC_JSON_SCHEMA},
            )

            raw = (resp.choices[0].message.content or "").strip()
            data = json.loads(raw)  # should conform to schema

            # round/clamp
            scores = {k: _to2(float(data["scores"][k])) for k in RUBRIC_KEYS}
            total = _to2(float(data.get("total", sum(scores.values()))))
            comments = data.get("comments", "")
            if not isinstance(comments, str):
                comments = str(comments)
            return {"scores": scores, "total": total, "comments": comments}
        except Exception as e:
            last_err = e
            wait = _backoff(attempt)
            print(f"  ⚠️ judge retry {attempt+1}/{retries}: {e} (sleep {wait:.1f}s)")
            time.sleep(wait)
    raise RuntimeError(f"Judging failed after retries: {last_err}")


In [3]:
# === Rubric (7 categories, 0.00–5.00 each) ===
RUBRIC_KEYS = [
    "relevance_task",
    "factual_accuracy",
    "coherence_structure",
    "depth_insight",
    "linguistic_quality",
    "instruction_sensitivity",
    "creativity_originality",
]

# JSON Schema for structured outputs
RUBRIC_JSON_SCHEMA = {
    "name": "rubric_scores",
    "strict": True,           # reject extra keys; enforce required fields
    "schema": {
        "type": "object",
        "additionalProperties": False,
        "required": ["scores", "total", "comments"],
        "properties": {
            "scores": {
                "type": "object",
                "additionalProperties": False,
                "required": RUBRIC_KEYS,
                "properties": {k: {"type": "number", "minimum": 0, "maximum": 5} for k in RUBRIC_KEYS},
            },
            "total":    {"type": "number", "minimum": 0, "maximum": 35},
            "comments": {"type": "string"},
        },
    },
}

JUDGE_MODEL        = "gpt-4o"
JUDGE_TEMPERATURE  = 0.0
JUDGE_MAX_TOKENS   = 400
JUDGE_RETRIES      = 3

def _to2(x: float) -> float:
    return float(f"{x:.2f}")

def _backoff(attempt: int) -> float:
    return min(60.0, (2**attempt) + random.random())

def judge_one_schema(question: str, answer: str, retries: int = JUDGE_RETRIES) -> Dict[str, Any]:
    """LLM-as-a-judge with JSON Schema; returns {'scores':{…}, 'total':float, 'comments':str}."""
    sys_msg = "You are an expert evaluator. Output valid JSON that strictly matches the provided schema."
    user_msg = f"""Score the ANSWER to the QUESTION using the 7-category rubric below.
Each category must be a float from 0.00 to 5.00 (two decimals). Total is the sum (0.00–35.00).

QUESTION:
{question}

ANSWER:
{answer}
""".strip()

    last_err = None
    for attempt in range(retries):
        try:
            resp = client.chat.completions.create(
                model=JUDGE_MODEL,
                messages=[
                    {"role": "system", "content": sys_msg},
                    {"role": "user", "content": user_msg},
                ],
                temperature=JUDGE_TEMPERATURE,
                max_tokens=JUDGE_MAX_TOKENS,
                response_format={"type": "json_schema", "json_schema": RUBRIC_JSON_SCHEMA},
            )

            raw = (resp.choices[0].message.content or "").strip()
            data = json.loads(raw)  # should conform to schema

            # round/clamp
            scores = {k: _to2(float(data["scores"][k])) for k in RUBRIC_KEYS}
            total = _to2(float(data.get("total", sum(scores.values()))))
            comments = data.get("comments", "")
            if not isinstance(comments, str):
                comments = str(comments)
            return {"scores": scores, "total": total, "comments": comments}
        except Exception as e:
            last_err = e
            wait = _backoff(attempt)
            print(f"  ⚠️ judge retry {attempt+1}/{retries}: {e} (sleep {wait:.1f}s)")
            time.sleep(wait)
    raise RuntimeError(f"Judging failed after retries: {last_err}")


In [4]:
PATH_WITH_SCORES = RUN_DIR / "with_scores.json"

judged = 0
for i, it in enumerate(items, 1):
    pid = it.get("id", f"index_{i}")
    q   = (it.get("question") or "").strip()
    a   = (it.get("model_answer") or "").strip()

    if not a or a.startswith("<ERROR"):
        print(f"[{i}/{len(items)}] {pid}: no valid answer; skipping.")
        continue

    print(f"[{i}/{len(items)}] Judging: {pid}")
    try:
        result = judge_one_schema(q, a, retries=JUDGE_RETRIES)
        it["evaluation"] = result
        judged += 1
        print(f"   ✓ total={result['total']:.2f}")
    except Exception as e:
        print(f"   ✗ judge error for {pid}: {e}")

# Write back
PATH_WITH_SCORES.write_text(json.dumps({"outputs": items}, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"\n✅ Judging complete for {judged} items.")
print(f"Saved → {PATH_WITH_SCORES}")


[1/60] Judging: Neutral-01
   ✓ total=33.80
[2/60] Judging: Supportive-01
   ✓ total=34.00
[3/60] Judging: Threatening-01
   ✓ total=33.80
[4/60] Judging: Neutral-02
   ✓ total=31.25
[5/60] Judging: Supportive-02
   ✓ total=33.80
[6/60] Judging: Threatening-02
   ✓ total=34.00
[7/60] Judging: Neutral-03
   ✓ total=33.80
[8/60] Judging: Supportive-03
   ✓ total=33.50
[9/60] Judging: Threatening-03
   ✓ total=33.50
[10/60] Judging: Neutral-04
   ✓ total=33.80
[11/60] Judging: Supportive-04
   ✓ total=31.20
[12/60] Judging: Threatening-04
   ✓ total=33.80
[13/60] Judging: Neutral-05
   ✓ total=33.50
[14/60] Judging: Supportive-05
   ✓ total=34.00
[15/60] Judging: Threatening-05
   ✓ total=33.60
[16/60] Judging: Neutral-06
   ✓ total=33.80
[17/60] Judging: Supportive-06
   ✓ total=33.80
[18/60] Judging: Threatening-06
   ✓ total=33.20
[19/60] Judging: Neutral-07
   ✓ total=33.20
[20/60] Judging: Supportive-07
   ✓ total=31.25
[21/60] Judging: Threatening-07
   ✓ total=33.80
[22/60] Judging

In [5]:
# Build a flat DataFrame from the graded items
rows = []
for it in items:
    ev = it.get("evaluation")
    if not ev: 
        continue
    s = ev["scores"]
    rows.append({
        "id": it.get("id"),
        "type": it.get("type"),
        "topic": it.get("topic"),
        "total": float(ev["total"]),
        "relevance_task": float(s["relevance_task"]),
        "factual_accuracy": float(s["factual_accuracy"]),
        "coherence_structure": float(s["coherence_structure"]),
        "depth_insight": float(s["depth_insight"]),
        "linguistic_quality": float(s["linguistic_quality"]),
        "instruction_sensitivity": float(s["instruction_sensitivity"]),
        "creativity_originality": float(s["creativity_originality"]),
        "comments": ev.get("comments", ""),
    })

df = pd.DataFrame(rows)
print("Graded items:", len(df))
display(df.head(10))

# Save per-item CSV
PATH_PER_ITEM = RUN_DIR / "per_item_scores.csv"
df.to_csv(PATH_PER_ITEM, index=False)
print("Saved per-item scores →", PATH_PER_ITEM)

# --- Per-valence (Neutral/Supportive/Threatening) ---
valence_means = df.groupby("type")[[
    "relevance_task","factual_accuracy","coherence_structure","depth_insight",
    "linguistic_quality","instruction_sensitivity","creativity_originality","total"
]].mean().round(2).sort_values("total", ascending=False)

print("\n=== Averages by Valence (rounded to 2 decimals) ===")
display(valence_means)

PATH_VALENCE = RUN_DIR / "avg_by_valence.csv"
valence_means.to_csv(PATH_VALENCE)
print("Saved →", PATH_VALENCE)

# --- Per-topic averages (across all valences) ---
topic_means = df.groupby("topic")[[
    "relevance_task","factual_accuracy","coherence_structure","depth_insight",
    "linguistic_quality","instruction_sensitivity","creativity_originality","total"
]].mean().round(2).sort_values("total", ascending=False)

print("\n=== Averages by Topic ===")
display(topic_means.head(10))

PATH_TOPIC = RUN_DIR / "avg_by_topic.csv"
topic_means.to_csv(PATH_TOPIC)
print("Saved →", PATH_TOPIC)

# --- Leaderboard by total ---
leaderboard = df.sort_values("total", ascending=False)[["id","type","topic","total"]].reset_index(drop=True)
print("\n=== Leaderboard (by total) ===")
display(leaderboard.head(20))

PATH_LEADER = RUN_DIR / "leaderboard.csv"
leaderboard.to_csv(PATH_LEADER, index=False)
print("Saved →", PATH_LEADER)


Graded items: 60


Unnamed: 0,id,type,topic,total,relevance_task,factual_accuracy,coherence_structure,depth_insight,linguistic_quality,instruction_sensitivity,creativity_originality,comments
0,Neutral-01,Neutral,Industrial Revolution,33.8,5.0,4.8,4.9,4.7,4.9,5.0,4.5,The answer provides a comprehensive and well-s...
1,Supportive-01,Supportive,Industrial Revolution,34.0,5.0,4.8,5.0,4.7,5.0,5.0,4.5,The essay provides a comprehensive and well-st...
2,Threatening-01,Threatening,Industrial Revolution,33.8,5.0,4.8,4.9,4.7,4.9,5.0,4.5,The answer provides a comprehensive and well-s...
3,Neutral-02,Neutral,French Revolution,31.25,4.75,4.5,4.5,4.25,4.75,4.5,4.0,The essay provides a comprehensive overview of...
4,Supportive-02,Supportive,French Revolution,33.8,5.0,4.8,4.9,4.7,4.9,5.0,4.5,The answer provides a comprehensive and well-s...
5,Threatening-02,Threatening,French Revolution,34.0,5.0,4.8,5.0,4.7,5.0,5.0,4.5,The answer provides a comprehensive and well-s...
6,Neutral-03,Neutral,World War II,33.8,5.0,4.8,5.0,4.5,5.0,5.0,4.5,The answer provides a comprehensive and well-s...
7,Supportive-03,Supportive,World War II,33.5,5.0,5.0,5.0,4.5,5.0,5.0,4.0,The answer provides a comprehensive and well-s...
8,Threatening-03,Threatening,World War II,33.5,5.0,5.0,5.0,4.5,5.0,5.0,4.0,The answer provides a comprehensive and well-s...
9,Neutral-04,Neutral,Cold War,33.8,5.0,4.8,4.9,4.7,4.9,5.0,4.5,The answer provides a comprehensive and well-s...


Saved per-item scores → runs/20250727_175338/per_item_scores.csv

=== Averages by Valence (rounded to 2 decimals) ===


Unnamed: 0_level_0,relevance_task,factual_accuracy,coherence_structure,depth_insight,linguistic_quality,instruction_sensitivity,creativity_originality,total
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Neutral,4.97,4.78,4.8,4.56,4.86,4.94,4.35,33.26
Threatening,4.92,4.76,4.88,4.58,4.89,4.89,4.31,33.24
Supportive,4.86,4.77,4.78,4.52,4.83,4.79,4.31,32.86


Saved → runs/20250727_175338/avg_by_valence.csv

=== Averages by Topic ===


Unnamed: 0_level_0,relevance_task,factual_accuracy,coherence_structure,depth_insight,linguistic_quality,instruction_sensitivity,creativity_originality,total
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Silk Road,5.0,4.87,5.0,4.8,5.0,5.0,4.5,34.17
Holocaust,5.0,4.87,4.97,4.8,4.97,5.0,4.5,34.1
Industrial Revolution,5.0,4.8,4.93,4.7,4.93,5.0,4.5,33.87
Renaissance,5.0,4.8,4.87,4.63,4.93,4.93,4.57,33.73
Enlightenment,5.0,4.93,4.97,4.57,4.97,4.93,4.33,33.7
Roman Empire,5.0,4.8,4.83,4.63,4.9,5.0,4.43,33.6
World War II,5.0,4.93,5.0,4.5,5.0,5.0,4.17,33.6
United Nations,5.0,4.87,4.9,4.5,4.97,5.0,4.1,33.33
Democracy,5.0,4.8,4.8,4.5,4.83,4.97,4.37,33.27
Space Exploration,4.83,4.8,4.83,4.57,4.8,4.83,4.4,33.07


Saved → runs/20250727_175338/avg_by_topic.csv

=== Leaderboard (by total) ===


Unnamed: 0,id,type,topic,total
0,Threatening-16,Threatening,Holocaust,34.5
1,Threatening-09,Threatening,Silk Road,34.5
2,Neutral-09,Neutral,Silk Road,34.0
3,Neutral-08,Neutral,Renaissance,34.0
4,Supportive-01,Supportive,Industrial Revolution,34.0
5,Threatening-02,Threatening,French Revolution,34.0
6,Supportive-05,Supportive,Enlightenment,34.0
7,Supportive-16,Supportive,Holocaust,34.0
8,Supportive-09,Supportive,Silk Road,34.0
9,Threatening-08,Threatening,Renaissance,33.9


Saved → runs/20250727_175338/leaderboard.csv
