In [None]:
# %% Imports

import os
import json
import ast
from pathlib import Path
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

try:
    import seaborn as sns
    sns.set(style="whitegrid")
except:
    sns = None

pd.set_option("display.max_rows", 200)
pd.set_option("display.width", 200)


In [None]:
# %% Part 0 – Paths

RUN_ID = "YOUR_RUN_ID_HERE"    # ← 修改为你的 run_experiment 实际 run_id

WORKSPACE = Path("/workspace")
LLM_LOG_PATH = WORKSPACE / "agent_logs" / RUN_ID / "llm_calls.jsonl"

HUMANEVAL_PATH = Path("/app/maswe/eval/humaneval.jsonl")


EVAL_RESULTS_PATH = WORKSPACE / "humaneval_results" / f"{RUN_ID}_eval.jsonl"


In [None]:
# %% Load llm_calls.jsonl

def load_llm_calls(path: Path) -> pd.DataFrame:
    rows = []
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            obj = json.loads(line)
            rows.append(obj)
    df = pd.DataFrame(rows)

    # 必须包含字段
    for key in ["task_id", "sample_id", "prompt", "response_raw", "response_code"]:
        if key not in df.columns:
            print(df.columns)
            raise ValueError(f"Missing key in llm_calls.jsonl: {key}")

    df["completion_text"] = df["response_code"].fillna("")
    df["raw_text"] = df["response_raw"].fillna("")
    df["completion_char_len"] = df["completion_text"].astype(str).str.len()
    df["completion_token_approx"] = (df["completion_char_len"] / 4).astype(int)

    return df

llm_df = load_llm_calls(LLM_LOG_PATH)
llm_df.head()



In [None]:
def load_humaneval(path=HUMANEVAL_PATH):
    rows = []
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            rows.append(json.loads(line))
    return pd.DataFrame(rows)

humaneval_df = load_humaneval()
humaneval_df.head()

In [None]:
{
  "task_id": "HumanEval/0",
  "pass@k": 0.33,
  "num_correct": 1,
  "num_total": 3,
  "results": [true, false, false]
}

In [None]:
def explode_eval_results(path: Path) -> pd.DataFrame:
    rows = []
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            obj = json.loads(line)
            task_id = obj["task_id"]
            results = obj["results"]
            for i, passed in enumerate(results):
                rows.append({
                    "task_id": task_id,
                    "sample_id": i,
                    "passed": bool(passed),
                })
    return pd.DataFrame(rows)

eval_df = explode_eval_results(EVAL_RESULTS_PATH)
eval_df.head()

In [None]:
merged = eval_df.merge(llm_df, on=["task_id", "sample_id"], how="left")

# prompt / canonical_solution / entry_point
merged = merged.merge(
    humaneval_df[["task_id", "prompt", "canonical_solution", "entry_point", "test"]],
    on="task_id",
    how="left"
)

merged.head()

In [None]:
def has_extra_output(raw: str) -> bool:
    raw = raw.lower()
    if "```" in raw:
        return True
    keywords = [
        "the function", "this function", "explanation",
        "step", "we can", "in summary", "first,", "second,", "finally"
    ]
    return any(k in raw for k in keywords)