In [None]:
import json
from pathlib import Path
import pandas as pd

root = Path("exp1")

rows = []
for f in root.rglob("*.json"):
    parts = f.parts
    # 期望路径: exp1/exp1/N?/model/metrics_*.json
    n = parts[2] if len(parts) > 2 else "?"
    model = parts[3] if len(parts) > 3 else "?"
    kind = "pointwise" if "pointwise" in f.name else "distribution"

    data = json.loads(f.read_text(encoding="utf-8"))
    for domain, metrics in data.items():
        row = {
            "N": n,
            "model": model,
            "kind": kind,
            "domain": domain,
        }
        row.update(metrics)
        cs = metrics.get("cleaning_stats") or {}
        row["original_rows"] = cs.get("original_rows")
        row["human_null"] = cs.get("human_null")
        row["llm_invalid"] = cs.get("llm_invalid")
        rows.append(row)

df = pd.DataFrame(rows)

pointwise = df[df["kind"] == "pointwise"].copy().sort_values(["N","model","domain"])
distribution = df[df["kind"] == "distribution"].copy().sort_values(["N","model","domain"])

# 选择需要的列
pointwise_cols = [
    "N","model","domain",
    "spearman_rho","spearman_p","js_divergence","rmse","mae"
]
distribution_cols = [
    "N","model","domain",
    "mae","coverage_rate"
]

display(pointwise[pointwise_cols])
display(distribution[distribution_cols])


In [None]:
import json
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib as mpl

# 1) 读取所有 json
root = Path("exp1")
rows = []

for f in root.rglob("*.json"):
    model = f.parent.name
    n = f.parent.parent.name
    kind = "pointwise" if "pointwise" in f.name else "distribution"

    data = json.loads(f.read_text(encoding="utf-8"))
    for domain, metrics in data.items():
        row = {"N": n, "model": model, "kind": kind, "domain": domain}
        row.update(metrics)
        rows.append(row)

df = pd.DataFrame(rows)

# 2) 缩写规则
def shorten_model(name: str) -> str:
    s = name
    s = s.replace("gpt-5-mini-", "g5m-")
    s = s.replace("gpt_5_mini_", "g5m-")
    s = s.replace("gpt-3.5-", "g3.5-")
    s = s.replace("gemini-3-flash-preview-", "ge3-")
    s = s.replace("gemini_3_flash_", "ge3-")
    s = s.replace("Qwen3-30B-", "q3-")
    s = s.replace("A3B-Instruct-2507-FP8", "-NT")
    s = s.replace("nothinking", "NT")
    s = s.replace("Thinking", "T")
    s = s.replace("thinking", "T")
    s = s.replace("medium", "T")
    s = s.replace("minimal", "NT")
    s = s.replace("turbo", "NT")
    return s

df["model_short"] = df["model"].apply(shorten_model)

# 3) 拆分 N1 / N50 并按 domain+model_short 合并
point = df[(df["kind"]=="pointwise") & (df["N"]=="N1")][
    ["domain","model_short","spearman_rho","js_divergence","rmse"]
].copy()

dist = df[(df["kind"]=="distribution") & (df["N"]=="N50")][
    ["domain","model_short","mae","coverage_rate"]
].copy()

merged = pd.merge(point, dist, on=["domain","model_short"], how="inner")

# 4) 指标方向与归一化（对部分指标做 log1p 增大区分度）
better_high = {"spearman_rho", "coverage_rate"}
better_low = {"js_divergence", "rmse", "mae"}
log_metrics = {"js_divergence", "rmse", "mae"}  # 对数放大区分度

def normalize_domain(df_domain, metrics):
    df_norm = df_domain.copy()
    for m in metrics:
        col = df_norm[m].astype(float)

        # log1p 放大区分度（先确保非负）
        if m in log_metrics:
            col = np.log1p(np.clip(col, a_min=0, a_max=None))

        # 方向统一：越大越好
        if m in better_low:
            col = -col

        minv, maxv = col.min(), col.max()
        if maxv - minv < 1e-12:
            df_norm[m] = 1.0
        else:
            df_norm[m] = (col - minv) / (maxv - minv)
    return df_norm

# 5) 颜色映射：同一类模型颜色相同，区分度更大
def base_model_from_short(s: str) -> str:
    return s.replace("NT", "").replace("T", "")

merged["base_model"] = merged["model_short"].apply(base_model_from_short)
base_models = sorted(merged["base_model"].unique())

#cmap = mpl.cm.get_cmap("Set2", len(base_models))  # 直接按数量采样
#color_map = {bm: cmap(i) for i, bm in enumerate(base_models)}

cmap = mpl.cm.get_cmap("tab20")
# 在 [0,1] 上均匀取点，避免集中在某一段颜色
vals = np.linspace(0, 1, len(base_models), endpoint=False)
color_map = {bm: cmap(v) for bm, v in zip(base_models, vals)}

# 用 tab20 并拉开颜色间隔
#cmap = cm.get_cmap("tab20")
#step = max(1, 20 // max(1, len(base_models)))
#color_map = {bm: cmap((i*step) % 20) for i, bm in enumerate(base_models)}

# 6) 雷达图函数


def radar_plot(df_domain, title):
    metrics = ["spearman_rho","js_divergence","rmse","mae","coverage_rate"]
    labels = metrics
    angles = np.linspace(0, 2*np.pi, len(metrics), endpoint=False).tolist()
    angles += angles[:1]

    df_plot = normalize_domain(df_domain, metrics)

    fig = plt.figure(figsize=(7.5, 7.5))
    ax = plt.subplot(111, polar=True)

    for _, row in df_plot.iterrows():
        values = [row[m] for m in metrics]
        values += values[:1]

        name = row["model_short"]
        if "NT" in name:
            ls = "--"
            marker = "o"
        elif "T" in name:
            ls = "-"
            marker = "s"
        else:
            ls = "-"
            marker = "o"

        bm = base_model_from_short(name)
        color = color_map.get(bm, "C0")

        ax.plot(
            angles, values,
            linewidth=2.1, linestyle=ls, color=color,
            marker=marker, markersize=4, label=row["model_short"]
        )
        ax.fill(angles, values, color=color, alpha=0.04)

    ax.set_xticks(angles[:-1])
    #ax.set_xticklabels(labels)
    ax.set_xticklabels(labels, fontfamily="Times New Roman", fontsize=20)
    ax.tick_params(axis="x", pad=22)
    ax.set_ylim(0, 1)
    #ax.set_title(title + " (Log+Normalized, Higher=Better)", y=1.08)
    ax.set_title(title + " (Log+Normalized, Higher=Better)", y=1.08,
             fontfamily="Times New Roman", fontsize=18)
    ax.grid(True)
    ax.legend(loc="upper right", bbox_to_anchor=(1.35, 1.15),
    prop={"family": "Times New Roman", "size": 18})
    plt.show()

# 7) 每个 domain 画一个雷达图
for domain in sorted(merged["domain"].unique()):
    radar_plot(merged[merged["domain"]==domain], f"Radar - {domain}")


In [None]:
import json
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 1) 读取所有 json
root = Path("exp1")
rows = []

for f in root.rglob("*.json"):
    model = f.parent.name
    n = f.parent.parent.name
    kind = "pointwise" if "pointwise" in f.name else "distribution"

    data = json.loads(f.read_text(encoding="utf-8"))
    for domain, metrics in data.items():
        row = {"N": n, "model": model, "kind": kind, "domain": domain}
        row.update(metrics)
        rows.append(row)

df = pd.DataFrame(rows)

# 2) 模型名缩写
def shorten_model(name: str) -> str:
    s = name
    s = s.replace("gpt-5-mini-", "g5m-")
    s = s.replace("gpt_5_mini_", "g5m-")
    s = s.replace("gpt-3.5-", "g3.5-")
    s = s.replace("gemini-3-flash-preview-", "ge3-")
    s = s.replace("gemini_3_flash_", "ge3-")
    s = s.replace("Qwen3-30B-", "q3-")
    s = s.replace("A3B-Instruct-2507-FP8", "NT")
    s = s.replace("nothinking", "NT")
    s = s.replace("Thinking", "T")
    s = s.replace("thinking", "T")
    s = s.replace("medium", "T")
    s = s.replace("minimal", "NT")
    s = s.replace("turbo", "NT")
    return s

df["model_short"] = df["model"].apply(shorten_model)

# 3) 拼接 N1 + N50 的 5 个指标
n1 = df[(df["kind"] == "pointwise") & (df["N"] == "N1")][
    ["domain", "model_short", "spearman_rho", "js_divergence", "rmse"]
].copy()

n50 = df[(df["kind"] == "distribution") & (df["N"] == "N50")][
    ["domain", "model_short", "mae", "coverage_rate"]
].copy()

merged = pd.merge(n1, n50, on=["domain", "model_short"], how="inner")

metrics_order = ["spearman_rho", "rmse", "js_divergence", "mae", "coverage_rate"]
plot_long = merged.melt(
    id_vars=["domain", "model_short"],
    value_vars=metrics_order,
    var_name="metric",
    value_name="value"
)

plot_long["metric"] = pd.Categorical(plot_long["metric"], categories=metrics_order, ordered=True)
model_order = [
    "g3.5-NT",
    "gpt-4o-mini",
    "g5m-NT",
    "g5m-T",
    "ge3-NT",
    "ge3-T",
    "q3-NT",
    "q3-T",
]
model_order = [m for m in model_order if m in plot_long["model_short"].unique()]
model_label_map = {
    "g3.5-NT": "GPT3.5",
    "gpt-4o-mini": "GPT4om",
    "g5m-NT": "GPT5m-nt",
    "g5m-T": "GPT5m-t",
    "ge3-NT": "GEM3f-nt",
    "ge3-T": "GEM3f-t",
    "q3-NT": "QWE3-nt",
    "q3-T": "QWE3-t",
}

def format_model(ms: str) -> str:
    return model_label_map.get(ms, ms)

domain_order = [d for d in ["spending", "labor", "credit"] if d in plot_long["domain"].unique()]
if not domain_order:
    domain_order = sorted(plot_long["domain"].unique())

domain_label_map = {"spending": "Spending", "labor": "Labor", "credit": "Credit"}
metric_label_map = {
    "spearman_rho": "Sperman_rho",
    "js_divergence": "JS_Divergence",
    "rmse": "RMSE",
    "mae": "MAE",
    "coverage_rate": "Coverage_Rate",
}

def format_domain(d: str) -> str:
    return domain_label_map.get(d, d.title())

def format_metric(m: str) -> str:
    return metric_label_map.get(m, m)

# 4) 棒棒糖图 + 均值引导线 + thinking/非thinking线型区分 + 超过阈值标红
sns.set_theme(
    style="whitegrid",
    font="Times New Roman",
    rc={
        "font.size": 18,
        "axes.titlesize": 18,
        "axes.labelsize": 18,
        "xtick.labelsize": 13,
        "ytick.labelsize": 13,
        "legend.fontsize": 18,
        "figure.titlesize": 18,
    },
)

fig, axes = plt.subplots(
    len(domain_order), len(metrics_order),
    figsize=(3.0 * len(metrics_order), 2.9 * len(domain_order)),
    sharex=False, sharey=False
)

if len(domain_order) == 1 and len(metrics_order) == 1:
    axes = [[axes]]
elif len(domain_order) == 1:
    axes = [axes]
elif len(metrics_order) == 1:
    axes = [[ax] for ax in axes]

def line_style(model_short: str) -> str:
    return "-" if "NT" in model_short else "-"

def marker_style(model_short: str) -> str:
    return "o" if ("NT" in model_short or "gpt-4o-mini" in model_short) else "s"

# 方向：1=越大越好，-1=越小越好
metric_dir = {
    "spearman_rho": 1,
    "coverage_rate": 1,
    "js_divergence": -1,
    "rmse": -1,
    "mae": -1,
}

def is_g35_or_qwen(ms: str) -> bool:
    return ("g3.5" in ms) or (ms.startswith("q3-")) or ("qwen" in ms)

def is_gemini3_or_gpt5(ms: str) -> bool:
    return ("ge3-" in ms) or ("g5" in ms)

for i, domain in enumerate(domain_order):
    for j, metric in enumerate(metrics_order):
        ax = axes[i][j]
        sub = plot_long[(plot_long["domain"] == domain) & (plot_long["metric"] == metric)].copy()
        sub["model_short"] = pd.Categorical(sub["model_short"], categories=model_order, ordered=True)
        sub = sub.sort_values("model_short")
        sub["model_display"] = sub["model_short"].map(format_model)
        model_display_order = [
            "GPT3.5",
            "GPT4om",
            "GPT5m-nt",
            "GPT5m-t",
            "GEM3f-nt",
            "GEM3f-t",
            "QWE3-nt",
            "QWE3-t",
        ]
        model_display_order = [m for m in model_display_order if m in sub["model_display"].unique()]
        sub["model_display"] = pd.Categorical(sub["model_display"], categories=model_display_order, ordered=True)

        # 参考模型值（用于比较 QWE3）
        ref_vals = sub[sub["model_short"].isin(["g5m-NT", "g5m-T", "ge3-NT", "ge3-T"])]["value"]
        direction = metric_dir.get(metric, 1)
        best_ref = None
        if len(ref_vals):
            best_ref = ref_vals.min() if direction == 1 else ref_vals.max()

        # 每个模型单独画，便于控制线型
        for _, r in sub.iterrows():
            ls = line_style(r["model_short"])
            mk = marker_style(r["model_short"])

            line_color = "#9aa0a6"            # 仅 QWE3-nt / QWE3-t 超过 (GPT5m/GEM3f) 任意一个时标红
            if best_ref is not None and str(r["model_short"]).startswith("q3-"):
                if direction == 1:
                    better = r["value"] > best_ref
                else:
                    better = r["value"] < best_ref
                if better:
                    line_color = "#d62728"

            # GPT3.5 超过 (GPT5m/GEM3f) 任意一个时标蓝
            if best_ref is not None and str(r["model_short"]).startswith("g3.5"):
                if direction == 1:
                    better = r["value"] > best_ref
                else:
                    better = r["value"] < best_ref
                if better:
                    line_color = "#1f77b4"


            ax.hlines(
                y=r["model_display"],
                xmin=0,
                xmax=r["value"],
                color=line_color,
                linewidth=1.8,
                linestyle=ls,
                alpha=0.95
            )
            ax.scatter(
                r["value"], r["model_display"],
                s=55, marker=mk, color="#1f77b4", zorder=3
            )

        # 0 参考线
        ax.axvline(0, color="black", linestyle="--", linewidth=1, alpha=0.7)

        # 均值引导线
        mean_val = sub["value"].mean()
        ax.axvline(mean_val, color="#d62728", linestyle=":", linewidth=1.6, alpha=0.9)

        ax.invert_yaxis()
        ax.set_title(format_metric(metric), fontsize=18)
        ax.set_xlabel("value")
        ax.set_ylabel(format_domain(domain) if j == 0 else "")
        if j != 0:
            ax.set_yticklabels([])
        # 在 rmse 子图里临时打印

fig.suptitle(
    "Exp1 Lollipop View: N1 (spearman_rho/rmse/js_divergence/mae) + N50 (coverage_rate)",
    y=1.02,
    fontsize=18
)
plt.tight_layout()
plt.show()



In [None]:
import json
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 1) 读取所有 json
root = Path("exp1")
rows = []

for f in root.rglob("*.json"):
    model = f.parent.name
    n = f.parent.parent.name
    kind = "pointwise" if "pointwise" in f.name else "distribution"

    data = json.loads(f.read_text(encoding="utf-8"))
    for domain, metrics in data.items():
        row = {"N": n, "model": model, "kind": kind, "domain": domain}
        row.update(metrics)
        rows.append(row)

df = pd.DataFrame(rows)

# 2) 模型名缩写
def shorten_model(name: str) -> str:
    s = name
    s = s.replace("gpt-5-mini-", "g5m-")
    s = s.replace("gpt_5_mini_", "g5m-")
    s = s.replace("gpt-3.5-", "g3.5-")
    s = s.replace("gemini-3-flash-preview-", "ge3-")
    s = s.replace("gemini_3_flash_", "ge3-")
    s = s.replace("Qwen3-30B-", "q3-")
    s = s.replace("A3B-Instruct-2507-FP8", "NT")
    s = s.replace("nothinking", "NT")
    s = s.replace("Thinking", "T")
    s = s.replace("thinking", "T")
    s = s.replace("medium", "T")
    s = s.replace("minimal", "NT")
    s = s.replace("turbo", "NT")
    return s

df["model_short"] = df["model"].apply(shorten_model)

# 3) 拼接 N1 + N50 的 5 个指标
n1 = df[(df["kind"] == "pointwise") & (df["N"] == "N1")][
    ["domain", "model_short", "spearman_rho", "js_divergence", "rmse"]
].copy()

n50 = df[(df["kind"] == "distribution") & (df["N"] == "N50")][
    ["domain", "model_short", "mae", "coverage_rate"]
].copy()

merged = pd.merge(n1, n50, on=["domain", "model_short"], how="inner")

metrics_order = ["spearman_rho", "rmse", "js_divergence", "mae", "coverage_rate"]
plot_long = merged.melt(
    id_vars=["domain", "model_short"],
    value_vars=metrics_order,
    var_name="metric",
    value_name="value"
)

plot_long["metric"] = pd.Categorical(plot_long["metric"], categories=metrics_order, ordered=True)
model_order = [
    "g3.5-NT",
    "gpt-4o-mini",
    "g5m-NT",
    "g5m-T",
    "ge3-NT",
    "ge3-T",
    "q3-NT",
    "q3-T",
]
model_order = [m for m in model_order if m in plot_long["model_short"].unique()]
model_label_map = {
    "g3.5-NT": "GPT3.5",
    "gpt-4o-mini": "GPT4om",
    "g5m-NT": "GPT5m-nt",
    "g5m-T": "GPT5m-t",
    "ge3-NT": "GEM3f-nt",
    "ge3-T": "GEM3f-t",
    "q3-NT": "QWE3-nt",
    "q3-T": "QWE3-t",
}

def format_model(ms: str) -> str:
    return model_label_map.get(ms, ms)

domain_order = [d for d in ["spending", "labor", "credit"] if d in plot_long["domain"].unique()]
if not domain_order:
    domain_order = sorted(plot_long["domain"].unique())

domain_label_map = {"spending": "Spending", "labor": "Labor", "credit": "Credit"}
metric_label_map = {
    "spearman_rho": "Sperman_rho",
    "js_divergence": "JS_Divergence",
    "rmse": "RMSE",
    "mae": "MAE",
    "coverage_rate": "Coverage_Rate",
}

def format_domain(d: str) -> str:
    return domain_label_map.get(d, d.title())

def format_metric(m: str) -> str:
    return metric_label_map.get(m, m)

# 4) 棒棒糖图 + 均值引导线 + thinking/非thinking线型区分 + 超过阈值标红
sns.set_theme(
    style="whitegrid",
    font="Times New Roman",
    rc={
        "font.size": 18,
        "axes.titlesize": 18,
        "axes.labelsize": 18,
        "xtick.labelsize": 15,
        "ytick.labelsize": 15,
        "legend.fontsize": 18,
        "figure.titlesize": 18,
    },
)

fig, axes = plt.subplots(
    len(domain_order), len(metrics_order),
    figsize=(3.0 * len(metrics_order), 2.9 * len(domain_order)),
    sharex=False, sharey=False
)

if len(domain_order) == 1 and len(metrics_order) == 1:
    axes = [[axes]]
elif len(domain_order) == 1:
    axes = [axes]
elif len(metrics_order) == 1:
    axes = [[ax] for ax in axes]

def line_style(model_short: str) -> str:
    return "-" if "NT" in model_short else "-"

def marker_style(model_short: str) -> str:
    return "o" if ("NT" in model_short or "gpt-4o-mini" in model_short) else "s"

# 方向：1=越大越好，-1=越小越好
metric_dir = {
    "spearman_rho": 1,
    "coverage_rate": 1,
    "js_divergence": -1,
    "rmse": -1,
    "mae": -1,
}

def is_g35_or_qwen(ms: str) -> bool:
    return ("g3.5" in ms) or (ms.startswith("q3-")) or ("qwen" in ms)

def is_gemini3_or_gpt5(ms: str) -> bool:
    return ("ge3-" in ms) or ("g5" in ms)

for i, domain in enumerate(domain_order):
    for j, metric in enumerate(metrics_order):
        ax = axes[i][j]
        sub = plot_long[(plot_long["domain"] == domain) & (plot_long["metric"] == metric)].copy()
        sub["model_short"] = pd.Categorical(sub["model_short"], categories=model_order, ordered=True)
        sub = sub.sort_values("model_short")
        sub["model_display"] = sub["model_short"].map(format_model)
        model_display_order = [
            "GPT3.5",
            "GPT4om",
            "GPT5m-nt",
            "GPT5m-t",
            "GEM3f-nt",
            "GEM3f-t",
            "QWE3-nt",
            "QWE3-t",
        ]
        model_display_order = [m for m in model_display_order if m in sub["model_display"].unique()]
        sub["model_display"] = pd.Categorical(sub["model_display"], categories=model_display_order, ordered=True)

        # 参考模型值（用于比较 QWE3）
        ref_vals = sub[sub["model_short"].isin(["g5m-NT", "g5m-T", "ge3-NT", "ge3-T"])]["value"]
        direction = metric_dir.get(metric, 1)
        best_ref = None
        if len(ref_vals):
            best_ref = ref_vals.min() if direction == 1 else ref_vals.max()

        # 每个模型单独画，便于控制线型
        for _, r in sub.iterrows():
            ls = line_style(r["model_short"])
            mk = marker_style(r["model_short"])

            line_color = "#9aa0a6"            # 仅 QWE3-nt / QWE3-t 超过 (GPT5m/GEM3f) 任意一个时标红
            if best_ref is not None and str(r["model_short"]).startswith("q3-"):
                if direction == 1:
                    better = r["value"] > best_ref
                else:
                    better = r["value"] < best_ref
                if better:
                    line_color = "#d62728"

            # GPT3.5 超过 (GPT5m/GEM3f) 任意一个时标蓝
            if best_ref is not None and str(r["model_short"]).startswith("g3.5"):
                if direction == 1:
                    better = r["value"] > best_ref
                else:
                    better = r["value"] < best_ref
                if better:
                    line_color = "#1f77b4"


            ax.hlines(
                y=r["model_display"],
                xmin=0,
                xmax=r["value"],
                color=line_color,
                linewidth=1.8,
                linestyle=ls,
                alpha=0.95
            )
            ax.scatter(
                r["value"], r["model_display"],
                s=55, marker=mk, color="#1f77b4", zorder=3
            )

        # 0 参考线
        ax.axvline(0, color="black", linestyle="--", linewidth=1, alpha=0.7)

        # 均值引导线
        mean_val = sub["value"].mean()
        ax.axvline(mean_val, color="#d62728", linestyle=":", linewidth=1.6, alpha=0.9)

        ax.invert_yaxis()
        ax.set_title(format_metric(metric), fontsize=18)
        ax.set_xlabel("Value")
        ax.set_ylabel(format_domain(domain) if j == 0 else "")
        if j != 0:
            ax.set_yticklabels([])
        # 在 rmse 子图里临时打印

fig.suptitle(
    "Exp1 Lollipop View: N1 (spearman_rho/rmse/js_divergence/mae) + N50 (coverage_rate)",
    y=1.02,
    fontsize=18
)
plt.tight_layout()
plt.show()

# 保存到 resultPic/exp1/exp1.pdf
out_dir = Path("/xxxxxxxxxxxxxx/result_analysed_final/exp1") / root.name
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / f"{root.name}.pdf"

fig.savefig(out_path, dpi=300, bbox_inches="tight")
print("Saved:", out_path)