In [1]:
import os
import json
import numpy as np

def parse_dataset_from_fname(fname: str):
    # 例如：Qwen3-4B-Base-math-puzzle-science_PUZZLE_n5.jsonl
    # 取中间的 PUZZLE
    try:
        return fname.split("_")[-2]
    except Exception:
        return "UNKNOWN"

def to_bool(x):
    if isinstance(x, (bool,)):
        return x
    if isinstance(x, (int, np.integer)):
        return bool(x)
    if isinstance(x, (np.bool_,)):
        return bool(x)
    return False


def compute_folder_acc(folder):
    results = {}

    for fname in sorted(os.listdir(folder)):
        if not fname.endswith(".jsonl"):
            continue

        path = os.path.join(folder, fname)

        total = 0
        correct_cnt = 0

        with open(path, "r") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue

                if "correct" not in obj:
                    continue

                total += 1
                correct_cnt += int(to_bool(obj["correct"]))

        acc = correct_cnt / total if total > 0 else 0.0
        results[fname] = {
            "total": total,
            "correct": correct_cnt,
            "acc": acc,
        }

    return results

In [24]:

from collections import defaultdict

folder = "/scratch/pioneer/jobs/wxy320/verl/motivation"
stats = compute_folder_acc(folder)

# dataset -> list of (fname, stat)
by_dataset = defaultdict(list)

for fname, s in stats.items():
    dataset = parse_dataset_from_fname(fname)
    by_dataset[dataset].append((fname, s))

# ===== 打印 =====
for dataset in sorted(by_dataset.keys()):
    print(f"\n===== Dataset: {dataset} =====")
    for fname, s in sorted(by_dataset[dataset]):
        print(
            f"{fname:60s} | "
            f"total={s['total']:6d} | "
            f"acc={s['acc']:.4f}"
        )


===== Dataset: PUZZLE =====
Qwen3-4B-Base-logic_PUZZLE_n5.jsonl                          | total=  1975 | acc=0.0522
Qwen3-4B-Base-math_PUZZLE_n5.jsonl                           | total=  1975 | acc=0.0466
Qwen3-4B-Base-puzzle_PUZZLE_n5.jsonl                         | total=  1975 | acc=0.2015
Qwen3-4B-Base-science_PUZZLE_n5.jsonl                        | total=  1975 | acc=0.0405

===== Dataset: kk =====
Qwen3-4B-Base-logic_kk_n5.jsonl                              | total=  3500 | acc=0.8814
Qwen3-4B-Base-math_kk_n5.jsonl                               | total=  3500 | acc=0.3426
Qwen3-4B-Base-puzzle_kk_n5.jsonl                             | total=  3500 | acc=0.3311
Qwen3-4B-Base-science_kk_n5.jsonl                            | total=  3500 | acc=0.2811

===== Dataset: math =====
Qwen3-4B-Base-logic_skywork_math_n5.jsonl                    | total=  2500 | acc=0.7160
Qwen3-4B-Base-math_skywork_math_n5.jsonl                     | total=  2500 | acc=0.8276
Qwen3-4B-Base-puzzle_skywork_

In [3]:

from collections import defaultdict

folder = "/scratch/pioneer/jobs/wxy320/verl/change"
stats = compute_folder_acc(folder)

# dataset -> list of (fname, stat)
by_dataset = defaultdict(list)

for fname, s in stats.items():
    dataset = parse_dataset_from_fname(fname)
    by_dataset[dataset].append((fname, s))

# ===== 打印 =====
for dataset in sorted(by_dataset.keys()):
    print(f"\n===== Dataset: {dataset} =====")
    for fname, s in sorted(by_dataset[dataset]):
        print(
            f"{fname:60s} | "
            f"total={s['total']:6d} | "
            f"acc={s['acc']:.4f}"
        )


===== Dataset: PUZZLE =====
Qwen3-4B-Base-logic-puzzle_PUZZLE_n5.jsonl                   | total=  1975 | acc=0.2030
Qwen3-4B-Base-math-logic_PUZZLE_n5.jsonl                     | total=  1975 | acc=0.0567
Qwen3-4B-Base-math-puzzle_PUZZLE_n5.jsonl                    | total=  1975 | acc=0.1853
Qwen3-4B-Base-puzzle-logic_PUZZLE_n5.jsonl                   | total=  1975 | acc=0.2046
Qwen3-4B-Base-puzzle-math_PUZZLE_n5.jsonl                    | total=  1975 | acc=0.1970
Qwen3-4B-Base-puzzle-science_PUZZLE_n5.jsonl                 | total=  1975 | acc=0.1909
Qwen3-4B-Base-puzzle_PUZZLE_n5.jsonl                         | total=  1975 | acc=0.2015
Qwen3-4B-Base-science-puzzle_PUZZLE_n5.jsonl                 | total=  1975 | acc=0.2187

===== Dataset: kk =====
Qwen3-4B-Base-logic-math_kk_n5.jsonl                         | total=  3500 | acc=0.8600
Qwen3-4B-Base-logic-puzzle_kk_n5.jsonl                       | total=  3500 | acc=0.8814
Qwen3-4B-Base-logic-science_kk_n5.jsonl                 

In [10]:
from collections import defaultdict

folder = "/scratch/pioneer/jobs/wxy320/verl/result"
stats = compute_folder_acc(folder)

# dataset -> list of (fname, stat)
by_dataset = defaultdict(list)

for fname, s in stats.items():
    dataset = parse_dataset_from_fname(fname)
    by_dataset[dataset].append((fname, s))

# ===== 打印 =====
for dataset in sorted(by_dataset.keys()):
    print(f"\n===== Dataset: {dataset} =====")
    for fname, s in sorted(by_dataset[dataset]):
        print(
            f"{fname:60s} | "
            f"total={s['total']:6d} | "
            f"acc={s['acc']:.4f}"
        )


===== Dataset: PUZZLE =====
Qwen3-4B-Base-math-puzzle-science_PUZZLE_n5.jsonl            | total=  1975 | acc=0.1894
Qwen3-4B-Base-math-science-puzzle_PUZZLE_n5.jsonl            | total=  1975 | acc=0.1706
Qwen3-4B-Base-msp_PUZZLE_n5.jsonl                            | total=  1975 | acc=0.2142
Qwen3-4B-Base-puzzle-math-science_PUZZLE_n5.jsonl            | total=  1975 | acc=0.1899
Qwen3-4B-Base-puzzle-science-math_PUZZLE_n5.jsonl            | total=  1975 | acc=0.1772
Qwen3-4B-Base-science-math-puzzle_PUZZLE_n5.jsonl            | total=  1975 | acc=0.2020
Qwen3-4B-Base-science-puzzle-math_PUZZLE_n5.jsonl            | total=  1975 | acc=0.2010

===== Dataset: math =====
Qwen3-4B-Base-math-puzzle-science_skywork_math_n5.jsonl      | total=  2500 | acc=0.8176
Qwen3-4B-Base-math-science-puzzle_skywork_math_n5.jsonl      | total=  2500 | acc=0.8048
Qwen3-4B-Base-msp_skywork_math_n5.jsonl                      | total=  2500 | acc=0.7992
Qwen3-4B-Base-puzzle-math-science_skywork_math_n5.json

In [2]:
from collections import defaultdict

folder = "/scratch/pioneer/jobs/wxy320/verl/change_logic"
stats = compute_folder_acc(folder)

# dataset -> list of (fname, stat)
by_dataset = defaultdict(list)

for fname, s in stats.items():
    dataset = parse_dataset_from_fname(fname)
    by_dataset[dataset].append((fname, s))

# ===== 打印 =====
for dataset in sorted(by_dataset.keys()):
    print(f"\n===== Dataset: {dataset} =====")
    for fname, s in sorted(by_dataset[dataset]):
        print(
            f"{fname:60s} | "
            f"total={s['total']:6d} | "
            f"acc={s['acc']:.4f}"
        )


===== Dataset: kk =====
Qwen3-4B-Base-logic-math-science_kk_n5.jsonl                 | total=  3500 | acc=0.8731
Qwen3-4B-Base-logic-science-math_kk_n5.jsonl                 | total=  3500 | acc=0.8171
Qwen3-4B-Base-math-logic-science_kk_n5.jsonl                 | total=  3500 | acc=0.8300
Qwen3-4B-Base-math-science-logic_kk_n5.jsonl                 | total=  3500 | acc=0.7591
Qwen3-4B-Base-msl_kk_n5.jsonl                                | total=  3500 | acc=0.9537
Qwen3-4B-Base-science-logic-math_kk_n5.jsonl                 | total=  3500 | acc=0.7620
Qwen3-4B-Base-science-math-logic_kk_n5.jsonl                 | total=  3500 | acc=0.6380

===== Dataset: math =====
Qwen3-4B-Base-logic-math-science_skywork_math_n5.jsonl       | total=  2500 | acc=0.8400
Qwen3-4B-Base-logic-science-math_skywork_math_n5.jsonl       | total=  2500 | acc=0.7696
Qwen3-4B-Base-math-logic-science_skywork_math_n5.jsonl       | total=  2500 | acc=0.8256
Qwen3-4B-Base-math-science-logic_skywork_math_n5.jsonl    

In [2]:
from collections import defaultdict

folder = "/scratch/pioneer/jobs/wxy320/verl/change_logic2"
stats = compute_folder_acc(folder)

# dataset -> list of (fname, stat)
by_dataset = defaultdict(list)

for fname, s in stats.items():
    dataset = parse_dataset_from_fname(fname)
    by_dataset[dataset].append((fname, s))

# ===== 打印 =====
for dataset in sorted(by_dataset.keys()):
    print(f"\n===== Dataset: {dataset} =====")
    for fname, s in sorted(by_dataset[dataset]):
        print(
            f"{fname:60s} | "
            f"total={s['total']:6d} | "
            f"acc={s['acc']:.4f}"
        )


===== Dataset: PUZZLE =====
Qwen3-4B-Base-logic-puzzle_PUZZLE_n5.jsonl                   | total=  1975 | acc=0.2030

===== Dataset: kk =====
Qwen3-4B-Base-logic-puzzle_kk_n5.jsonl                       | total=  3500 | acc=0.8814
Qwen3-4B-Base-science-math-logic_kk_n5.jsonl                 | total=  3500 | acc=0.6351

===== Dataset: science =====
Qwen3-4B-Base-math-sl_science_n5.jsonl                       | total=   990 | acc=0.3939
