In [16]:
import re
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"


import json
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple, Dict, Any
import sys
import subprocess
import shutil
import pandas as pd
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams
from tqdm.auto import tqdm

In [17]:
import torch
print("visible cuda devices:", torch.cuda.device_count())

visible cuda devices: 4


In [18]:
# Migrated from from verl.utils.reward_score.math_verify import compute_score
# Reduce reliance on installing verl, evaluation should be verl free..
try:
    from math_verify.errors import TimeoutException
    from math_verify.metric import math_metric
    from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig
except ImportError:
    print("To use Math-Verify, please install it first by running `pip install math-verify`.")


def compute_score(model_output: str, ground_truth: str, timeout_score: float = 0) -> bool:
    verify_func = math_metric(
        gold_extraction_target=(LatexExtractionConfig(),),
        pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),
    )
    ret_score = 0.0

    # Wrap the ground truth in \boxed{} format for verification
    ground_truth_boxed = "\\boxed{" + ground_truth + "}"
    try:
        ret_score, _ = verify_func([ground_truth_boxed], [model_output])
    except Exception:
        pass
    except TimeoutException:
        ret_score = timeout_score

    return ret_score

# Traverse Ckpts

In [4]:
@dataclass(frozen=True)
class CheckpointRef:
    exp_dir: Path
    exp_name: str
    step: int
    step_dir: Path
    actor_dir: Path

def parse_step_from_stepdir(step_dir_name: str) -> Optional[int]:
    """
    Support:
      global_step_60
      global_steps_60
      step_60
      global_step60 (less common)
    """
    m = re.search(r"(global_steps|global_step|step)[^\d]*(\d+)", step_dir_name)
    return int(m.group(2)) if m else None

def find_actor_dir(step_dir: Path) -> Path:
    direct = step_dir / "actor"
    if direct.is_dir():
        return direct
    cands = [p for p in step_dir.rglob("actor") if p.is_dir()]
    if not cands:
        raise FileNotFoundError(f"Cannot find actor dir under {step_dir}")
    # Prefer the one that looks like FSDP dir
    for p in cands:
        if (p / "fsdp_config.json").exists():
            return p
    return cands[0]

def scan_exp_dir(exp_dir: str) -> List[CheckpointRef]:
    """
    Scan exactly one experiment directory:
      <exp_dir>/
        global_step_0/
          actor/...
        global_step_60/
          actor/...
        ...

    Returns sorted list of CheckpointRef by step.
    """
    exp = Path(exp_dir).expanduser().resolve()
    if not exp.is_dir():
        raise FileNotFoundError(f"exp_dir not found: {exp}")

    out: List[CheckpointRef] = []
    for child in exp.iterdir():
        if not child.is_dir():
            continue
        step = parse_step_from_stepdir(child.name)
        if step is None:
            continue
        actor_dir = find_actor_dir(child)
        out.append(CheckpointRef(
            exp_dir=exp,
            exp_name=exp.name,
            step=step,
            step_dir=child,
            actor_dir=actor_dir,
        ))

    out.sort(key=lambda x: x.step)
    return out

In [5]:

checkpoint_dir = "/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2"

ckpts = scan_exp_dir(checkpoint_dir)
ckpts

[CheckpointRef(exp_dir=PosixPath('/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2'), exp_name='openthoughts-grpo-qwen3_0p6b_fsdp2', step=60, step_dir=PosixPath('/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/global_step_60'), actor_dir=PosixPath('/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/global_step_60/actor')),
 CheckpointRef(exp_dir=PosixPath('/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2'), exp_name='openthoughts-grpo-qwen3_0p6b_fsdp2', step=120, step_dir=PosixPath('/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/global_step_120'), actor_dir=PosixPath('/mnt/local/shared/michaelw/mlf2/verl/reproduce/gr

# Checkpoint converter

In [6]:
def is_merged_hf_dir(model_dir: Path) -> bool:
    return model_dir.is_dir() and (model_dir / "config.json").exists()

def default_cache_root_for_exp(exp_dir: Path) -> Path:
    # <exp_dir>/_merged_hf_cache/global_step_<step>
    return exp_dir / "_merged_hf_cache"

def default_target_dir_for_ckpt(ckpt: CheckpointRef) -> Path:
    return default_cache_root_for_exp(ckpt.exp_dir) / f"global_step_{ckpt.step}"

def run_cmd_capture(cmd):
    print("[CMD]", " ".join(map(str, cmd)))
    p = subprocess.run(list(map(str, cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if p.returncode != 0:
        raise RuntimeError(f"rc={p.returncode}\nSTDOUT:\n{p.stdout[-4000:]}\nSTDERR:\n{p.stderr[-4000:]}\n")
    return p.stdout, p.stderr

def merge_one_fsdp_ckpt(
    ckpt: CheckpointRef,
    backend: str = "fsdp",
    force: bool = False,
    python_exec: Optional[str] = None,
) -> Path:
    tgt = default_target_dir_for_ckpt(ckpt)
    if is_merged_hf_dir(tgt) and not force:
        print(f"[skip] {tgt}")
        return tgt
    if force and tgt.exists():
        shutil.rmtree(tgt)
    tgt.mkdir(parents=True, exist_ok=True)
    if python_exec is None:
        python_exec = sys.executable
    cmd = [
        python_exec, "-m", "verl.model_merger", "merge",
        "--backend", backend,
        "--local_dir", str(ckpt.actor_dir),
        "--target_dir", str(tgt),
    ]
    run_cmd_capture(cmd)
    print(f"[ok] merged -> {tgt}")
    return tgt

def merge_all_steps_for_exp_dir(
    exp_dir: str,
    backend: str = "fsdp",
    force: bool = False,
) -> List[Tuple[int, Path]]:
    ckpts = scan_exp_dir(exp_dir)
    out = []
    for c in ckpts:
        tgt = merge_one_fsdp_ckpt(c, backend=backend, force=force)
        out.append((c.step, tgt))
    return out

In [7]:
merged = merge_all_steps_for_exp_dir(checkpoint_dir, backend="fsdp", force=False)
merged[:3], len(merged)

[skip] /mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_60
[skip] /mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_120
[skip] /mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_180
[skip] /mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_240
[skip] /mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_300
[skip] /mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_319


([(60,
   PosixPath('/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_60')),
  (120,
   PosixPath('/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_120')),
  (180,
   PosixPath('/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_180'))],
 6)

In [8]:
step, model_dir = merged[0]   
model_dir = Path(model_dir)
print("Testing:", step, model_dir)

# model_name = "Qwen/Qwen3-0.6B"
model_name = model_dir

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

# prepare the model input
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=1024
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)


Testing: 60 /mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_60


The tokenizer you are loading from '/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_60' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
`torch_dtype` is deprecated! Use `dtype` instead!


thinking content: <think>
Okay, the user wants a short introduction to a large language model. Let me start by recalling the key points. Large language models are AI systems designed to understand and generate human language, right? They can process text in multiple languages and perform various tasks like answering questions, writing, or even creative writing.

I should mention that they're trained on massive amounts of text, which allows them to learn complex patterns and nuances. Also, they can handle multiple languages and adapt to different contexts. Maybe include how they're used in various applications like customer service, content creation, or research.

Wait, should I add something about the training data and the model's capabilities? That could make it more comprehensive. Also, maybe touch on their efficiency and performance. Make sure the language is simple and engaging for someone who might not be familiar with the technical terms. Avoid jargon but still sound informative.

# Validation Parquet

In [9]:
def _de_numpy(x):
    """Convert numpy containers/scalars into pure Python recursively."""
    # numpy scalar -> python scalar
    if isinstance(x, np.generic):
        return x.item()
    # numpy array -> list (then recurse)
    if isinstance(x, np.ndarray):
        return [_de_numpy(v) for v in x.tolist()]
    # dict
    if isinstance(x, dict):
        return {k: _de_numpy(v) for k, v in x.items()}
    # list/tuple
    if isinstance(x, (list, tuple)):
        return [_de_numpy(v) for v in x]
    return x

def load_records_from_parquet(parquet_path: str):
    df = pd.read_parquet(parquet_path)
    records = df.to_dict("records")

    # Fix prompt + sanitize numpy types
    fixed = []
    for r in records:
        r = _de_numpy(r)

        # Ensure prompt is a python list of message dicts
        # Your data sometimes has prompt as np.ndarray([{'role':..., 'content':...}], dtype=object)
        p = r.get("prompt", None)
        if p is None:
            r["prompt"] = []
        elif isinstance(p, dict):
            # (edge) single message dict -> list
            r["prompt"] = [p]
        elif isinstance(p, list):
            r["prompt"] = p
        else:
            # if some weird type survived, try best-effort
            r["prompt"] = list(p)

        fixed.append(r)
    return fixed

In [10]:
parquet_path = "/mnt/local/shared/michaelw/mlf2/verl/reproduce/data/openthoughts3/local_parquet_dir/test.parquet"
records = load_records_from_parquet(parquet_path)

In [11]:
len(records)

4787

In [12]:
records[0]

{'data_source': 'open-thoughts/OpenThoughts3-1.2M',
 'prompt': [{'content': "Let's think step by step and solve this problem. Find the smallest possible value of the sum $\\lvert x + 1\\rvert + \\lvert x + 3\\rvert + \\lvert x + 7\\rvert$.",
   'role': 'user'}],
 'ability': 'math',
 'reward_model': {'ground_truth': '6', 'style': 'rule'},
 'extra_info': {'answer': '<think> Okay, so I need to find the smallest possible value of the sum |x + 1| + |x + 3| + |x + 7|. Hmm, absolute value expressions can sometimes be tricky because they change their behavior depending on whether the inside is positive or negative. I remember that the sum of absolute values often has its minimum at a median point or something like that. Maybe I should think about how the different terms behave as x changes.\n\nFirst, let me recall that the absolute value function |a| is equal to a when a ≥ 0 and -a when a < 0. So, each term in the sum |x +1|, |x +3|, and |x +7| will change their expressions when x is equal to 

In [13]:
records[0]["prompt"][0]

{'content': "Let's think step by step and solve this problem. Find the smallest possible value of the sum $\\lvert x + 1\\rvert + \\lvert x + 3\\rvert + \\lvert x + 7\\rvert$.",
 'role': 'user'}

In [14]:
records[0]["reward_model"]["ground_truth"]

'6'

In [19]:
def build_vllm_llm(
    model_dir: str,
    gpu_memory_utilization: float = 0.8,
    dtype: str = "auto",
    max_model_len: int | None = None,
    tensor_parallel_size: int = 1,
) -> LLM:
    kwargs = dict(
        model=str(model_dir),
        dtype=dtype,
        tensor_parallel_size=tensor_parallel_size,
        gpu_memory_utilization=gpu_memory_utilization,
        disable_log_stats=True,   
    )
    if max_model_len is not None:
        kwargs["max_model_len"] = max_model_len
    return LLM(**kwargs)


def vllm_generate_solutions(
    llm: LLM,
    records: List[Dict[str, Any]],
    batch_size: int = 32,
    max_new_tokens: int = 512,
    tqdm_desc: str = "vLLM inference",
) -> List[str]:
    sp = SamplingParams(
        temperature=0.0,
        top_p=1.0,
        max_tokens=max_new_tokens,
    )

    all_solutions: List[str] = []

    # tqdm: batch-level progress
    for i in tqdm(
        range(0, len(records), batch_size),
        desc=tqdm_desc,
        total=(len(records) + batch_size - 1) // batch_size,
    ):
        batch = records[i:i+batch_size]

        # vLLM chat: needs input like List[List[{"role","content"}]]
        messages_list = [r["prompt"] for r in batch]

        outs = llm.chat(messages_list, sampling_params=sp)
        sols = [o.outputs[0].text for o in outs]
        all_solutions.extend(sols)

    return all_solutions


def eval_records_with_vllm(
    model_dir: str,
    records: List[Dict[str, Any]],
    batch_size: int = 32,
    max_new_tokens: int = 512,
    gpu_memory_utilization: float = 0.90,
    dtype: str = "auto",
    max_model_len: int | None = None,
    tensor_parallel_size: int = 1,
) -> Tuple[float, List[bool], List[str]]:
    llm = build_vllm_llm(
        model_dir=model_dir,
        gpu_memory_utilization=gpu_memory_utilization,
        dtype=dtype,
        max_model_len=max_model_len,
        tensor_parallel_size=tensor_parallel_size,
    )

    solutions = vllm_generate_solutions(
        llm=llm,
        records=records,
        batch_size=batch_size,
        max_new_tokens=max_new_tokens,
        tqdm_desc="vLLM inference",
    )

    oks: List[bool] = []
    # tqdm: score-level progress
    for r, sol in tqdm(
        zip(records, solutions),
        desc="compute_score",
        total=len(records),
    ):
        gt = r["reward_model"]["ground_truth"]
        ok = bool(compute_score(sol, gt))
        oks.append(ok)

    acc = sum(oks) / max(1, len(oks))
    return acc, oks, solutions

In [20]:
acc, oks, sols = eval_records_with_vllm(
    model_dir=str(model_dir),
    records=records[:64],
    batch_size=16,
    max_new_tokens=512,
)

print("acc@64:", acc)
print("example pred:", sols[0][:400])
print("example gt:", records[0]["reward_model"]["ground_truth"])

INFO 01-28 17:37:18 [utils.py:233] non-default args: {'disable_log_stats': True, 'model': '/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_60'}
INFO 01-28 17:37:18 [model.py:547] Resolved architecture: Qwen3ForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 01-28 17:37:18 [model.py:1510] Using max model len 40960


INFO 01-28 17:37:18 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=16384.


The tokenizer you are loading from '/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_60' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.




  import pynvml  # type: ignore[import]


INFO 01-28 17:37:24 [__init__.py:216] Automatically detected platform cuda.
[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:25 [core.py:644] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:25 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_60', speculative_config=None, tokenizer='/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_60', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=40960, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforc

[1;36m(EngineCore_DP0 pid=1692977)[0;0m W0128 17:37:26.727000 1692977 site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
[1;36m(EngineCore_DP0 pid=1692977)[0;0m W0128 17:37:26.727000 1692977 site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:27 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:27 [topk_topp_sampler.py:55] Using FlashInfer for top-p & top-k sampling.
[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:27 [gpu_model_runner.py:2602] Starting to load model /mnt/local/shared/michaelw/mlf2/verl/repr

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  3.97it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  3.96it/s]
[1;36m(EngineCore_DP0 pid=1692977)[0;0m 


[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:28 [default_loader.py:267] Loading weights took 0.28 seconds
[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:28 [gpu_model_runner.py:2653] Model loading took 1.1201 GiB and 0.536634 seconds
[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:34 [backends.py:548] Using cache directory: /mnt/local/shared//michaelw/.cache/vllm/torch_compile_cache/158d2f012a/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:34 [backends.py:559] Dynamo bytecode transform time: 5.55 s
[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:37 [backends.py:164] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.479 s
[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:38 [monitor.py:34] torch.compile takes 5.55 s in total
[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:39 [gpu_worker.py:298] Available KV cache memory: 67.91 GiB
[1;36m(

[1;36m(EngineCore_DP0 pid=1692977)[0;0m 2026-01-28 17:37:39,771 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
[1;36m(EngineCore_DP0 pid=1692977)[0;0m 2026-01-28 17:37:39,802 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:02<00:00, 30.34it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 67/67 [00:01<00:00, 41.65it/s]


[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:44 [gpu_model_runner.py:3480] Graph capturing finished in 4 secs, took -0.22 GiB
[1;36m(EngineCore_DP0 pid=1692977)[0;0m INFO 01-28 17:37:44 [core.py:210] init engine (profile, create kv cache, warmup model) took 15.42 seconds


[1;36m(EngineCore_DP0 pid=1692977)[0;0m The tokenizer you are loading from '/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_60' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


INFO 01-28 17:37:44 [llm.py:306] Supported_tasks: ['generate']


vLLM inference:   0%|          | 0/4 [00:00<?, ?it/s]

The tokenizer you are loading from '/mnt/local/shared/michaelw/mlf2/verl/reproduce/grpo/checkpoints/mw_verl_recipe_reasoning/openthoughts-grpo-qwen3_0p6b_fsdp2/_merged_hf_cache/global_step_60' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


INFO 01-28 17:37:45 [chat_utils.py:560] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

compute_score:   0%|          | 0/64 [00:00<?, ?it/s]



acc@64: 0.140625
example pred: <think>
Okay, so I need to find the smallest possible value of the sum of absolute values: |x + 1| + |x + 3| + |x + 7|. Hmm, absolute value functions can sometimes be tricky because they change their behavior depending on the value of x. I remember that the sum of absolute values often has a minimum at a point where the expressions inside the absolute values change sign. But since there are three 
example gt: 6
