In [1]:
from __future__ import annotations
import os
import json
import importlib

import torch
import argparse
from tqdm import tqdm
from datetime import datetime
from omegaconf import OmegaConf
from rstar_deepthink.agents import BS, MCTS
from rstar_deepthink.solver import Solver
from rstar_deepthink.config import BaseConfig


In [2]:
from rstar_deepthink import solver
from rstar_deepthink import agents

In [3]:
torch.set_num_threads(12)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def load_qaf(filename: str):
    if filename.endswith(".json"):
        with open(filename, "r") as f:
            data = json.load(f)
        if "example" in data:
            data = data["example"]
    elif filename.endswith(".jsonl"):
        data = []
        with open(filename, "r") as f:
            lines = f.readlines()
        for line in lines:
            data.append(json.loads(line))
    else:
        raise ValueError(f"Unrecognized file format: {filename}")
    return data

def batch(iterable, n=-1):
    l = len(iterable)
    if n <= 0:
        n = l
    for ndx in range(0, l, n):
        yield iterable[ndx: min(ndx + n, l)]

def parse_args():
    base_dir = '/groups/kjun/tnn/datasets/'
    
    # dataset path
    data_dir = base_dir + "/prm800k/math_splits"

    llm_tokenizer_dir = base_dir + "/Llama-3.2-1B-Instruct"
    prm_tokenizer_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data"
    
    args = argparse.ArgumentParser()
    args.add_argument('--custom_cfg', type=str, default="config/sft_eval_mcts.yaml")
    args.add_argument("--qaf", type=str, default="eval_data/math500_test.json", help="quesuion and answer file")
    args.add_argument('--model_dir', type=str, default=f"{llm_tokenizer_dir}") 
    args.add_argument('--reward_model_dir', type=str, default=f"{prm_tokenizer_dir}") 
    args.add_argument('--save_in_model', type=str, default="results/")
    args = args.parse_args()
    return args

In [4]:
# args = parse_args()

base_dir = '/groups/kjun/tnn/datasets/'
    
# dataset path
data_dir = base_dir + "/prm800k/math_splits"

llm_tokenizer_dir = base_dir + "/Qwen2.5-Math-1.5B-Instruct"
prm_tokenizer_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data"
prm_tokenizer_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data-Modified"
# prm_tokenizer_dir = base_dir + "/Llama-3.2-1B-RM-GSM8k"

args = argparse.Namespace()
args.custom_cfg = f"config/sft_eval_mcts.yaml"
args.qaf = f"eval_data/math500_test.json"
args.model_dir = f"{llm_tokenizer_dir}"
args.reward_model_dir = f"{prm_tokenizer_dir}"
args.save_in_model = f"results/"

config = OmegaConf.structured(BaseConfig)
if args.custom_cfg:
    custom_config = OmegaConf.load(args.custom_cfg)
    config = OmegaConf.merge(config, custom_config)
config = OmegaConf.create(OmegaConf.to_yaml(config, resolve=True))
if args.model_dir:
    config.model_dir = args.model_dir
if args.reward_model_dir:
    config.reward_model_dir = args.reward_model_dir
print(config)

llm_version = os.path.basename(config.model_dir.rstrip("/"))

# data = load_qaf(args.qaf)
# data = data[100:101]
b_solver = Solver(config=config)

# init agent
if config.mode == "mcts":
    agent = MCTS
elif config.mode == "bs":
    agent = BS
else:
    raise NotImplementedError
if args.reward_model_dir:
    llm_version += "." + args.reward_model_dir.split("/")[-1]

print(llm_version)

saved_jsonl_file = f"{args.qaf}.{config.mode}.{llm_version}.{datetime.now().strftime('%Y%m%d%H%M%S')}.jsonl" 
    
if args.save_in_model:
    saved_jsonl_file = args.save_in_model + '.jsonl'
    saved_jsonl_file_dir = os.path.dirname(saved_jsonl_file)
    os.makedirs(saved_jsonl_file_dir, exist_ok=True)
    


{'mode': 'mcts', 'model_dir': '/groups/kjun/tnn/datasets//Qwen2.5-Math-1.5B-Instruct', 'reward_model_dir': '/groups/kjun/tnn/datasets//Llama3.1-8B-PRM-Deepseek-Data-Modified', 'few_shot_path': './rstar_deepthink/few_shots/few_shots.json', 'prompt_path': './rstar_deepthink/few_shots/sft_prompt.json', 'num_few_shot': 0, 'prompt_wrap': 'rstar', 'result_unwrap': 'rstar', 'step_delim': '\n', 'temperature': 1.0, 'top_p': 1.0, 'top_k': -1, 'use_beam_search': False, 'best_of': 2, 'max_tokens': 2048, 'seed': None, 'swap_space': 12, 'n_generate_sample': 2, 'stop': ['<end_of_step>', '<end_of_code>', '<end_of_answer>'], 'step_beam_width': 1, 'max_depth': 16, 'iterations': 2, 'positive_reward': 1.0, 'negative_reward': -1.0, 'errors_threshold': 1, 'need_value_func': True, 'update_leaf_value': True, 'c_puct': 2.0, 'is_sampling': False, 'prune': False, 'batch_size': 8000, 'max_model_len': 4096, 'terminal_sample': False, 'llm_gpu_memory_utilization': 0.5, 'tp': 1, 'save_intermediate_rollouts': True}
IN

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 06-05 16:48:33 model_runner.py:1099] Loading model weights took 2.3185 GB
INFO 06-05 16:48:39 config.py:510] This model supports multiple tasks: {'embed', 'classify', 'score', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 06-05 16:48:39 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='/groups/kjun/tnn/datasets//Qwen2.5-Math-1.5B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Qwen2.5-Math-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=Observabili

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 06-05 16:48:41 model_runner.py:1099] Loading model weights took 2.8797 GB
INFO 06-05 16:48:41 worker.py:241] Memory profiling takes 0.49 seconds
INFO 06-05 16:48:41 worker.py:241] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.50) = 15.87GiB
INFO 06-05 16:48:41 worker.py:241] model weights take 2.88GiB; non_torch_memory takes 0.17GiB; PyTorch activation peak memory takes 1.40GiB; the rest of the memory reserved for KV Cache is 11.42GiB.
INFO 06-05 16:48:42 gpu_executor.py:76] # GPU blocks: 26721, # CPU blocks: 28086
INFO 06-05 16:48:42 gpu_executor.py:80] Maximum concurrency for 4096 tokens per request: 104.38x
INFO 06-05 16:48:46 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 5.50 seconds
Qwen2.5-Math-1.5B-Instruct.Llama3.1-8B-PRM-Deepseek-Data-Modified


In [5]:
# importlib.reload(agents.mcts)
# importlib.reload(solver)

In [6]:
print(b_solver.max_agent_steps)
# data = data[:1]
data = load_qaf(args.qaf)
data = data[:1]
data[0]['question'] = '''
Half the value of $3x-9$ is $x+37$. What is the value of $x$?
'''
data[0]['answer'] = "90^\circ"

2


In [7]:
print(data)

[{'index': 0, 'question': '\nHalf the value of $3x-9$ is $x+37$. What is the value of $x$?\n', 'answer': '90^\\circ'}]


In [8]:
with open(saved_jsonl_file, "a+", encoding='utf-8') as writer:
    for cur_data in tqdm(batch(data, config.batch_size), desc="Main Processing"):
        b_agents = [agent(config=config, question=d["question"], ground_truth=str(d["answer"])) 
                  for d in cur_data]
        jsonlines = b_solver.solve(b_agents, saved_jsonl_file, cur_data)
        for d in cur_data:
            question = d["question"]
            d["rstar"] = jsonlines[question]
            writer.write(json.dumps(d, ensure_ascii=False) + '\n')
            writer.flush()

Main Processing: 0it [00:00, ?it/s]


-> select_next_step
None
state={'text': '', 'extra_info': 'question: \nHalf the value of $3x-9$ is $x+37$. What is the value of $x$?\n', 'action': '', 'action_input': '', 'final_answer': ''} additional_state_keys=['action', 'action_input', 'final_answer'] parent=None children=[] depth=0 is_terminal=False reward=None value=0 tag='0' consecutive_errors=0 c_puct=2.0 inited=False
-----------------Current Rollout:  0 -----------------
-----------------Current Step:  0 -----------------

-> current_agents
[MCTSNode(state={'text': '', 'extra_info': 'question: \nHalf the value of $3x-9$ is $x+37$. What is the value of $x$?\n', 'action': '', 'action_input': '', 'final_answer': ''}, additional_state_keys=['action', 'action_input', 'final_answer'], parent=None, children=[], depth=0, is_terminal=False, reward=None, value=0, tag='0', consecutive_errors=0, c_puct=2.0, inited=False)]

-> llm prompts
["<|user|>:\n\nHalf the value of $3x-9$ is $x+37$. What is the value of $x$?\n\n\n<|assistant|>: Let'


Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
Processed prompts:  50%|█████     | 1/2 [00:05<00:05,  5.30s/it, est. speed input: 9.43 toks/s, output: 151.88 toks/s][A

step_expansion outputs
[RequestOutput(request_id=0, prompt="<|user|>:\n\nHalf the value of $3x-9$ is $x+37$. What is the value of $x$?\n\n\n<|assistant|>: Let's think step by step and solve the problem with code.", prompt_token_ids=[27, 91, 872, 91, 29, 1447, 42627, 279, 897, 315, 400, 18, 87, 12, 24, 3, 374, 400, 87, 10, 18, 22, 12947, 3555, 374, 279, 897, 315, 400, 87, 3, 30831, 27, 91, 77091, 91, 26818, 6771, 594, 1744, 3019, 553, 3019, 323, 11625, 279, 3491, 448, 2038, 13], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=" We'll follow the steps to:\n\n1. Express the given problem mathematically.\n2. Solve the mathematical equation using Python for accuracy.\n\n#### fits活动\n\nThe given equation in the problem is:\n\\[\n\\frac{1}{2} \\cdot (3x - 9) = x + 37\n\\]\n\nWe'll transform this equation algebraically and then solve for \\(x\\). Let's start expressing this in Python.\n```python\nfrom sympy import(lines coerc_m注





-> generate_next_step
0
state={'text': '', 'extra_info': 'question: \nHalf the value of $3x-9$ is $x+37$. What is the value of $x$?\n', 'action': '', 'action_input': '', 'final_answer': ''} additional_state_keys=['action', 'action_input', 'final_answer'] parent=None children=[] depth=0 is_terminal=False reward=None value=0 tag='0' consecutive_errors=0 c_puct=2.0 inited=False
RequestOutput(request_id=0, prompt="<|user|>:\n\nHalf the value of $3x-9$ is $x+37$. What is the value of $x$?\n\n\n<|assistant|>: Let's think step by step and solve the problem with code.", prompt_token_ids=[27, 91, 872, 91, 29, 1447, 42627, 279, 897, 315, 400, 18, 87, 12, 24, 3, 374, 400, 87, 10, 18, 22, 12947, 3555, 374, 279, 897, 315, 400, 87, 3, 30831, 27, 91, 77091, 91, 26818, 6771, 594, 1744, 3019, 553, 3019, 323, 11625, 279, 3491, 448, 2038, 13], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=" We'll follow the steps to:\n\n1. Express the g


Processed prompts: 100%|██████████| 2/2 [00:00<00:00, 53.37it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


-> value outputs
[<rstar_deepthink.llms.llms.Reward object at 0x7fedaff2c9d0>, <rstar_deepthink.llms.llms.Reward object at 0x7fed84562310>]

-> valid_agents

-> select_next_step
state={'text': '', 'extra_info': 'question: \nHalf the value of $3x-9$ is $x+37$. What is the value of $x$?\n', 'action': '', 'action_input': '', 'final_answer': ''} additional_state_keys=['action', 'action_input', 'final_answer'] parent=None children=[MCTSNode(state={'text': " We'll follow the steps to:\n\n1. Express the given problem mathematically.\n2. Solve the mathematical equation using Python for accuracy.\n\n#### fits活动\n\nThe given equation in the problem is:\n\\[\n\\frac{1}{2} \\cdot (3x - 9) = x + 37\n\\]\n\nWe'll transform this equation algebraically and then solve for \\(x\\). Let's start expressing this in Python.\n```python\nfrom sympy import(lines coerc_m注册color along various up QUE.All👓要注意天气       business\nimport sympy as sp\n\n# Define the variable\nx = sp.symbols('x')\n\n# Define the equati




state={'text': ' Here is how we can approach the problem:\n\nThe problem states: "Half the value of \\(3x-9\\) is \\(x+37\\)."\n\nWe need to determine the value of \\(x\\).\n\nStep 1: Set up the equation based on the given information.\n\nHalf of \\(3x - 9\\) is written as:\n\\[\n\\frac{1}{2} \\times (3x - 9) = x + 37 \n\\]\n\nStep 2: Simplify the equation:\n\\[\n\\frac{3x - 9}{2} = x + 37 \n\\]\n\nStep 3: Eliminate the fraction by multiplying both sides by 2:\n\\[\n3x - 9 = 2(x + 37)\n\\]\n\nStep 4: Distribute the 2 on the right-hand side:\n\\[\n3x - 9 = 2x + 74\n\\]\n\nStep 5: Isolate \\(x\\) by moving all \\(x\\) terms to one side and constants to the other:\n\\[\n3x - 2x = 74 + 9\n\\]\n\nStep 6: Solve for \\(x\\):\n\\[\nx = 83\n\\]\n\nLet\'s verify this by using Python to ensure our steps and calculations are accurate.\n```python\nfrom sympy import symbols, Eq, solve\r\n\r\n# Define the variable\r\nx = symbols(\'x\')\r\n\r\n# Define the equation\r\nequation = Eq((1/2) * (3*x - 9), 

Main Processing: 1it [00:16, 16.83s/it]

None

-> expanded_agents
-----------------Current Rollout:  0 -----------------
-----------------Current Step:  1 -----------------

-> current_agents
[]

-> llm prompts
[]
[]

-> select_next_step
None
None
-----------------Current Rollout:  1 -----------------
-----------------Current Step:  0 -----------------

-> current_agents
[]

-> llm prompts
[]
[]





In [9]:
print(agents[0].config.terminal_sample)

TypeError: 'module' object is not subscriptable