In [1]:
from __future__ import annotations
import os
import json
import importlib
import argparse
from functools import partial

import random
import numpy as np

import torch


from vllm import LLM, SamplingParams, PoolingParams

from sal.config import Config

from core import mcts_search_v51
from core.reward_models import RLHFFlow
from utils.load_data import load_data_prm800k

# from core.llm_engine import rm_engine
# from core.llms import rm_generate
# import logging
# logging.basicConfig(format='%(message)s', level=logging.ERROR)

INFO 07-14 17:35:30 [__init__.py:244] Automatically detected platform cuda.


In [2]:
if torch.cuda.is_available():
    GPUS = os.environ.get('CUDA_VISIBLE_DEVICES', "0").split(',')
    print(GPUS)
else:
    print("CUDA is not available.")

['0']


In [3]:
# base_dir
base_dir = '/groups/kjun/tnn/datasets/'

# dataset path
data_dir = base_dir + "/prm800k/math_splits"

# llm and prm path
# llm_dir = base_dir + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
# prm_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_dir = base_dir + "/Llama-3.2-1B-Instruct"
prm_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data"
# prm_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data-Modified"

In [4]:
llm_total_gpu = 0.4
llm_gpu_memory_utilization = 0.2
# llm_vllm = LLM(
#     model = llm_dir,
#     tensor_parallel_size=1,
#     gpu_memory_utilization = 0.7,  # Utilize 50% of GPU memory
#     # enable_prefix_caching=True,  # V100 doesn't support enable_prefix_caching 
#     # enable_chunked_prefill=False, # and enable_chunked_prefill
#     max_model_len = 5000,
#     dtype = "float16",
#     seed = config.seed)

llm_vllm = LLM(
    model=llm_dir, 
    tensor_parallel_size=1, 
    # trust_remote_code=True,
    swap_space=16,
    max_model_len=5000,
    gpu_memory_utilization=llm_gpu_memory_utilization,
    enforce_eager=True,
    distributed_executor_backend=None,
    dtype="float16",
    seed=0,
)

print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))



INFO 07-14 17:35:44 [config.py:823] This model supports multiple tasks: {'score', 'classify', 'generate', 'reward', 'embed'}. Defaulting to 'generate'.
INFO 07-14 17:35:44 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.1) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=5000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 07-14 17:35:48 [default_loader.py:272] Loading weights took 1.33 seconds
INFO 07-14 17:35:48 [model_runner.py:1203] Model loading took 2.3185 GiB and 1.460454 seconds
INFO 07-14 17:35:49 [worker.py:294] Memory profiling takes 0.51 seconds
INFO 07-14 17:35:49 [worker.py:294] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.20) = 6.35GiB
INFO 07-14 17:35:49 [worker.py:294] model weights take 2.32GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.19GiB; the rest of the memory reserved for KV Cache is 2.75GiB.
INFO 07-14 17:35:49 [executor_base.py:113] # cuda blocks: 5631, # CPU blocks: 32768
INFO 07-14 17:35:49 [executor_base.py:118] Maximum concurrency for 5000 tokens per request: 18.02x
INFO 07-14 17:35:55 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 7.07 seconds
#--- memory: 5.07647705078125


In [5]:
llm_vllm_embeds = LLM(
    model=llm_dir, 
    tensor_parallel_size=1, 
    # trust_remote_code=True,
    task="embed",
    swap_space=16,
    max_model_len=5000,
    gpu_memory_utilization=llm_total_gpu-llm_gpu_memory_utilization,
    enforce_eager=True,
    distributed_executor_backend=None,
    dtype="float16",
    seed=0,
)
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

INFO 07-14 17:35:56 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.1) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=5000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct,

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 07-14 17:35:58 [default_loader.py:272] Loading weights took 1.32 seconds
INFO 07-14 17:35:59 [model_runner.py:1203] Model loading took 2.3029 GiB and 1.372816 seconds
#--- memory: 7.379390716552734


In [6]:
prm = RLHFFlow(model_path=prm_dir, device_map='cuda:0')
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#--- memory: 22.336918354034424


In [7]:
stop

NameError: name 'stop' is not defined

In [8]:
#  load data 
data_by_levels = load_data_prm800k(data_dir)

1: 43
2: 90
3: 105
4: 128
5: 134


In [26]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# general params
config = Config()
config.agg_strategy = 'last'

config.n = 4                      # number of budgets to be generated per depth
config.beam_width = 4             # number of nodes left after selection
config.lookahead = 0              # don't use it for now
config.max_depths = 10            # max depths, after reaching max_depth then terminate search 
config.sort_completed = False      
config.filter_duplicates = True   # remove any duplicates in the last list of trajs
config.seed = 0

# mcts parameter
config.num_phases = 100
config.num_batches = 1
config.batch_budget = config.num_batches*config.max_depths 

config.lam = 10 
config.normalize_embeds = True

config.cpuct_root = 0
config.cpuct_leaf = 0
config.ds_beta = 1.0
config.ds_alpha = 10.0
config.use_ppl = True

config.version = "v51"



In [10]:
level = 4                                   # level of difficulty of questions
num_questions = len(data_by_levels[level])  # level 4 has 128 questions
num_questions = 1
num_trials = 1
print(f"num_questions = {num_questions}")
print(f"num_trials = {num_trials}")

# get batch of questions ['q1', 'q2', ...]
batch_of_questions = [data_by_levels[level][q_idx]['problem'] for q_idx in range(num_questions)]

num_questions = 1
num_trials = 1


In [None]:
print(batch_of_questions)

In [27]:
importlib.reload(mcts_search_v51)

<module 'core.mcts_search_v51' from '/home/u20/tnguyen9210/tnn1/LLMs/llm-reasoning-methods/core/mcts_search_v51.py'>

In [28]:

for question in batch_of_questions:
    agent = mcts_search_v51.MCTS(config=config, question=question)
    agent_completions = mcts_search_v51.mcts_search(question, agent, config, llm_vllm, llm_vllm_embeds, prm)
    print(agent_completions)



-> p = 0

-> d = 0
select_child
node.__visit_count = 0
0
0
0
0
best_childs_len = 1

-> d = 1
select_child
node.__visit_count = 1
0
0
0
0
best_childs_len = 1

-> d = 2
select_child
node.__visit_count = 1
0
0
0
0
best_childs_len = 1

-> d = 3
select_child
node.__visit_count = 1
0
0
0
0
best_childs_len = 1

-> d = 4
select_child
node.__visit_count = 1
0
0
0
0
best_childs_len = 1

-> d = 5
select_child
node.__visit_count = 1
0
0
0
0
best_childs_len = 1

-> d = 6
select_child
node.__visit_count = 1
0
0
0
0
best_childs_len = 1

-> d = 7
select_child
node.__visit_count = 1
0
0
0
0
best_childs_len = 1

-> d = 8
select_child
node.__visit_count = 1
0
0
0
0
best_childs_len = 1

-> d = 9
select_child
node.__visit_count = 5


['## Step 1: Express y and z in terms of x\nWe start by expressing y and z in terms of x using the given equations.\nFrom the first equation, we get $y = \\frac{2}{3}x$.\nFrom the second equation, we get $z = -\\frac{6}{4}x = -\\frac{3}{2}x$.\n\n## Step 2: Find the direction ratios of the first line\nThe direction ratios of the first line are (2, 3, -3).\n\n## Step 3: Find the direction ratios of the second line\nThe direction ratios of the second line are (6, -1, -4).\n\n## Step 4: Calculate the angle between the lines using the dot product\nNow, we use the formula for finding the angle between two lines:\n\\[ \\cos \\theta = \\frac{\\mathbf{a} \\cdot \\mathbf{b}}{|\\mathbf{a}| |\\mathbf{b}|} \\]\n\nwhere $\\mathbf{a}$ and $\\mathbf{b}$ are the direction ratios of the lines.\n\n## Step 5: Substitute the direction ratios into the formula\nWe substitute the direction ratios of the lines into the formula:\n\\[\\cos \\theta = \\frac{(2, 3, -3) \\cdot (6, -1, -4)}{|(2, 3, -3)| |(6, -1, -4)

In [None]:
# for idx, node in enumerate(agent_completions):
#     print(f"\n-> idx = {idx}")
#     print(node.state["text"])
print(agent_completions)