In [1]:
import os, psutil, gc
import time 
import json
import pprint

from collections import defaultdict
import random
import numpy as np

In [None]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams, PoolingParams

from sal.config import Config

from core.best_of_n import best_of_n_v11, best_of_n_v12

In [None]:
if torch.cuda.is_available():
    GPUS = os.environ.get('CUDA_VISIBLE_DEVICES', "0").split(',')
    print(GPUS)
else:
    print("CUDA is not available.")

In [None]:
# base_dir
base_dir = '/groups/kjun/tnn/datasets/'

# dataset path
data_dir = base_dir + "/prm800k/math_splits"

# llm and prm path
llm_dir = base_dir + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_dir = base_dir + "/Llama-3.2-1B-Instruct"
prm_tokenizer_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data"

In [None]:
#  load data 
data_by_levels = load_data_prm800k(data_dir)

# load random_seeds     
# random_seeds = np.loadtxt("random_seeds.txt").astype("int64")
# random_seeds = [int(seed) for seed in random_seeds]

In [4]:
# baseline: gpu_memory_utilization=0.2
# use the standard model 
llm_vllm = LLM(
        model = llm_tokenizer_dir,
        tensor_parallel_size=1,
        gpu_memory_utilization = 0.7,  # Utilize 50% of GPU memory
        # enable_prefix_caching=True,  # V100 doesn't support enable_prefix_caching 
        # enable_chunked_prefill=False, # and enable_chunked_prefill
        max_model_len = 5000,
        dtype = "float16",
        seed = 123)
    
    # # use the gguf quantized model 
    # llm_regular = LLM(
    #     model = llm_dir,
    #     tokenizer = llm_tokenizer_dir,
    #     tensor_parallel_size=1,
    #     gpu_memory_utilization = 0.2,  # Utilize 50% of GPU memory
    #     max_model_len = 5000,
    #     dtype = "float16",
    #     seed = 123)


gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))

INFO 03-23 11:42:28 [config.py:583] This model supports multiple tasks: {'classify', 'score', 'generate', 'reward', 'embed'}. Defaulting to 'generate'.
INFO 03-23 11:42:28 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.1) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=10000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=Fal

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-23 11:42:32 [loader.py:429] Loading weights took 1.29 seconds
INFO 03-23 11:42:32 [model_runner.py:1146] Model loading took 2.3185 GB and 1.388610 seconds
INFO 03-23 11:42:33 [worker.py:267] Memory profiling takes 0.61 seconds
INFO 03-23 11:42:33 [worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.50) = 15.87GiB
INFO 03-23 11:42:33 [worker.py:267] model weights take 2.32GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.21GiB; the rest of the memory reserved for KV Cache is 12.25GiB.
INFO 03-23 11:42:33 [executor_base.py:111] # cuda blocks: 25088, # CPU blocks: 8192
INFO 03-23 11:42:33 [executor_base.py:116] Maximum concurrency for 10000 tokens per request: 40.14x
INFO 03-23 11:42:34 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:13<00:00,  2.53it/s]

INFO 03-23 11:42:48 [model_runner.py:1570] Graph capturing finished in 14 secs, took 0.13 GiB
INFO 03-23 11:42:48 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 16.42 seconds





#--- memory: 14.584884643554688
#--- memory: 0.0


In [None]:
tokenizer = AutoTokenizer.from_pretrained(llm_tokenizer_path)
llm_tf = AutoModelForCausalLM.from_pretrained(llm_tokenizer_path).to("cuda:1")
# model_regular.generation_config.pad_token_id = tokenizer.eos_token_id
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

In [24]:
# general params
config = Config()
config.n = 4

level = '1'
num_questions = len(data_by_levels[level])
num_questions = 20
num_trials = 200
print(f"num_questions = {num_questions}")

# get batch of questions
batch_of_questions = [data_by_levels[level][q_idx]['problem'] for q_idx in range(num_questions)]

# select search algo
search_name = 'best_of_n'
algo_type = 1
if search_name == 'best_of_n':
    if algo_type == 1:
        search_algo = best_of_n.best_of_n_v11
    else:
        search_algo = best_of_n.best_of_n_v12
print(search_algo)

# run search_algo and save results
result_dir = f"results/generate_bon_prm800k_level{level}_n{config.n}_v11.jsonl"
start_time = time.time()
with open(result_dir, 'w', encoding = 'utf-8') as fout:
    for trial_idx in range(num_trials):
        # best_of_n(batch_of_questions, config, llm_vllm, random_seeds[trial_idx])
        results = search_algo(batch_of_questions, config, llm_vllm, 10000+trial_idx)
        json.dump(results, fout)
        fout.write('\n')
    
        # compute the time
        if trial_idx % 1 == 0:
            total_time = time.time() - start_time
            time_per_trial = total_time/(trial_idx+1)
            time_per_question = time_per_trial/num_questions
            print(f"trial {trial_idx}")
            print(f"it takes {time_per_question:0.4f}s per question")
            print(f"it takes {time_per_trial:0.4f}s per trial")

total_time = time.time() - start_time
print(f"it takes {total_time:0.4f}s in total")

num_trials = 50
num_questions = 43
<function best_of_n_v11 at 0x7f4a52bc1440>


KeyboardInterrupt: 