In [3]:
import os, psutil, gc
import time 
import json
import pprint

from collections import defaultdict
import random
import numpy as np

from dataclasses import dataclass

In [14]:
import torch 
import torch.distributed as dist
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams, PoolingParams

from sal.config import Config
from sal.search.utils import build_conv, generate_k_steps, last

from core import select_diverse
from utils.load_data import load_data_prm800k

In [3]:
if torch.cuda.is_available():
    GPUS = os.environ.get('CUDA_VISIBLE_DEVICES', "0").split(',')
    print(GPUS)
else:
    print("CUDA is not available.")

['0', '1', '2', '3']


In [4]:
# base_dir
base_dir = '/groups/kjun/tnn/datasets/'

# dataset path
data_dir = base_dir + "/prm800k/math_splits"

# llm and prm path
llm_dir = base_dir + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_dir = base_dir + "/Llama-3.2-1B-Instruct"
prm_tokenizer_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data"

In [5]:
#  load data 
data_by_levels = load_data_prm800k(data_dir)

# load random_seeds     
# random_seeds = np.loadtxt("random_seeds.txt").astype("int64")
# random_seeds = [int(seed) for seed in random_seeds]

1: 43
2: 90
3: 105
4: 128
5: 134


In [6]:
# baseline: gpu_memory_utilization=0.2
# use the standard model 
llm_vllm = LLM(
        model = llm_tokenizer_dir,
        tensor_parallel_size=1,
        gpu_memory_utilization = 0.7,  # Utilize 50% of GPU memory
        # enable_prefix_caching=True,  # V100 doesn't support enable_prefix_caching 
        # enable_chunked_prefill=False, # and enable_chunked_prefill
        max_model_len = 5000,
        dtype = "float16",
        seed = 123)
    
    # # use the gguf quantized model 
    # llm_regular = LLM(
    #     model = llm_dir,
    #     tokenizer = llm_tokenizer_dir,
    #     tensor_parallel_size=1,
    #     gpu_memory_utilization = 0.2,  # Utilize 50% of GPU memory
    #     max_model_len = 5000,
    #     dtype = "float16",
    #     seed = 123)


gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))

INFO 04-03 18:56:41 __init__.py:207] Automatically detected platform cuda.
INFO 04-03 18:56:49 config.py:549] This model supports multiple tasks: {'reward', 'classify', 'embed', 'generate', 'score'}. Defaulting to 'generate'.
INFO 04-03 18:56:49 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=5000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_m

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-03 18:56:52 model_runner.py:1115] Loading model weights took 2.3185 GB
INFO 04-03 18:56:53 worker.py:267] Memory profiling takes 0.49 seconds
INFO 04-03 18:56:53 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.70) = 22.21GiB
INFO 04-03 18:56:53 worker.py:267] model weights take 2.32GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.19GiB; the rest of the memory reserved for KV Cache is 18.62GiB.
INFO 04-03 18:56:53 executor_base.py:111] # cuda blocks: 38125, # CPU blocks: 8192
INFO 04-03 18:56:53 executor_base.py:116] Maximum concurrency for 5000 tokens per request: 122.00x
INFO 04-03 18:56:55 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:15<00:00,  2.29it/s]

INFO 04-03 18:57:10 model_runner.py:1562] Graph capturing finished in 15 secs, took 0.13 GiB
INFO 04-03 18:57:10 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 17.70 seconds





#--- memory: 20.959694862365723
#--- memory: 0.0


In [9]:
tokenizer = AutoTokenizer.from_pretrained(llm_tokenizer_dir)
llm_tf = AutoModelForCausalLM.from_pretrained(llm_tokenizer_dir).to("cuda:1")
# model_regular.generation_config.pad_token_id = tokenizer.eos_token_id
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

#--- memory: 20.959694862365723


In [None]:
@dataclass
class Beam:
    prompt: str
    templated_prompt: str
    index: int
    current_text: str | None
    next_texts: list[str] | None
    lookahead_texts: list[str] | None
    stop_reasons: list[str | None] | None
    best_scores: list[float]  # the PRM scores
    all_scores: list[list[float]]  # all PRM scores
    previous_text: str | None
    pruned: False
    history: list[str]
    completed: bool = False
    completion_tokens: int = 0


def _select_diverse(X_embeds, X_lprobs, X_ppl, K, V):
    num_arms = len(X_embeds)
    _V = copy.deepcopy(V)
    A_idxes = []
    A_embeds = []
    tol = 0.0001
    for it in range(K):
        _V_inv = np.linalg.inv(_V)
        arm_vals = np.einsum('ij,jk,ik->i', X_embeds, _V_inv, X_embeds)
        max_val = np.max([val for idx, val in enumerate(arm_vals) if idx not in A_idxes])
        # candidate_idxes = np.where(np.abs(arm_vals-max_val) < tol)[0]
        candidate_idxes = [
            arm_idx for arm_idx, arm_val in enumerate(arm_vals)
            if (np.abs(max_val - arm_val) <= tol) and (arm_idx not in A_idxes)
        ]

        best_idx = max(candidate_idxes, key=lambda i: X_ppl[i])
        # print(arm_vals)
        # print(X_lprobs)
        # print(candidate_idxes)
        # print(best_idx)

        best_embeds = X_embeds[best_idx]
        # print(best_embeds.shape)

        # update V
        _V = _V + np.matmul(best_embeds, best_embeds.T)

        # update A
        A_idxes.append(best_idx)

        # print(_V.shape)
        # print(max_val)
        # print(max_idx)
        # print(A_idxes)

    return A_idxes


In [29]:




def _select_diverse_search(batch_of_questions, config: Config, llm_vllm: LLM, llm_tf, llm_tokenizer) -> list[Beam]:
    

    beams: list[Beam] = []
    for p_idx, prompt in enumerate(batch_of_questions):
        beams.append(
            Beam(
                prompt=prompt,
                templated_prompt=prompt,
                index=p_idx,
                current_text="",
                next_texts=None,
                lookahead_texts=None,
                pruned=False,
                completed=False,  # New flag to track completion
                stop_reasons=None,
                history=[],
                best_scores=[],
                all_scores=[],
                previous_text=None,
                completion_tokens=0,
            )
        )            

    completed_beams: list[Beam] = []
    completed_answer = [[] for _ in range(len(batch_of_questions))]
    # completion_ntokens = [[] for _ in range(len(batch_of_questions))]

    active_beams = [b for b in beams if not b.pruned]
    
    convs = [
        build_conv(b.prompt, b.current_text, config.system_prompt)
        for b in active_beams
    ]
    # continue_final_message = i > 0
    # add_generation_prompt = i == 0

    tokenizer = llm_vllm.get_tokenizer()

    if config.custom_chat_template is not None:
        tokenizer.chat_template = config.custom_chat_template
        
    templated_convs = tokenizer.apply_chat_template(
        convs,
        add_generation_prompt=True,
        continue_final_message=False,
        tokenize=False,
    )

    templated_convs = [c for conv in templated_convs for c in [conv]*config.n]

    sampling_params = SamplingParams(
        temperature=config.temperature,
        max_tokens=config.max_tokens,
        top_p=config.top_p,
        stop=["\n\n"],
        include_stop_str_in_output=True,
        n=1,
    )
    
    # Generate responses 
    llm_responses = llm_vllm.generate(
        templated_convs,
        sampling_params=sampling_params,
        use_tqdm=False,
    )

    # completions = [[]]
    # for r_idx, r in enumerate(llm_responses):
    #     print(f"r_idx = {r_idx}")
    #     for output in r.outputs:
    #         print(output)

    completions = [[] for _ in range(len(batch_of_questions))]
    completion_ntokens = [[] for _ in range(len(batch_of_questions))]
    for r_idx, r in enumerate(llm_responses):
        idx = r_idx // config.n
        output = r.outputs[0]
        # print(output.text)
        if 
        completions[idx].append(output.text)
        completion_ntokens[idx].append(len(output.token_ids))

    print(completion_ntokens)
    # print(templated_convs)
    stop
      

    return completed_beams


# general params
config = Config()
config.n = 4
config.beam_width = 2
config.lookahead = 0
config.num_iterations = 1

# diverse_select params
config.lam = 10
config.normalize_embeds = True

level = '4'
num_questions = len(data_by_levels[level])
num_questions = 2
num_trials = 1
print(f"num_questions = {num_questions}")
print(f"num_trials = {num_trials}")

# get batch of questions
batch_of_questions = [data_by_levels[level][q_idx]['problem'] for q_idx in range(num_questions)]

beam_results = _select_diverse_search(batch_of_questions, config, llm_vllm, llm_tf, tokenizer)

num_questions = 2
num_trials = 1
[[77, 96, 70, 74], [343, 424, 336, 439]]


NameError: name 'stop' is not defined