In [1]:
import os, psutil, gc
import time 
import json
import pprint

from collections import defaultdict
import random
import numpy as np

In [12]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams, PoolingParams

from sal.search.utils import Beam, build_conv, generate_k_steps, last
from sal.config import Config


from core import best_of_n
from utils.load_data import load_data_prm800k


In [3]:
if torch.cuda.is_available():
    GPUS = os.environ.get('CUDA_VISIBLE_DEVICES', "0").split(',')
    print(GPUS)
else:
    print("CUDA is not available.")

['0', '1', '2', '3']


In [4]:
# base_dir
base_dir = '/groups/kjun/tnn/datasets/'

# dataset path
data_dir = base_dir + "/prm800k/math_splits"

# llm and prm path
llm_dir = base_dir + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_dir = base_dir + "/Llama-3.2-1B-Instruct"
prm_tokenizer_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data"

In [8]:
#  load data 
data_by_levels = load_data_prm800k(data_dir)

# load random_seeds     
# random_seeds = np.loadtxt("random_seeds.txt").astype("int64")
# random_seeds = [int(seed) for seed in random_seeds]

1: 43
2: 90
3: 105
4: 128
5: 134


In [9]:
# baseline: gpu_memory_utilization=0.2
# use the standard model 
llm_vllm = LLM(
        model = llm_tokenizer_dir,
        tensor_parallel_size=1,
        gpu_memory_utilization = 0.7,  # Utilize 50% of GPU memory
        # enable_prefix_caching=True,  # V100 doesn't support enable_prefix_caching 
        # enable_chunked_prefill=False, # and enable_chunked_prefill
        max_model_len = 5000,
        dtype = "float16",
        seed = 123)
    
    # # use the gguf quantized model 
    # llm_regular = LLM(
    #     model = llm_dir,
    #     tokenizer = llm_tokenizer_dir,
    #     tensor_parallel_size=1,
    #     gpu_memory_utilization = 0.2,  # Utilize 50% of GPU memory
    #     max_model_len = 5000,
    #     dtype = "float16",
    #     seed = 123)


gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))

INFO 03-26 08:48:58 __init__.py:207] Automatically detected platform cuda.
INFO 03-26 08:49:14 config.py:549] This model supports multiple tasks: {'generate', 'embed', 'reward', 'classify', 'score'}. Defaulting to 'generate'.
INFO 03-26 08:49:14 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=5000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_m

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-26 08:49:19 model_runner.py:1115] Loading model weights took 2.3185 GB
INFO 03-26 08:49:20 worker.py:267] Memory profiling takes 0.69 seconds
INFO 03-26 08:49:20 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.70) = 22.21GiB
INFO 03-26 08:49:20 worker.py:267] model weights take 2.32GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.19GiB; the rest of the memory reserved for KV Cache is 18.62GiB.
INFO 03-26 08:49:20 executor_base.py:111] # cuda blocks: 38125, # CPU blocks: 8192
INFO 03-26 08:49:20 executor_base.py:116] Maximum concurrency for 5000 tokens per request: 122.00x
INFO 03-26 08:49:22 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:14<00:00,  2.41it/s]

INFO 03-26 08:49:36 model_runner.py:1562] Graph capturing finished in 15 secs, took 0.13 GiB
INFO 03-26 08:49:36 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 17.16 seconds





#--- memory: 20.959694862365723
#--- memory: 0.0


In [11]:
tokenizer = AutoTokenizer.from_pretrained(llm_tokenizer_dir)
llm_tf = AutoModelForCausalLM.from_pretrained(llm_tokenizer_dir).to("cuda:1")
# model_regular.generation_config.pad_token_id = tokenizer.eos_token_id
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

#--- memory: 20.959694862365723


In [None]:
import copy
import logging
from collections import defaultdict

import numpy as np
from tqdm import tqdm
from vllm import LLM, SamplingParams

from sal.config import Config
from sal.models.reward_models import PRM

from sal.search.utils import Beam, build_conv, generate_k_steps, last



def _select_diverse(embeds_list, K, V):
    num_arms = len(embeds_list)
    _V = copy.deepcopy(V)
    # S_embeds = copy.deepcopy(embeds_list)
    A_idxes = []
    A_embeds = []

    for it in range(K):
        max_val = -10
        max_idx = None
        max_embeds = None
        for arm_idx, arm_embed in enumerate(embeds_list):
            # print(arm_idx)
            # print(arm_embed.shape)

            if arm_idx in A_idxes:
                continue

            # normalize the embeddings
            # norm = np.linalg.norm(arm_embed)
            # arm_embed /= norm

            # compute Mahalanobis norm
            arm_val = np.matmul(np.matmul(arm_embed, np.linalg.inv(_V)), arm_embed.T)
            # print(arm_val)
            if arm_val > max_val:
                max_val = arm_val
                max_idx = arm_idx
                max_embed = arm_embed

        # update V
        _V = _V + np.matmul(max_embed, max_embed.T)

        # update A
        A_idxes.append(max_idx)

        # print(_V.shape)
        # print(max_val)
        # print(max_idx)
        # print(A_idxes)

    return A_idxes

def _select_diverse_search(batch_of_prompts, config: Config, llm: LLM, llm_tf, llm_tokenizer, prm) -> list[Beam]:
    sampling_params = SamplingParams(
        temperature=config.temperature,
        max_tokens=config.max_tokens,
        top_p=config.top_p,
        stop=["\n\n"],
        include_stop_str_in_output=True,
        n=1,
    )

    beams: list[Beam] = []
    for prompt in batch_of_prompts:
        for i in range(config.n):
            beams.append(
                Beam(
                    prompt=prompt,
                    index=i,
                    current_text="",
                    next_texts=None,
                    lookahead_texts=None,
                    pruned=False,
                    completed=False,  # New flag to track completion
                    stop_reasons=None,
                    history=[],
                    best_scores=[],
                    all_scores=[],
                    previous_text=None,
                    completion_tokens=0,
                )
            )

    completed_beams: list[Beam] = []

    # for i in tqdm(range(config.num_iterations), desc="Beam search iterations"):
    for i in range(config.num_iterations):
        if i == 0:
            active_beams = [b for b in beams if not b.pruned]
        else:
            active_beams = [b for b in active_beams if not b.pruned]

        # Duplicate active beams to ensure that we have config.n beams per iteration
        if len(active_beams) != config.n:
            repeats = (config.n // len(active_beams)) + 1
            # print(
            #     f"Extending active_beams with {repeats} repetitions to reach size {config.n}"
            # )
            extended_active_beams = [
                copy.deepcopy(b) for b in (active_beams * repeats)[: config.n]
            ]
            active_beams = extended_active_beams
            if len(active_beams) != config.n:
                raise ValueError(
                    f"Expected {config.n} active beams, but got {len(active_beams)}"
                )

        if i == config.num_iterations - 1:
            # Last iteration, generate to EOS
            sampling_params = SamplingParams(
                temperature=config.temperature,
                max_tokens=config.max_tokens,
                top_p=config.top_p,
                n=1,
            )

        convs = [
            build_conv(b.prompt, b.current_text, config.system_prompt)
            for b in active_beams
        ]
        continue_final_message = i > 0
        add_generation_prompt = i == 0

        tokenizer = llm.get_tokenizer()
        if config.custom_chat_template is not None:
            tokenizer.chat_template = config.custom_chat_template
        templated_convs = tokenizer.apply_chat_template(
            convs,
            add_generation_prompt=add_generation_prompt,
            continue_final_message=continue_final_message,
            tokenize=False,
        )
        lookahead = 0 if i == config.num_iterations - 1 else config.lookahead
        gen_results = generate_k_steps(
            templated_convs, lookahead, llm, sampling_params, 1
        )

        prompts, completions = [], []
        next_active_beams = []
        for beam, gen_result in zip(active_beams, gen_results, strict=True):
            beam.next_texts = gen_result.next_texts
            beam.stop_reasons = gen_result.stop_reasons
            beam.lookahead_texts = gen_result.lookahead_texts
            beam.completion_tokens += gen_result.completion_tokens
            beam.current_text += beam.next_texts[0]
            beam.history.append(beam.next_texts[0])

            if (
                beam.stop_reasons[0] == "EOS"
                or beam.stop_reasons[0] == "length"
                or beam.next_texts[0] == ""
            ):
                beam.completed = True
                completed_beams.append(beam)

            prompts.append(beam.prompt)
            completions.append([beam.current_text])

        # scores = prm.score(prompts, completions)

        # agg_scores = [
        #     [aggregate_scores(s, config.agg_strategy) for s in score]
        #     for score in scores
        # ]

        # for beam, score in zip(active_beams, scores, strict=True):
        #     beam.all_scores = score[0]

        # # Now filter active_beams and agg_scores for beams that are completed
        # agg_scores = [
        #     agg_scores[i] for i, b in enumerate(active_beams) if not b.completed
        # ]
        active_beams = [b for b in active_beams if not b.completed]

        # Early stopping if all beams are completed
        if len(active_beams) == 0:
            break

        # get completion's embeddings
        completions_embeds = []
        for beam in active_beams:
            with torch.no_grad():
                # get beam.current_text which include previous all steps upto now
                inputs = llm_tokenizer(beam.current_text, return_tensors="pt").to("cuda")
                output = llm_tf(**inputs, output_hidden_states=True)
                # print(output)
                last_hidden_state = output.hidden_states[-1]
                last_token_embedding = last_hidden_state[:, -1, :].squeeze(0).detach().cpu().numpy()
                # print(last_hidden_state.shape)
                # print(last_token_embedding)

                # normalize the embeddings
                if config.normalize_embeddings:
                    norm = np.linalg.norm(last_token_embedding)
                    last_token_embedding /= norm

                completions_embeds.append(last_token_embedding)

        V = config.lam*np.eye(2048)
        K = int(config.n / 2)
        selected_idxes = _select_diverse(completions_embeds, K, V)
        # print(len(completions_embeds))
        # print(selected_idxes)

        for idx, beam in enumerate(active_beams):
            if idx not in selected_idxes:
                beam.pruned = True

    # Filter completed beams for those with top config.n scores
    if config.sort_completed:
        completed_beams = sorted(
            completed_beams,
            key=lambda b: aggregate_scores(b.all_scores, config.agg_strategy),
            reverse=True,
        )[: config.n]
    else:
        completed_beams = completed_beams[: config.n]

    if len(completed_beams) != config.n:
        # If we don't have enough completed_beams, duplicate until we reach config.n
        repeats = (config.n // len(completed_beams)) + 1
        # print(
        #     f"Extending completed_beams with {repeats} repetitions to reach size {config.n}"
        # )
        extended_completed_beams = [
            copy.deepcopy(b) for b in (completed_beams * repeats)[: config.n]
        ]
        completed_beams = extended_completed_beams

    return completed_beams

def select_diverse_search(batch_of_prompts, config: Config, llm: LLM, llm_tf, llm_tokenizer, prm):
    # problems = examples["problem"]
    beam_results = _select_diverse_search(batch_of_prompts, config, llm, llm_tf, llm_tokenizer, prm)

    # Group together alike beams and store in the dataset
    grouped_results = defaultdict(list)
    for results in beam_results:
        grouped_results[results.prompt].append(results)

    results = {"completions": [], "pred": [], "completion_tokens": [], "best_score": []}

    for p in batch_of_prompts:
        beams = grouped_results[p]
        completions = [b.current_text for b in beams]
        agg_scores = [
            aggregate_scores(b.all_scores, config.agg_strategy) for b in beams
        ]
        best_idx = np.argmax(agg_scores)
        # best_pred = completions[best_idx]
        results["completions"].append(completions)
        # results["scores"].append([b.all_scores for b in beams])
        results["best_score"].append(agg_scores[best_idx])
        results["pred"].append(completions[best_idx])
        # results["completion_tokens"].append([b.completion_tokens for b in beams])

    return results


In [24]:
# general params
config = Config()
config.n = 4

level = '1'
num_questions = len(data_by_levels[level])
num_questions = 20
num_trials = 200
print(f"num_questions = {num_questions}")

# get batch of questions
batch_of_questions = [data_by_levels[level][q_idx]['problem'] for q_idx in range(num_questions)]

# select search algo
search_name = 'best_of_n'
algo_type = 1
if search_name == 'best_of_n':
    if algo_type == 1:
        search_algo = best_of_n.best_of_n_v11
    else:
        search_algo = best_of_n.best_of_n_v12
print(search_algo)

# run search_algo and save results
result_dir = f"results/generate_bon_prm800k_level{level}_n{config.n}_v11.jsonl"
start_time = time.time()
with open(result_dir, 'w', encoding = 'utf-8') as fout:
    for trial_idx in range(num_trials):
        # best_of_n(batch_of_questions, config, llm_vllm, random_seeds[trial_idx])
        results = search_algo(batch_of_questions, config, llm_vllm, 10000+trial_idx)
        json.dump(results, fout)
        fout.write('\n')
    
        # compute the time
        if trial_idx % 1 == 0:
            total_time = time.time() - start_time
            time_per_trial = total_time/(trial_idx+1)
            time_per_question = time_per_trial/num_questions
            print(f"trial {trial_idx}")
            print(f"it takes {time_per_question:0.4f}s per question")
            print(f"it takes {time_per_trial:0.4f}s per trial")

total_time = time.time() - start_time
print(f"it takes {total_time:0.4f}s in total")

num_trials = 50
num_questions = 43
<function best_of_n_v11 at 0x7f4a52bc1440>


KeyboardInterrupt: 