In [1]:
import gc
import copy
import pprint
import json
import os, psutil
from collections import defaultdict

import time
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from dataclasses import dataclass

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams
from sal.models.reward_models import RLHFFlow

from sal.search.utils import Beam, build_conv, generate_k_steps, last
from sal.config import Config
from sal.models.reward_models import PRM
from sal.utils.score import aggregate_scores

In [3]:
cpu_percent = psutil.cpu_percent(interval=1)
print(f"CPU Usage: {cpu_percent}%")

# RAM usage
virtual_memory = psutil.virtual_memory()
print(f"Total RAM: {virtual_memory.total / (1024 ** 3):.2f} GB")
print(f"Available RAM: {virtual_memory.available / (1024 ** 3):.2f} GB")
print(f"Used RAM: {virtual_memory.used / (1024 ** 3):.2f} GB")
print(f"RAM Usage Percentage: {virtual_memory.percent}%")

if torch.cuda.is_available():
    GPUS = os.environ.get('CUDA_VISIBLE_DEVICES', "0").split(',')
    print(GPUS)
    for gpu_index in  GPUS:
        print(f"\n-> gpu {gpu_index}")
        gpu_index = int(gpu_index)
        # gpu_index = 0  # Change this if you have multiple GPUs
        total_memory = torch.cuda.get_device_properties(gpu_index).total_memory
        reserved_memory = torch.cuda.memory_reserved(gpu_index)
        allocated_memory = torch.cuda.memory_allocated(gpu_index)
        free_memory = reserved_memory - allocated_memory
    
        print(f"Total GPU Memory: {total_memory / 1024 ** 3:.2f} GB")
        print(f"Allocated GPU Memory: {allocated_memory / 1024 ** 3:.2f} GB")
        print(f"Available GPU Memory: {free_memory / 1024 ** 3:.2f} GB")
else:
    print("CUDA is not available.")

CPU Usage: 0.1%
Total RAM: 503.68 GB
Available RAM: 475.61 GB
Used RAM: 14.14 GB
RAM Usage Percentage: 5.6%
['0', '1', '2', '3']

-> gpu 0
Total GPU Memory: 31.73 GB
Allocated GPU Memory: 0.00 GB
Available GPU Memory: 0.00 GB

-> gpu 1
Total GPU Memory: 31.73 GB
Allocated GPU Memory: 0.00 GB
Available GPU Memory: 0.00 GB

-> gpu 2
Total GPU Memory: 31.73 GB
Allocated GPU Memory: 0.00 GB
Available GPU Memory: 0.00 GB

-> gpu 3
Total GPU Memory: 31.73 GB
Allocated GPU Memory: 0.00 GB
Available GPU Memory: 0.00 GB


In [4]:
# base_path
base_path = '/groups/kjun/tnn/datasets/'

# dataset path
dataset_path = base_path + "/prm800k/math_splits"

# llm and prm path
llm_path = base_path + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_path = base_path + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_path = base_path + "/Llama-3.2-1B-Instruct"
prm_tokenizer_path = base_path + "/Llama3.1-8B-PRM-Deepseek-Data"

In [5]:
# baseline: gpu_memory_utilization=0.2
vllm_inference = LLM(
    model = llm_tokenizer_path,
    gpu_memory_utilization = 0.2,  # Utilize 50% of GPU memory
    max_model_len = 5000,
    dtype = "float16",
    seed = 123)

gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

INFO 03-19 13:41:38 __init__.py:207] Automatically detected platform cuda.
INFO 03-19 13:41:45 config.py:549] This model supports multiple tasks: {'classify', 'reward', 'embed', 'score', 'generate'}. Defaulting to 'generate'.
INFO 03-19 13:41:45 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=5000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_m

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-19 13:41:49 model_runner.py:1115] Loading model weights took 2.3185 GB
INFO 03-19 13:41:49 worker.py:267] Memory profiling takes 0.50 seconds
INFO 03-19 13:41:49 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.20) = 6.35GiB
INFO 03-19 13:41:49 worker.py:267] model weights take 2.32GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.19GiB; the rest of the memory reserved for KV Cache is 2.75GiB.
INFO 03-19 13:41:50 executor_base.py:111] # cuda blocks: 5631, # CPU blocks: 8192
INFO 03-19 13:41:50 executor_base.py:116] Maximum concurrency for 5000 tokens per request: 18.02x
INFO 03-19 13:41:51 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utili

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:16<00:00,  2.09it/s]

INFO 03-19 13:42:08 model_runner.py:1562] Graph capturing finished in 17 secs, took 0.13 GiB
INFO 03-19 13:42:08 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 19.61 seconds





#--- memory: 5.084694862365723


In [6]:
tokenizer = AutoTokenizer.from_pretrained(llm_tokenizer_path)
llm_transformer = AutoModelForCausalLM.from_pretrained(llm_tokenizer_path).to("cuda:1")
# model_regular.generation_config.pad_token_id = tokenizer.eos_token_id
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

#--- memory: 5.084694862365723


In [7]:
question_text = "Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\theta),$ where $r > 0$ and $0 \le \theta < 2 \pi.$"
input_batch = {"problem": [question_text]}

In [17]:
prm = RLHFFlow(prm_tokenizer_path)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [24]:
# general params
config = Config()
config.agg_strategy = 'last'
config.n = 4
config.lookahead = 0
config.num_iterations = 10

# diverse_select params
config.lam = 10
config.normalize_embeddings = True


def _embed_diverse_search(batch_of_prompts, config: Config, llm: LLM, llm_tf, llm_tokenizer, prm) -> list[Beam]:
    sampling_params = SamplingParams(
        temperature=config.temperature,
        max_tokens=config.max_tokens,
        top_p=config.top_p,
        stop=["\n\n"],
        include_stop_str_in_output=True,
        n=1,
    )

    beams: list[Beam] = []
    for prompt in batch_of_prompts:
        for i in range(config.n):
            beams.append(
                Beam(
                    prompt=prompt,
                    index=i,
                    current_text="",
                    next_texts=None,
                    lookahead_texts=None,
                    pruned=False,
                    completed=False,  # New flag to track completion
                    stop_reasons=None,
                    history=[],
                    best_scores=[],
                    all_scores=[],
                    previous_text=None,
                    completion_tokens=0,
                )
            )

    completed_beams: list[Beam] = []

    # for i in tqdm(range(config.num_iterations), desc="Beam search iterations"):
    for i in range(config.num_iterations):
        if i == 0:
            active_beams = [b for b in beams if not b.pruned]
        else:
            active_beams = [b for b in active_beams if not b.pruned]

        # Duplicate active beams to ensure that we have config.n beams per iteration
        if len(active_beams) != config.n:
            repeats = (config.n // len(active_beams)) + 1
            print(
                f"Extending active_beams with {repeats} repetitions to reach size {config.n}"
            )
            extended_active_beams = [
                copy.deepcopy(b) for b in (active_beams * repeats)[: config.n]
            ]
            active_beams = extended_active_beams
            if len(active_beams) != config.n:
                raise ValueError(
                    f"Expected {config.n} active beams, but got {len(active_beams)}"
                )

        if i == config.num_iterations - 1:
            # Last iteration, generate to EOS
            sampling_params = SamplingParams(
                temperature=config.temperature,
                max_tokens=config.max_tokens,
                top_p=config.top_p,
                n=1,
            )

        convs = [
            build_conv(b.prompt, b.current_text, config.system_prompt)
            for b in active_beams
        ]
        continue_final_message = i > 0
        add_generation_prompt = i == 0

        tokenizer = llm.get_tokenizer()
        if config.custom_chat_template is not None:
            tokenizer.chat_template = config.custom_chat_template
        templated_convs = tokenizer.apply_chat_template(
            convs,
            add_generation_prompt=add_generation_prompt,
            continue_final_message=continue_final_message,
            tokenize=False,
        )
        lookahead = 0 if i == config.num_iterations - 1 else config.lookahead
        gen_results = generate_k_steps(
            templated_convs, lookahead, llm, sampling_params, 1
        )

        prompts, completions = [], []
        next_active_beams = []
        for beam, gen_result in zip(active_beams, gen_results, strict=True):
            beam.next_texts = gen_result.next_texts
            beam.stop_reasons = gen_result.stop_reasons
            beam.lookahead_texts = gen_result.lookahead_texts
            beam.completion_tokens += gen_result.completion_tokens
            beam.current_text += beam.next_texts[0]
            beam.history.append(beam.next_texts[0])

            if (
                beam.stop_reasons[0] == "EOS"
                or beam.stop_reasons[0] == "length"
                or beam.next_texts[0] == ""
            ):
                beam.completed = True
                completed_beams.append(beam)
                
            prompts.append(beam.prompt)
            completions.append([beam.current_text])

        scores = prm.score(prompts, completions)

        agg_scores = [
            [aggregate_scores(s, config.agg_strategy) for s in score]
            for score in scores
        ]

        for beam, score in zip(active_beams, scores, strict=True):
            beam.all_scores = score[0]

        # Now filter active_beams and agg_scores for beams that are completed
        agg_scores = [
            agg_scores[i] for i, b in enumerate(active_beams) if not b.completed
        ]
        active_beams = [b for b in active_beams if not b.completed]

        # Early stopping if all beams are completed
        if len(active_beams) == 0:
            break

        # get completion's embeddings 
        completions_embeds = []
        for beam in active_beams:
            with torch.no_grad():
                # get beam.current_text which include previous all steps upto now 
                inputs = llm_tokenizer(beam.current_text, return_tensors="pt").to(llm_tf.device)
                output = llm_tf(**inputs, output_hidden_states=True)
                # print(output)
                last_hidden_state = output.hidden_states[-1]
                last_token_embedding = last_hidden_state[:, -1, :].squeeze(0).detach().cpu().numpy()
                # print(last_hidden_state.shape)
                # print(last_token_embedding)
                
                # normalize the embeddings
                if config.normalize_embeddings:
                    norm = np.linalg.norm(last_token_embedding)
                    last_token_embedding /= norm
                    
                completions_embeds.append(last_token_embedding)
            
        V = config.lam*np.eye(2048)
        K = int(config.n / 2)
        selected_idxes = _select_diverse(completions_embeds, K, V)
        print(len(completions_embeds))
        print(selected_idxes)

        for idx, beam in enumerate(active_beams):
            if idx not in selected_idxes:
                beam.pruned = True

    # Filter completed beams for those with top config.n scores
    if config.sort_completed:
        completed_beams = sorted(
            completed_beams,
            key=lambda b: aggregate_scores(b.all_scores, config.agg_strategy),
            reverse=True,
        )[: config.n]
    else:
        completed_beams = completed_beams[: config.n]

    if len(completed_beams) != config.n:
        # If we don't have enough completed_beams, duplicate until we reach config.n
        repeats = (config.n // len(completed_beams)) + 1
        print(
            f"Extending completed_beams with {repeats} repetitions to reach size {config.n}"
        )
        extended_completed_beams = [
            copy.deepcopy(b) for b in (completed_beams * repeats)[: config.n]
        ]
        completed_beams = extended_completed_beams

    return completed_beams



In [25]:
def _select_diverse(embeds_list, K, V):
    num_arms = len(embeds_list)
    _V = copy.deepcopy(V)
    # S_embeds = copy.deepcopy(embeds_list)
    A_idxes = []
    A_embeds = []
    
    for it in range(K):
        max_val = -10
        max_idx = None
        max_embeds = None
        for arm_idx, arm_embed in enumerate(embeds_list):
            # print(arm_idx)
            # print(arm_embed.shape)
            
            if arm_idx in A_idxes:
                continue 

            # normalize the embeddings
            # norm = np.linalg.norm(arm_embed)
            # arm_embed /= norm
            
            # compute Mahalanobis norm
            arm_val = np.matmul(np.matmul(arm_embed, np.linalg.inv(_V)), arm_embed.T)
            # print(arm_val)
            if arm_val > max_val:
                max_val = arm_val
                max_idx = arm_idx
                max_embed = arm_embed

        # update V
        _V = _V + np.matmul(max_embed, max_embed.T)

        # update A 
        A_idxes.append(max_idx)

        # print(_V.shape)
        # print(max_val)
        # print(max_idx)
        # print(A_idxes)
    
    return A_idxes


beam_results = _embed_diverse_search(input_batch['problem'], config, vllm_inference, llm_transformer, tokenizer, prm)

4
[1, 2]
Extending active_beams with 3 repetitions to reach size 4
4
[2, 1]
Extending active_beams with 3 repetitions to reach size 4
4
[1, 2]
Extending active_beams with 3 repetitions to reach size 4
4
[0, 1]
Extending active_beams with 3 repetitions to reach size 4
4
[3, 1]
Extending active_beams with 3 repetitions to reach size 4
4
[1, 2]
Extending active_beams with 3 repetitions to reach size 4
2
[1, 0]
Extending active_beams with 3 repetitions to reach size 4


In [28]:
print(beam_results[0])

Beam(prompt='Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\theta),$ where $r > 0$ and $0 \\le \theta < 2 \\pi.$', index=2, current_text='## Step 1: Understand the conversion formulas\nTo convert from rectangular coordinates $(x, y)$ to polar coordinates $(r, \\theta)$, we use the formulas $r = \\sqrt{x^2 + y^2}$ and $\\theta = \\tan^{-1}\\left(\\frac{y}{x}\\right)$.\n\n## Step 2: Apply the conversion formulas\nWe have the point $(0, 3)$ in rectangular coordinates. We will substitute $x = 0$ and $y = 3$ into the formulas.\n\n## Step 3: Calculate the value of $r$\n$r = \\sqrt{0^2 + 3^2} = \\sqrt{0 + 9} = \\sqrt{9} = 3$.\n\n## Step 4: Calculate the value of $\\theta$\n$\\theta = \\tan^{-1}\\left(\\frac{3}{0}\\right)$\n\n## Step 5: Evaluate the arctangent\nSince $\\theta = \\tan^{-1}\\left(\\frac{3}{0}\\right)$ is undefined, we must use the fact that $\\tan^{-1}(0) = 0$ and the periodicity of the arctangent function. So $\\th

In [None]:
# del(llm_transformer)
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))