In [1]:
import os, psutil, gc
import time 
import json
import pprint
import copy 

from collections import defaultdict
import random
import numpy as np

import multiprocessing as mp

from dataclasses import dataclass

In [2]:
import torch 
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
import torch.distributed as dist
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams, PoolingParams

from sal.config import Config
from sal.search.utils import build_conv, generate_k_steps, last

from core import select_diverse
from utils.load_data import load_data_prm800k

In [3]:
if torch.cuda.is_available():
    GPUS = os.environ.get('CUDA_VISIBLE_DEVICES', "0").split(',')
    print(GPUS)
else:
    print("CUDA is not available.")

['0', '1', '2', '3']


In [4]:
# base_dir
base_dir = '/groups/kjun/tnn/datasets/'

# dataset path
data_dir = base_dir + "/prm800k/math_splits"

# llm and prm path
llm_dir = base_dir + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_dir = base_dir + "/Llama-3.2-1B-Instruct"
prm_tokenizer_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data"

In [5]:
#  load data 
data_by_levels = load_data_prm800k(data_dir)

# load random_seeds     
# random_seeds = np.loadtxt("random_seeds.txt").astype("int64")
# random_seeds = [int(seed) for seed in random_seeds]

1: 43
2: 90
3: 105
4: 128
5: 134


In [6]:
# baseline: gpu_memory_utilization=0.2
# use the standard model 
llm_vllm = LLM(
        model = llm_tokenizer_dir,
        tensor_parallel_size=1,
        gpu_memory_utilization = 0.7,  # Utilize 50% of GPU memory
        # enable_prefix_caching=True,  # V100 doesn't support enable_prefix_caching 
        # enable_chunked_prefill=False, # and enable_chunked_prefill
        max_model_len = 5000,
        dtype = "float16",
        seed = 123)
    
    # # use the gguf quantized model 
    # llm_regular = LLM(
    #     model = llm_dir,
    #     tokenizer = llm_tokenizer_dir,
    #     tensor_parallel_size=1,
    #     gpu_memory_utilization = 0.2,  # Utilize 50% of GPU memory
    #     max_model_len = 5000,
    #     dtype = "float16",
    #     seed = 123)


gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))

INFO 04-23 10:41:11 __init__.py:207] Automatically detected platform cuda.
INFO 04-23 10:41:18 config.py:549] This model supports multiple tasks: {'embed', 'reward', 'score', 'classify', 'generate'}. Defaulting to 'generate'.
INFO 04-23 10:41:18 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=5000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_m

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-23 10:41:22 model_runner.py:1115] Loading model weights took 2.3185 GB
INFO 04-23 10:41:22 worker.py:267] Memory profiling takes 0.49 seconds
INFO 04-23 10:41:22 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.70) = 22.21GiB
INFO 04-23 10:41:22 worker.py:267] model weights take 2.32GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.19GiB; the rest of the memory reserved for KV Cache is 18.62GiB.
INFO 04-23 10:41:23 executor_base.py:111] # cuda blocks: 38125, # CPU blocks: 8192
INFO 04-23 10:41:23 executor_base.py:116] Maximum concurrency for 5000 tokens per request: 122.00x
INFO 04-23 10:41:24 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:15<00:00,  2.31it/s]

INFO 04-23 10:41:39 model_runner.py:1562] Graph capturing finished in 15 secs, took 0.13 GiB
INFO 04-23 10:41:39 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 17.61 seconds





#--- memory: 20.959694862365723
#--- memory: 0.0


In [7]:
tokenizer = AutoTokenizer.from_pretrained(llm_tokenizer_dir)
llm_tf = AutoModelForCausalLM.from_pretrained(llm_tokenizer_dir).to("cuda:1")
# model_regular.generation_config.pad_token_id = tokenizer.eos_token_id
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

#--- memory: 20.959694862365723


In [20]:
@dataclass
class Beam:
    q_idx: int
    question: str
    templated_conv: str
    current_text: str | None
    next_texts: list[str] | None
    lookahead_texts: list[str] | None
    stop_reasons: list[str | None] | None
    best_scores: list[float]  # the PRM scores
    all_scores: list[list[float]]  # all PRM scores
    previous_text: str | None
    pruned: False
    history: list[str]
    completed: bool = False
    completion_tokens: int = 0


def _select_diverse(X_embeds, X_lprobs, X_ppl, K, V):
    num_arms = len(X_embeds)
    _V = copy.deepcopy(V)
    A_idxes = []
    A_embeds = []
    tol = 0.0001
    for it in range(K):
        _V_inv = np.linalg.inv(_V)
        arm_vals = np.einsum('ij,jk,ik->i', X_embeds, _V_inv, X_embeds)
        max_val = np.max([val for idx, val in enumerate(arm_vals) if idx not in A_idxes])
        # candidate_idxes = np.where(np.abs(arm_vals-max_val) < tol)[0]
        candidate_idxes = [
            arm_idx for arm_idx, arm_val in enumerate(arm_vals)
            if (np.abs(max_val - arm_val) <= tol) and (arm_idx not in A_idxes)
        ]

        best_idx = max(candidate_idxes, key=lambda i: X_lprobs[i])
        # print(arm_vals)
        # print(X_lprobs)
        # print(candidate_idxes)
        # print(best_idx)
        
        best_embeds = X_embeds[best_idx]
        best_embeds = best_embeds.reshape(-1, 1)
        # print(best_embeds.shape)

        # update V
        _V = _V + np.matmul(best_embeds, best_embeds.T)

        # update A
        A_idxes.append(best_idx)

        # print(_V.shape)
        # print(max_val)
        # print(max_idx)
        # print(A_idxes)

    return A_idxes


In [9]:
stop

NameError: name 'stop' is not defined

In [24]:
def _select_diverse_search(batch_of_questions, config: Config, llm_vllm: LLM, llm_tf, llm_tokenizer) -> list[Beam]:
    sampling_params = SamplingParams(
        temperature=config.temperature,
        max_tokens=config.max_tokens,
        top_p=config.top_p,
        stop=["\n\n"],
        include_stop_str_in_output=True,
        n=1,
    )
    
    completed_beams: list[Beam] = []
    beams: list[Beam] = []
    for q_idx, question in enumerate(batch_of_questions):
        for _ in range(config.n):
            beams.append(
                Beam(
                    q_idx=q_idx,
                    question=question,
                    templated_conv="",
                    current_text="",
                    next_texts=None,
                    lookahead_texts=None,
                    pruned=False,
                    completed=False,  # New flag to track completion
                    stop_reasons=None,
                    history=[],
                    best_scores=[],
                    all_scores=[],
                    previous_text=None,
                    completion_tokens=0,
                )
            ) 


    # for i in tqdm(range(config.num_iterations), desc="Beam search iterations"):
    for it in range(config.num_iterations):
        print(f"\n-> {it}")
        if it == 0:
            active_beams = beams
        else:
            # active_beams = [b for b in active_beams if not b.pruned]
            m = len(active_beams)
            if m < config.n:
                expanded_idxes = np.random.int(m, n-m)
                active_beams += [active_beams[idx] for idx in expanded_idxes]
        
        print(f"initial: len = {len(active_beams)}")
        convs = [
            build_conv(b.question, b.current_text, config.system_prompt)
            for b in active_beams
        ]

        add_generation_prompt = it == 0
        continue_final_message = it > 0
    
        tokenizer = llm_vllm.get_tokenizer()
    
        if config.custom_chat_template is not None:
            tokenizer.chat_template = config.custom_chat_template
            
        templated_convs = tokenizer.apply_chat_template(
            convs,
            add_generation_prompt=add_generation_prompt,
            continue_final_message=continue_final_message,
            tokenize=False,
        )

        # Last iteration, generate to EOS
        if it == config.num_iterations - 1:
            sampling_params = SamplingParams(
                temperature=config.temperature,
                max_tokens=config.max_tokens,
                top_p=config.top_p,
                n=1,
            )

        lookahead = 0 if it == config.num_iterations - 1 else config.lookahead
        gen_results = generate_k_steps(
            templated_convs, lookahead, llm_vllm, sampling_params, 1
        )

        # Collecct gen_results into beams
        for beam, gen_result in zip(active_beams, gen_results, strict=True):
            beam.next_texts = gen_result.next_texts
            beam.stop_reasons = gen_result.stop_reasons
            beam.lookahead_texts = gen_result.lookahead_texts
            beam.completion_tokens += gen_result.completion_tokens
            beam.current_text += gen_result.next_texts[0]
            # beam.history.append(beam.next_texts[0])
            beam.templated_prompt = gen_result.prompt
            # pprint.pprint(gen_result)
            # print(f"beam.next_texts = {beam.next_texts}")
            # print(f"beam.stop_reasons = {beam.stop_reasons}")
            # print(f"beam.lookahead_texts = {beam.lookahead_texts}")
            # print(f"beam.lookahead_texts = {beam.lookahead_texts}")
            # stop
            
            if (
                beam.stop_reasons[0] == "EOS"
                or beam.stop_reasons[0] == "length"
                or beam.next_texts[0] == ""
            ):
                beam.completed = True
                completed_beams.append(beam)
                # continue

        print(f"after generation: len = {len(active_beams)}")
        for b_idx, b in enumerate(active_beams):
            print(f"b_idx = {b_idx}")
            print(b.current_text)
            
        # Filter out comleted beams 
        active_beams = [b for b in active_beams if not b.completed]

        print(f"after filtered completed beams: len = {len(active_beams)}")

        # Early stopping if all beams are completed
        if len(active_beams) == 0:
            print("break")
            break
        
        # Extract completion's embeddings and other info
        batch_embeds = [[] for _ in range(len(batch_of_questions))]
        batch_log_probs = [[] for _ in range(len(batch_of_questions))]
        batch_ppl = [[] for _ in range(len(batch_of_questions))]
        batch_beams = [[] for _ in range(len(batch_of_questions))]
    
        for b_idx, beam in enumerate(active_beams):
            with torch.no_grad():
                # get beam.current_text which include previous all steps upto now
                gen_prompt = beam.templated_prompt + beam.next_texts[0]
                # print(gen_prompt)
                # stop
                inputs = llm_tokenizer(gen_prompt, return_tensors="pt").to(llm_tf.device)
                outputs = llm_tf(**inputs, output_hidden_states=True)
    
                # Get last_token_embeds
                last_hidden_state = outputs.hidden_states[-1]
                last_token_embeds = last_hidden_state[:, -1, :].squeeze(0).detach().cpu().numpy()
                # print(last_token_embeds.shape)
    
                # Compute otuput_log_prob
                # Prepare labels: shift input_ids to the right by one
                labels = inputs['input_ids'][:, 1:]   
                shifted_logits = outputs.logits[:, :-1, :]
                loss_fct = CrossEntropyLoss(reduction='sum')
                completion_log_prob = -loss_fct(shifted_logits.view(-1, shifted_logits.size(-1)), labels.view(-1)).detach().cpu().numpy()
                completion_ppl = np.exp(completion_log_prob/len(labels))
                # print(sent_ppl)
                # print(loss)
    
                # normalize the embeds
                if config.normalize_embeds:
                    norm = np.linalg.norm(last_token_embeds)
                    last_token_embeds /= norm
                    # print(np.linalg.norm(last_token_embeds))
    
                batch_embeds[beam.q_idx].append(last_token_embeds)
                batch_log_probs[beam.q_idx].append(completion_log_prob)
                batch_ppl[beam.q_idx].append(completion_ppl)
                batch_beams[beam.q_idx].append(beam)
    
        # pprint.pprint(len(batch_completions_embeds))
        # pprint.pprint(len(batch_completions_log_probs))
        # pprint.pprint(len(batch_completions_ppl))
        print(f"after generation: batch_beams")
        print(len(batch_beams))
        print(len(batch_beams[0]))

        # Use _select_diverse to diversify embeddings 
        for q_idx in range(len(batch_of_questions)):
            V = config.lam*np.eye(2048)
            K = int(config.n / config.beam_width)
            if len(batch_beams[q_idx]) <= K:
                continue 
    
            selected_idxes = _select_diverse(
                batch_embeds[q_idx], batch_log_probs[q_idx], batch_ppl[q_idx], K, V)
    
            # print(selected_idxes)
            
            for idx, beam in enumerate(batch_beams[q_idx]):
                if idx not in selected_idxes:
                    beam.pruned = True 

        # tasks = [(q_idx, batch_embeds[q_idx],
        #           batch_log_probs[q_idx], batch_ppl[q_idx], config) for q_idx in range(len(batch_of_questions))]
        # tasks = [(q_idx, config) for q_idx in range(len(batch_of_questions))]

        # with mp.Pool() as pool:
        #     pool.starmap(process_select_diverse, tasks)
                

    # Collect the completions from beams
    completions = [[] for _ in range(len(batch_of_questions))]
    completion_ntokens = [[] for _ in range(len(batch_of_questions))]

    for beam in completed_beams:
        completions[beam.q_idx].append(beam.current_text)

    results = defaultdict(list)
    results["completions"] = completions
    # results["completion_ntokens"] = completion_ntokens
    
    return results

# general params
config = Config()
config.n = 4
config.beam_width = 2
config.lookahead = 0
config.num_iterations = 10

# diverse_select params
config.lam = 10
config.normalize_embeds = True

level = 4
num_questions = len(data_by_levels[level])
num_questions = 1
num_trials = 1
print(f"num_questions = {num_questions}")
print(f"num_trials = {num_trials}")

# get batch of questions
batch_of_questions = [data_by_levels[level][q_idx]['problem'] for q_idx in range(num_questions)]

# np.random.seed(100000+trial_idx)
# random.seed(100000+trial_idx)
torch.manual_seed(100000+0)
torch.cuda.manual_seed(100000+0)

results = _select_diverse_search(batch_of_questions, config, llm_vllm, llm_tf, tokenizer)

num_questions = 1
num_trials = 1

-> 0
initial: len = 4
after generation: len = 4
b_idx = 0
## Step 1:  To find the angle between two lines given by their parametric equations, we first need to identify the direction vectors of each line.
## Step 2:  The direction vector of the given line $6x = -y = -4z$ can be identified from its coefficients as $\vec{v} = \langle 6, -1, -4 \rangle$.
## Step 3:  For the first line $2x = 3y = -z$, we can identify its direction vector as $\vec{w} = \langle 2, \frac{1}{3}, -1 \rangle$.
## Step 4:  To calculate the angle between these two lines, we can use the dot product formula for vectors, which is $\vec{a} \cdot \vec{b} = |\vec{a}| |\vec{b}| \cos \theta$, where $\theta$ is the angle between the vectors.
## Step 5:  The magnitude of $\vec{v}$ is given by $|\vec{v}| = \sqrt{6^2 + (-1)^2 + (-4)^2} = \sqrt{36 + 1 + 16} = \sqrt{53}$.
## Step 6:  The magnitude of $\vec{w}$ is given by $|\vec{w}| = \sqrt{2^2 + \left(\frac{1}{3}\right)^2 + (-1)^2} = \sqrt{4 +

In [15]:
# print(len(beam_results))
# pprint.pprint(results)
for q_idx in range(len(results["completions"])):
    print(len(results["completions"][q_idx]))
    for c_idx, completion in enumerate(results["completions"][q_idx]):
        print(f"\n-> c_idx = {c_idx}")
        print(completion)

4

-> c_idx = 0
## Step 1: Identify the direction vectors of the lines
The direction vector of the first line is given by $\mathbf{d}_1 = (2, 3, -1)$, and the direction vector of the second line is given by $\mathbf{d}_2 = (6, -1, -4)$.

## Step 2: Calculate the dot product of the direction vectors
The dot product of $\mathbf{d}_1$ and $\mathbf{d}_2$ is calculated as:
\[(2, 3, -1) \cdot (6, -1, -4) = (2)(6) + (3)(-1) + (-1)(-4) = 12 - 3 + 4 = 13.\]

## Step 3: Calculate the magnitudes of the direction vectors
The magnitude of $\mathbf{d}_1$ is calculated as:
\[\|\mathbf{d}_1\| = \sqrt{(2)^2 + (3)^2 + (-1)^2} = \sqrt{4 + 9 + 1} = \sqrt{14}.\]
The magnitude of $\mathbf{d}_2$ is calculated as:
\[\|\mathbf{d}_2\| = \sqrt{(6)^2 + (-1)^2 + (-4)^2} = \sqrt{36 + 1 + 16} = \sqrt{53}.\]

## Step 4: Calculate the cosine of the angle between the lines
The cosine of the angle $\theta$ between the lines is given by:
\[\cos \theta = \frac{\mathbf{d}_1 \cdot \mathbf{d}_2}{\|\mathbf{d}_1\| \|\mathbf{d}