In [1]:
'''
Apply multiprocessing to  _select_diverse function to efficiently handle batches of beams across multiple questions.
'''

import os, psutil, gc
import time 
import json
import pprint
import copy 

from collections import defaultdict
import random
import numpy as np

import multiprocessing as mp

from dataclasses import dataclass

In [2]:
import torch 
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
import torch.distributed as dist
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams, PoolingParams

from sal.config import Config
from sal.search.utils import build_conv, generate_k_steps, last
from sal.utils.score import aggregate_scores

# from core import select_diverse_v31
from core.reward_models import RLHFFlow

from utils.load_data import load_data_prm800k

In [3]:
if torch.cuda.is_available():
    GPUS = os.environ.get('CUDA_VISIBLE_DEVICES', "0").split(',')
    print(GPUS)
else:
    print("CUDA is not available.")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

['0', '1', '2', '3']


In [4]:
# base_dir
base_dir = '/groups/kjun/tnn/datasets/'

# dataset path
data_dir = base_dir + "/prm800k/math_splits"

# llm and prm path
llm_dir = base_dir + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_dir = base_dir + "/Llama-3.2-1B-Instruct"
prm_tokenizer_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data"

In [5]:
#  load data 
data_by_levels = load_data_prm800k(data_dir)

# load random_seeds     
# random_seeds = np.loadtxt("random_seeds.txt").astype("int64")
# random_seeds = [int(seed) for seed in random_seeds]

1: 43
2: 90
3: 105
4: 128
5: 134


In [6]:
# baseline: gpu_memory_utilization=0.2
# use the standard model 
llm_vllm = LLM(
        model = llm_tokenizer_dir,
        tensor_parallel_size=1,
        gpu_memory_utilization = 0.7,  # Utilize 50% of GPU memory
        # enable_prefix_caching=True,  # V100 doesn't support enable_prefix_caching 
        # enable_chunked_prefill=False, # and enable_chunked_prefill
        max_model_len = 5000,
        dtype = "float16",
        seed = 0)
    
    # # use the gguf quantized model 
    # llm_regular = LLM(
    #     model = llm_dir,
    #     tokenizer = llm_tokenizer_dir,
    #     tensor_parallel_size=1,
    #     gpu_memory_utilization = 0.2,  # Utilize 50% of GPU memory
    #     max_model_len = 5000,
    #     dtype = "float16",
    #     seed = 123)


gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))

INFO 05-15 17:31:23 __init__.py:207] Automatically detected platform cuda.
INFO 05-15 17:31:31 config.py:549] This model supports multiple tasks: {'embed', 'classify', 'score', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 05-15 17:31:31 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=5000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_m

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 05-15 17:31:34 model_runner.py:1115] Loading model weights took 2.3185 GB
INFO 05-15 17:31:35 worker.py:267] Memory profiling takes 0.48 seconds
INFO 05-15 17:31:35 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.70) = 22.21GiB
INFO 05-15 17:31:35 worker.py:267] model weights take 2.32GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.19GiB; the rest of the memory reserved for KV Cache is 18.62GiB.
INFO 05-15 17:31:35 executor_base.py:111] # cuda blocks: 38125, # CPU blocks: 8192
INFO 05-15 17:31:35 executor_base.py:116] Maximum concurrency for 5000 tokens per request: 122.00x
INFO 05-15 17:31:36 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:15<00:00,  2.27it/s]

INFO 05-15 17:31:52 model_runner.py:1562] Graph capturing finished in 15 secs, took 0.13 GiB
INFO 05-15 17:31:52 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 17.84 seconds





#--- memory: 20.959694862365723
#--- memory: 0.0


In [7]:
tokenizer = AutoTokenizer.from_pretrained(llm_tokenizer_dir)
llm_tf = AutoModelForCausalLM.from_pretrained(llm_tokenizer_dir).to("cuda:1")
# model_regular.generation_config.pad_token_id = tokenizer.eos_token_id
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

#--- memory: 20.959694862365723


In [49]:
llm_tf.eval()
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'


In [9]:
prm = RLHFFlow(model_path=prm_tokenizer_dir, device_map='cuda:2')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
stop

NameError: name 'stop' is not defined

In [11]:

from core import diverse_reward_search
from core import diverse_reward_search_v12

import importlib


In [59]:
importlib.reload(diverse_reward_search)
importlib.reload(diverse_reward_search_v12)

# general params
config = Config()
config.n = 4
config.beam_width = 2
config.lookahead = 0
config.num_depths = 3
config.filter_duplicates = True

# diverse_select params
config.lam = 10
config.normalize_embeds = True

config.ds_beta = 0
config.ds_alpha = 1.0

level = 4
num_questions = len(data_by_levels[level])
num_questions = 2
num_trials = 1
print(f"num_questions = {num_questions}")
print(f"num_trials = {num_trials}")

# get batch of questions
batch_of_questions = [data_by_levels[level][q_idx]['problem'] for q_idx in range(num_questions)]

torch.manual_seed(100000+0)
torch.cuda.manual_seed(100000+0)

start_time = time.time()
results = diverse_reward_search.diverse_search(batch_of_questions, config, llm_vllm, llm_tf, tokenizer, prm)
print(f"Time taken: {time.time() - start_time} seconds")

num_questions = 2
num_trials = 1
diverse_search_v12

-> 0

-> after filtered completions: len = 7
b_idx = 0
## Step 1: Express the first set of points in terms of a single variable.
Let's express $y$ and $z$ in terms of $x$ using the given equations: 
\[y = \frac{2}{3}x\]
\[z = -\frac{2}{3}x\]


b_idx = 1
## Step 1: Identify the direction vectors of the lines


b_idx = 2
## Step 1: Find the direction vectors of the two lines
To find the angle between the lines, we first need to find their direction vectors. The direction vector of the first line is (2, 3, -1) and the direction vector of the second line is (6, -1, -4).


b_idx = 3
## Step 1: Express the first equation in terms of y
From the given equation $2x = 3y$, we can express y in terms of x as $y = \frac{2}{3}x$.


b_idx = 4
## Step 1: Identify the graph's characteristics
The graph of $y = a \sin (bx + c) + d$ for some positive constants $a,$ $b,$ $c,$ and $d$ is a sinusoidal function with amplitude $a$, vertical shift $d$, and pe

In [19]:
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))

#--- memory: 6.1091742515563965


In [None]:
# print(len(beam_results))
# pprint.pprint(results)
for items in results['completions']:
    print(len(items))
# print(results['completions'])

In [None]:
# print(len(beam_results))
# pprint.pprint(results)
print(results['completions'])