In [1]:
import os, psutil, gc
import time 
import json
import pprint

from collections import defaultdict
import random
import numpy as np


import logging 
logging.basicConfig(level=logging.INFO)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch 
# from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams, PoolingParams

from sal.config import Config
from sal.search import beam_search, best_of_n, dvts
from sal.utils.data import get_dataset, save_dataset
from sal.utils.parser import H4ArgumentParser
from sal.utils.score import score

# from sal.models.reward_models import load_prm
from core.reward_models import RLHFFlow

from datasets import Dataset, load_dataset

INFO:datasets:PyTorch version 2.5.1 available.


In [3]:
if torch.cuda.is_available():
    GPUS = os.environ.get('CUDA_VISIBLE_DEVICES', "0").split(',')
    print(GPUS)
else:
    print("CUDA is not available.")

['0', '1', '2', '3']


In [4]:
# base_dir
base_dir = '/groups/kjun/tnn/datasets/'

# dataset path
data_dir = base_dir + "/math500"

# llm and prm path
llm_dir = base_dir + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_dir = base_dir + "/Llama-3.2-1B-Instruct"
prm_tokenizer_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data"

In [5]:
config = Config()
config.n = 4
config.approach = "beam_search"
config.search_batch_size = 1
config.sort_completed = True
config.filter_duplicates = True
config.num_samples = 10         # REMOVE THIS LINE TO RUN ON THE WHOLE DATASET
config.seed = 0

result_dir = f"results/"
config.output_dir = result_dir
# pprint.pprint(config)

In [6]:
level = 2

#  load data 
# data_by_levels = load_data_prm800k(data_dir)
dataset = load_dataset(config.dataset_name, split=config.dataset_split, cache_dir=data_dir)
dataset = dataset.filter(lambda example: example['level'] == level)
print(len(dataset))
# dataset = dataset.select(range(min(len(dataset), config.num_samples)))
# load random_seeds     
# random_seeds = np.loadtxt("random_seeds.txt").astype("int64")
# random_seeds = [int(seed) for seed in random_seeds]

90


In [7]:
# baseline: gpu_memory_utilization=0.2
# use the standard model 
llm_vllm = LLM(
        model = llm_tokenizer_dir,
        tensor_parallel_size=1,
        gpu_memory_utilization = 0.7,  # Utilize 50% of GPU memory
        # enable_prefix_caching=True,  # V100 doesn't support enable_prefix_caching 
        # enable_chunked_prefill=False, # and enable_chunked_prefill
        max_model_len = 5000,
        dtype = "float16",
        seed = 123)
    
    # # use the gguf quantized model 
    # llm_regular = LLM(
    #     model = llm_dir,
    #     tokenizer = llm_tokenizer_dir,
    #     tensor_parallel_size=1,
    #     gpu_memory_utilization = 0.2,  # Utilize 50% of GPU memory
    #     max_model_len = 5000,
    #     dtype = "float16",
    #     seed = 123)


gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))

INFO 04-10 19:08:27 __init__.py:207] Automatically detected platform cuda.
INFO 04-10 19:08:34 config.py:549] This model supports multiple tasks: {'embed', 'classify', 'score', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 04-10 19:08:34 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=5000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_m

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-10 19:08:38 model_runner.py:1115] Loading model weights took 2.3185 GB
INFO 04-10 19:08:39 worker.py:267] Memory profiling takes 0.49 seconds
INFO 04-10 19:08:39 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.70) = 22.21GiB
INFO 04-10 19:08:39 worker.py:267] model weights take 2.32GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.19GiB; the rest of the memory reserved for KV Cache is 18.62GiB.
INFO 04-10 19:08:39 executor_base.py:111] # cuda blocks: 38125, # CPU blocks: 8192
INFO 04-10 19:08:39 executor_base.py:116] Maximum concurrency for 5000 tokens per request: 122.00x
INFO 04-10 19:08:40 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:15<00:00,  2.26it/s]

INFO 04-10 19:08:56 model_runner.py:1562] Graph capturing finished in 16 secs, took 0.13 GiB
INFO 04-10 19:08:56 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 17.99 seconds





#--- memory: 20.959694862365723
#--- memory: 0.0


In [8]:
prm = RLHFFlow(model_path=prm_tokenizer_dir, device_map='cuda:1')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
approach_fn = best_of_n

dataset = dataset.map(
    approach_fn,
    batched=True,
    batch_size=config.search_batch_size,
    fn_kwargs={"config": config, "llm": llm_vllm, "prm": prm},
    desc="Running search",
    load_from_cache_file=False,
)

Running search:   0%|          | 0/90 [00:00<?, ? examples/s]

In [None]:
for data in dataset:
    pprint.pprint(data)

In [None]:
dataset = score(dataset, config) 

In [None]:
dataset_id = "tnguyen9210/LLM-Reasoning-Math-500"
revision = f"beam-n{config.n}-level{level}-v11"
print(revision)

dataset.push_to_hub(dataset_id, config_name=revision)

# save_dataset(dataset, config)