In [1]:
import os, psutil, gc
import time 
import json
import pprint

from collections import defaultdict
import random
import numpy as np

In [2]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams, PoolingParams

from sal.config import Config

INFO 03-23 00:31:44 [__init__.py:256] Automatically detected platform cuda.


In [3]:
!nvidia-smi topo -m

	[4mGPU0	GPU1	GPU2	GPU3	CPU Affinity	NUMA Affinity	GPU NUMA ID[0m
GPU0	 X 	NODE	SYS	SYS	1-24	0		N/A
GPU1	NODE	 X 	SYS	SYS	1-24	0		N/A
GPU2	SYS	SYS	 X 	NODE	49-72	1		N/A
GPU3	SYS	SYS	NODE	 X 	49-72	1		N/A

Legend:

  X    = Self
  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
  PIX  = Connection traversing at most a single PCIe bridge
  NV#  = Connection traversing a bonded set of # NVLinks


In [4]:
# os.environ["VLLM_USE_V1"] = "0" 
# os.environ["CUDA_VISIBLE_DEVICES"]="0,3"
# !export NCCL_P2P_LEVEL=NVL
# os.environ['NCCL_P2P_LEVEL'] = 'NVL'
# os.environ['NCCL_P2P_DISABLE'] = '1'
# %env NCCL_P2P_DISABLE=1


In [5]:
print(os.environ.get('NCCL_P2P_DISABLE'))  # Should print '1'

None


In [6]:
# cpu_percent = psutil.cpu_percent(interval=1)
# print(f"CPU Usage: {cpu_percent}%")

# RAM usage
virtual_memory = psutil.virtual_memory()
print(f"Total RAM: {virtual_memory.total / (1024 ** 3):.2f} GB")
print(f"Available RAM: {virtual_memory.available / (1024 ** 3):.2f} GB")
print(f"Used RAM: {virtual_memory.used / (1024 ** 3):.2f} GB")
print(f"RAM Usage Percentage: {virtual_memory.percent}%")

if torch.cuda.is_available():
    GPUS = os.environ.get('CUDA_VISIBLE_DEVICES', "0").split(',')
    print(GPUS)
    for gpu_index in GPUS:
        print(f"\n-> gpu {gpu_index}")
        gpu_index = int(gpu_index)
        # gpu_index = 0  # Change this if you have multiple GPUs
        total_memory = torch.cuda.get_device_properties(gpu_index).total_memory
        reserved_memory = torch.cuda.memory_reserved(gpu_index)
        allocated_memory = torch.cuda.memory_allocated(gpu_index)
        free_memory = reserved_memory - allocated_memory
    
        print(f"Total GPU Memory: {total_memory / 1024 ** 3:.2f} GB")
        print(f"Allocated GPU Memory: {allocated_memory / 1024 ** 3:.2f} GB")
        print(f"Available GPU Memory: {free_memory / 1024 ** 3:.2f} GB")
else:
    print("CUDA is not available.")

Total RAM: 503.68 GB
Available RAM: 475.77 GB
Used RAM: 14.14 GB
RAM Usage Percentage: 5.5%
['0', '1', '2', '3']

-> gpu 0
Total GPU Memory: 31.73 GB
Allocated GPU Memory: 0.00 GB
Available GPU Memory: 0.00 GB

-> gpu 1
Total GPU Memory: 31.73 GB
Allocated GPU Memory: 0.00 GB
Available GPU Memory: 0.00 GB

-> gpu 2
Total GPU Memory: 31.73 GB
Allocated GPU Memory: 0.00 GB
Available GPU Memory: 0.00 GB

-> gpu 3
Total GPU Memory: 31.73 GB
Allocated GPU Memory: 0.00 GB
Available GPU Memory: 0.00 GB


In [7]:
# base_path
base_path = '/groups/kjun/tnn/datasets/'

# dataset path
dataset_path = base_path + "/prm800k/math_splits"

# llm and prm path
llm_path = base_path + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_path = base_path + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_path = base_path + "/Llama-3.2-1B-Instruct"
prm_tokenizer_path = base_path + "/Llama3.1-8B-PRM-Deepseek-Data"

In [8]:
# baseline: gpu_memory_utilization=0.2
# use the standard model 
llm_vllm = LLM(
    model = llm_tokenizer_path,
    tensor_parallel_size=1,
    gpu_memory_utilization = 0.5,  # Utilize 50% of GPU memory
    max_model_len = 10000,
    dtype = "float16",
    seed = 123)

# # use the gguf quantized model 
# llm_regular = LLM(
#     model = llm_path,
#     tokenizer = llm_tokenizer_path,
#     tensor_parallel_size=1,
#     gpu_memory_utilization = 0.2,  # Utilize 50% of GPU memory
#     max_model_len = 5000,
#     dtype = "float16",
#     seed = 123)


gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))

INFO 03-23 00:31:57 [config.py:583] This model supports multiple tasks: {'reward', 'generate', 'score', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 03-23 00:31:57 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.1) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=10000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=Fal

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-23 00:32:01 [loader.py:429] Loading weights took 1.36 seconds
INFO 03-23 00:32:01 [model_runner.py:1146] Model loading took 2.3185 GB and 1.459759 seconds
INFO 03-23 00:32:02 [worker.py:267] Memory profiling takes 0.59 seconds
INFO 03-23 00:32:02 [worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.50) = 15.87GiB
INFO 03-23 00:32:02 [worker.py:267] model weights take 2.32GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.21GiB; the rest of the memory reserved for KV Cache is 12.25GiB.
INFO 03-23 00:32:02 [executor_base.py:111] # cuda blocks: 25088, # CPU blocks: 8192
INFO 03-23 00:32:02 [executor_base.py:116] Maximum concurrency for 10000 tokens per request: 40.14x
INFO 03-23 00:32:04 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:14<00:00,  2.49it/s]

INFO 03-23 00:32:18 [model_runner.py:1570] Graph capturing finished in 14 secs, took 0.13 GiB
INFO 03-23 00:32:18 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 16.62 seconds





#--- memory: 14.584884643554688
#--- memory: 0.0


In [9]:
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))
# print('#--- memory:', torch.cuda.memory_allocated(2)/(1024**3))
# print('#--- memory:', torch.cuda.memory_allocated(3)/(1024**3))

#--- memory: 14.584884643554688
#--- memory: 0.0


In [10]:
data_by_levels = defaultdict(list)
with open(f"{dataset_path}/test.jsonl", 'r', encoding='utf-8') as filein:
    for line in filein:
        if line.strip():
            data = json.loads(line)
            # print(data['level'])
            data_by_levels[f"{data['level']}"].append(data)

    # data =  [json.loads(line) for line in filein if line.strip()]
    # pprint.pprint(data, compact=True)

for key in range(1,6):
    key = str(key)
    print(f"{key}: {len(data_by_levels[key])}")
    # pprint.pprint(data_by_levels[key][:2], compact=True)
# print(data_by_levels.keys())
# pprint.pprint(data_by_levels['2'], compact=True)

random_seeds = np.loadtxt("random_seeds.txt").astype("int64")
random_seeds = [int(seed) for seed in random_seeds]

1: 43
2: 90
3: 105
4: 128
5: 134


In [11]:

def test_best_of_n_v11(batch_of_prompts, config, llm_vllm, random_seed):
    convs = [
        [
            {"role": "system", "content": config.system_prompt},
            {"role": "user", "content": prompt},
        ]
        for prompt in batch_of_prompts
    ]
    
    tokenizer = llm_vllm.get_tokenizer()
    
    # TODO: set the augmented template from a file
    if config.custom_chat_template is not None:
        tokenizer.chat_template = config.custom_chat_template
        
    templated_convs = tokenizer.apply_chat_template(
        convs, add_generation_prompt=True, tokenize=False,
    )

    # Duplicate convs to generate config.n completions per prompt so we can do continous batching
    # This makes [p1, p2, p3, p4] become [p1, p1, p2, p2, p3, p3, p4, p4] for e.g. config.n=2
    # templated_convs = [c for conv in templated_convs for c in [conv] * config.n]

    sampling_params = SamplingParams(
        temperature=config.temperature,
        # temperature=0,
        max_tokens=config.max_tokens,
        top_p=config.top_p,
        n=config.n,  # generate n outputs
        best_of=config.n,
        # stop=[
        #     "\n\n"
        # ],  # we consider that a step in the problem is indicated by a double newline
        # include_stop_str_in_output=True,
        seed=random_seed,
    )        

    # Generate responses 
    responses = llm_vllm.generate(
        templated_convs,
        sampling_params=sampling_params,
        use_tqdm=False,
    )

    # Re-generate responses if we get more responses than expected
    if len(responses) != len(batch_of_prompts):
        responses = llm_vllm.generate(
            templated_convs,
            sampling_params=sampling_params,
            use_tqdm=False,
        )
        assert len(responses) == len(batch_of_prompts), \
            f"Generated {len(responses)} responses instead of {len(batch_of_prompts)}"
    
    # Collect the completions from responses
    completions = [[] for _ in range(len(batch_of_prompts))]
    completion_ntokens = [[] for _ in range(len(batch_of_prompts))]

    # for r_idx, r in enumerate(responses):
    #     # print(r.request_id)
    #     if len(r.outputs) != config.n:
    #         raise ValueError(f"Generated {len(r.outputs)} completions instead of {config.n}")
            
    #     for output in r.outputs[:config.n]:
    #         print(output.text)
    #         # print(output.stop_reason)
    #         completions[r_idx].append(output.text)
    #         completion_ntokens[r_idx].append(len(output.token_ids))

    return 



In [12]:
def test_best_of_n_v12(batch_of_prompts, config, llm_vllm, random_seed):
    convs = [
        [
            {"role": "system", "content": config.system_prompt},
            {"role": "user", "content": prompt},
        ]
        for prompt in batch_of_prompts
    ]
    
    tokenizer = llm_vllm.get_tokenizer()
    
    # TODO: set the augmented template from a file
    if config.custom_chat_template is not None:
        tokenizer.chat_template = config.custom_chat_template
        
    templated_convs = tokenizer.apply_chat_template(
        convs, add_generation_prompt=True, tokenize=False,
    )

    # Duplicate convs to generate config.n completions per prompt so we can do continous batching
    # This makes [p1, p2, p3, p4] become [p1, p1, p2, p2, p3, p3, p4, p4] for e.g. config.n=2
    templated_convs = [c for conv in templated_convs for c in [conv] * config.n]

    sampling_params = SamplingParams(
        temperature=config.temperature,
        # temperature=0,
        max_tokens=config.max_tokens,
        top_p=config.top_p,
        n=1,  # generate n outputs
        # stop=[
        #     "\n\n"
        # ],  # we consider that a step in the problem is indicated by a double newline
        # include_stop_str_in_output=True,
        seed=random_seed,
    )        

    # Generate responses 
    responses = llm_vllm.generate(
        templated_convs,
        sampling_params=sampling_params,
        use_tqdm=False,
    )

    # Re-generate responses if we get more responses than expected
    if len(responses) != len(batch_of_prompts) * config.n:
        responses = llm_vllm.generate(
            templated_convs,
            sampling_params=sampling_params,
            use_tqdm=False,
        )
        assert len(responses) == len(batch_of_prompts) * config.n, \
            f"Generated {len(responses)} responses instead of {len(batch_of_prompts)}"
    
    # Collect the completions from responses
    completions = [[] for _ in range(len(batch_of_prompts))]
    completion_ntokens = [[] for _ in range(len(batch_of_prompts))]

    # for i in range(len(completions)):
    #     completions[i] = [
    #         output.text
    #         for r in responses[i * config.n : (i + 1) * config.n]
    #         for output in r.outputs
    #     ]
    #     completion_ntokens[i] = [
    #         len(output.token_ids)
    #         for r in responses[i * config.n : (i + 1) * config.n]
    #         for output in r.outputs
    #     ]
    # print(responses)
    
    # for r_idx, r in enumerate(responses):
    #     idx = r_idx // config.n
    #     output = r.outputs[0]
    #     # print(output.text)
    #     completions[idx].append(output.text)
    #     completion_ntokens[idx].append(output.token_ids)

    # print(completions)

    return 

In [13]:
# general params
config = Config()
config.n = 128

level = '1'
num_questions = len(data_by_levels[level])
# num_questions = 2
num_trials = 1
print(f"num_questions = {num_questions}")

method_number = 2
if method_number == 1:
    test_method = test_best_of_n_v11
else:
    test_method = test_best_of_n_v12
print(test_method)

batch_of_prompts = [data_by_levels[level][q_idx]['problem'] for q_idx in range(num_questions)]
start_time = time.time()
for t_idx in range(num_trials):
    print(f"trial {t_idx}")
    # test_method(batch_of_prompts, config, llm_vllm, random_seeds[t_idx])
    test_method(batch_of_prompts, config, llm_vllm, 10000+t_idx)

    # compute the time
    total_time = time.time() - start_time
    time_per_trial = total_time/(t_idx+1)
    time_per_question = time_per_trial/num_questions
    print(f"it takes {time_per_question:0.4f}s per question")
    print(f"it takes {time_per_trial:0.4f}s for this trial")

print(f"it takes {total_time:0.4f}s in total")

num_questions = 43
<function test_best_of_n_v12 at 0x7f6d1649ff60>
trial 0
it takes 7.4377s per question
it takes 319.8194s for this trial
it takes 319.8194s in total
