In [1]:
import os, psutil
import time 
import gc

import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams, PoolingParams
from transformers import BitsAndBytesConfig


In [2]:
cpu_percent = psutil.cpu_percent(interval=1)
print(f"CPU Usage: {cpu_percent}%")

# RAM usage
virtual_memory = psutil.virtual_memory()
print(f"Total RAM: {virtual_memory.total / (1024 ** 3):.2f} GB")
print(f"Available RAM: {virtual_memory.available / (1024 ** 3):.2f} GB")
print(f"Used RAM: {virtual_memory.used / (1024 ** 3):.2f} GB")
print(f"RAM Usage Percentage: {virtual_memory.percent}%")

if torch.cuda.is_available():
    GPUS = os.environ.get('CUDA_VISIBLE_DEVICES', "0").split(',')
    print(GPUS)
    for gpu_index in  GPUS:
        print(f"\n-> gpu {gpu_index}")
        gpu_index = int(gpu_index)
        # gpu_index = 0  # Change this if you have multiple GPUs
        total_memory = torch.cuda.get_device_properties(gpu_index).total_memory
        reserved_memory = torch.cuda.memory_reserved(gpu_index)
        allocated_memory = torch.cuda.memory_allocated(gpu_index)
        free_memory = reserved_memory - allocated_memory
    
        print(f"Total GPU Memory: {total_memory / 1024 ** 3:.2f} GB")
        print(f"Allocated GPU Memory: {allocated_memory / 1024 ** 3:.2f} GB")
        print(f"Available GPU Memory: {free_memory / 1024 ** 3:.2f} GB")
else:
    print("CUDA is not available.")

CPU Usage: 0.1%
Total RAM: 503.68 GB
Available RAM: 475.77 GB
Used RAM: 14.14 GB
RAM Usage Percentage: 5.5%
['0', '1', '2', '3']

-> gpu 0
Total GPU Memory: 31.73 GB
Allocated GPU Memory: 0.00 GB
Available GPU Memory: 0.00 GB

-> gpu 1
Total GPU Memory: 31.73 GB
Allocated GPU Memory: 0.00 GB
Available GPU Memory: 0.00 GB

-> gpu 2
Total GPU Memory: 31.73 GB
Allocated GPU Memory: 0.00 GB
Available GPU Memory: 0.00 GB

-> gpu 3
Total GPU Memory: 31.73 GB
Allocated GPU Memory: 0.00 GB
Available GPU Memory: 0.00 GB


In [2]:
# base_path
base_path = '/groups/kjun/tnn/datasets/'

# dataset path
dataset_path = base_path + "/prm800k/math_splits"

# llm and prm path
llm_path = base_path + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_path = base_path + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_path = base_path + "/Llama-3.2-1B-Instruct"
prm_tokenizer_path = base_path + "/Llama3.1-8B-PRM-Deepseek-Data"

In [3]:
# prompt = 'Below is the graph of $y = a \\sin (bx + c) + d$ for some positive constants $a,$ $b,$ $c,$ and $d.$  Find the smallest possible value of $c.$\n\n[asy]import TrigMacros;\n\nsize(400);\n\nreal f(real x)\n{\n\treturn 2*sin(3*x + pi) + 1;\n}\n\ndraw(graph(f,-3*pi,3*pi,n=700,join=operator ..),red);\ntrig_axes(-3*pi,3*pi,-4,4,pi/2,1);\nlayer();\nrm_trig_labels(-5,5, 2);\n\nlabel("$1$", (0,1), E);\nlabel("$2$", (0,2), E);\nlabel("$3$", (0,3), E);\nlabel("$-1$", (0,-1), E);\nlabel("$-2$", (0,-2), E);\nlabel("$-3$", (0,-3), E);\n[/asy]' 
prompt =  'If $f(x) = \frac{3x-2}{x-2}$, what is the value of $f(-2) +f(-1)+f(0)$? Express your answer as a common fraction.'
max_new_tokens = 1024 
num_runs = 10

In [4]:
def measure_inference(model, tokenizer, prompt, max_new_tokens, num_runs, use_vllm=False):
    total_time = 0
    total_tokens = 0
    for _ in range(num_runs):
        start_time = time.time()
        if use_vllm:
            # vLLM generates text directly from the prompt
            sampling_params = SamplingParams(temperature=0.8, max_tokens=max_new_tokens, top_p=1.0, n=1, seed=123)
            output = model.generate(prompt, sampling_params)
            generated_text = output[0].outputs[0].text
            generated_ids = output[0].outputs[0].token_ids
            num_tokens = len(generated_ids)
        else:
            # Transformers requires tokenization
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            output = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)
            generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
            num_tokens = len(tokenizer.encode(generated_text))
            
        end_time = time.time()
        total_time += (end_time - start_time)
        # total_tokens += len(tokenizer.encode(generated_text)) if not use_vllm else max_new_tokens
        total_tokens += num_tokens
        
    latency = total_time / num_runs
    throughput = total_tokens / total_time
    avg_tokens = total_tokens / num_runs
    return latency, throughput, avg_tokens, generated_text

In [14]:
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

#--- memory: 0.0079345703125


In [21]:
tokenizer = AutoTokenizer.from_pretrained(llm_tokenizer_path)

# model_regular = AutoModelForCausalLM.from_pretrained(
#     llm_tokenizer_path, device_map='cuda:0')

model_regular = AutoModelForCausalLM.from_pretrained(
    llm_path, device_map='cuda:0')

# quantized_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, 
#     bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
# model_regular = AutoModelForCausalLM.from_pretrained(
#     llm_tokenizer_path, quantization_config=quantized_config, device_map='cuda:0')

# model_id = "QuantFactory/Llama-3.2-1B-Instruct-GGUF"
# filename = "Llama-3.2-1B-Instruct.Q4_K_M.gguf"
# model_regular = AutoModelForCausalLM.from_pretrained(
#     model_id, gguf_file=filename, device_map='cuda:0')

# model_regular.generation_config.pad_token_id = tokenizer.eos_token_id
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(2)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(3)/(1024**3))

OSError: Incorrect path_or_model_id: '/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [8]:
latency_regular, throughput_regular, avg_tokens, generated_text = measure_inference(model_regular, tokenizer, prompt, max_new_tokens, num_runs)
print(f"Transformers (Regular) - Latency: {latency_regular:.4f}s, Throughput: {throughput_regular:.2f} tokens/s")

Transformers (Regular) - Latency: 6.6855s, Throughput: 79.87 tokens/s


In [None]:
print(avg_tokens)
print(generated_text)

In [20]:
del(tokenizer)
del(model_regular)
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

#--- memory: 0.0079345703125


In [6]:
# baseline: gpu_memory_utilization=0.2
llm_regular = LLM(
    model = llm_tokenizer_path,
    gpu_memory_utilization = 0.2,  # Utilize 50% of GPU memory
    max_model_len = 5000,
    dtype = "float16",
    seed = 123)

gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

INFO 03-19 11:43:07 __init__.py:207] Automatically detected platform cuda.
INFO 03-19 11:43:15 config.py:549] This model supports multiple tasks: {'embed', 'score', 'generate', 'reward', 'classify'}. Defaulting to 'generate'.
INFO 03-19 11:43:15 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=5000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_m

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-19 11:43:19 model_runner.py:1115] Loading model weights took 2.3185 GB
INFO 03-19 11:43:19 worker.py:267] Memory profiling takes 0.45 seconds
INFO 03-19 11:43:19 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.20) = 6.35GiB
INFO 03-19 11:43:19 worker.py:267] model weights take 2.32GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.19GiB; the rest of the memory reserved for KV Cache is 2.75GiB.
INFO 03-19 11:43:19 executor_base.py:111] # cuda blocks: 5631, # CPU blocks: 8192
INFO 03-19 11:43:19 executor_base.py:116] Maximum concurrency for 5000 tokens per request: 18.02x
INFO 03-19 11:43:21 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utili

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:13<00:00,  2.59it/s]

INFO 03-19 11:43:34 model_runner.py:1562] Graph capturing finished in 14 secs, took 0.13 GiB
INFO 03-19 11:43:34 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 15.86 seconds





#--- memory: 5.084694862365723


In [7]:
print(llm_regular.encode())

<vllm.engine.llm_engine.LLMEngine object at 0x7f01ceb13250>


In [15]:
latency_vllm_regular, throughput_vllm_regular, avg_tokens, generated_text = measure_inference(llm_regular, None, prompt, max_new_tokens, num_runs, use_vllm=True)
print(f"vLLM (Regular) - Latency: {latency_vllm_regular:.4f}s, Throughput: {throughput_vllm_regular:.2f} tokens/s")

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.96s/it, est. speed input: 16.58 toks/s, output: 212.79 toks/s]


ValueError: LLM.encode() is only supported for pooling models. Your model supports the 'pooling' runner, but is currently initialized for the 'generate' runner. Please initialize vLLM using `--task embed`, `--task classify`, `--task score` etc.

In [65]:
print(avg_tokens)
print(generated_text)

629.0
 

## Step 1: Plug in the value of x in the function f(x) to find f(-2)
To find the value of $f(-2)$, we plug in $x = -2$ into the function $f(x) = \frac{3x-2}{x-2}$. This gives us $f(-2) = \frac{3(-2)-2}{-2-2} = \frac{-6-2}{-4} = \frac{-8}{-4} = 2$.

## Step 2: Plug in the value of x in the function f(x) to find f(-1)
To find the value of $f(-1)$, we plug in $x = -1$ into the function $f(x) = \frac{3x-2}{x-2}$. This gives us $f(-1) = \frac{3(-1)-2}{-1-2} = \frac{-3-2}{-3} = \frac{-5}{-3} = \frac{5}{3}$.

## Step 3: Plug in the value of x in the function f(x) to find f(0)
To find the value of $f(0)$, we plug in $x = 0$ into the function $f(x) = \frac{3x-2}{x-2}$. This gives us $f(0) = \frac{3(0)-2}{0-2} = \frac{-2}{-2} = 1$.

## Step 4: Add up the values of f(-2), f(-1), and f(0)
Now that we have found the values of $f(-2)$, $f(-1)$, and $f(0)$, which are 2, $\frac{5}{3}$, and 1 respectively, we can add them up. $f(-2) + f(-1) + f(0) = 2 + \frac{5}{3} + 1$.

## Step 5: Calculate 

In [12]:
# del(llm_regular)
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

#--- memory: 0.0394287109375


In [67]:
# increase gpu_memory_utilization=0.5
llm_regular = LLM(
    model = llm_tokenizer_path,
    gpu_memory_utilization = 0.7,  # Utilize 50% of GPU memory
    max_model_len = 5000,
    dtype = "float16",
    seed = 123)

gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

INFO 03-17 17:42:41 config.py:549] This model supports multiple tasks: {'reward', 'score', 'classify', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 03-17 17:42:41 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=5000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=123, serve

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-17 17:42:44 model_runner.py:1115] Loading model weights took 2.3029 GB
INFO 03-17 17:42:44 worker.py:267] Memory profiling takes 0.38 seconds
INFO 03-17 17:42:44 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.70) = 22.21GiB
INFO 03-17 17:42:44 worker.py:267] model weights take 2.30GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.18GiB; the rest of the memory reserved for KV Cache is 18.73GiB.
INFO 03-17 17:42:44 executor_base.py:111] # cuda blocks: 38353, # CPU blocks: 8192
INFO 03-17 17:42:44 executor_base.py:116] Maximum concurrency for 5000 tokens per request: 122.73x
INFO 03-17 17:42:45 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:15<00:00,  2.23it/s]

INFO 03-17 17:43:00 model_runner.py:1562] Graph capturing finished in 16 secs, took 0.04 GiB
INFO 03-17 17:43:00 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 16.80 seconds





In [69]:
latency_vllm_regular, throughput_vllm_regular, avg_tokens, generated_text = measure_inference(llm_regular, None, prompt, max_new_tokens, num_runs, use_vllm=True)
print(f"vLLM (Regular) - Latency: {latency_vllm_regular:.4f}s, Throughput: {throughput_vllm_regular:.2f} tokens/s")

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.95s/it, est. speed input: 16.64 toks/s, output: 213.61 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.93s/it, est. speed input: 16.72 toks/s, output: 214.61 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.93s/it, est. speed input: 16.72 toks/s, output: 214.61 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.93s/it, est. speed input: 16.71 toks/s, output: 214.53 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.93s/it, est. speed input: 16.71 toks/s, output: 214.56 toks/s]

vLLM (Regular) - Latency: 2.9366s, Throughput: 214.19 tokens/s





In [70]:
print(avg_tokens)
print(generated_text)

629.0
 

## Step 1: Plug in the value of x in the function f(x) to find f(-2)
To find the value of $f(-2)$, we plug in $x = -2$ into the function $f(x) = \frac{3x-2}{x-2}$. This gives us $f(-2) = \frac{3(-2)-2}{-2-2} = \frac{-6-2}{-4} = \frac{-8}{-4} = 2$.

## Step 2: Plug in the value of x in the function f(x) to find f(-1)
To find the value of $f(-1)$, we plug in $x = -1$ into the function $f(x) = \frac{3x-2}{x-2}$. This gives us $f(-1) = \frac{3(-1)-2}{-1-2} = \frac{-3-2}{-3} = \frac{-5}{-3} = \frac{5}{3}$.

## Step 3: Plug in the value of x in the function f(x) to find f(0)
To find the value of $f(0)$, we plug in $x = 0$ into the function $f(x) = \frac{3x-2}{x-2}$. This gives us $f(0) = \frac{3(0)-2}{0-2} = \frac{-2}{-2} = 1$.

## Step 4: Add up the values of f(-2), f(-1), and f(0)
Now that we have found the values of $f(-2)$, $f(-1)$, and $f(0)$, which are 2, $\frac{5}{3}$, and 1 respectively, we can add them up. $f(-2) + f(-1) + f(0) = 2 + \frac{5}{3} + 1$.

## Step 5: Calculate 

In [35]:
sampling_params = SamplingParams(temperature=0.8, max_tokens=max_new_tokens, top_p=1.0, n=1)
output = llm_regular.generate(prompt, sampling_params)
generated_text = output[0].outputs[0].text

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.97s/it, est. speed input: 16.52 toks/s, output: 212.05 toks/s]


In [38]:
print(len(output[0].outputs[0].token_ids))

629
