In [1]:
from vllm import LLM, SamplingParams
import json
import pandas as pd
import time
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
!export CUDA_VISIBLE_DEVICES=1
#from vllm.model_executor.models.llama_ee import LlamaForCausalLMEE
#ModelRegistry.register_model("LlamaForCausalLMEE", LlamaForCausalLMEE)
# from huggingface_hub import notebook_login
# notebook_login()

INFO 03-24 11:43:27 __init__.py:183] Automatically detected platform cuda.


In [2]:
def run_batch_inference(llm, input_file, output_file, batch_size=8):
    prompts = []
    with open(input_file, "r") as f:
        for line in f.readlines():
            prompts.append(json.loads(line)["text"])

    sampling_params = SamplingParams(seed=42, max_tokens=100)

    start_time = time.time()
    outputs = llm.generate(prompts, sampling_params)
    end_time = time.time()

    input_prompt = []
    generated_text = []
    arrival_time = []
    last_token_time = []
    first_scheduled_time = []
    first_token_time = []
    time_in_queue = []
    finished_time = []
    scheduler_time = []
    model_forward_time = []
    model_execute_time = []

    generated_len = 0 # number of tokens generated
    for output in outputs:
        input_prompt.append(f"\"{output.prompt!r}\"")
        generated_text.append(f"\"{output.outputs[0].text!r}\"")
        generated_len += len(output.outputs[0].token_ids)
        metrics = output.metrics
        arrival_time.append(metrics.arrival_time)
        last_token_time.append(metrics.last_token_time)
        first_scheduled_time.append(metrics.first_scheduled_time)
        first_token_time.append(metrics.first_token_time)
        time_in_queue.append(metrics.time_in_queue)
        finished_time.append(metrics.finished_time)
        scheduler_time.append(metrics.scheduler_time)
        model_forward_time.append(metrics.model_forward_time)
        model_execute_time.append(metrics.model_execute_time)
    
    output_throughput = generated_len / (end_time - start_time)
    print(f"Overall output throughput: {output_throughput} tokens/second")
    df = pd.DataFrame({
        "input_prompt": input_prompt,
        "generated_text": generated_text,
        "arrival_time": arrival_time,
        "last_token_time": last_token_time,
        "first_scheduled_time": first_scheduled_time,
        "first_token_time": first_token_time,
        "time_in_queue": time_in_queue,
        "finished_time": finished_time,
        "scheduler_time": scheduler_time,
        "model_forward_time": model_forward_time,
        "model_execute_time": model_execute_time,
        "throughput": [output_throughput] * len(input_prompt)
    })
    df.to_csv(output_file, index=False)

In [3]:
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=500)


In [4]:
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", max_model_len=1024, enforce_eager=True)

INFO 03-24 11:43:42 config.py:520] This model supports multiple tasks: {'embed', 'generate', 'classify', 'reward', 'score'}. Defaulting to 'generate'.
INFO 03-24 11:43:42 llm_engine.py:232] Initializing an LLM engine (v0.7.0) with config: model='meta-llama/Llama-2-7b-chat-hf', speculative_config=None, tokenizer='meta-llama/Llama-2-7b-chat-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Llama-2-7b-chat-hf,

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 03-24 11:43:51 model_runner.py:1115] Loading model weights took 12.5523 GB
INFO 03-24 11:43:52 worker.py:266] Memory profiling takes 0.89 seconds
INFO 03-24 11:43:52 worker.py:266] the current vLLM instance can use total_gpu_memory (15.77GiB) x gpu_memory_utilization (0.90) = 14.19GiB
INFO 03-24 11:43:52 worker.py:266] model weights take 12.55GiB; non_torch_memory takes 0.07GiB; PyTorch activation peak memory takes 0.79GiB; the rest of the memory reserved for KV Cache is 0.78GiB.
INFO 03-24 11:43:53 executor_base.py:108] # CUDA blocks: 99, # CPU blocks: 512
INFO 03-24 11:43:53 executor_base.py:113] Maximum concurrency for 1024 tokens per request: 1.55x
[CacheEngine._allocate_kv_cache] kv_cache_shape: (2, 99, 65536)
[CacheEngine._allocate_kv_cache] kv_cache_shape: (2, 512, 65536)
INFO 03-24 11:43:55 llm_engine.py:429] init engine (profile, create kv cache, warmup model) took 3.99 seconds


In [3]:
# Below is the output of running the original llama code (llama_original.py) for reference
original_llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", max_model_len=1024, enforce_eager=True)

INFO 03-23 16:34:57 config.py:520] This model supports multiple tasks: {'classify', 'reward', 'generate', 'embed', 'score'}. Defaulting to 'generate'.
INFO 03-23 16:34:57 llm_engine.py:232] Initializing an LLM engine (v0.7.0) with config: model='meta-llama/Llama-2-7b-chat-hf', speculative_config=None, tokenizer='meta-llama/Llama-2-7b-chat-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Llama-2-7b-chat-hf,

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 03-23 16:35:06 model_runner.py:1115] Loading model weights took 12.5523 GB
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2048])
[LlamaModel forward] Input_ids shape: torch.Size([2048])
[LlamaForCausalLM forward] Model output shape: torch.Size([2048, 4096])
[compute_logits] Hidden states shape: torch.Size([2048, 4096])
INFO 03-23 16:35:07 worker.py:266] Memory profiling takes 0.83 seconds
INFO 03-23 16:35:07 worker.py:266] the current vLLM instance can use total_gpu_memory (15.77GiB) x gpu_memory_utilization (0.90) = 14.19GiB
INFO 03-23 16:35:07 worker.py:266] model weights take 12.55GiB; non_torch_memory takes 0.07GiB; PyTorch activation peak memory takes 0.31GiB; the rest of the memory reserved for KV Cache is 1.25GiB.
INFO 03-23 16:35:07 executor_base.py:108] # CUDA blocks: 160, # CPU blocks: 512
INFO 03-23 16:35:07 executor_base.py:113] Maximum concurrency for 1024 tokens per request: 2.50x
INFO 03-23 16:35:10 llm_engine.py:429] init engine (profile, create kv cache, 

In [None]:
# Output of original lammama for reference
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

Processed prompts: 100%|██████████| 4/4 [00:00<00:00,  8.39it/s, est. speed input: 56.62 toks/s, output: 134.20 toks/s]

Prompt: 'Hello, my name is', Generated text: " Sherry and I'm a 35-year-old woman from"
Prompt: 'The president of the United States is', Generated text: ' a member of Congress, which means that he or she is subject to the same'
Prompt: 'The capital of France is', Generated text: ' Paris. This is a fact that is well known and widely accepted. However,'
Prompt: 'The future of AI is', Generated text: ' likely to be shaped by a combination of technological advancements, soci'





In [None]:
# output of llama-ee (current llama.py is llama-ee)
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

input_ids shape before converting: torch.Size([27])
input_ids shape after converting: torch.Size([1, 27])
LlamaModel forward. input_ids shape: torch.Size([1, 27])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([1, 27])
Force kv_caches to None
--------EE statistics---------
self.exited_rates: [0, 2]
skip_mask: False, conf: 0.0088043212890625
--------EE statistics---------
input_ids shape before converting: torch.Size([4])
input_ids shape after converting: torch.Size([1, 4])
LlamaModel forward. input_ids shape: torch.Size([1, 4])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([1, 4])
Force kv_caches to None
--------EE statistics---------
self.exited_rates: [0, 3]
skip_mask: False, conf: 0.1258544921875
--------EE statistics---------
input_ids shape before converting: torch.Size([4])
input_ids shape after converting: torch.Size([1, 4])
LlamaModel forward. input_ids shape: torch.Size([1, 4])
a

Processed prompts: 100%|██████████| 4/4 [00:00<00:00,  4.47it/s, est. speed input: 30.20 toks/s, output: 71.59 toks/s]

--------EE statistics---------
self.exited_rates: [2, 11]
skip_mask: False, conf: 0.210693359375
--------EE statistics---------
input_ids shape before converting: torch.Size([4])
input_ids shape after converting: torch.Size([1, 4])
LlamaModel forward. input_ids shape: torch.Size([1, 4])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([1, 4])
Force kv_caches to None
--------EE statistics---------
self.exited_rates: [2, 12]
skip_mask: False, conf: 0.394287109375
--------EE statistics---------
input_ids shape before converting: torch.Size([4])
input_ids shape after converting: torch.Size([1, 4])
LlamaModel forward. input_ids shape: torch.Size([1, 4])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([1, 4])
Force kv_caches to None
--------EE statistics---------
self.exited_rates: [2, 13]
skip_mask: False, conf: 0.008636474609375
--------EE statistics---------
input_ids shape before converting: to




In [4]:
run_batch_inference(original_llm, "lmsys_1.jsonl", "results/lmsys_1_original.csv")

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[LlamaForCausalLM forward] Input_ids shape: torch.Size([65])
[LlamaModel forward] Input_ids shape: torch.Size([65])
[LlamaForCausalLM forward] Model output shape: torch.Size([65, 4096])
[compute_logits] Hidden states shape: torch.Size([65, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forwar

Processed prompts: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it, est. speed input: 25.81 toks/s, output: 79.40 toks/s]

[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] M




In [9]:
run_batch_inference(llm, "lmsys_1.jsonl", "results/lmsys_1.csv")

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[get_skip_mask] mask: tensor([False, False], device='cuda:0')
tensor(0.0696, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: False
[get_skip_mask] mask: tensor([False, False], device='cuda:0')
tensor(0.0013, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: False
[get_skip_mask] mask: tensor([False, False], device='cuda:0')
tensor(0.0415, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: False
[get_skip_mask] mask: tensor([False,  True], device='cuda:0')
tensor(0.4463, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: True
[get_skip_mask] mask: tensor([ True, False], device='cuda:0')
tensor(0.4592, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: True
[get_skip_mask] mask: tensor([False, False], device='cuda:0')
tensor(0.1172, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: False
[get_skip_mask] mask: tensor([False, False], device='cuda:0')
tensor(0.1113, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: False
[

Processed prompts: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it, est. speed input: 26.84 toks/s, output: 82.57 toks/s]

[get_skip_mask] mask: tensor([False, False], device='cuda:0')
tensor(0.0158, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: False
[get_skip_mask] mask: tensor([ True, False], device='cuda:0')
tensor(0.3352, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: True
[get_skip_mask] mask: tensor([False,  True], device='cuda:0')
tensor(0.4626, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: True
[get_skip_mask] mask: tensor([False, False], device='cuda:0')
tensor(0.1970, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: False
[get_skip_mask] mask: tensor([False, False], device='cuda:0')
tensor(0.2271, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: False
[get_skip_mask] mask: tensor([False,  True], device='cuda:0')
tensor(0.5024, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: True
[get_skip_mask] mask: tensor([False,  True], device='cuda:0')
tensor(0.3916, device='cuda:0', dtype=torch.float16)
[get_skip_mask] mask: True
[ge


