In [1]:
from vllm import LLM, SamplingParams
import json
import pandas as pd
import time
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
!export CUDA_VISIBLE_DEVICES=1
#from vllm.model_executor.models.llama_ee import LlamaForCausalLMEE
#ModelRegistry.register_model("LlamaForCausalLMEE", LlamaForCausalLMEE)
# from huggingface_hub import notebook_login
# notebook_login()

INFO 03-24 14:47:46 __init__.py:183] Automatically detected platform cuda.


In [2]:
def run_batch_inference(llm, input_file, output_file):
    prompts = []
    with open(input_file, "r") as f:
        for line in f.readlines():
            prompts.append(json.loads(line)["text"])

    sampling_params = SamplingParams(seed=42, max_tokens=100)

    start_time = time.time()
    outputs = llm.generate(prompts, sampling_params)
    end_time = time.time()

    input_prompt = []
    generated_text = []
    arrival_time = []
    last_token_time = []
    first_scheduled_time = []
    first_token_time = []
    time_in_queue = []
    finished_time = []
    scheduler_time = []
    model_forward_time = []
    model_execute_time = []

    generated_len = 0 # number of tokens generated
    for output in outputs:
        input_prompt.append(f"\"{output.prompt!r}\"")
        generated_text.append(f"\"{output.outputs[0].text!r}\"")
        generated_len += len(output.outputs[0].token_ids)
        metrics = output.metrics
        arrival_time.append(metrics.arrival_time)
        last_token_time.append(metrics.last_token_time)
        first_scheduled_time.append(metrics.first_scheduled_time)
        first_token_time.append(metrics.first_token_time)
        time_in_queue.append(metrics.time_in_queue)
        finished_time.append(metrics.finished_time)
        scheduler_time.append(metrics.scheduler_time)
        model_forward_time.append(metrics.model_forward_time)
        model_execute_time.append(metrics.model_execute_time)
    
    output_throughput = generated_len / (end_time - start_time)
    print(f"Overall output throughput: {output_throughput} tokens/second")
    df = pd.DataFrame({
        "input_prompt": input_prompt,
        "generated_text": generated_text,
        "arrival_time": arrival_time,
        "last_token_time": last_token_time,
        "first_scheduled_time": first_scheduled_time,
        "first_token_time": first_token_time,
        "time_in_queue": time_in_queue,
        "finished_time": finished_time,
        "scheduler_time": scheduler_time,
        "model_forward_time": model_forward_time,
        "model_execute_time": model_execute_time,
        "throughput": [output_throughput] * len(input_prompt)
    })
    df.to_csv(output_file, index=False)

In [3]:
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=500)


In [4]:
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", max_model_len=1024, enforce_eager=True, max_num_seqs=4)

INFO 03-24 14:48:01 config.py:520] This model supports multiple tasks: {'score', 'classify', 'generate', 'reward', 'embed'}. Defaulting to 'generate'.
INFO 03-24 14:48:01 llm_engine.py:232] Initializing an LLM engine (v0.7.0) with config: model='meta-llama/Llama-2-7b-chat-hf', speculative_config=None, tokenizer='meta-llama/Llama-2-7b-chat-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Llama-2-7b-chat-hf,

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 03-24 14:48:10 model_runner.py:1115] Loading model weights took 12.5523 GB
INFO 03-24 14:48:11 worker.py:266] Memory profiling takes 0.93 seconds
INFO 03-24 14:48:11 worker.py:266] the current vLLM instance can use total_gpu_memory (15.77GiB) x gpu_memory_utilization (0.90) = 14.19GiB
INFO 03-24 14:48:11 worker.py:266] model weights take 12.55GiB; non_torch_memory takes 0.07GiB; PyTorch activation peak memory takes 0.67GiB; the rest of the memory reserved for KV Cache is 0.90GiB.
INFO 03-24 14:48:11 executor_base.py:108] # CUDA blocks: 115, # CPU blocks: 512
INFO 03-24 14:48:11 executor_base.py:113] Maximum concurrency for 1024 tokens per request: 1.80x
[CacheEngine._allocate_kv_cache] kv_cache_shape: (2, 115, 65536)
[CacheEngine._allocate_kv_cache] kv_cache_shape: (2, 512, 65536)
INFO 03-24 14:48:14 llm_engine.py:429] init engine (profile, create kv cache, warmup model) took 4.03 seconds


In [3]:
# Below is the output of running the original llama code (llama_original.py) for reference
original_llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", max_model_len=1024, enforce_eager=True, max_num_seqs=4)

INFO 03-24 14:24:46 config.py:520] This model supports multiple tasks: {'reward', 'score', 'classify', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 03-24 14:24:46 llm_engine.py:232] Initializing an LLM engine (v0.7.0) with config: model='meta-llama/Llama-2-7b-chat-hf', speculative_config=None, tokenizer='meta-llama/Llama-2-7b-chat-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Llama-2-7b-chat-hf,

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 03-24 14:24:55 model_runner.py:1115] Loading model weights took 12.5523 GB
INFO 03-24 14:24:56 worker.py:266] Memory profiling takes 0.87 seconds
INFO 03-24 14:24:56 worker.py:266] the current vLLM instance can use total_gpu_memory (15.77GiB) x gpu_memory_utilization (0.90) = 14.19GiB
INFO 03-24 14:24:56 worker.py:266] model weights take 12.55GiB; non_torch_memory takes 0.07GiB; PyTorch activation peak memory takes 0.18GiB; the rest of the memory reserved for KV Cache is 1.39GiB.
INFO 03-24 14:24:56 executor_base.py:108] # CUDA blocks: 177, # CPU blocks: 512
INFO 03-24 14:24:56 executor_base.py:113] Maximum concurrency for 1024 tokens per request: 2.77x
[CacheEngine._allocate_kv_cache] kv_cache_shape: (2, 177, 65536)
[CacheEngine._allocate_kv_cache] kv_cache_shape: (2, 512, 65536)
INFO 03-24 14:24:59 llm_engine.py:429] init engine (profile, create kv cache, warmup model) took 3.91 seconds


In [None]:
# Output of original lammama for reference
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

Processed prompts: 100%|██████████| 4/4 [00:00<00:00,  8.39it/s, est. speed input: 56.62 toks/s, output: 134.20 toks/s]

Prompt: 'Hello, my name is', Generated text: " Sherry and I'm a 35-year-old woman from"
Prompt: 'The president of the United States is', Generated text: ' a member of Congress, which means that he or she is subject to the same'
Prompt: 'The capital of France is', Generated text: ' Paris. This is a fact that is well known and widely accepted. However,'
Prompt: 'The future of AI is', Generated text: ' likely to be shaped by a combination of technological advancements, soci'





In [None]:
# output of llama-ee (current llama.py is llama-ee)
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

input_ids shape before converting: torch.Size([27])
input_ids shape after converting: torch.Size([1, 27])
LlamaModel forward. input_ids shape: torch.Size([1, 27])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([1, 27])
Force kv_caches to None
--------EE statistics---------
self.exited_rates: [0, 2]
skip_mask: False, conf: 0.0088043212890625
--------EE statistics---------
input_ids shape before converting: torch.Size([4])
input_ids shape after converting: torch.Size([1, 4])
LlamaModel forward. input_ids shape: torch.Size([1, 4])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([1, 4])
Force kv_caches to None
--------EE statistics---------
self.exited_rates: [0, 3]
skip_mask: False, conf: 0.1258544921875
--------EE statistics---------
input_ids shape before converting: torch.Size([4])
input_ids shape after converting: torch.Size([1, 4])
LlamaModel forward. input_ids shape: torch.Size([1, 4])
a

Processed prompts: 100%|██████████| 4/4 [00:00<00:00,  4.47it/s, est. speed input: 30.20 toks/s, output: 71.59 toks/s]

--------EE statistics---------
self.exited_rates: [2, 11]
skip_mask: False, conf: 0.210693359375
--------EE statistics---------
input_ids shape before converting: torch.Size([4])
input_ids shape after converting: torch.Size([1, 4])
LlamaModel forward. input_ids shape: torch.Size([1, 4])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([1, 4])
Force kv_caches to None
--------EE statistics---------
self.exited_rates: [2, 12]
skip_mask: False, conf: 0.394287109375
--------EE statistics---------
input_ids shape before converting: torch.Size([4])
input_ids shape after converting: torch.Size([1, 4])
LlamaModel forward. input_ids shape: torch.Size([1, 4])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([1, 4])
Force kv_caches to None
--------EE statistics---------
self.exited_rates: [2, 13]
skip_mask: False, conf: 0.008636474609375
--------EE statistics---------
input_ids shape before converting: to




In [4]:
run_batch_inference(original_llm, "lmsys_1.jsonl", "results/lmsys_1_original.csv")

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[LlamaForCausalLM forward] Input_ids shape: torch.Size([65])
[LlamaModel forward] Input_ids shape: torch.Size([65])
[LlamaForCausalLM forward] Model output shape: torch.Size([65, 4096])
[compute_logits] Hidden states shape: torch.Size([65, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forwar

Processed prompts: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it, est. speed input: 25.81 toks/s, output: 79.40 toks/s]

[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] M




In [None]:
run_batch_inference(llm, "lmsys_1.jsonl", "results/lmsys_1.csv")

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

input_ids shape before converting: torch.Size([65])
input_ids shape after converting: torch.Size([65])
LlamaModel forward. input_ids shape: torch.Size([65])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([65])
[compute_logits] hidden_states shape: torch.Size([65, 4096]). lm_head shape: ParallelLMHead(num_embeddings=32000, embedding_dim=4096, org_vocab_size=32000, num_embeddings_padded=32000, tp_size=1)
[get_skip_mask] mask dim: 1
--------[LlamaModel: forward] EE statistics at layer20---------
self.exited_rates([num_ee, num_no_ee]): [30, 172]
skip_mask: False, conf: 0.005096435546875
--------[LlamaModel: forward] EE statistics---------
[LlamaModel: forward] Returning hidden_states shape: torch.Size([65, 4096])
[LlamaForCausalLM forward] Model output shape: torch.Size([65, 4096])
[compute_logits] hidden_states shape: torch.Size([65, 4096]). lm_head shape: ParallelLMHead(num_embeddings=32000, embedding_dim=4096, org_vocab_size=32000, num_em

Processed prompts: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it, est. speed input: 26.82 toks/s, output: 82.52 toks/s]

[LlamaModel: forward] Returning hidden_states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] hidden_states shape: torch.Size([2, 4096]). lm_head shape: ParallelLMHead(num_embeddings=32000, embedding_dim=4096, org_vocab_size=32000, num_embeddings_padded=32000, tp_size=1)
input_ids shape before converting: torch.Size([2])
input_ids shape after converting: torch.Size([2])
LlamaModel forward. input_ids shape: torch.Size([2])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([2])
[compute_logits] hidden_states shape: torch.Size([2, 4096]). lm_head shape: ParallelLMHead(num_embeddings=32000, embedding_dim=4096, org_vocab_size=32000, num_embeddings_padded=32000, tp_size=1)
[get_skip_mask] mask dim: 1
--------[LlamaModel: forward] EE statistics at layer20---------
self.exited_rates([num_ee, num_no_ee]): [44, 252]
skip_mask: False, conf: 0.0056304931640625
--------[LlamaModel: forwa




In [5]:
run_batch_inference(llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_eager.csv")

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  29%|██▉       | 29/100 [00:10<00:31,  2.29it/s, est. speed input: 254.07 toks/s, output: 264.21 toks/s]



Processed prompts:  57%|█████▋    | 57/100 [00:22<00:36,  1.17it/s, est. speed input: 278.02 toks/s, output: 238.29 toks/s]



Processed prompts: 100%|██████████| 100/100 [00:36<00:00,  2.70it/s, est. speed input: 308.53 toks/s, output: 255.23 toks/s]


Overall output throughput: 254.72538793728546 tokens/second


In [5]:
run_batch_inference(llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_lazy.csv")

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  41%|████      | 41/100 [00:14<00:19,  2.99it/s, est. speed input: 252.93 toks/s, output: 239.00 toks/s]



Processed prompts:  58%|█████▊    | 58/100 [00:25<00:32,  1.29it/s, est. speed input: 255.39 toks/s, output: 197.28 toks/s]



Processed prompts: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s, est. speed input: 287.37 toks/s, output: 220.44 toks/s]


Overall output throughput: 220.03491680150898 tokens/second


In [5]:
# batchsize = 4
run_batch_inference(llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_off.csv")

Processed prompts:  56%|█████▌    | 56/100 [00:40<00:28,  1.55it/s, est. speed input: 137.51 toks/s, output: 123.71 toks/s]



Processed prompts: 100%|██████████| 100/100 [01:10<00:00,  1.42it/s, est. speed input: 162.51 toks/s, output: 125.95 toks/s]

Overall output throughput: 125.82064581870641 tokens/second





In [None]:
# batchsize = 4 + totally turn off
run_batch_inference(llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_off.csv")

Processed prompts: 100%|██████████| 100/100 [01:00<00:00,  1.67it/s, est. speed input: 189.99 toks/s, output: 146.85 toks/s]

Overall output throughput: 146.67840422611025 tokens/second





In [None]:
# batchsize = 8
run_batch_inference(original_llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_original.csv")

Processed prompts: 100%|██████████| 100/100 [00:33<00:00,  3.00it/s, est. speed input: 342.28 toks/s, output: 264.57 toks/s]

Overall output throughput: 264.0085491509796 tokens/second





In [4]:
# batchsize = 4
run_batch_inference(original_llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_original.csv")

Processed prompts: 100%|██████████| 100/100 [00:58<00:00,  1.71it/s, est. speed input: 195.53 toks/s, output: 151.14 toks/s]

Overall output throughput: 150.94671541175452 tokens/second



