In [1]:
from vllm import LLM, SamplingParams
import json
import pandas as pd
import time
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
!export CUDA_VISIBLE_DEVICES=0,1
#from vllm.model_executor.models.llama_ee import LlamaForCausalLMEE
#ModelRegistry.register_model("LlamaForCausalLMEE", LlamaForCausalLMEE)
# from huggingface_hub import notebook_login
# notebook_login()

INFO 03-29 15:10:06 __init__.py:183] Automatically detected platform cuda.


In [2]:
def run_batch_inference(llm, input_file, output_file):
    prompts = []
    with open(input_file, "r") as f:
        for line in f.readlines():
            prompts.append(json.loads(line)["text"])

    sampling_params = SamplingParams(seed=42, max_tokens=100)

    start_time = time.time()
    outputs = llm.generate(prompts, sampling_params)
    end_time = time.time()

    input_prompt = []
    generated_text = []
    arrival_time = []
    last_token_time = []
    first_scheduled_time = []
    first_token_time = []
    time_in_queue = []
    finished_time = []
    scheduler_time = []
    model_forward_time = []
    model_execute_time = []

    generated_len = 0 # number of tokens generated
    for output in outputs:
        input_prompt.append(f"\"{output.prompt!r}\"")
        generated_text.append(f"\"{output.outputs[0].text!r}\"")
        generated_len += len(output.outputs[0].token_ids)
        metrics = output.metrics
        arrival_time.append(metrics.arrival_time)
        last_token_time.append(metrics.last_token_time)
        first_scheduled_time.append(metrics.first_scheduled_time)
        first_token_time.append(metrics.first_token_time)
        time_in_queue.append(metrics.time_in_queue)
        finished_time.append(metrics.finished_time)
        scheduler_time.append(metrics.scheduler_time)
        model_forward_time.append(metrics.model_forward_time)
        model_execute_time.append(metrics.model_execute_time)
    
    output_throughput = generated_len / (end_time - start_time)
    print(f"Overall output throughput: {output_throughput} tokens/second")
    df = pd.DataFrame({
        "input_prompt": input_prompt,
        "generated_text": generated_text,
        "arrival_time": arrival_time,
        "last_token_time": last_token_time,
        "first_scheduled_time": first_scheduled_time,
        "first_token_time": first_token_time,
        "time_in_queue": time_in_queue,
        "finished_time": finished_time,
        "scheduler_time": scheduler_time,
        "model_forward_time": model_forward_time,
        "model_execute_time": model_execute_time,
        "throughput": [output_throughput] * len(input_prompt)
    })
    df.to_csv(output_file, index=False)

In [3]:
prompts = [
    'Article: (CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don\'t know, but the fact that so many people can have a life extension, that\'s pretty big," Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. "Thanks for all the support and prayers," a comment on a Facebook page in her name read. "I know this entire journey is much bigger than all of us. I also know I\'m just the messenger." CNN cannot verify the authenticity of the page. But the power that multiplied Broussard. Summarize the article in three sentences. Summary:'
]
sampling_params = SamplingParams(temperature=1.0, top_p=1.0, max_tokens=800)


In [4]:
llm = LLM(model="meta-llama/Meta-Llama-3-8b", max_model_len=1024, enforce_eager=True, max_num_seqs=1, dtype="float16", seed=42, tensor_parallel_size=2)

INFO 03-29 15:10:28 config.py:520] This model supports multiple tasks: {'classify', 'score', 'generate', 'embed', 'reward'}. Defaulting to 'generate'.
INFO 03-29 15:10:28 config.py:1328] Defaulting to use mp for distributed inference
INFO 03-29 15:10:28 llm_engine.py:232] Initializing an LLM engine (v0.7.0) with config: model='meta-llama/Meta-Llama-3-8b', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=2818292)[0;0m INFO 03-29 15:10:45 model_runner.py:1115] Loading model weights took 7.4830 GB
INFO 03-29 15:10:45 model_runner.py:1115] Loading model weights took 7.4830 GB
[1;36m(VllmWorkerProcess pid=2818292)[0;0m returning hidden states: tensor([[ 2.4062,  1.0938, -0.0267,  ..., -1.1611,  0.4702, -1.7100],
[1;36m(VllmWorkerProcess pid=2818292)[0;0m         [ 2.4062,  1.0938, -0.0267,  ..., -1.1611,  0.4702, -1.7100],
[1;36m(VllmWorkerProcess pid=2818292)[0;0m         [ 2.4062,  1.0938, -0.0267,  ..., -1.1611,  0.4702, -1.7100],
[1;36m(VllmWorkerProcess pid=2818292)[0;0m         ...,
[1;36m(VllmWorkerProcess pid=2818292)[0;0m         [ 2.4062,  1.0938, -0.0267,  ..., -1.1611,  0.4702, -1.7100],
[1;36m(VllmWorkerProcess pid=2818292)[0;0m         [ 2.4062,  1.0938, -0.0267,  ..., -1.1611,  0.4702, -1.7100],
[1;36m(VllmWorkerProcess pid=2818292)[0;0m         [ 2.4062,  1.0938, -0.0267,  ..., -1.1611,  0.4702, -1.7100]],
[1;36m(VllmWorkerProce

[1;36m(VllmWorkerProcess pid=2818292)[0;0m returning hidden states: tensor([[ 4.0039, -0.5000, -1.9941,  ..., -3.7480,  0.8428,  2.7031],
[1;36m(VllmWorkerProcess pid=2818292)[0;0m         [ 1.8525, -5.2148, -2.9082,  ...,  4.5000, -0.4395, -2.2422],
[1;36m(VllmWorkerProcess pid=2818292)[0;0m         [-1.0469,  0.4756, -0.0826,  ..., -0.8335, -0.1664,  1.1475],
[1;36m(VllmWorkerProcess pid=2818292)[0;0m         ...,
[1;36m(VllmWorkerProcess pid=2818292)[0;0m         [ 0.1879, -2.7012,  3.4570,  ..., -0.4714,  1.0674,  1.1543],
[1;36m(VllmWorkerProcess pid=2818292)[0;0m         [ 1.8691, -2.3320,  2.8750,  ..., -2.0410,  1.2432, -0.3259],
[1;36m(VllmWorkerProcess pid=2818292)[0;0m         [-0.0872, -2.8379, -0.4094,  ...,  0.4395,  0.9897,  0.3643]],
[1;36m(VllmWorkerProcess pid=2818292)[0;0m        device='cuda:1', dtype=torch.float16)
[1;36m(VllmWorkerProcess pid=2818292)[0;0m returning hidden states: tensor([[ 0.5596, -0.1158,  1.0508,  ...,  1.1582,  1.7148,  2.283

In [5]:
# Output of original lammama for reference
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

[InputPreprocessor] tokenized_ids: [128000, 17064, 25, 320, 56555, 8, 12388, 11, 323, 701, 8352, 690, 387, 56016, 13, 3011, 1253, 5222, 1093, 459, 1560, 83775, 1008, 425, 11, 719, 994, 1901, 22732, 426, 583, 784, 569, 659, 16117, 6773, 311, 3041, 832, 315, 1077, 81960, 311, 264, 35058, 11, 1077, 65352, 35526, 709, 449, 2466, 828, 13, 1102, 19543, 304, 4848, 6978, 12588, 1380, 64805, 13, 3011, 14792, 323, 289, 13111, 1077, 13, 330, 40, 3463, 358, 574, 2133, 311, 1520, 420, 832, 1732, 889, 358, 1541, 956, 1440, 11, 719, 279, 2144, 430, 779, 1690, 1274, 649, 617, 264, 2324, 9070, 11, 430, 596, 5128, 2466, 1359, 426, 583, 784, 569, 3309, 20352, 22325, 735, 15881, 13, 3005, 1253, 2733, 33687, 304, 1077, 65352, 555, 264, 5190, 2410, 13, 330, 12947, 369, 682, 279, 1862, 323, 34296, 1359, 264, 4068, 389, 264, 5690, 2199, 304, 1077, 836, 1373, 13, 330, 40, 1440, 420, 4553, 11879, 374, 1790, 11493, 1109, 682, 315, 603, 13, 358, 1101, 1440, 358, 2846, 1120, 279, 50596, 1210, 20352, 4250, 10356, 2

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

returning hidden states: tensor([[ 4.0039, -0.5000, -1.9941,  ..., -3.7480,  0.8428,  2.7031],
        [ 1.8525, -5.2148, -2.9082,  ...,  4.5000, -0.4395, -2.2422],
        [-1.0469,  0.4756, -0.0826,  ..., -0.8335, -0.1664,  1.1475],
        ...,
        [ 0.1879, -2.7012,  3.4570,  ..., -0.4714,  1.0674,  1.1543],
        [ 1.8691, -2.3320,  2.8750,  ..., -2.0410,  1.2432, -0.3259],
        [-0.0872, -2.8379, -0.4094,  ...,  0.4395,  0.9897,  0.3643]],
       device='cuda:0', dtype=torch.float16)
returning hidden states: tensor([[ 0.5596, -0.1158,  1.0508,  ...,  1.1582,  1.7148,  2.2832]],
       device='cuda:0', dtype=torch.float16)
returning hidden states: tensor([[-1.0146e+00, -1.1396e+00,  9.3031e-04,  ..., -1.0635e-02,
          9.5264e-01,  1.9258e+00]], device='cuda:0', dtype=torch.float16)
returning hidden states: tensor([[-0.0765, -3.2930, -1.1621,  ..., -0.8877, -0.1743,  0.8477]],
       device='cuda:0', dtype=torch.float16)
returning hidden states: tensor([[-0.5840, -2.2

Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.09s/it, est. speed input: 28.22 toks/s, output: 23.00 toks/s]

returning hidden states: tensor([[-0.7134, -0.9683, -2.1191,  ..., -1.4893, -1.6816,  0.5962]],
       device='cuda:0', dtype=torch.float16)
returning hidden states: tensor([[ 0.3992, -4.6602, -2.2734,  ...,  0.3870, -2.1055, -0.4324]],
       device='cuda:0', dtype=torch.float16)
returning hidden states: tensor([[-0.1849, -5.6797,  0.9180,  ..., -0.6875, -0.6646, -0.4419]],
       device='cuda:0', dtype=torch.float16)
returning hidden states: tensor([[ 0.4187, -5.8828, -0.0119,  ...,  0.7148, -0.6401, -0.4219]],
       device='cuda:0', dtype=torch.float16)
returning hidden states: tensor([[-1.4082, -4.6484, -0.9014,  ..., -0.6484,  0.3503,  1.1572]],
       device='cuda:0', dtype=torch.float16)
returning hidden states: tensor([[-0.5215, -2.8887,  3.9668,  ..., -1.2002,  1.7676,  0.9878]],
       device='cuda:0', dtype=torch.float16)
Prompt: 'Article: (CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give




In [None]:
# output of llama-ee (current llama.py is llama-ee)
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    generated_token_ids = output.outputs[0].token_ids
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    print(f"Generated token ids: {generated_token_ids!r}")

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

input_ids shape before converting: torch.Size([27])
input_ids shape after converting: torch.Size([1, 27])
LlamaModel forward. input_ids shape: torch.Size([1, 27])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([1, 27])
Force kv_caches to None
--------EE statistics---------
self.exited_rates: [0, 2]
skip_mask: False, conf: 0.0088043212890625
--------EE statistics---------
input_ids shape before converting: torch.Size([4])
input_ids shape after converting: torch.Size([1, 4])
LlamaModel forward. input_ids shape: torch.Size([1, 4])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([1, 4])
Force kv_caches to None
--------EE statistics---------
self.exited_rates: [0, 3]
skip_mask: False, conf: 0.1258544921875
--------EE statistics---------
input_ids shape before converting: torch.Size([4])
input_ids shape after converting: torch.Size([1, 4])
LlamaModel forward. input_ids shape: torch.Size([1, 4])
a

Processed prompts: 100%|██████████| 4/4 [00:00<00:00,  4.47it/s, est. speed input: 30.20 toks/s, output: 71.59 toks/s]

--------EE statistics---------
self.exited_rates: [2, 11]
skip_mask: False, conf: 0.210693359375
--------EE statistics---------
input_ids shape before converting: torch.Size([4])
input_ids shape after converting: torch.Size([1, 4])
LlamaModel forward. input_ids shape: torch.Size([1, 4])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([1, 4])
Force kv_caches to None
--------EE statistics---------
self.exited_rates: [2, 12]
skip_mask: False, conf: 0.394287109375
--------EE statistics---------
input_ids shape before converting: torch.Size([4])
input_ids shape after converting: torch.Size([1, 4])
LlamaModel forward. input_ids shape: torch.Size([1, 4])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([1, 4])
Force kv_caches to None
--------EE statistics---------
self.exited_rates: [2, 13]
skip_mask: False, conf: 0.008636474609375
--------EE statistics---------
input_ids shape before converting: to




In [6]:
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    generated_token_ids = output.outputs[0].token_ids
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    print(f"Generated token ids: {generated_token_ids!r}")

Prompt: 'Article: (CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don\'t know, but the fact that so many people can have a life extension, that\'s pretty big," Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. "Thanks for all the support and prayers," a comment on a Facebook page in her name read. "I know this entire journey is much bigger than all of us. I also know I\'m just the messenger." CNN cannot verify the authenticity of the page. But the power that multiplied Broussard. Summarize the article in three sentences. Summary:', Generated text: ' Summarize the main points of the article. Who: who, in what city? What: where happens, events or condition

In [4]:
run_batch_inference(original_llm, "lmsys_1.jsonl", "results/lmsys_1_original.csv")

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[LlamaForCausalLM forward] Input_ids shape: torch.Size([65])
[LlamaModel forward] Input_ids shape: torch.Size([65])
[LlamaForCausalLM forward] Model output shape: torch.Size([65, 4096])
[compute_logits] Hidden states shape: torch.Size([65, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forwar

Processed prompts: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it, est. speed input: 25.81 toks/s, output: 79.40 toks/s]

[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] Hidden states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Input_ids shape: torch.Size([2])
[LlamaModel forward] Input_ids shape: torch.Size([2])
[LlamaForCausalLM forward] M




In [None]:
run_batch_inference(llm, "lmsys_1.jsonl", "results/lmsys_1.csv")

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

input_ids shape before converting: torch.Size([65])
input_ids shape after converting: torch.Size([65])
LlamaModel forward. input_ids shape: torch.Size([65])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([65])
[compute_logits] hidden_states shape: torch.Size([65, 4096]). lm_head shape: ParallelLMHead(num_embeddings=32000, embedding_dim=4096, org_vocab_size=32000, num_embeddings_padded=32000, tp_size=1)
[get_skip_mask] mask dim: 1
--------[LlamaModel: forward] EE statistics at layer20---------
self.exited_rates([num_ee, num_no_ee]): [30, 172]
skip_mask: False, conf: 0.005096435546875
--------[LlamaModel: forward] EE statistics---------
[LlamaModel: forward] Returning hidden_states shape: torch.Size([65, 4096])
[LlamaForCausalLM forward] Model output shape: torch.Size([65, 4096])
[compute_logits] hidden_states shape: torch.Size([65, 4096]). lm_head shape: ParallelLMHead(num_embeddings=32000, embedding_dim=4096, org_vocab_size=32000, num_em

Processed prompts: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it, est. speed input: 26.82 toks/s, output: 82.52 toks/s]

[LlamaModel: forward] Returning hidden_states shape: torch.Size([2, 4096])
[LlamaForCausalLM forward] Model output shape: torch.Size([2, 4096])
[compute_logits] hidden_states shape: torch.Size([2, 4096]). lm_head shape: ParallelLMHead(num_embeddings=32000, embedding_dim=4096, org_vocab_size=32000, num_embeddings_padded=32000, tp_size=1)
input_ids shape before converting: torch.Size([2])
input_ids shape after converting: torch.Size([2])
LlamaModel forward. input_ids shape: torch.Size([2])
attention_mask shape: None
-------------------------
Is first rank. input ids shape: torch.Size([2])
[compute_logits] hidden_states shape: torch.Size([2, 4096]). lm_head shape: ParallelLMHead(num_embeddings=32000, embedding_dim=4096, org_vocab_size=32000, num_embeddings_padded=32000, tp_size=1)
[get_skip_mask] mask dim: 1
--------[LlamaModel: forward] EE statistics at layer20---------
self.exited_rates([num_ee, num_no_ee]): [44, 252]
skip_mask: False, conf: 0.0056304931640625
--------[LlamaModel: forwa




In [5]:
run_batch_inference(llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_eager.csv")

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  29%|██▉       | 29/100 [00:10<00:31,  2.29it/s, est. speed input: 254.07 toks/s, output: 264.21 toks/s]



Processed prompts:  57%|█████▋    | 57/100 [00:22<00:36,  1.17it/s, est. speed input: 278.02 toks/s, output: 238.29 toks/s]



Processed prompts: 100%|██████████| 100/100 [00:36<00:00,  2.70it/s, est. speed input: 308.53 toks/s, output: 255.23 toks/s]


Overall output throughput: 254.72538793728546 tokens/second


In [5]:
# batchsize = 4
run_batch_inference(llm, "inputs/lmsys_1.jsonl", "results/lmsys_1_eager.csv")

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

hidden_states shape: torch.Size([65, 4096])
logits shape: torch.Size([65, 32000])
skip_mask: False, conf: 0.0
hidden_states shape: torch.Size([2, 4096])
logits shape: torch.Size([2, 32000])
skip_mask: False, conf: 0.069580078125
hidden_states shape: torch.Size([2, 4096])
logits shape: torch.Size([2, 32000])
skip_mask: False, conf: 0.0013275146484375
hidden_states shape: torch.Size([2, 4096])
logits shape: torch.Size([2, 32000])
skip_mask: False, conf: 0.04150390625
hidden_states shape: torch.Size([2, 4096])
logits shape: torch.Size([2, 32000])
skip_mask: True, conf: 0.4462890625
hidden_states shape: torch.Size([2, 4096])
logits shape: torch.Size([2, 32000])
skip_mask: True, conf: 0.459228515625
hidden_states shape: torch.Size([2, 4096])
logits shape: torch.Size([2, 32000])
skip_mask: False, conf: 0.1171875
hidden_states shape: torch.Size([2, 4096])
logits shape: torch.Size([2, 32000])
skip_mask: False, conf: 0.00716400146484375
hidden_states shape: torch.Size([2, 4096])
logits shape: t

Processed prompts: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it, est. speed input: 22.62 toks/s, output: 69.60 toks/s]

logits shape: torch.Size([2, 32000])
skip_mask: True, conf: 0.51416015625
hidden_states shape: torch.Size([2, 4096])
logits shape: torch.Size([2, 32000])
skip_mask: False, conf: 0.02276611328125
hidden_states shape: torch.Size([2, 4096])
logits shape: torch.Size([2, 32000])
skip_mask: False, conf: 0.024688720703125
hidden_states shape: torch.Size([2, 4096])
logits shape: torch.Size([2, 32000])
skip_mask: False, conf: 0.031585693359375
hidden_states shape: torch.Size([2, 4096])
logits shape: torch.Size([2, 32000])
skip_mask: False, conf: 0.0072174072265625
Overall output throughput: 69.34218724025946 tokens/second





In [5]:
# batchsize = 4
run_batch_inference(llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_eager.csv")

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode]

Processed prompts:   1%|          | 1/100 [00:02<04:18,  2.61s/it, est. speed input: 4.99 toks/s, output: 38.37 toks/s]

[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode]

Processed prompts:   5%|▌         | 5/100 [00:05<01:26,  1.10it/s, est. speed input: 44.06 toks/s, output: 98.78 toks/s]

[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode]

Processed prompts:   9%|▉         | 9/100 [00:06<00:57,  1.58it/s, est. speed input: 99.42 toks/s, output: 129.51 toks/s]

[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode]

Processed prompts:  10%|█         | 10/100 [00:07<01:00,  1.48it/s, est. speed input: 89.15 toks/s, output: 127.27 toks/s]

[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode]

Processed prompts:  13%|█▎        | 13/100 [00:09<01:01,  1.42it/s, est. speed input: 90.48 toks/s, output: 128.89 toks/s]

[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode]

Processed prompts:  14%|█▍        | 14/100 [00:11<01:08,  1.26it/s, est. speed input: 81.38 toks/s, output: 123.43 toks/s]

[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode]

Processed prompts:  17%|█▋        | 17/100 [00:12<00:57,  1.45it/s, est. speed input: 76.73 toks/s, output: 131.17 toks/s]

[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode] Using PagedAttention V1.
[PagedAttention.forward_decode]

KeyboardInterrupt: 

In [5]:
run_batch_inference(llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_lazy.csv")

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  41%|████      | 41/100 [00:14<00:19,  2.99it/s, est. speed input: 252.93 toks/s, output: 239.00 toks/s]



Processed prompts:  58%|█████▊    | 58/100 [00:25<00:32,  1.29it/s, est. speed input: 255.39 toks/s, output: 197.28 toks/s]



Processed prompts: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s, est. speed input: 287.37 toks/s, output: 220.44 toks/s]


Overall output throughput: 220.03491680150898 tokens/second


In [5]:
# batchsize = 4
run_batch_inference(llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_off.csv")

Processed prompts:  56%|█████▌    | 56/100 [00:40<00:28,  1.55it/s, est. speed input: 137.51 toks/s, output: 123.71 toks/s]



Processed prompts: 100%|██████████| 100/100 [01:10<00:00,  1.42it/s, est. speed input: 162.51 toks/s, output: 125.95 toks/s]

Overall output throughput: 125.82064581870641 tokens/second





In [None]:
# batchsize = 4 + totally turn off
run_batch_inference(llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_off.csv")

Processed prompts: 100%|██████████| 100/100 [01:00<00:00,  1.67it/s, est. speed input: 189.99 toks/s, output: 146.85 toks/s]

Overall output throughput: 146.67840422611025 tokens/second





In [None]:
# batchsize = 8
run_batch_inference(original_llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_original.csv")

Processed prompts: 100%|██████████| 100/100 [00:33<00:00,  3.00it/s, est. speed input: 342.28 toks/s, output: 264.57 toks/s]

Overall output throughput: 264.0085491509796 tokens/second





In [4]:
# batchsize = 4
run_batch_inference(original_llm, "inputs/lmsys_100.jsonl", "results/lmsys_100_original.csv")

Processed prompts: 100%|██████████| 100/100 [00:58<00:00,  1.71it/s, est. speed input: 195.53 toks/s, output: 151.14 toks/s]

Overall output throughput: 150.94671541175452 tokens/second



