# 1. Install prerequisites

In [None]:
from importlib.metadata import version

In [1]:
version('torch')

'2.4.0'

In [5]:
version('triton')

'3.0.0'

In [None]:
pip install --upgrade transformers

In [2]:
version('transformers')

'4.44.2'

## vLLM

https://docs.vllm.ai/en/latest/

In [None]:
pip install --upgrade vllm

In [3]:
version('vllm')

'0.6.1.post2'

## SGLang

https://sglang.readthedocs.io/en/latest/

In [None]:
pip install --upgrade "sglang[all]"

In [4]:
version('sglang')

'0.3.1.post2'

In [None]:
pip install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/

In [6]:
version('flashinfer')

'0.1.6+cu124torch2.4'

## ollama

https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install

In [11]:
!mkdir ollama && curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz && tar -C ./ollama -xzf ollama-linux-amd64.tgz && rm ollama-linux-amd64.tgz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   117  100   117    0     0    566      0 --:--:-- --:--:-- --:--:--   567
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1583M  100 1583M    0     0  17.1M      0  0:01:32  0:01:32 --:--:-- 16.9M


```bash
./ollama/bin/ollama serve &
```

https://github.com/ollama/ollama-python

https://github.com/ollama/ollama/tree/main/docs

In [None]:
pip install --upgrade ollama

In [9]:
version('ollama')

'0.3.3'

# 2. Performance testing functions

## Test models

https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct

https://huggingface.co/collections/neuralmagic/llama-31-quantization-66a3f907f48d07feabb8f300

In [1]:
test_models = {                                                                    # OpenLLM leaderboard score
    "llama-3.1" : "meta-llama/Meta-Llama-3.1-8B-Instruct",                         # 100.0 %
    "llama-3.1:w8a16" : "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16",  # 99.8 %    
    "llama-3.1:fp8" : "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8",                # 99.5 % - warning the "FP8-dynamic" version is MUCH slower on RTX 4090 !
    "llama-3.1:w8a8" : "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",    # 99.4 %
    "llama-3.1:w4a16" : "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"   # 97.1 %
}

Note: to create custom quantized model versions -> https://docs.vllm.ai/en/latest/quantization/fp8.html#quantization-process

https://ollama.com/library/llama3.1

```bash
./ollama/bin/ollama pull llama3.1:8b-instruct-fp16
./ollama/bin/ollama pull llama3.1:8b-instruct-q8_0
./ollama/bin/ollama pull llama3.1:8b-instruct-q4_0
```

In [2]:
ollama_test_models = {
    "llama-3.1" : "llama3.1:8b-instruct-fp16",
    "llama-3.1:int8" : "llama3.1:8b-instruct-q8_0",
    "llama-3.1:int4" : "llama3.1:8b-instruct-q4_0"
}

## Test prompts

In [3]:
test_messages = [
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages du Crédit Mutuel ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages du Crédit Agricole ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la Société Générale ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la BNP ?"}
]
]

In [4]:
from transformers import AutoTokenizer

def format_prompt(messages, model):
    tokenizer = AutoTokenizer.from_pretrained(model)
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

In [5]:
format_prompt(test_messages, test_models["llama-3.1"])

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nTu es un assistant utile et professionnel qui répond toujours en français.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuels sont les avantages du Crédit Mutuel ?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
 '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nTu es un assistant utile et professionnel qui répond toujours en français.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuels sont les avantages du Crédit Agricole ?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
 '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nTu es un assistant utile et professionnel qui répond toujours en français.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuels sont 

### vLLM

In [6]:
# Authenticate VLLM with Huggingface Hub
import os

with open("/workspace/hftoken", 'r') as file:
    myhftoken = file.read().strip()

os.environ["HF_TOKEN"]=myhftoken

In [7]:
import time
from vllm import LLM, SamplingParams

def vllm_load(model):    
    llm = LLM(model, gpu_memory_utilization=0.99, max_model_len=8192)
    llm._model = model
    return llm

def vllm_generate(messages, llm):    
    print(f"vLLM performance test:")
    
    prompts = format_prompt(messages, llm._model)
    sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512)
    # warmup
    outputs = llm.generate(prompts[0], sampling_params)
    print(f"Generated text: {outputs[0].outputs[0].text!r}")
    
    for batch_size in range(1, len(messages) + 1):
        start_time = time.time()  # Record the start time
        outputs = llm.generate(prompts[0:batch_size], sampling_params)
        end_time = time.time()  # Record the end time
            
        # Print the outputs.
        tokenscount = 0
        for output in outputs:
            generated_text = output.outputs[0].text
            tokenscount = tokenscount + len(output.outputs[0].token_ids)

        tokens_per_sec = tokenscount/(end_time-start_time)
        print(f"- batch size {batch_size}: {tokens_per_sec:.2f} tokens/sec ({batch_size} x {tokens_per_sec/batch_size:.2f})")

In [8]:
llm = vllm_load(test_models["llama-3.1"])

INFO 09-22 14:24:00 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Meta-Llama-3.1-8B-Instruct, use_v2_block_manager=False, num_scheduler_steps=1, en

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 09-22 14:24:17 model_runner.py:1008] Loading model weights took 14.9888 GB
INFO 09-22 14:24:19 gpu_executor.py:122] # GPU blocks: 3610, # CPU blocks: 2048
INFO 09-22 14:24:31 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-22 14:24:31 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-22 14:24:42 model_runner.py:1430] Graph capturing finished in 11 secs.


In [17]:
vllm_generate(test_messages*8, llm)

vLLM performance test:


Processed prompts: 100%|█████████████| 1/1 [00:09<00:00,  9.58s/it, est. speed input: 6.58 toks/s, output: 53.47 toks/s]


Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre plusieurs avantages à ses membres et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus élevés sur les épargnes** : Le Crédit Mutuel propose des taux d'intérêt plus élevés sur les comptes d'épargne que de nombreux autres établissements bancaires.\n2. **Taux d'emprunt compétitifs** : Les prêts personnels, les prêts immobiliers et les prêts pour la mobilité sont proposés avec des taux d'intérêt attractifs.\n3. **Services personnalisés** : Le Crédit Mutuel offre des services personnalisés et adaptés aux besoins de ses membres et clients, grâce à une approche relationnelle et à une connaissance approfondie de leurs situations financières.\n4. **Sécurité et confidentialité** : Le Crédit Mutuel s'engage à protéger les données personnelles et financières de ses membres et clients, conformément aux règles de protection des données.\n5. **Participation aux décisions** : En tant que membre 

Processed prompts: 100%|█████████████| 1/1 [00:09<00:00,  9.30s/it, est. speed input: 6.78 toks/s, output: 55.07 toks/s]


- batch size 1: 55.06 tokens/sec (1 x 55.06)


Processed prompts: 100%|███████████| 2/2 [00:09<00:00,  4.79s/it, est. speed input: 13.24 toks/s, output: 106.79 toks/s]


- batch size 2: 106.76 tokens/sec (2 x 53.38)


Processed prompts: 100%|███████████| 3/3 [00:09<00:00,  3.21s/it, est. speed input: 19.84 toks/s, output: 159.53 toks/s]


- batch size 3: 159.49 tokens/sec (3 x 53.16)


Processed prompts: 100%|███████████| 4/4 [00:09<00:00,  2.41s/it, est. speed input: 26.13 toks/s, output: 209.16 toks/s]


- batch size 4: 209.11 tokens/sec (4 x 52.28)


Processed prompts: 100%|███████████| 5/5 [00:09<00:00,  1.95s/it, est. speed input: 32.38 toks/s, output: 255.03 toks/s]


- batch size 5: 254.98 tokens/sec (5 x 51.00)


Processed prompts: 100%|███████████| 6/6 [00:09<00:00,  1.63s/it, est. speed input: 38.78 toks/s, output: 314.30 toks/s]


- batch size 6: 314.21 tokens/sec (6 x 52.37)


Processed prompts: 100%|███████████| 7/7 [00:09<00:00,  1.40s/it, est. speed input: 45.17 toks/s, output: 365.40 toks/s]


- batch size 7: 365.28 tokens/sec (7 x 52.18)


Processed prompts: 100%|███████████| 8/8 [00:09<00:00,  1.23s/it, est. speed input: 51.42 toks/s, output: 417.88 toks/s]


- batch size 8: 417.78 tokens/sec (8 x 52.22)


Processed prompts: 100%|███████████| 9/9 [00:09<00:00,  1.10s/it, est. speed input: 57.23 toks/s, output: 453.61 toks/s]


- batch size 9: 453.50 tokens/sec (9 x 50.39)


Processed prompts: 100%|█████████| 10/10 [00:09<00:00,  1.00it/s, est. speed input: 63.26 toks/s, output: 513.27 toks/s]


- batch size 10: 513.14 tokens/sec (10 x 51.31)


Processed prompts: 100%|█████████| 11/11 [00:10<00:00,  1.09it/s, est. speed input: 69.12 toks/s, output: 558.33 toks/s]


- batch size 11: 558.14 tokens/sec (11 x 50.74)


Processed prompts: 100%|█████████| 12/12 [00:10<00:00,  1.19it/s, est. speed input: 74.86 toks/s, output: 607.20 toks/s]


- batch size 12: 607.03 tokens/sec (12 x 50.59)


Processed prompts: 100%|█████████| 13/13 [00:10<00:00,  1.28it/s, est. speed input: 80.73 toks/s, output: 656.08 toks/s]


- batch size 13: 655.89 tokens/sec (13 x 50.45)


Processed prompts: 100%|█████████| 14/14 [00:10<00:00,  1.36it/s, est. speed input: 86.00 toks/s, output: 690.02 toks/s]


- batch size 14: 689.78 tokens/sec (14 x 49.27)


Processed prompts: 100%|█████████| 15/15 [00:10<00:00,  1.47it/s, est. speed input: 92.83 toks/s, output: 748.79 toks/s]


- batch size 15: 748.58 tokens/sec (15 x 49.91)


Processed prompts: 100%|█████████| 16/16 [00:10<00:00,  1.56it/s, est. speed input: 98.23 toks/s, output: 791.13 toks/s]


- batch size 16: 790.84 tokens/sec (16 x 49.43)


Processed prompts: 100%|█████████| 17/17 [00:10<00:00,  1.56it/s, est. speed input: 98.22 toks/s, output: 794.11 toks/s]


- batch size 17: 793.86 tokens/sec (17 x 46.70)


Processed prompts: 100%|████████| 18/18 [00:11<00:00,  1.64it/s, est. speed input: 103.14 toks/s, output: 837.51 toks/s]


- batch size 18: 837.21 tokens/sec (18 x 46.51)


Processed prompts: 100%|████████| 19/19 [00:11<00:00,  1.73it/s, est. speed input: 108.98 toks/s, output: 872.24 toks/s]


- batch size 19: 871.95 tokens/sec (19 x 45.89)


Processed prompts: 100%|████████| 20/20 [00:11<00:00,  1.79it/s, est. speed input: 112.62 toks/s, output: 906.29 toks/s]


- batch size 20: 905.96 tokens/sec (20 x 45.30)


Processed prompts: 100%|████████| 21/21 [00:11<00:00,  1.88it/s, est. speed input: 118.43 toks/s, output: 959.20 toks/s]


- batch size 21: 958.86 tokens/sec (21 x 45.66)


Processed prompts: 100%|███████| 22/22 [00:11<00:00,  1.96it/s, est. speed input: 123.62 toks/s, output: 1003.89 toks/s]


- batch size 22: 1003.52 tokens/sec (22 x 45.61)


Processed prompts: 100%|███████| 23/23 [00:11<00:00,  1.97it/s, est. speed input: 124.36 toks/s, output: 1009.31 toks/s]


- batch size 23: 1008.92 tokens/sec (23 x 43.87)


Processed prompts: 100%|███████| 24/24 [00:11<00:00,  2.04it/s, est. speed input: 128.40 toks/s, output: 1043.48 toks/s]


- batch size 24: 1043.09 tokens/sec (24 x 43.46)


Processed prompts: 100%|███████| 25/25 [00:11<00:00,  2.12it/s, est. speed input: 133.65 toks/s, output: 1076.93 toks/s]


- batch size 25: 1076.50 tokens/sec (25 x 43.06)


Processed prompts: 100%|███████| 26/26 [00:11<00:00,  2.17it/s, est. speed input: 136.85 toks/s, output: 1102.69 toks/s]


- batch size 26: 1102.26 tokens/sec (26 x 42.39)


Processed prompts: 100%|███████| 27/27 [00:11<00:00,  2.27it/s, est. speed input: 142.88 toks/s, output: 1157.88 toks/s]


- batch size 27: 1157.37 tokens/sec (27 x 42.87)


Processed prompts: 100%|███████| 28/28 [00:11<00:00,  2.34it/s, est. speed input: 147.12 toks/s, output: 1189.74 toks/s]


- batch size 28: 1189.24 tokens/sec (28 x 42.47)


Processed prompts: 100%|███████| 29/29 [00:12<00:00,  2.40it/s, est. speed input: 151.45 toks/s, output: 1223.51 toks/s]


- batch size 29: 1222.97 tokens/sec (29 x 42.17)


Processed prompts: 100%|███████| 30/30 [00:12<00:00,  2.44it/s, est. speed input: 153.90 toks/s, output: 1243.11 toks/s]


- batch size 30: 1242.63 tokens/sec (30 x 41.42)


Processed prompts: 100%|███████| 31/31 [00:12<00:00,  2.53it/s, est. speed input: 159.80 toks/s, output: 1282.74 toks/s]


- batch size 31: 1282.23 tokens/sec (31 x 41.36)


Processed prompts: 100%|███████| 32/32 [00:12<00:00,  2.56it/s, est. speed input: 161.42 toks/s, output: 1308.09 toks/s]

- batch size 32: 1307.55 tokens/sec (32 x 40.86)





### SGLang

In [5]:
import json, time
import sglang

def sglang_load(model):
    runtime = sglang.Runtime(model_path=model)
    runtime._model = model
    return runtime

def sglang_generate(messages, runtime):
    print(f"SGLang performance test:")
    
    prompts = format_prompt(messages, runtime._model)
    sampling_params = { "temperature":0.7, "top_p":0.8, "repetition_penalty":1.05, "max_new_tokens":512 }
    # warmup
    output = json.loads(runtime.generate(prompt=prompts[0], sampling_params=sampling_params))
    print(f"Generated text: {output['text']!r}")
    
    for batch_size in range(1, len(messages) + 1):
        start_time = time.time()  # Record the start time
        outputs = json.loads(runtime.generate(prompt=prompts[0:batch_size], sampling_params=sampling_params))
        end_time = time.time()  # Record the end time
            
        # Print the outputs.
        tokenscount = 0
        for output in outputs:
            generated_text = output["text"]
            tokenscount = tokenscount + output["meta_info"]["completion_tokens"]

        tokens_per_sec = tokenscount/(end_time-start_time)
        print(f"- batch size {batch_size}: {tokens_per_sec:.2f} tokens/sec ({batch_size} x {tokens_per_sec/batch_size:.2f})")

In [None]:
runtime = sglang_load(test_models["llama-3.1"])

In [7]:
sglang_generate(test_messages*8, runtime)

SGLang performance test:
Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre divers avantages à ses membres et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus élevés sur les dépôts** : Le Crédit Mutuel propose des taux d'intérêt plus élevés que les banques traditionnelles pour les dépôts, ce qui permet aux clients de gagner plus d'argent sur leurs économies.\n\n2. **Taux d'emprunt compétitifs** : Les emprunts du Crédit Mutuel sont souvent moins chers que ceux proposés par les banques traditionnelles, ce qui peut aider les clients à se procurer des prêts à des conditions avantageuses.\n\n3. **Services personnalisés** : En tant que banque coopérative, le Crédit Mutuel prend en compte les besoins spécifiques de ses clients et offre des services personnalisés pour répondre à leurs attentes.\n\n4. **Transparence et sécurité** : Le Crédit Mutuel est connu pour sa transparence financière et sa sécurité, ce qui rassure les clients qui s

### Ollama

https://github.com/ollama/ollama-python

In [45]:
import ollama

ollama.list()

{'models': [{'name': 'llama3.1:8b-instruct-q4_0',
   'model': 'llama3.1:8b-instruct-q4_0',
   'modified_at': '2024-09-22T11:43:03.484852591+02:00',
   'size': 4661230766,
   'digest': '42182419e9508c30c4b1fe55015f06b65f4ca4b9e28a744be55008d21998a093',
   'details': {'parent_model': '',
    'format': 'gguf',
    'family': 'llama',
    'families': ['llama'],
    'parameter_size': '8.0B',
    'quantization_level': 'Q4_0'}},
  {'name': 'llama3.1:8b-instruct-q8_0',
   'model': 'llama3.1:8b-instruct-q8_0',
   'modified_at': '2024-09-22T11:33:10.006555339+02:00',
   'size': 8540789934,
   'digest': 'b158ded76fa05be6bce8a682099ce5df8c5571340a04cf63a2923464679db576',
   'details': {'parent_model': '',
    'format': 'gguf',
    'family': 'llama',
    'families': ['llama'],
    'parameter_size': '8.0B',
    'quantization_level': 'Q8_0'}},
  {'name': 'llama3.1:8b-instruct-fp16',
   'model': 'llama3.1:8b-instruct-fp16',
   'modified_at': '2024-09-21T22:54:30.926572546+02:00',
   'size': 16068910253

In [27]:
print(ollama.show("llama3.1:8b-instruct-fp16")['template'])

{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
{{- if .System }}

{{ .System }}
{{- end }}
{{- if .Tools }}

Cutting Knowledge Date: December 2023

When you receive a tool call response, use the output to format an answer to the orginal user question.

You are a helpful assistant with tool calling capabilities.
{{- end }}<|eot_id|>
{{- end }}
{{- range $i, $_ := .Messages }}
{{- $last := eq (len (slice $.Messages $i)) 1 }}
{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>
{{- if and $.Tools $last }}

Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.

Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.

{{ range $.Tools }}
{{- . }}
{{ end }}
Question: {{ .Content }}<|eot_id|>
{{- else }}

{{ .Content }}<|eot_id|>
{{- end }}{{ if $last }}<|start_header_id|>assistant<|end_header_id|>

{{ e

https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion

In [41]:
import ollama

# ollama keeps models 5 min in memory by default, they are reloaded by a query
def ollama_load(model):
    sampling_params = { "num_predict":1 }
    ollama.generate(model=model, prompt="load", raw=True, options=sampling_params, stream=False)
    return model

# ollama API only supports batch size 1
def ollama_generate(messages, model):
    print(f"ollama performance test:")
    
    prompts = format_prompt(messages, runtime._model)
    sampling_params = { "temperature":0.7, "top_p":0.8, "repeat_penalty":1.05, "num_predict":512 }
    # warmup
    output = ollama.generate(model=model, prompt=prompts[0], raw=True, options=sampling_params, stream=False)
    print(f"Generated text: {output['response']!r}")
    
    for msg_index in range(len(messages)):
        start_time = time.time()  # Record the start time
        output = ollama.generate(model=model, prompt=prompts[msg_index], raw=True, options=sampling_params, stream=False)
        end_time = time.time()  # Record the end time
            
        # Print the outputs.
        tokenscount = output['eval_count']
        tokens_per_sec = tokenscount/(end_time-start_time)
        print(f"- batch size 1: {tokens_per_sec:.2f} tokens/sec")

In [43]:
model = ollama_load(ollama_test_models["llama-3.1"])

In [44]:
ollama_generate(test_messages, model)

ollama performance test:
Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre divers avantages à ses adhérents. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts réduits sur les prêts** : Le Crédit Mutuel propose des taux d'intérêt compétitifs pour les prêts personnels, hypothécaires et professionnels.\n2. **Avantages fiscaux** : Les adhérents du Crédit Mutuel peuvent bénéficier de réductions fiscales sur leurs prêts et épargnes, en fonction de leur situation financière et de leurs impôts.\n3. **Assistance financière** : Le Crédit Mutuel offre des services d'assistance financière pour aider ses adhérents à gérer leurs finances et à atteindre leurs objectifs économiques.\n4. **Produits diversifiés** : Le Crédit Mutuel propose une gamme complète de produits bancaires, tels que les comptes courants, les livrets d'épargne, les cartes de crédit, les assurances et les investissements.\n5. **Services en ligne** : Les adhérents du Crédit Mutuel ont accè

# 3. Performance tests on RTX 4090

## vLLM

### FP16

In [7]:
llm = vllm_load(test_models["llama-3.1"])

INFO 09-22 12:18:56 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Meta-Llama-3.1-8B-Instruct, use_v2_block_manager=False, num_scheduler_steps=1, en

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 09-22 12:19:07 model_runner.py:1008] Loading model weights took 14.9888 GB
INFO 09-22 12:19:08 gpu_executor.py:122] # GPU blocks: 3610, # CPU blocks: 2048
INFO 09-22 12:19:08 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-22 12:19:08 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-22 12:19:18 model_runner.py:1430] Graph capturing finished in 10 secs.


In [8]:
vllm_generate(test_messages*12, llm)

vLLM performance test:


Processed prompts: 100%|█████████████| 1/1 [00:09<00:00,  9.62s/it, est. speed input: 6.55 toks/s, output: 53.23 toks/s]


Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre plusieurs avantages à ses membres et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus élevés sur les épargnes** : Le Crédit Mutuel propose des taux d'intérêt plus élevés sur les comptes d'épargne que de nombreux autres établissements bancaires.\n2. **Taux d'emprunt compétitifs** : Les prêts personnels, les prêts immobiliers et les prêts pour la mobilité sont proposés avec des taux d'intérêt attractifs.\n3. **Services personnalisés** : Le Crédit Mutuel offre des services personnalisés et adaptés aux besoins de ses membres et clients, grâce à une approche relationnelle et à une connaissance approfondie de leurs situations financières.\n4. **Sécurité et confidentialité** : Le Crédit Mutuel s'engage à protéger les données personnelles et financières de ses membres et clients, conformément aux règles de protection des données.\n5. **Participation aux décisions** : En tant que membre 

Processed prompts: 100%|█████████████| 1/1 [00:09<00:00,  9.29s/it, est. speed input: 6.78 toks/s, output: 55.11 toks/s]


- batch size 1: 55.10 tokens/sec (1 x 55.10)


Processed prompts: 100%|███████████| 2/2 [00:09<00:00,  4.79s/it, est. speed input: 13.26 toks/s, output: 106.91 toks/s]


- batch size 2: 106.89 tokens/sec (2 x 53.44)


Processed prompts: 100%|███████████| 3/3 [00:09<00:00,  3.20s/it, est. speed input: 19.89 toks/s, output: 159.98 toks/s]


- batch size 3: 159.94 tokens/sec (3 x 53.31)


Processed prompts: 100%|███████████| 4/4 [00:09<00:00,  2.41s/it, est. speed input: 26.12 toks/s, output: 209.04 toks/s]


- batch size 4: 209.00 tokens/sec (4 x 52.25)


Processed prompts: 100%|███████████| 5/5 [00:09<00:00,  1.95s/it, est. speed input: 32.36 toks/s, output: 254.88 toks/s]


- batch size 5: 254.81 tokens/sec (5 x 50.96)


Processed prompts: 100%|███████████| 6/6 [00:09<00:00,  1.63s/it, est. speed input: 38.83 toks/s, output: 314.72 toks/s]


- batch size 6: 314.63 tokens/sec (6 x 52.44)


Processed prompts: 100%|███████████| 7/7 [00:09<00:00,  1.40s/it, est. speed input: 45.15 toks/s, output: 365.32 toks/s]


- batch size 7: 365.23 tokens/sec (7 x 52.18)


Processed prompts: 100%|███████████| 8/8 [00:09<00:00,  1.23s/it, est. speed input: 51.20 toks/s, output: 416.13 toks/s]


- batch size 8: 416.02 tokens/sec (8 x 52.00)


Processed prompts: 100%|███████████| 9/9 [00:09<00:00,  1.10s/it, est. speed input: 57.18 toks/s, output: 453.22 toks/s]


- batch size 9: 453.06 tokens/sec (9 x 50.34)


Processed prompts: 100%|█████████| 10/10 [00:09<00:00,  1.01it/s, est. speed input: 63.46 toks/s, output: 514.91 toks/s]


- batch size 10: 514.74 tokens/sec (10 x 51.47)


Processed prompts: 100%|█████████| 11/11 [00:09<00:00,  1.10it/s, est. speed input: 69.57 toks/s, output: 561.93 toks/s]


- batch size 11: 561.78 tokens/sec (11 x 51.07)


Processed prompts: 100%|█████████| 12/12 [00:10<00:00,  1.19it/s, est. speed input: 75.24 toks/s, output: 610.24 toks/s]


- batch size 12: 610.03 tokens/sec (12 x 50.84)


Processed prompts: 100%|█████████| 13/13 [00:10<00:00,  1.28it/s, est. speed input: 80.76 toks/s, output: 656.31 toks/s]


- batch size 13: 656.12 tokens/sec (13 x 50.47)


Processed prompts: 100%|█████████| 14/14 [00:10<00:00,  1.36it/s, est. speed input: 86.09 toks/s, output: 690.78 toks/s]


- batch size 14: 690.56 tokens/sec (14 x 49.33)


Processed prompts: 100%|█████████| 15/15 [00:10<00:00,  1.47it/s, est. speed input: 92.65 toks/s, output: 747.39 toks/s]


- batch size 15: 747.14 tokens/sec (15 x 49.81)


Processed prompts: 100%|█████████| 16/16 [00:10<00:00,  1.56it/s, est. speed input: 98.35 toks/s, output: 792.05 toks/s]


- batch size 16: 791.76 tokens/sec (16 x 49.49)


Processed prompts: 100%|█████████| 17/17 [00:10<00:00,  1.55it/s, est. speed input: 97.86 toks/s, output: 791.16 toks/s]


- batch size 17: 790.86 tokens/sec (17 x 46.52)


Processed prompts: 100%|████████| 18/18 [00:11<00:00,  1.63it/s, est. speed input: 102.61 toks/s, output: 833.17 toks/s]


- batch size 18: 832.88 tokens/sec (18 x 46.27)


Processed prompts: 100%|████████| 19/19 [00:11<00:00,  1.73it/s, est. speed input: 108.87 toks/s, output: 871.30 toks/s]


- batch size 19: 871.01 tokens/sec (19 x 45.84)


Processed prompts: 100%|████████| 20/20 [00:11<00:00,  1.79it/s, est. speed input: 112.79 toks/s, output: 907.66 toks/s]


- batch size 20: 907.32 tokens/sec (20 x 45.37)


Processed prompts: 100%|████████| 21/21 [00:11<00:00,  1.88it/s, est. speed input: 118.24 toks/s, output: 957.62 toks/s]


- batch size 21: 957.29 tokens/sec (21 x 45.59)


Processed prompts: 100%|███████| 22/22 [00:11<00:00,  1.96it/s, est. speed input: 123.82 toks/s, output: 1005.52 toks/s]


- batch size 22: 1005.15 tokens/sec (22 x 45.69)


Processed prompts: 100%|███████| 23/23 [00:11<00:00,  1.96it/s, est. speed input: 123.65 toks/s, output: 1003.52 toks/s]


- batch size 23: 1003.09 tokens/sec (23 x 43.61)


Processed prompts: 100%|███████| 24/24 [00:11<00:00,  2.03it/s, est. speed input: 127.89 toks/s, output: 1039.36 toks/s]


- batch size 24: 1038.99 tokens/sec (24 x 43.29)


Processed prompts: 100%|███████| 25/25 [00:11<00:00,  2.09it/s, est. speed input: 131.69 toks/s, output: 1061.09 toks/s]


- batch size 25: 1060.58 tokens/sec (25 x 42.42)


Processed prompts: 100%|███████| 26/26 [00:11<00:00,  2.18it/s, est. speed input: 137.70 toks/s, output: 1109.60 toks/s]


- batch size 26: 1109.19 tokens/sec (26 x 42.66)


Processed prompts: 100%|███████| 27/27 [00:11<00:00,  2.26it/s, est. speed input: 142.36 toks/s, output: 1153.66 toks/s]


- batch size 27: 1153.22 tokens/sec (27 x 42.71)


Processed prompts: 100%|███████| 28/28 [00:12<00:00,  2.33it/s, est. speed input: 146.82 toks/s, output: 1187.31 toks/s]


- batch size 28: 1186.84 tokens/sec (28 x 42.39)


Processed prompts: 100%|███████| 29/29 [00:12<00:00,  2.40it/s, est. speed input: 151.49 toks/s, output: 1223.90 toks/s]


- batch size 29: 1223.40 tokens/sec (29 x 42.19)


Processed prompts: 100%|███████| 30/30 [00:12<00:00,  2.44it/s, est. speed input: 154.00 toks/s, output: 1243.88 toks/s]


- batch size 30: 1243.38 tokens/sec (30 x 41.45)


Processed prompts: 100%|███████| 31/31 [00:12<00:00,  2.53it/s, est. speed input: 159.80 toks/s, output: 1282.70 toks/s]


- batch size 31: 1282.13 tokens/sec (31 x 41.36)


Processed prompts: 100%|███████| 32/32 [00:12<00:00,  2.59it/s, est. speed input: 163.12 toks/s, output: 1321.86 toks/s]


- batch size 32: 1321.37 tokens/sec (32 x 41.29)


Processed prompts: 100%|███████| 33/33 [00:12<00:00,  2.61it/s, est. speed input: 164.50 toks/s, output: 1331.06 toks/s]


- batch size 33: 1330.54 tokens/sec (33 x 40.32)


Processed prompts: 100%|███████| 34/34 [00:12<00:00,  2.68it/s, est. speed input: 169.09 toks/s, output: 1366.55 toks/s]


- batch size 34: 1365.97 tokens/sec (34 x 40.18)


Processed prompts: 100%|███████| 35/35 [00:12<00:00,  2.75it/s, est. speed input: 173.34 toks/s, output: 1405.81 toks/s]


- batch size 35: 1405.09 tokens/sec (35 x 40.15)


Processed prompts: 100%|███████| 36/36 [00:12<00:00,  2.81it/s, est. speed input: 177.34 toks/s, output: 1431.51 toks/s]


- batch size 36: 1430.91 tokens/sec (36 x 39.75)


Processed prompts: 100%|███████| 37/37 [00:12<00:00,  2.85it/s, est. speed input: 179.79 toks/s, output: 1452.48 toks/s]


- batch size 37: 1451.85 tokens/sec (37 x 39.24)


Processed prompts: 100%|███████| 38/38 [00:13<00:00,  2.91it/s, est. speed input: 183.41 toks/s, output: 1478.96 toks/s]


- batch size 38: 1478.29 tokens/sec (38 x 38.90)


Processed prompts: 100%|███████| 39/39 [00:13<00:00,  2.99it/s, est. speed input: 188.84 toks/s, output: 1533.46 toks/s]


- batch size 39: 1532.73 tokens/sec (39 x 39.30)


Processed prompts: 100%|███████| 40/40 [00:13<00:00,  3.03it/s, est. speed input: 190.64 toks/s, output: 1518.94 toks/s]


- batch size 40: 1518.18 tokens/sec (40 x 37.95)


Processed prompts: 100%|███████| 41/41 [00:13<00:00,  3.11it/s, est. speed input: 195.87 toks/s, output: 1575.50 toks/s]


- batch size 41: 1574.79 tokens/sec (41 x 38.41)


Processed prompts: 100%|███████| 42/42 [00:13<00:00,  3.16it/s, est. speed input: 199.45 toks/s, output: 1613.63 toks/s]


- batch size 42: 1612.86 tokens/sec (42 x 38.40)


Processed prompts: 100%|███████| 43/43 [00:13<00:00,  3.18it/s, est. speed input: 200.46 toks/s, output: 1625.72 toks/s]


- batch size 43: 1624.96 tokens/sec (43 x 37.79)


Processed prompts: 100%|███████| 44/44 [00:13<00:00,  3.27it/s, est. speed input: 205.80 toks/s, output: 1659.72 toks/s]


- batch size 44: 1658.89 tokens/sec (44 x 37.70)


Processed prompts: 100%|███████| 45/45 [00:21<00:00,  2.09it/s, est. speed input: 131.66 toks/s, output: 1064.46 toks/s]


- batch size 45: 1064.15 tokens/sec (45 x 23.65)


Processed prompts: 100%|███████| 46/46 [00:21<00:00,  2.10it/s, est. speed input: 132.23 toks/s, output: 1071.14 toks/s]


- batch size 46: 1070.79 tokens/sec (46 x 23.28)


Processed prompts: 100%|███████| 47/47 [00:22<00:00,  2.13it/s, est. speed input: 134.17 toks/s, output: 1087.55 toks/s]


- batch size 47: 1087.21 tokens/sec (47 x 23.13)


Processed prompts: 100%|███████| 48/48 [00:22<00:00,  2.13it/s, est. speed input: 134.46 toks/s, output: 1085.58 toks/s]

- batch size 48: 1085.12 tokens/sec (48 x 22.61)





### w8a16

In [8]:
llm = vllm_load(test_models["llama-3.1:w8a16"])

INFO 09-22 12:47:04 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=neuralmagic/Meta-Llama-3.1-8B-Instruct-quant

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-22 12:47:08 model_runner.py:1008] Loading model weights took 8.4927 GB
INFO 09-22 12:47:09 gpu_executor.py:122] # GPU blocks: 6716, # CPU blocks: 2048
INFO 09-22 12:47:09 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-22 12:47:09 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-22 12:47:18 model_runner.py:1430] Graph capturing finished in 9 secs.


In [10]:
vllm_generate(test_messages*18, llm)

vLLM performance test:


Processed prompts: 100%|█████████████| 1/1 [00:06<00:00,  6.05s/it, est. speed input: 6.94 toks/s, output: 84.63 toks/s]


Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre de nombreux avantages à ses adhérents et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus élevés** : Le Crédit Mutuel propose des taux d'intérêt plus élevés que les banques traditionnelles, notamment pour les comptes courants et les prêts.\n2. **Services personnalisés** : En tant que banque coopérative, le Crédit Mutuel met l'accent sur la proximité et la personnalisation de ses services. Les clients ont accès à des conseillers financiers compétents qui leur proposent des solutions adaptées à leurs besoins.\n3. **Transparence et sécurité** : Le Crédit Mutuel est connu pour sa transparence dans les tarifs et les conditions de prêt. Les clients sont également protégés par des garanties de sécurité renforcées.\n4. **Épargne et placement** : Le Crédit Mutuel propose une gamme de produits d'épargne et de placement attractifs, notamment des comptes d'épargne, des livrets et des fonds 

Processed prompts: 100%|█████████████| 1/1 [00:05<00:00,  5.82s/it, est. speed input: 7.21 toks/s, output: 87.92 toks/s]


- batch size 1: 87.89 tokens/sec (1 x 87.89)


Processed prompts: 100%|███████████| 2/2 [00:05<00:00,  2.98s/it, est. speed input: 14.25 toks/s, output: 171.63 toks/s]


- batch size 2: 171.58 tokens/sec (2 x 85.79)


Processed prompts: 100%|███████████| 3/3 [00:05<00:00,  1.99s/it, est. speed input: 21.48 toks/s, output: 257.74 toks/s]


- batch size 3: 257.67 tokens/sec (3 x 85.89)


Processed prompts: 100%|███████████| 4/4 [00:06<00:00,  1.50s/it, est. speed input: 27.99 toks/s, output: 335.94 toks/s]


- batch size 4: 335.81 tokens/sec (4 x 83.95)


Processed prompts: 100%|███████████| 5/5 [00:06<00:00,  1.21s/it, est. speed input: 34.65 toks/s, output: 413.27 toks/s]


- batch size 5: 413.13 tokens/sec (5 x 82.63)


Processed prompts: 100%|███████████| 6/6 [00:06<00:00,  1.02s/it, est. speed input: 41.37 toks/s, output: 502.31 toks/s]


- batch size 6: 502.13 tokens/sec (6 x 83.69)


Processed prompts: 100%|███████████| 7/7 [00:06<00:00,  1.14it/s, est. speed input: 48.05 toks/s, output: 563.09 toks/s]


- batch size 7: 562.86 tokens/sec (7 x 80.41)


Processed prompts: 100%|███████████| 8/8 [00:06<00:00,  1.29it/s, est. speed input: 54.08 toks/s, output: 658.42 toks/s]


- batch size 8: 658.07 tokens/sec (8 x 82.26)


Processed prompts: 100%|███████████| 9/9 [00:06<00:00,  1.44it/s, est. speed input: 60.28 toks/s, output: 734.72 toks/s]


- batch size 9: 734.43 tokens/sec (9 x 81.60)


Processed prompts: 100%|█████████| 10/10 [00:06<00:00,  1.59it/s, est. speed input: 66.78 toks/s, output: 806.88 toks/s]


- batch size 10: 806.50 tokens/sec (10 x 80.65)


Processed prompts: 100%|█████████| 11/11 [00:06<00:00,  1.69it/s, est. speed input: 71.30 toks/s, output: 857.11 toks/s]


- batch size 11: 856.78 tokens/sec (11 x 77.89)


Processed prompts: 100%|█████████| 12/12 [00:06<00:00,  1.87it/s, est. speed input: 78.49 toks/s, output: 956.26 toks/s]


- batch size 12: 955.85 tokens/sec (12 x 79.65)


Processed prompts: 100%|████████| 13/13 [00:06<00:00,  2.01it/s, est. speed input: 84.51 toks/s, output: 1017.83 toks/s]


- batch size 13: 1017.28 tokens/sec (13 x 78.25)


Processed prompts: 100%|████████| 14/14 [00:06<00:00,  2.15it/s, est. speed input: 90.58 toks/s, output: 1087.73 toks/s]


- batch size 14: 1087.24 tokens/sec (14 x 77.66)


Processed prompts: 100%|████████| 15/15 [00:06<00:00,  2.29it/s, est. speed input: 96.63 toks/s, output: 1162.61 toks/s]


- batch size 15: 1162.08 tokens/sec (15 x 77.47)


Processed prompts: 100%|███████| 16/16 [00:06<00:00,  2.46it/s, est. speed input: 103.28 toks/s, output: 1235.94 toks/s]


- batch size 16: 1235.28 tokens/sec (16 x 77.20)


Processed prompts: 100%|████████| 17/17 [00:07<00:00,  2.37it/s, est. speed input: 99.74 toks/s, output: 1199.24 toks/s]


- batch size 17: 1198.74 tokens/sec (17 x 70.51)


Processed prompts: 100%|███████| 18/18 [00:07<00:00,  2.48it/s, est. speed input: 104.09 toks/s, output: 1259.25 toks/s]


- batch size 18: 1258.70 tokens/sec (18 x 69.93)


Processed prompts: 100%|███████| 19/19 [00:07<00:00,  2.61it/s, est. speed input: 109.93 toks/s, output: 1317.82 toks/s]


- batch size 19: 1317.21 tokens/sec (19 x 69.33)


Processed prompts: 100%|███████| 20/20 [00:07<00:00,  2.67it/s, est. speed input: 112.21 toks/s, output: 1365.50 toks/s]


- batch size 20: 1364.70 tokens/sec (20 x 68.24)


Processed prompts: 100%|███████| 21/21 [00:07<00:00,  2.86it/s, est. speed input: 119.95 toks/s, output: 1436.85 toks/s]


- batch size 21: 1436.10 tokens/sec (21 x 68.39)


Processed prompts: 100%|███████| 22/22 [00:07<00:00,  2.98it/s, est. speed input: 125.10 toks/s, output: 1498.98 toks/s]


- batch size 22: 1498.23 tokens/sec (22 x 68.10)


Processed prompts: 100%|███████| 23/23 [00:07<00:00,  3.09it/s, est. speed input: 130.16 toks/s, output: 1557.75 toks/s]


- batch size 23: 1557.03 tokens/sec (23 x 67.70)


Processed prompts: 100%|███████| 24/24 [00:07<00:00,  3.21it/s, est. speed input: 134.87 toks/s, output: 1614.59 toks/s]


- batch size 24: 1613.82 tokens/sec (24 x 67.24)


Processed prompts: 100%|███████| 25/25 [00:07<00:00,  3.34it/s, est. speed input: 140.42 toks/s, output: 1693.10 toks/s]


- batch size 25: 1692.18 tokens/sec (25 x 67.69)


Processed prompts: 100%|███████| 26/26 [00:07<00:00,  3.39it/s, est. speed input: 142.59 toks/s, output: 1714.22 toks/s]


- batch size 26: 1713.37 tokens/sec (26 x 65.90)


Processed prompts: 100%|███████| 27/27 [00:07<00:00,  3.57it/s, est. speed input: 150.38 toks/s, output: 1784.17 toks/s]


- batch size 27: 1783.18 tokens/sec (27 x 66.04)


Processed prompts: 100%|███████| 28/28 [00:07<00:00,  3.64it/s, est. speed input: 152.80 toks/s, output: 1857.85 toks/s]


- batch size 28: 1856.88 tokens/sec (28 x 66.32)


Processed prompts: 100%|███████| 29/29 [00:07<00:00,  3.73it/s, est. speed input: 156.48 toks/s, output: 1885.38 toks/s]


- batch size 29: 1884.27 tokens/sec (29 x 64.97)


Processed prompts: 100%|███████| 30/30 [00:07<00:00,  3.85it/s, est. speed input: 161.69 toks/s, output: 1931.98 toks/s]


- batch size 30: 1930.94 tokens/sec (30 x 64.36)


Processed prompts: 100%|███████| 31/31 [00:08<00:00,  3.87it/s, est. speed input: 162.87 toks/s, output: 1967.71 toks/s]


- batch size 31: 1966.60 tokens/sec (31 x 63.44)


Processed prompts: 100%|███████| 32/32 [00:07<00:00,  4.06it/s, est. speed input: 170.60 toks/s, output: 2071.56 toks/s]


- batch size 32: 2070.27 tokens/sec (32 x 64.70)


Processed prompts: 100%|███████| 33/33 [00:08<00:00,  3.99it/s, est. speed input: 167.64 toks/s, output: 2013.23 toks/s]


- batch size 33: 2012.02 tokens/sec (33 x 60.97)


Processed prompts: 100%|███████| 34/34 [00:08<00:00,  4.06it/s, est. speed input: 170.75 toks/s, output: 2057.06 toks/s]


- batch size 34: 2055.94 tokens/sec (34 x 60.47)


Processed prompts: 100%|███████| 35/35 [00:08<00:00,  4.08it/s, est. speed input: 171.43 toks/s, output: 2072.74 toks/s]


- batch size 35: 2071.45 tokens/sec (35 x 59.18)


Processed prompts: 100%|███████| 36/36 [00:08<00:00,  4.22it/s, est. speed input: 177.45 toks/s, output: 2139.55 toks/s]


- batch size 36: 2138.32 tokens/sec (36 x 59.40)


Processed prompts: 100%|███████| 37/37 [00:08<00:00,  4.32it/s, est. speed input: 181.51 toks/s, output: 2183.74 toks/s]


- batch size 37: 2182.29 tokens/sec (37 x 58.98)


Processed prompts: 100%|███████| 38/38 [00:08<00:00,  4.40it/s, est. speed input: 184.81 toks/s, output: 2239.70 toks/s]


- batch size 38: 2238.40 tokens/sec (38 x 58.91)


Processed prompts: 100%|███████| 39/39 [00:08<00:00,  4.45it/s, est. speed input: 186.98 toks/s, output: 2260.57 toks/s]


- batch size 39: 2259.25 tokens/sec (39 x 57.93)


Processed prompts: 100%|███████| 40/40 [00:08<00:00,  4.62it/s, est. speed input: 193.99 toks/s, output: 2339.56 toks/s]


- batch size 40: 2338.13 tokens/sec (40 x 58.45)


Processed prompts: 100%|███████| 41/41 [00:08<00:00,  4.67it/s, est. speed input: 196.33 toks/s, output: 2359.76 toks/s]


- batch size 41: 2358.16 tokens/sec (41 x 57.52)


Processed prompts: 100%|███████| 42/42 [00:08<00:00,  4.72it/s, est. speed input: 198.40 toks/s, output: 2396.78 toks/s]


- batch size 42: 2395.28 tokens/sec (42 x 57.03)


Processed prompts: 100%|███████| 43/43 [00:09<00:00,  4.71it/s, est. speed input: 198.19 toks/s, output: 2383.91 toks/s]


- batch size 43: 2382.54 tokens/sec (43 x 55.41)


Processed prompts: 100%|███████| 44/44 [00:09<00:00,  4.88it/s, est. speed input: 204.93 toks/s, output: 2442.57 toks/s]


- batch size 44: 2440.99 tokens/sec (44 x 55.48)


Processed prompts: 100%|███████| 45/45 [00:09<00:00,  4.94it/s, est. speed input: 207.50 toks/s, output: 2503.28 toks/s]


- batch size 45: 2501.75 tokens/sec (45 x 55.59)


Processed prompts: 100%|███████| 46/46 [00:09<00:00,  4.95it/s, est. speed input: 207.94 toks/s, output: 2512.30 toks/s]


- batch size 46: 2510.82 tokens/sec (46 x 54.58)


Processed prompts: 100%|███████| 47/47 [00:09<00:00,  5.10it/s, est. speed input: 214.43 toks/s, output: 2596.01 toks/s]


- batch size 47: 2594.32 tokens/sec (47 x 55.20)


Processed prompts: 100%|███████| 48/48 [00:09<00:00,  5.20it/s, est. speed input: 218.22 toks/s, output: 2615.13 toks/s]


- batch size 48: 2613.30 tokens/sec (48 x 54.44)


Processed prompts: 100%|███████| 49/49 [00:09<00:00,  5.39it/s, est. speed input: 226.31 toks/s, output: 2720.64 toks/s]


- batch size 49: 2718.71 tokens/sec (49 x 55.48)


Processed prompts: 100%|███████| 50/50 [00:09<00:00,  5.55it/s, est. speed input: 233.29 toks/s, output: 2790.36 toks/s]


- batch size 50: 2788.50 tokens/sec (50 x 55.77)


Processed prompts: 100%|███████| 51/51 [00:09<00:00,  5.17it/s, est. speed input: 217.31 toks/s, output: 2616.69 toks/s]


- batch size 51: 2615.17 tokens/sec (51 x 51.28)


Processed prompts: 100%|███████| 52/52 [00:09<00:00,  5.32it/s, est. speed input: 223.62 toks/s, output: 2702.69 toks/s]


- batch size 52: 2700.85 tokens/sec (52 x 51.94)


Processed prompts: 100%|███████| 53/53 [00:09<00:00,  5.41it/s, est. speed input: 227.09 toks/s, output: 2724.34 toks/s]


- batch size 53: 2722.50 tokens/sec (53 x 51.37)


Processed prompts: 100%|███████| 54/54 [00:09<00:00,  5.40it/s, est. speed input: 227.07 toks/s, output: 2739.90 toks/s]


- batch size 54: 2738.10 tokens/sec (54 x 50.71)


Processed prompts: 100%|███████| 55/55 [00:09<00:00,  5.52it/s, est. speed input: 231.88 toks/s, output: 2794.47 toks/s]


- batch size 55: 2792.75 tokens/sec (55 x 50.78)


Processed prompts: 100%|███████| 56/56 [00:10<00:00,  5.50it/s, est. speed input: 230.98 toks/s, output: 2791.16 toks/s]


- batch size 56: 2789.30 tokens/sec (56 x 49.81)


Processed prompts: 100%|███████| 57/57 [00:10<00:00,  5.62it/s, est. speed input: 236.13 toks/s, output: 2836.53 toks/s]


- batch size 57: 2834.65 tokens/sec (57 x 49.73)


Processed prompts: 100%|███████| 58/58 [00:10<00:00,  5.71it/s, est. speed input: 239.74 toks/s, output: 2868.93 toks/s]


- batch size 58: 2866.93 tokens/sec (58 x 49.43)


Processed prompts: 100%|███████| 59/59 [00:10<00:00,  5.69it/s, est. speed input: 239.28 toks/s, output: 2891.93 toks/s]


- batch size 59: 2890.12 tokens/sec (59 x 48.99)


Processed prompts: 100%|███████| 60/60 [00:10<00:00,  5.85it/s, est. speed input: 245.72 toks/s, output: 2945.32 toks/s]


- batch size 60: 2943.27 tokens/sec (60 x 49.05)


Processed prompts: 100%|███████| 61/61 [00:10<00:00,  5.85it/s, est. speed input: 245.85 toks/s, output: 2951.12 toks/s]


- batch size 61: 2949.07 tokens/sec (61 x 48.35)


Processed prompts: 100%|███████| 62/62 [00:10<00:00,  5.93it/s, est. speed input: 249.18 toks/s, output: 2990.77 toks/s]


- batch size 62: 2988.63 tokens/sec (62 x 48.20)


Processed prompts: 100%|███████| 63/63 [00:10<00:00,  5.92it/s, est. speed input: 248.93 toks/s, output: 2987.75 toks/s]


- batch size 63: 2985.80 tokens/sec (63 x 47.39)


Processed prompts: 100%|███████| 64/64 [00:10<00:00,  5.93it/s, est. speed input: 249.17 toks/s, output: 2987.57 toks/s]


- batch size 64: 2985.45 tokens/sec (64 x 46.65)


Processed prompts: 100%|███████| 65/65 [00:13<00:00,  4.89it/s, est. speed input: 205.33 toks/s, output: 2475.45 toks/s]


- batch size 65: 2474.07 tokens/sec (65 x 38.06)


Processed prompts: 100%|███████| 66/66 [00:13<00:00,  4.85it/s, est. speed input: 203.57 toks/s, output: 2432.96 toks/s]


- batch size 66: 2431.67 tokens/sec (66 x 36.84)


Processed prompts: 100%|███████| 67/67 [00:13<00:00,  4.87it/s, est. speed input: 204.60 toks/s, output: 2458.98 toks/s]


- batch size 67: 2457.61 tokens/sec (67 x 36.68)


Processed prompts: 100%|███████| 68/68 [00:14<00:00,  4.75it/s, est. speed input: 199.54 toks/s, output: 2413.53 toks/s]


- batch size 68: 2412.29 tokens/sec (68 x 35.47)


Processed prompts:   0%|                     | 0/69 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

KeyboardInterrupt: 

### fp8

Note: the "FP8-dynamic" version is much slower that the "FP8" version.
- batch size 16: 61 tokens/sec vs 77 tokens/sec
- batch size 32: 42 tokens/sec vs 62 tokens/sec
- batch size 48: 03 tokens/sec vs 52 tokens/sec !

In [8]:
llm = vllm_load(test_models["llama-3.1:fp8"])

config.json:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

INFO 09-22 13:39:33 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8, use_v2_block_manager=Fal

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

INFO 09-22 13:39:35 model_runner.py:997] Starting to load model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8...
INFO 09-22 13:39:36 weight_utils.py:242] Using model weights format ['*.safetensors']


model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.08G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/62.8k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-22 13:47:57 model_runner.py:1008] Loading model weights took 8.4889 GB
INFO 09-22 13:47:58 gpu_executor.py:122] # GPU blocks: 6856, # CPU blocks: 2048
INFO 09-22 13:47:58 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-22 13:47:58 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-22 13:48:07 model_runner.py:1430] Graph capturing finished in 9 secs.


In [9]:
vllm_generate(test_messages*16, llm)

vLLM performance test:


Processed prompts: 100%|████████████| 1/1 [00:06<00:00,  6.11s/it, est. speed input: 10.32 toks/s, output: 83.84 toks/s]


Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre plusieurs avantages à ses membres et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus élevés sur les épargnes** : Le Crédit Mutuel propose des taux d'intérêt plus élevés sur les comptes d'épargne comparés aux banques traditionnelles.\n2. **Frais de gestion réduits** : Les frais de gestion pour les comptes courants et les prêts sont souvent moins élevés chez le Crédit Mutuel que dans les banques conventionnelles.\n3. **Prêts personnalisés** : Le Crédit Mutuel prend en compte la situation personnelle et financière de chaque client pour proposer des prêts adaptés à ses besoins.\n4. **Soutien à l'entrepreneuriat** : Le Crédit Mutuel propose des solutions financières spécifiques pour les entrepreneurs et les PME, notamment des prêts et des services de financement.\n5. **Services bancaires complets** : Le Crédit Mutuel propose une gamme complète de services bancaires, y compris les co

Processed prompts: 100%|████████████| 1/1 [00:05<00:00,  5.93s/it, est. speed input: 10.62 toks/s, output: 86.31 toks/s]


- batch size 1: 86.27 tokens/sec (1 x 86.27)


Processed prompts: 100%|███████████| 2/2 [00:06<00:00,  3.04s/it, est. speed input: 20.88 toks/s, output: 168.32 toks/s]


- batch size 2: 168.27 tokens/sec (2 x 84.14)


Processed prompts: 100%|███████████| 3/3 [00:06<00:00,  2.03s/it, est. speed input: 31.39 toks/s, output: 252.42 toks/s]


- batch size 3: 252.33 tokens/sec (3 x 84.11)


Processed prompts: 100%|███████████| 4/4 [00:06<00:00,  1.53s/it, est. speed input: 41.17 toks/s, output: 334.58 toks/s]


- batch size 4: 334.44 tokens/sec (4 x 83.61)


Processed prompts: 100%|███████████| 5/5 [00:06<00:00,  1.24s/it, est. speed input: 51.01 toks/s, output: 411.35 toks/s]


- batch size 5: 411.19 tokens/sec (5 x 82.24)


Processed prompts: 100%|███████████| 6/6 [00:06<00:00,  1.03s/it, est. speed input: 61.37 toks/s, output: 497.41 toks/s]


- batch size 6: 497.24 tokens/sec (6 x 82.87)


Processed prompts: 100%|███████████| 7/7 [00:06<00:00,  1.12it/s, est. speed input: 71.09 toks/s, output: 575.11 toks/s]


- batch size 7: 574.88 tokens/sec (7 x 82.13)


Processed prompts: 100%|███████████| 8/8 [00:06<00:00,  1.28it/s, est. speed input: 80.54 toks/s, output: 654.54 toks/s]


- batch size 8: 654.25 tokens/sec (8 x 81.78)


Processed prompts: 100%|███████████| 9/9 [00:06<00:00,  1.43it/s, est. speed input: 90.25 toks/s, output: 729.52 toks/s]


- batch size 9: 729.21 tokens/sec (9 x 81.02)


Processed prompts: 100%|█████████| 10/10 [00:06<00:00,  1.57it/s, est. speed input: 99.38 toks/s, output: 806.05 toks/s]


- batch size 10: 805.72 tokens/sec (10 x 80.57)


Processed prompts: 100%|████████| 11/11 [00:06<00:00,  1.74it/s, est. speed input: 109.74 toks/s, output: 889.25 toks/s]


- batch size 11: 888.85 tokens/sec (11 x 80.80)


Processed prompts: 100%|████████| 12/12 [00:06<00:00,  1.88it/s, est. speed input: 118.63 toks/s, output: 964.09 toks/s]


- batch size 12: 963.66 tokens/sec (12 x 80.31)


Processed prompts: 100%|███████| 13/13 [00:06<00:00,  2.03it/s, est. speed input: 127.69 toks/s, output: 1026.37 toks/s]


- batch size 13: 1025.87 tokens/sec (13 x 78.91)


Processed prompts: 100%|███████| 14/14 [00:06<00:00,  2.14it/s, est. speed input: 134.68 toks/s, output: 1086.59 toks/s]


- batch size 14: 1086.05 tokens/sec (14 x 77.57)


Processed prompts: 100%|███████| 15/15 [00:06<00:00,  2.29it/s, est. speed input: 144.63 toks/s, output: 1172.89 toks/s]


- batch size 15: 1172.37 tokens/sec (15 x 78.16)


Processed prompts: 100%|███████| 16/16 [00:06<00:00,  2.42it/s, est. speed input: 152.37 toks/s, output: 1232.30 toks/s]


- batch size 16: 1231.70 tokens/sec (16 x 76.98)


Processed prompts: 100%|███████| 17/17 [00:06<00:00,  2.47it/s, est. speed input: 155.67 toks/s, output: 1264.67 toks/s]


- batch size 17: 1264.05 tokens/sec (17 x 74.36)


Processed prompts: 100%|███████| 18/18 [00:06<00:00,  2.61it/s, est. speed input: 164.44 toks/s, output: 1327.22 toks/s]


- batch size 18: 1326.57 tokens/sec (18 x 73.70)


Processed prompts: 100%|███████| 19/19 [00:06<00:00,  2.74it/s, est. speed input: 173.17 toks/s, output: 1394.73 toks/s]


- batch size 19: 1394.01 tokens/sec (19 x 73.37)


Processed prompts: 100%|███████| 20/20 [00:07<00:00,  2.84it/s, est. speed input: 179.14 toks/s, output: 1443.90 toks/s]


- batch size 20: 1443.12 tokens/sec (20 x 72.16)


Processed prompts: 100%|███████| 21/21 [00:06<00:00,  3.02it/s, est. speed input: 190.27 toks/s, output: 1545.87 toks/s]


- batch size 21: 1545.00 tokens/sec (21 x 73.57)


Processed prompts: 100%|███████| 22/22 [00:07<00:00,  3.13it/s, est. speed input: 197.52 toks/s, output: 1604.08 toks/s]


- batch size 22: 1603.21 tokens/sec (22 x 72.87)


Processed prompts: 100%|███████| 23/23 [00:07<00:00,  3.26it/s, est. speed input: 205.49 toks/s, output: 1667.17 toks/s]


- batch size 23: 1666.15 tokens/sec (23 x 72.44)


Processed prompts: 100%|███████| 24/24 [00:07<00:00,  3.40it/s, est. speed input: 214.05 toks/s, output: 1732.91 toks/s]


- batch size 24: 1731.86 tokens/sec (24 x 72.16)


Processed prompts: 100%|███████| 25/25 [00:07<00:00,  3.23it/s, est. speed input: 203.68 toks/s, output: 1641.56 toks/s]


- batch size 25: 1640.63 tokens/sec (25 x 65.63)


Processed prompts: 100%|███████| 26/26 [00:07<00:00,  3.36it/s, est. speed input: 212.02 toks/s, output: 1720.50 toks/s]


- batch size 26: 1719.50 tokens/sec (26 x 66.13)


Processed prompts: 100%|███████| 27/27 [00:07<00:00,  3.46it/s, est. speed input: 218.23 toks/s, output: 1758.78 toks/s]


- batch size 27: 1757.78 tokens/sec (27 x 65.10)


Processed prompts: 100%|███████| 28/28 [00:07<00:00,  3.57it/s, est. speed input: 225.07 toks/s, output: 1820.45 toks/s]


- batch size 28: 1819.32 tokens/sec (28 x 64.98)


Processed prompts: 100%|███████| 29/29 [00:07<00:00,  3.64it/s, est. speed input: 229.44 toks/s, output: 1856.89 toks/s]


- batch size 29: 1855.89 tokens/sec (29 x 64.00)


Processed prompts: 100%|███████| 30/30 [00:08<00:00,  3.67it/s, est. speed input: 231.08 toks/s, output: 1874.22 toks/s]


- batch size 30: 1873.13 tokens/sec (30 x 62.44)


Processed prompts: 100%|███████| 31/31 [00:08<00:00,  3.80it/s, est. speed input: 239.93 toks/s, output: 1946.91 toks/s]


- batch size 31: 1945.79 tokens/sec (31 x 62.77)


Processed prompts: 100%|███████| 32/32 [00:08<00:00,  3.88it/s, est. speed input: 244.64 toks/s, output: 1988.15 toks/s]


- batch size 32: 1986.91 tokens/sec (32 x 62.09)


Processed prompts: 100%|███████| 33/33 [00:08<00:00,  3.89it/s, est. speed input: 244.79 toks/s, output: 1959.25 toks/s]


- batch size 33: 1958.11 tokens/sec (33 x 59.34)


Processed prompts: 100%|███████| 34/34 [00:08<00:00,  3.96it/s, est. speed input: 249.55 toks/s, output: 2020.37 toks/s]


- batch size 34: 2019.27 tokens/sec (34 x 59.39)


Processed prompts: 100%|███████| 35/35 [00:08<00:00,  4.08it/s, est. speed input: 257.34 toks/s, output: 2081.09 toks/s]


- batch size 35: 2079.73 tokens/sec (35 x 59.42)


Processed prompts: 100%|███████| 36/36 [00:08<00:00,  4.17it/s, est. speed input: 262.86 toks/s, output: 2136.26 toks/s]


- batch size 36: 2134.90 tokens/sec (36 x 59.30)


Processed prompts: 100%|███████| 37/37 [00:08<00:00,  4.23it/s, est. speed input: 266.38 toks/s, output: 2150.39 toks/s]


- batch size 37: 2149.06 tokens/sec (37 x 58.08)


Processed prompts: 100%|███████| 38/38 [00:08<00:00,  4.36it/s, est. speed input: 274.64 toks/s, output: 2227.99 toks/s]


- batch size 38: 2226.55 tokens/sec (38 x 58.59)


Processed prompts: 100%|███████| 39/39 [00:08<00:00,  4.64it/s, est. speed input: 292.55 toks/s, output: 2370.01 toks/s]


- batch size 39: 2368.33 tokens/sec (39 x 60.73)


Processed prompts: 100%|███████| 40/40 [00:08<00:00,  4.47it/s, est. speed input: 281.69 toks/s, output: 2281.73 toks/s]


- batch size 40: 2280.26 tokens/sec (40 x 57.01)


Processed prompts: 100%|███████| 41/41 [00:08<00:00,  4.57it/s, est. speed input: 287.97 toks/s, output: 2330.75 toks/s]


- batch size 41: 2329.16 tokens/sec (41 x 56.81)


Processed prompts: 100%|███████| 42/42 [00:09<00:00,  4.61it/s, est. speed input: 290.65 toks/s, output: 2353.57 toks/s]


- batch size 42: 2351.93 tokens/sec (42 x 56.00)


Processed prompts: 100%|███████| 43/43 [00:09<00:00,  4.64it/s, est. speed input: 292.56 toks/s, output: 2369.20 toks/s]


- batch size 43: 2367.68 tokens/sec (43 x 55.06)


Processed prompts: 100%|███████| 44/44 [00:09<00:00,  4.75it/s, est. speed input: 299.25 toks/s, output: 2423.04 toks/s]


- batch size 44: 2421.30 tokens/sec (44 x 55.03)


Processed prompts: 100%|███████| 45/45 [00:09<00:00,  4.75it/s, est. speed input: 299.00 toks/s, output: 2421.10 toks/s]


- batch size 45: 2419.48 tokens/sec (45 x 53.77)


Processed prompts: 100%|███████| 46/46 [00:09<00:00,  4.84it/s, est. speed input: 305.23 toks/s, output: 2474.15 toks/s]


- batch size 46: 2472.53 tokens/sec (46 x 53.75)


Processed prompts: 100%|███████| 47/47 [00:09<00:00,  4.92it/s, est. speed input: 310.48 toks/s, output: 2512.15 toks/s]


- batch size 47: 2510.48 tokens/sec (47 x 53.41)


Processed prompts: 100%|███████| 48/48 [00:09<00:00,  4.90it/s, est. speed input: 309.00 toks/s, output: 2507.15 toks/s]


- batch size 48: 2505.45 tokens/sec (48 x 52.20)


Processed prompts: 100%|███████| 49/49 [00:10<00:00,  4.81it/s, est. speed input: 303.06 toks/s, output: 2448.22 toks/s]


- batch size 49: 2446.60 tokens/sec (49 x 49.93)


Processed prompts: 100%|███████| 50/50 [00:10<00:00,  4.81it/s, est. speed input: 302.88 toks/s, output: 2459.86 toks/s]


- batch size 50: 2458.17 tokens/sec (50 x 49.16)


Processed prompts: 100%|███████| 51/51 [00:10<00:00,  4.89it/s, est. speed input: 308.39 toks/s, output: 2497.62 toks/s]


- batch size 51: 2496.01 tokens/sec (51 x 48.94)


Processed prompts: 100%|███████| 52/52 [00:10<00:00,  5.09it/s, est. speed input: 320.59 toks/s, output: 2598.08 toks/s]


- batch size 52: 2596.31 tokens/sec (52 x 49.93)


Processed prompts: 100%|███████| 53/53 [00:10<00:00,  4.95it/s, est. speed input: 311.58 toks/s, output: 2519.97 toks/s]


- batch size 53: 2518.22 tokens/sec (53 x 47.51)


Processed prompts: 100%|███████| 54/54 [00:10<00:00,  5.08it/s, est. speed input: 319.88 toks/s, output: 2592.56 toks/s]


- batch size 54: 2590.62 tokens/sec (54 x 47.97)


Processed prompts: 100%|███████| 55/55 [00:10<00:00,  5.10it/s, est. speed input: 321.73 toks/s, output: 2603.96 toks/s]


- batch size 55: 2602.18 tokens/sec (55 x 47.31)


Processed prompts: 100%|███████| 56/56 [00:28<00:00,  1.99it/s, est. speed input: 125.12 toks/s, output: 1013.03 toks/s]


- batch size 56: 1012.78 tokens/sec (56 x 18.09)


Processed prompts: 100%|████████| 57/57 [00:29<00:00,  1.96it/s, est. speed input: 123.31 toks/s, output: 997.82 toks/s]


- batch size 57: 997.55 tokens/sec (57 x 17.50)


Processed prompts: 100%|████████| 58/58 [00:30<00:00,  1.90it/s, est. speed input: 119.90 toks/s, output: 971.78 toks/s]


- batch size 58: 971.53 tokens/sec (58 x 16.75)


Processed prompts: 100%|████████| 59/59 [00:31<00:00,  1.90it/s, est. speed input: 119.68 toks/s, output: 970.45 toks/s]


- batch size 59: 970.21 tokens/sec (59 x 16.44)


Processed prompts: 100%|████████| 60/60 [00:31<00:00,  1.90it/s, est. speed input: 119.84 toks/s, output: 972.38 toks/s]


- batch size 60: 972.13 tokens/sec (60 x 16.20)


Processed prompts: 100%|████████| 61/61 [00:32<00:00,  1.91it/s, est. speed input: 120.05 toks/s, output: 974.06 toks/s]


- batch size 61: 973.83 tokens/sec (61 x 15.96)


Processed prompts: 100%|████████| 62/62 [00:32<00:00,  1.91it/s, est. speed input: 120.49 toks/s, output: 973.36 toks/s]


- batch size 62: 973.11 tokens/sec (62 x 15.70)


Processed prompts: 100%|████████| 63/63 [00:32<00:00,  1.94it/s, est. speed input: 122.35 toks/s, output: 989.52 toks/s]


- batch size 63: 989.27 tokens/sec (63 x 15.70)


Processed prompts: 100%|████████| 64/64 [00:33<00:00,  1.94it/s, est. speed input: 121.93 toks/s, output: 986.78 toks/s]


- batch size 64: 986.53 tokens/sec (64 x 15.41)


Processed prompts: 100%|████████| 65/65 [00:34<00:00,  1.91it/s, est. speed input: 120.07 toks/s, output: 972.32 toks/s]


- batch size 65: 972.07 tokens/sec (65 x 14.95)


Processed prompts: 100%|████████| 66/66 [00:34<00:00,  1.92it/s, est. speed input: 120.88 toks/s, output: 978.95 toks/s]


- batch size 66: 978.71 tokens/sec (66 x 14.83)


Processed prompts:   0%|                     | 0/67 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

KeyboardInterrupt: 

### w8a8

In [8]:
llm = vllm_load(test_models["llama-3.1:w8a8"])

config.json:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

INFO 09-22 14:08:30 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=neuralmagic/Meta-Llama-3.1-8B-Instruct-quantiz

tokenizer_config.json:   0%|          | 0.00/50.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

INFO 09-22 14:08:33 model_runner.py:997] Starting to load model neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8...
INFO 09-22 14:08:33 weight_utils.py:242] Using model weights format ['*.safetensors']


model-00002-of-00002.safetensors:   0%|          | 0.00/4.08G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/43.5k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-22 14:16:47 model_runner.py:1008] Loading model weights took 8.4939 GB
INFO 09-22 14:16:48 gpu_executor.py:122] # GPU blocks: 6759, # CPU blocks: 2048
INFO 09-22 14:16:48 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-22 14:16:48 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-22 14:16:57 model_runner.py:1430] Graph capturing finished in 9 secs.


In [9]:
vllm_generate(test_messages*18, llm)

vLLM performance test:


Processed prompts: 100%|█████████████| 1/1 [00:06<00:00,  6.90s/it, est. speed input: 6.09 toks/s, output: 74.24 toks/s]


Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre plusieurs avantages à ses membres et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus faibles** : Le Crédit Mutuel propose des taux d'intérêt compétitifs pour les prêts, les comptes courants et les épargnes.\n2. **Accès à des services bancaires complets** : Le Crédit Mutuel offre une gamme complète de services bancaires, y compris des prêts, des comptes courants, des épargnes, des cartes de crédit, des assurances et des investissements.\n3. **Conseils personnalisés** : Les conseillers du Crédit Mutuel peuvent vous aider à gérer votre patrimoine, à élaborer un plan financier et à atteindre vos objectifs économiques.\n4. **Transparence et sécurité** : Le Crédit Mutuel est une banque coopérative, ce qui signifie que les décisions sont prises collectivement par les membres et non par des actionnaires. Cela assure une transparence et une sécurité supplémentaires.\n5. **Services en li

Processed prompts: 100%|█████████████| 1/1 [00:06<00:00,  6.63s/it, est. speed input: 6.33 toks/s, output: 77.21 toks/s]


- batch size 1: 77.18 tokens/sec (1 x 77.18)


Processed prompts: 100%|███████████| 2/2 [00:06<00:00,  3.39s/it, est. speed input: 12.53 toks/s, output: 149.72 toks/s]


- batch size 2: 149.67 tokens/sec (2 x 74.84)


Processed prompts: 100%|███████████| 3/3 [00:06<00:00,  2.27s/it, est. speed input: 18.83 toks/s, output: 221.19 toks/s]


- batch size 3: 221.12 tokens/sec (3 x 73.71)


Processed prompts: 100%|███████████| 4/4 [00:06<00:00,  1.70s/it, est. speed input: 24.69 toks/s, output: 277.91 toks/s]


- batch size 4: 277.82 tokens/sec (4 x 69.45)


Processed prompts: 100%|███████████| 5/5 [00:06<00:00,  1.38s/it, est. speed input: 30.48 toks/s, output: 360.57 toks/s]


- batch size 5: 360.46 tokens/sec (5 x 72.09)


Processed prompts: 100%|███████████| 6/6 [00:06<00:00,  1.16s/it, est. speed input: 36.45 toks/s, output: 442.11 toks/s]


- batch size 6: 441.98 tokens/sec (6 x 73.66)


Processed prompts: 100%|███████████| 7/7 [00:06<00:00,  1.00it/s, est. speed input: 42.40 toks/s, output: 494.18 toks/s]


- batch size 7: 493.99 tokens/sec (7 x 70.57)


Processed prompts: 100%|███████████| 8/8 [00:06<00:00,  1.14it/s, est. speed input: 48.02 toks/s, output: 580.86 toks/s]


- batch size 8: 580.67 tokens/sec (8 x 72.58)


Processed prompts: 100%|███████████| 9/9 [00:07<00:00,  1.27it/s, est. speed input: 53.51 toks/s, output: 649.17 toks/s]


- batch size 9: 648.93 tokens/sec (9 x 72.10)


Processed prompts: 100%|█████████| 10/10 [00:07<00:00,  1.42it/s, est. speed input: 59.71 toks/s, output: 715.61 toks/s]


- batch size 10: 715.38 tokens/sec (10 x 71.54)


Processed prompts: 100%|█████████| 11/11 [00:07<00:00,  1.55it/s, est. speed input: 65.46 toks/s, output: 785.15 toks/s]


- batch size 11: 784.85 tokens/sec (11 x 71.35)


Processed prompts: 100%|█████████| 12/12 [00:07<00:00,  1.68it/s, est. speed input: 70.58 toks/s, output: 849.79 toks/s]


- batch size 12: 849.42 tokens/sec (12 x 70.79)


Processed prompts: 100%|█████████| 13/13 [00:07<00:00,  1.81it/s, est. speed input: 75.96 toks/s, output: 914.40 toks/s]


- batch size 13: 914.05 tokens/sec (13 x 70.31)


Processed prompts: 100%|█████████| 14/14 [00:07<00:00,  1.90it/s, est. speed input: 79.95 toks/s, output: 952.74 toks/s]


- batch size 14: 952.35 tokens/sec (14 x 68.02)


Processed prompts: 100%|████████| 15/15 [00:07<00:00,  2.05it/s, est. speed input: 86.48 toks/s, output: 1028.98 toks/s]


- batch size 15: 1028.55 tokens/sec (15 x 68.57)


Processed prompts: 100%|████████| 16/16 [00:07<00:00,  2.17it/s, est. speed input: 91.31 toks/s, output: 1107.42 toks/s]


- batch size 16: 1106.96 tokens/sec (16 x 69.18)


Processed prompts: 100%|████████| 17/17 [00:07<00:00,  2.25it/s, est. speed input: 94.65 toks/s, output: 1123.09 toks/s]


- batch size 17: 1122.60 tokens/sec (17 x 66.04)


Processed prompts: 100%|███████| 18/18 [00:07<00:00,  2.40it/s, est. speed input: 100.96 toks/s, output: 1194.59 toks/s]


- batch size 18: 1194.09 tokens/sec (18 x 66.34)


Processed prompts: 100%|███████| 19/19 [00:07<00:00,  2.51it/s, est. speed input: 105.88 toks/s, output: 1262.74 toks/s]


- batch size 19: 1262.16 tokens/sec (19 x 66.43)


Processed prompts: 100%|███████| 20/20 [00:07<00:00,  2.61it/s, est. speed input: 109.41 toks/s, output: 1300.31 toks/s]


- batch size 20: 1299.72 tokens/sec (20 x 64.99)


Processed prompts: 100%|███████| 21/21 [00:07<00:00,  2.75it/s, est. speed input: 115.49 toks/s, output: 1377.52 toks/s]


- batch size 21: 1376.91 tokens/sec (21 x 65.57)


Processed prompts: 100%|███████| 22/22 [00:07<00:00,  2.84it/s, est. speed input: 119.28 toks/s, output: 1429.30 toks/s]


- batch size 22: 1428.64 tokens/sec (22 x 64.94)


Processed prompts: 100%|███████| 23/23 [00:07<00:00,  2.95it/s, est. speed input: 124.36 toks/s, output: 1456.42 toks/s]


- batch size 23: 1455.72 tokens/sec (23 x 63.29)


Processed prompts: 100%|███████| 24/24 [00:07<00:00,  3.09it/s, est. speed input: 129.65 toks/s, output: 1500.97 toks/s]


- batch size 24: 1500.26 tokens/sec (24 x 62.51)


Processed prompts: 100%|███████| 25/25 [00:07<00:00,  3.20it/s, est. speed input: 134.22 toks/s, output: 1593.73 toks/s]


- batch size 25: 1592.90 tokens/sec (25 x 63.72)


Processed prompts: 100%|███████| 26/26 [00:07<00:00,  3.26it/s, est. speed input: 137.04 toks/s, output: 1645.39 toks/s]


- batch size 26: 1644.59 tokens/sec (26 x 63.25)


Processed prompts: 100%|███████| 27/27 [00:07<00:00,  3.42it/s, est. speed input: 143.74 toks/s, output: 1724.50 toks/s]


- batch size 27: 1723.58 tokens/sec (27 x 63.84)


Processed prompts: 100%|███████| 28/28 [00:07<00:00,  3.52it/s, est. speed input: 147.78 toks/s, output: 1722.77 toks/s]


- batch size 28: 1721.85 tokens/sec (28 x 61.49)


Processed prompts: 100%|█████████| 29/29 [00:18<00:00,  1.54it/s, est. speed input: 64.72 toks/s, output: 759.85 toks/s]


- batch size 29: 759.69 tokens/sec (29 x 26.20)


Processed prompts: 100%|█████████| 30/30 [00:31<00:00,  1.06s/it, est. speed input: 39.75 toks/s, output: 467.74 toks/s]


- batch size 30: 467.67 tokens/sec (30 x 15.59)


Processed prompts: 100%|█████████| 31/31 [00:46<00:00,  1.49s/it, est. speed input: 28.29 toks/s, output: 337.61 toks/s]


- batch size 31: 337.58 tokens/sec (31 x 10.89)


Processed prompts: 100%|█████████| 32/32 [00:59<00:00,  1.87s/it, est. speed input: 22.40 toks/s, output: 269.94 toks/s]


- batch size 32: 269.92 tokens/sec (32 x 8.44)


Processed prompts:   0%|                     | 0/33 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

KeyboardInterrupt: 

### w4a16

In [8]:
llm = vllm_load(test_models["llama-3.1:w4a16"])

config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

INFO 09-22 14:25:08 gptq_marlin.py:108] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel.
INFO 09-22 14:25:08 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_tim

tokenizer_config.json:   0%|          | 0.00/50.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

INFO 09-22 14:25:10 model_runner.py:997] Starting to load model neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16...
INFO 09-22 14:25:11 weight_utils.py:242] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/5.74G [00:00<?, ?B/s]

INFO 09-22 14:30:25 weight_utils.py:287] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 09-22 14:30:26 model_runner.py:1008] Loading model weights took 5.3812 GB
INFO 09-22 14:30:28 gpu_executor.py:122] # GPU blocks: 8325, # CPU blocks: 2048
INFO 09-22 14:30:30 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-22 14:30:30 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-22 14:30:40 model_runner.py:1430] Graph capturing finished in 10 secs.


In [9]:
vllm_generate(test_messages*18, llm)

vLLM performance test:


Processed prompts: 100%|████████████| 1/1 [00:04<00:00,  4.38s/it, est. speed input: 9.58 toks/s, output: 116.78 toks/s]


Generated text: "Le Crédit Mutuel est une banque mutuelle française qui offre divers avantages à ses clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts sur les comptes courants** : Le Crédit Mutuel offre des intérêts sur les comptes courants, ce qui permet aux clients de gagner de l'argent sans avoir à investir leurs fonds.\n2. **Prêt à usage** : Le Crédit Mutuel propose des prêts à usage avec des taux d'intérêt compétitifs, ce qui peut aider les clients à financer leurs besoins ou leurs projets.\n3. **Assurance de prêt** : Le Crédit Mutuel offre une assurance de prêt qui couvre les risques de défaut de remboursement, ce qui donne aux emprunteurs une sécurité supplémentaire.\n4. **Services bancaires complets** : Le Crédit Mutuel propose une gamme complète de services bancaires, y compris des cartes de crédit, des chèques, des virements et des paiements en ligne.\n5. **Sécurité et protection** : Le Crédit Mutuel met en place des mesures de sécurité pour protéger les

Processed prompts: 100%|███████████| 1/1 [00:04<00:00,  4.08s/it, est. speed input: 10.30 toks/s, output: 125.62 toks/s]


- batch size 1: 125.55 tokens/sec (1 x 125.55)


Processed prompts: 100%|███████████| 2/2 [00:04<00:00,  2.12s/it, est. speed input: 20.02 toks/s, output: 241.18 toks/s]


- batch size 2: 241.06 tokens/sec (2 x 120.53)


Processed prompts: 100%|███████████| 3/3 [00:04<00:00,  1.40s/it, est. speed input: 30.41 toks/s, output: 355.68 toks/s]


- batch size 3: 355.46 tokens/sec (3 x 118.49)


Processed prompts: 100%|███████████| 4/4 [00:04<00:00,  1.06s/it, est. speed input: 39.45 toks/s, output: 478.56 toks/s]


- batch size 4: 478.33 tokens/sec (4 x 119.58)


Processed prompts: 100%|███████████| 5/5 [00:04<00:00,  1.16it/s, est. speed input: 48.76 toks/s, output: 570.07 toks/s]


- batch size 5: 569.82 tokens/sec (5 x 113.96)


Processed prompts: 100%|███████████| 6/6 [00:04<00:00,  1.39it/s, est. speed input: 58.42 toks/s, output: 697.80 toks/s]


- batch size 6: 697.43 tokens/sec (6 x 116.24)


Processed prompts: 100%|███████████| 7/7 [00:04<00:00,  1.59it/s, est. speed input: 67.28 toks/s, output: 787.57 toks/s]


- batch size 7: 787.10 tokens/sec (7 x 112.44)


Processed prompts: 100%|███████████| 8/8 [00:04<00:00,  1.80it/s, est. speed input: 75.68 toks/s, output: 881.61 toks/s]


- batch size 8: 881.11 tokens/sec (8 x 110.14)


Processed prompts: 100%|███████████| 9/9 [00:04<00:00,  2.00it/s, est. speed input: 84.02 toks/s, output: 988.23 toks/s]


- batch size 9: 987.65 tokens/sec (9 x 109.74)


Processed prompts: 100%|████████| 10/10 [00:04<00:00,  2.20it/s, est. speed input: 92.81 toks/s, output: 1067.00 toks/s]


- batch size 10: 1066.43 tokens/sec (10 x 106.64)


Processed prompts: 100%|███████| 11/11 [00:04<00:00,  2.39it/s, est. speed input: 100.90 toks/s, output: 1190.30 toks/s]


- batch size 11: 1189.65 tokens/sec (11 x 108.15)


Processed prompts: 100%|███████| 12/12 [00:04<00:00,  2.60it/s, est. speed input: 109.12 toks/s, output: 1288.86 toks/s]


- batch size 12: 1288.05 tokens/sec (12 x 107.34)


Processed prompts: 100%|███████| 13/13 [00:04<00:00,  2.78it/s, est. speed input: 116.89 toks/s, output: 1381.02 toks/s]


- batch size 13: 1380.25 tokens/sec (13 x 106.17)


Processed prompts: 100%|███████| 14/14 [00:04<00:00,  2.91it/s, est. speed input: 122.61 toks/s, output: 1435.34 toks/s]


- batch size 14: 1434.50 tokens/sec (14 x 102.46)


Processed prompts: 100%|███████| 15/15 [00:04<00:00,  3.15it/s, est. speed input: 132.66 toks/s, output: 1549.33 toks/s]


- batch size 15: 1548.31 tokens/sec (15 x 103.22)


Processed prompts: 100%|███████| 16/16 [00:04<00:00,  3.34it/s, est. speed input: 140.13 toks/s, output: 1636.07 toks/s]


- batch size 16: 1634.87 tokens/sec (16 x 102.18)


Processed prompts: 100%|███████| 17/17 [00:05<00:00,  3.16it/s, est. speed input: 132.65 toks/s, output: 1581.27 toks/s]


- batch size 17: 1580.32 tokens/sec (17 x 92.96)


Processed prompts: 100%|███████| 18/18 [00:05<00:00,  3.29it/s, est. speed input: 138.55 toks/s, output: 1630.24 toks/s]


- batch size 18: 1629.18 tokens/sec (18 x 90.51)


Processed prompts: 100%|███████| 19/19 [00:05<00:00,  3.50it/s, est. speed input: 147.55 toks/s, output: 1711.78 toks/s]


- batch size 19: 1710.76 tokens/sec (19 x 90.04)


Processed prompts: 100%|███████| 20/20 [00:05<00:00,  3.70it/s, est. speed input: 155.47 toks/s, output: 1786.38 toks/s]


- batch size 20: 1785.25 tokens/sec (20 x 89.26)


Processed prompts: 100%|███████| 21/21 [00:05<00:00,  3.77it/s, est. speed input: 158.44 toks/s, output: 1844.72 toks/s]


- batch size 21: 1843.56 tokens/sec (21 x 87.79)


Processed prompts: 100%|███████| 22/22 [00:05<00:00,  3.96it/s, est. speed input: 166.44 toks/s, output: 1955.59 toks/s]


- batch size 22: 1954.32 tokens/sec (22 x 88.83)


Processed prompts: 100%|███████| 23/23 [00:05<00:00,  4.12it/s, est. speed input: 173.21 toks/s, output: 2013.39 toks/s]


- batch size 23: 2011.92 tokens/sec (23 x 87.47)


Processed prompts: 100%|███████| 24/24 [00:05<00:00,  4.24it/s, est. speed input: 178.23 toks/s, output: 2090.27 toks/s]


- batch size 24: 2088.91 tokens/sec (24 x 87.04)


Processed prompts: 100%|███████| 25/25 [00:05<00:00,  4.43it/s, est. speed input: 186.27 toks/s, output: 2175.76 toks/s]


- batch size 25: 2174.08 tokens/sec (25 x 86.96)


Processed prompts: 100%|███████| 26/26 [00:05<00:00,  4.47it/s, est. speed input: 188.00 toks/s, output: 2199.89 toks/s]


- batch size 26: 2198.47 tokens/sec (26 x 84.56)


Processed prompts: 100%|███████| 27/27 [00:05<00:00,  4.69it/s, est. speed input: 197.45 toks/s, output: 2314.77 toks/s]


- batch size 27: 2313.17 tokens/sec (27 x 85.67)


Processed prompts: 100%|███████| 28/28 [00:05<00:00,  4.84it/s, est. speed input: 203.44 toks/s, output: 2359.25 toks/s]


- batch size 28: 2357.53 tokens/sec (28 x 84.20)


Processed prompts: 100%|███████| 29/29 [00:05<00:00,  4.91it/s, est. speed input: 206.42 toks/s, output: 2453.69 toks/s]


- batch size 29: 2452.02 tokens/sec (29 x 84.55)


Processed prompts: 100%|███████| 30/30 [00:06<00:00,  4.96it/s, est. speed input: 208.42 toks/s, output: 2441.66 toks/s]


- batch size 30: 2439.96 tokens/sec (30 x 81.33)


Processed prompts: 100%|███████| 31/31 [00:05<00:00,  5.17it/s, est. speed input: 217.64 toks/s, output: 2538.39 toks/s]


- batch size 31: 2536.60 tokens/sec (31 x 81.83)


Processed prompts: 100%|███████| 32/32 [00:06<00:00,  5.30it/s, est. speed input: 222.50 toks/s, output: 2605.98 toks/s]


- batch size 32: 2604.17 tokens/sec (32 x 81.38)


Processed prompts: 100%|███████| 33/33 [00:06<00:00,  5.14it/s, est. speed input: 215.97 toks/s, output: 2520.17 toks/s]


- batch size 33: 2518.36 tokens/sec (33 x 76.31)


Processed prompts: 100%|███████| 34/34 [00:06<00:00,  5.15it/s, est. speed input: 216.40 toks/s, output: 2504.99 toks/s]


- batch size 34: 2503.17 tokens/sec (34 x 73.62)


Processed prompts: 100%|███████| 35/35 [00:06<00:00,  5.24it/s, est. speed input: 220.19 toks/s, output: 2595.18 toks/s]


- batch size 35: 2593.18 tokens/sec (35 x 74.09)


Processed prompts: 100%|███████| 36/36 [00:06<00:00,  5.36it/s, est. speed input: 225.03 toks/s, output: 2650.47 toks/s]


- batch size 36: 2648.47 tokens/sec (36 x 73.57)


Processed prompts: 100%|███████| 37/37 [00:06<00:00,  5.45it/s, est. speed input: 229.04 toks/s, output: 2696.51 toks/s]


- batch size 37: 2694.60 tokens/sec (37 x 72.83)


Processed prompts: 100%|███████| 38/38 [00:06<00:00,  5.50it/s, est. speed input: 231.22 toks/s, output: 2670.48 toks/s]


- batch size 38: 2668.55 tokens/sec (38 x 70.22)


Processed prompts: 100%|███████| 39/39 [00:06<00:00,  5.69it/s, est. speed input: 239.27 toks/s, output: 2815.79 toks/s]


- batch size 39: 2813.51 tokens/sec (39 x 72.14)


Processed prompts: 100%|███████| 40/40 [00:06<00:00,  5.84it/s, est. speed input: 245.40 toks/s, output: 2855.39 toks/s]


- batch size 40: 2848.50 tokens/sec (40 x 71.21)


Processed prompts: 100%|███████| 41/41 [00:07<00:00,  5.80it/s, est. speed input: 243.61 toks/s, output: 2846.18 toks/s]


- batch size 41: 2843.88 tokens/sec (41 x 69.36)


Processed prompts: 100%|███████| 42/42 [00:07<00:00,  5.96it/s, est. speed input: 250.50 toks/s, output: 2936.79 toks/s]


- batch size 42: 2934.41 tokens/sec (42 x 69.87)


Processed prompts: 100%|███████| 43/43 [00:07<00:00,  6.09it/s, est. speed input: 255.91 toks/s, output: 2955.13 toks/s]


- batch size 43: 2952.75 tokens/sec (43 x 68.67)


Processed prompts: 100%|███████| 44/44 [00:07<00:00,  6.00it/s, est. speed input: 251.98 toks/s, output: 2956.01 toks/s]


- batch size 44: 2953.65 tokens/sec (44 x 67.13)


Processed prompts: 100%|███████| 45/45 [00:07<00:00,  6.20it/s, est. speed input: 260.36 toks/s, output: 3044.16 toks/s]


- batch size 45: 3041.90 tokens/sec (45 x 67.60)


Processed prompts: 100%|███████| 46/46 [00:07<00:00,  6.26it/s, est. speed input: 263.08 toks/s, output: 3097.65 toks/s]


- batch size 46: 3095.31 tokens/sec (46 x 67.29)


Processed prompts: 100%|███████| 47/47 [00:07<00:00,  6.23it/s, est. speed input: 261.82 toks/s, output: 3090.93 toks/s]


- batch size 47: 3036.79 tokens/sec (47 x 64.61)


Processed prompts: 100%|███████| 48/48 [00:07<00:00,  6.43it/s, est. speed input: 270.23 toks/s, output: 3176.00 toks/s]


- batch size 48: 3173.57 tokens/sec (48 x 66.12)


Processed prompts: 100%|███████| 49/49 [00:08<00:00,  5.85it/s, est. speed input: 245.66 toks/s, output: 2914.78 toks/s]


- batch size 49: 2912.62 tokens/sec (49 x 59.44)


Processed prompts: 100%|███████| 50/50 [00:08<00:00,  5.91it/s, est. speed input: 248.53 toks/s, output: 2905.07 toks/s]


- batch size 50: 2903.08 tokens/sec (50 x 58.06)


Processed prompts: 100%|███████| 51/51 [00:09<00:00,  5.66it/s, est. speed input: 238.12 toks/s, output: 2783.81 toks/s]


- batch size 51: 2781.90 tokens/sec (51 x 54.55)


Processed prompts: 100%|███████| 52/52 [00:09<00:00,  5.62it/s, est. speed input: 235.93 toks/s, output: 2762.60 toks/s]


- batch size 52: 2760.68 tokens/sec (52 x 53.09)


Processed prompts: 100%|███████| 53/53 [00:09<00:00,  5.71it/s, est. speed input: 239.64 toks/s, output: 2830.30 toks/s]


- batch size 53: 2828.41 tokens/sec (53 x 53.37)


Processed prompts: 100%|███████| 54/54 [00:09<00:00,  5.73it/s, est. speed input: 240.64 toks/s, output: 2820.32 toks/s]


- batch size 54: 2818.25 tokens/sec (54 x 52.19)


Processed prompts: 100%|███████| 55/55 [00:09<00:00,  5.77it/s, est. speed input: 242.54 toks/s, output: 2883.49 toks/s]


- batch size 55: 2881.63 tokens/sec (55 x 52.39)


Processed prompts: 100%|███████| 56/56 [00:09<00:00,  5.78it/s, est. speed input: 242.64 toks/s, output: 2857.54 toks/s]


- batch size 56: 2855.51 tokens/sec (56 x 50.99)


Processed prompts: 100%|███████| 57/57 [00:09<00:00,  5.80it/s, est. speed input: 243.43 toks/s, output: 2879.73 toks/s]


- batch size 57: 2877.61 tokens/sec (57 x 50.48)


Processed prompts: 100%|███████| 58/58 [00:09<00:00,  5.91it/s, est. speed input: 248.43 toks/s, output: 2923.29 toks/s]


- batch size 58: 2921.28 tokens/sec (58 x 50.37)


Processed prompts: 100%|███████| 59/59 [00:09<00:00,  5.93it/s, est. speed input: 249.22 toks/s, output: 2939.09 toks/s]


- batch size 59: 2937.12 tokens/sec (59 x 49.78)


Processed prompts: 100%|███████| 60/60 [00:09<00:00,  6.09it/s, est. speed input: 255.81 toks/s, output: 2973.72 toks/s]


- batch size 60: 2971.65 tokens/sec (60 x 49.53)


Processed prompts: 100%|███████| 61/61 [00:10<00:00,  5.99it/s, est. speed input: 251.65 toks/s, output: 2963.72 toks/s]


- batch size 61: 2961.58 tokens/sec (61 x 48.55)


Processed prompts: 100%|███████| 62/62 [00:09<00:00,  6.22it/s, est. speed input: 261.29 toks/s, output: 3035.77 toks/s]


- batch size 62: 3033.71 tokens/sec (62 x 48.93)


Processed prompts: 100%|███████| 63/63 [00:10<00:00,  6.29it/s, est. speed input: 264.39 toks/s, output: 3088.26 toks/s]


- batch size 63: 3085.80 tokens/sec (63 x 48.98)


Processed prompts: 100%|███████| 64/64 [00:10<00:00,  6.22it/s, est. speed input: 261.25 toks/s, output: 3032.34 toks/s]


- batch size 64: 3030.35 tokens/sec (64 x 47.35)


Processed prompts: 100%|███████| 65/65 [00:11<00:00,  5.62it/s, est. speed input: 236.04 toks/s, output: 2794.32 toks/s]


- batch size 65: 2792.49 tokens/sec (65 x 42.96)


Processed prompts: 100%|███████| 66/66 [00:11<00:00,  5.80it/s, est. speed input: 243.59 toks/s, output: 2826.58 toks/s]


- batch size 66: 2824.80 tokens/sec (66 x 42.80)


Processed prompts: 100%|███████| 67/67 [00:11<00:00,  5.71it/s, est. speed input: 239.81 toks/s, output: 2794.38 toks/s]


- batch size 67: 2792.58 tokens/sec (67 x 41.68)


Processed prompts: 100%|███████| 68/68 [00:11<00:00,  5.79it/s, est. speed input: 243.15 toks/s, output: 2831.61 toks/s]


- batch size 68: 2829.74 tokens/sec (68 x 41.61)


Processed prompts: 100%|███████| 69/69 [00:11<00:00,  5.84it/s, est. speed input: 245.27 toks/s, output: 2880.51 toks/s]


- batch size 69: 2878.64 tokens/sec (69 x 41.72)


Processed prompts: 100%|███████| 70/70 [00:11<00:00,  5.84it/s, est. speed input: 245.45 toks/s, output: 2876.64 toks/s]


- batch size 70: 2874.84 tokens/sec (70 x 41.07)


Processed prompts: 100%|███████| 71/71 [00:11<00:00,  5.99it/s, est. speed input: 251.64 toks/s, output: 2927.52 toks/s]


- batch size 71: 2925.52 tokens/sec (71 x 41.20)


Processed prompts: 100%|███████| 72/72 [00:12<00:00,  5.91it/s, est. speed input: 248.35 toks/s, output: 2909.87 toks/s]

- batch size 72: 2907.97 tokens/sec (72 x 40.39)



