# 1. Install prerequisites

In [None]:
from importlib.metadata import version

In [1]:
version('torch')

'2.4.0'

In [5]:
version('triton')

'3.0.0'

In [None]:
pip install --upgrade transformers

In [2]:
version('transformers')

'4.44.2'

## vLLM

https://docs.vllm.ai/en/latest/

In [None]:
pip install --upgrade vllm

In [3]:
version('vllm')

'0.6.1.post2'

## SGLang

https://sglang.readthedocs.io/en/latest/

In [None]:
pip install --upgrade "sglang[all]"

In [4]:
version('sglang')

'0.3.1.post2'

In [None]:
pip install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/

In [6]:
version('flashinfer')

'0.1.6+cu124torch2.4'

## ollama

https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install

In [11]:
!mkdir ollama && curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz && tar -C ./ollama -xzf ollama-linux-amd64.tgz && rm ollama-linux-amd64.tgz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   117  100   117    0     0    566      0 --:--:-- --:--:-- --:--:--   567
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1583M  100 1583M    0     0  17.1M      0  0:01:32  0:01:32 --:--:-- 16.9M


```bash
./ollama/bin/ollama serve &
```

https://github.com/ollama/ollama-python

https://github.com/ollama/ollama/tree/main/docs

In [None]:
pip install --upgrade ollama

In [9]:
version('ollama')

'0.3.3'

# 2. Performance testing functions

## Test models

https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct

https://huggingface.co/collections/neuralmagic/llama-31-quantization-66a3f907f48d07feabb8f300

In [1]:
test_models = {                                                                    # OpenLLM leaderboard score
    "llama-3.1" : "meta-llama/Meta-Llama-3.1-8B-Instruct",                         # 100.0 %
    "llama-3.1:w8a16" : "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16",  # 99.8 %    
    "llama-3.1:fp8" : "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8",                # 99.5 % - warning the "FP8-dynamic" version is MUCH slower on RTX 4090 !
    "llama-3.1:w8a8" : "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",    # 99.4 %
    "llama-3.1:w4a16" : "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",  # 97.1 %
    "qwen-2.5" : "Qwen/Qwen2.5-7B-Instruct",
    "qwen-2.5:w8a16" : "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
    "qwen-2.5:w4a16" : "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4",
    "qwen-2.5-14b:w8a16" : "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8",
    "qwen-2.5-14b:w4a16" : "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
    "qwen-2.5-32b:w4a16" : "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4",    
    "qwen-2.5-coder" : "Qwen/Qwen2.5-Coder-7B-Instruct",
    "qwen-2.5-coder:fp8" : "Qwen2.5-Coder-7B-Instruct-FP8-Dynamic"
}

Note: to create custom quantized model versions -> https://docs.vllm.ai/en/latest/quantization/fp8.html#quantization-process

https://ollama.com/library/llama3.1

```bash
./ollama/bin/ollama pull llama3.1:8b-instruct-fp16
./ollama/bin/ollama pull llama3.1:8b-instruct-q8_0
./ollama/bin/ollama pull llama3.1:8b-instruct-q4_0

./ollama/bin/ollama pull qwen2.5:7b-instruct-fp16
./ollama/bin/ollama pull qwen2.5:7b-instruct-q8_0
./ollama/bin/ollama pull qwen2.5:7b-instruct-q4_K_M
./ollama/bin/ollama pull qwen2.5:14b-instruct-q4_K_M
./ollama/bin/ollama pull qwen2.5:32b-instruct-q4_K_M

./ollama/bin/ollama pull qwen2.5-coder:7b-instruct-q8_0
./ollama/bin/ollama pull qwen2.5-coder:7b-instruct-q4_K_M
```

In [2]:
ollama_test_models = {
    "llama-3.1" : "llama3.1:8b-instruct-fp16",
    "llama-3.1:int8" : "llama3.1:8b-instruct-q8_0",
    "llama-3.1:int4" : "llama3.1:8b-instruct-q4_0",
    "qwen-2.5" : "qwen2.5:7b-instruct-fp16",
    "qwen-2.5:int8" : "qwen2.5:7b-instruct-q8_0",
    "qwen-2.5:int4" : "qwen2.5:7b-instruct-q4_K_M",
    "qwen-2.5-14b:int4" : "qwen2.5:14b-instruct-q4_K_M",
    "qwen-2.5-32b:int4" : "qwen2.5:32b-instruct-q4_K_M",
    "qwen-2.5-coder:int8" : "qwen2.5-coder:7b-instruct-q8_0",
    "qwen-2.5-coder:int4" : "qwen2.5-coder:7b-instruct-q4_K_M"
}

## Test prompts

In [3]:
test_messages = [
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages du Crédit Mutuel ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages du Crédit Agricole ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la Société Générale ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la BNP ?"}
]
]

In [4]:
from transformers import AutoTokenizer

def format_prompt(messages, model):
    tokenizer = AutoTokenizer.from_pretrained(model)
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

In [5]:
format_prompt(test_messages, test_models["qwen-2.5"])

['<|im_start|>system\nTu es un assistant utile et professionnel qui répond toujours en français.<|im_end|>\n<|im_start|>user\nQuels sont les avantages du Crédit Mutuel ?<|im_end|>\n<|im_start|>assistant\n',
 '<|im_start|>system\nTu es un assistant utile et professionnel qui répond toujours en français.<|im_end|>\n<|im_start|>user\nQuels sont les avantages du Crédit Agricole ?<|im_end|>\n<|im_start|>assistant\n',
 '<|im_start|>system\nTu es un assistant utile et professionnel qui répond toujours en français.<|im_end|>\n<|im_start|>user\nQuels sont les avantages de la Société Générale ?<|im_end|>\n<|im_start|>assistant\n',
 '<|im_start|>system\nTu es un assistant utile et professionnel qui répond toujours en français.<|im_end|>\n<|im_start|>user\nQuels sont les avantages de la BNP ?<|im_end|>\n<|im_start|>assistant\n']

### vLLM

In [6]:
# Authenticate VLLM with Huggingface Hub
import os

with open("/workspace/hftoken", 'r') as file:
    myhftoken = file.read().strip()

os.environ["HF_TOKEN"]=myhftoken

In [7]:
import time
from vllm import LLM, SamplingParams

def vllm_load(model):    
    llm = LLM(model, gpu_memory_utilization=0.99, max_model_len=8192)
    llm._model = model
    return llm

def vllm_generate(messages, llm):    
    print(f"vLLM performance test:")
    
    prompts = format_prompt(messages, llm._model)
    sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512)
    # warmup
    outputs = llm.generate(prompts[0], sampling_params)
    print(f"Generated text: {outputs[0].outputs[0].text!r}")
    
    for batch_size in range(1, len(messages) + 1):
        start_time = time.time()  # Record the start time
        outputs = llm.generate(prompts[0:batch_size], sampling_params)
        end_time = time.time()  # Record the end time
            
        # Print the outputs.
        tokenscount = 0
        for output in outputs:
            generated_text = output.outputs[0].text
            tokenscount = tokenscount + len(output.outputs[0].token_ids)

        tokens_per_sec = tokenscount/(end_time-start_time)
        print(f"- batch size {batch_size}: {tokens_per_sec:.2f} tokens/sec ({batch_size} x {tokens_per_sec/batch_size:.2f})")

In [None]:
llm = vllm_load(test_models["llama-3.1"])

In [None]:
vllm_generate(test_messages*8, llm)

### SGLang

In [6]:
import json, time
import sglang

def sglang_load(model):
    runtime = sglang.Runtime(model_path=model)
    runtime._model = model
    return runtime

def sglang_generate(messages, runtime):
    print(f"SGLang performance test:")
    
    prompts = format_prompt(messages, runtime._model)
    sampling_params = { "temperature":0.7, "top_p":0.8, "repetition_penalty":1.05, "max_new_tokens":512 }
    # warmup
    output = json.loads(runtime.generate(prompt=prompts[0], sampling_params=sampling_params))
    print(f"Generated text: {output['text']!r}")
    
    for batch_size in range(1, len(messages) + 1):
        start_time = time.time()  # Record the start time
        outputs = json.loads(runtime.generate(prompt=prompts[0:batch_size], sampling_params=sampling_params))
        end_time = time.time()  # Record the end time
            
        # Print the outputs.
        tokenscount = 0
        for output in outputs:
            generated_text = output["text"]
            tokenscount = tokenscount + output["meta_info"]["completion_tokens"]

        tokens_per_sec = tokenscount/(end_time-start_time)
        print(f"- batch size {batch_size}: {tokens_per_sec:.2f} tokens/sec ({batch_size} x {tokens_per_sec/batch_size:.2f})")

In [None]:
runtime = sglang_load(test_models["llama-3.1"])

In [None]:
sglang_generate(test_messages*8, runtime)

### Ollama

https://github.com/ollama/ollama-python

In [45]:
import ollama

ollama.list()

{'models': [{'name': 'llama3.1:8b-instruct-q4_0',
   'model': 'llama3.1:8b-instruct-q4_0',
   'modified_at': '2024-09-22T11:43:03.484852591+02:00',
   'size': 4661230766,
   'digest': '42182419e9508c30c4b1fe55015f06b65f4ca4b9e28a744be55008d21998a093',
   'details': {'parent_model': '',
    'format': 'gguf',
    'family': 'llama',
    'families': ['llama'],
    'parameter_size': '8.0B',
    'quantization_level': 'Q4_0'}},
  {'name': 'llama3.1:8b-instruct-q8_0',
   'model': 'llama3.1:8b-instruct-q8_0',
   'modified_at': '2024-09-22T11:33:10.006555339+02:00',
   'size': 8540789934,
   'digest': 'b158ded76fa05be6bce8a682099ce5df8c5571340a04cf63a2923464679db576',
   'details': {'parent_model': '',
    'format': 'gguf',
    'family': 'llama',
    'families': ['llama'],
    'parameter_size': '8.0B',
    'quantization_level': 'Q8_0'}},
  {'name': 'llama3.1:8b-instruct-fp16',
   'model': 'llama3.1:8b-instruct-fp16',
   'modified_at': '2024-09-21T22:54:30.926572546+02:00',
   'size': 16068910253

In [27]:
print(ollama.show("llama3.1:8b-instruct-fp16")['template'])

{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
{{- if .System }}

{{ .System }}
{{- end }}
{{- if .Tools }}

Cutting Knowledge Date: December 2023

When you receive a tool call response, use the output to format an answer to the orginal user question.

You are a helpful assistant with tool calling capabilities.
{{- end }}<|eot_id|>
{{- end }}
{{- range $i, $_ := .Messages }}
{{- $last := eq (len (slice $.Messages $i)) 1 }}
{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>
{{- if and $.Tools $last }}

Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.

Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.

{{ range $.Tools }}
{{- . }}
{{ end }}
Question: {{ .Content }}<|eot_id|>
{{- else }}

{{ .Content }}<|eot_id|>
{{- end }}{{ if $last }}<|start_header_id|>assistant<|end_header_id|>

{{ e

https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion

In [16]:
import ollama
import time

# ollama keeps models 5 min in memory by default, they are reloaded by a query
def ollama_load(model):
    sampling_params = { "num_predict":1 }
    ollama.generate(model=model, prompt="load", raw=True, options=sampling_params, stream=False)
    return model

# ollama API only supports batch size 1
def ollama_generate(messages, model, hftokenizer):
    print(f"ollama performance test:")
    
    prompts = format_prompt(messages, hftokenizer)
    sampling_params = { "temperature":0.7, "top_p":0.8, "repeat_penalty":1.05, "num_predict":512 }
    # warmup
    output = ollama.generate(model=model, prompt=prompts[0], raw=True, options=sampling_params, stream=False)
    print(f"Generated text: {output['response']!r}")
    
    for msg_index in range(len(messages)):
        start_time = time.time()  # Record the start time
        output = ollama.generate(model=model, prompt=prompts[msg_index], raw=True, options=sampling_params, stream=False)
        end_time = time.time()  # Record the end time
            
        # Print the outputs.
        tokenscount = output['eval_count']
        tokens_per_sec = tokenscount/(end_time-start_time)
        print(f"- batch size 1: {tokens_per_sec:.2f} tokens/sec")

In [43]:
model = ollama_load(ollama_test_models["llama-3.1"])

In [None]:
ollama_generate(test_messages, model, test_models["llama-3.1"])

# 3. Performance tests on RTX 4090 - llama-3.1

## vLLM

### FP16

In [7]:
llm = vllm_load(test_models["llama-3.1"])

INFO 09-22 12:18:56 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Meta-Llama-3.1-8B-Instruct, use_v2_block_manager=False, num_scheduler_steps=1, en

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 09-22 12:19:07 model_runner.py:1008] Loading model weights took 14.9888 GB
INFO 09-22 12:19:08 gpu_executor.py:122] # GPU blocks: 3610, # CPU blocks: 2048
INFO 09-22 12:19:08 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-22 12:19:08 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-22 12:19:18 model_runner.py:1430] Graph capturing finished in 10 secs.


In [8]:
vllm_generate(test_messages*12, llm)

vLLM performance test:


Processed prompts: 100%|█████████████| 1/1 [00:09<00:00,  9.62s/it, est. speed input: 6.55 toks/s, output: 53.23 toks/s]


Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre plusieurs avantages à ses membres et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus élevés sur les épargnes** : Le Crédit Mutuel propose des taux d'intérêt plus élevés sur les comptes d'épargne que de nombreux autres établissements bancaires.\n2. **Taux d'emprunt compétitifs** : Les prêts personnels, les prêts immobiliers et les prêts pour la mobilité sont proposés avec des taux d'intérêt attractifs.\n3. **Services personnalisés** : Le Crédit Mutuel offre des services personnalisés et adaptés aux besoins de ses membres et clients, grâce à une approche relationnelle et à une connaissance approfondie de leurs situations financières.\n4. **Sécurité et confidentialité** : Le Crédit Mutuel s'engage à protéger les données personnelles et financières de ses membres et clients, conformément aux règles de protection des données.\n5. **Participation aux décisions** : En tant que membre 

Processed prompts: 100%|█████████████| 1/1 [00:09<00:00,  9.29s/it, est. speed input: 6.78 toks/s, output: 55.11 toks/s]


- batch size 1: 55.10 tokens/sec (1 x 55.10)


Processed prompts: 100%|███████████| 2/2 [00:09<00:00,  4.79s/it, est. speed input: 13.26 toks/s, output: 106.91 toks/s]


- batch size 2: 106.89 tokens/sec (2 x 53.44)


Processed prompts: 100%|███████████| 3/3 [00:09<00:00,  3.20s/it, est. speed input: 19.89 toks/s, output: 159.98 toks/s]


- batch size 3: 159.94 tokens/sec (3 x 53.31)


Processed prompts: 100%|███████████| 4/4 [00:09<00:00,  2.41s/it, est. speed input: 26.12 toks/s, output: 209.04 toks/s]


- batch size 4: 209.00 tokens/sec (4 x 52.25)


Processed prompts: 100%|███████████| 5/5 [00:09<00:00,  1.95s/it, est. speed input: 32.36 toks/s, output: 254.88 toks/s]


- batch size 5: 254.81 tokens/sec (5 x 50.96)


Processed prompts: 100%|███████████| 6/6 [00:09<00:00,  1.63s/it, est. speed input: 38.83 toks/s, output: 314.72 toks/s]


- batch size 6: 314.63 tokens/sec (6 x 52.44)


Processed prompts: 100%|███████████| 7/7 [00:09<00:00,  1.40s/it, est. speed input: 45.15 toks/s, output: 365.32 toks/s]


- batch size 7: 365.23 tokens/sec (7 x 52.18)


Processed prompts: 100%|███████████| 8/8 [00:09<00:00,  1.23s/it, est. speed input: 51.20 toks/s, output: 416.13 toks/s]


- batch size 8: 416.02 tokens/sec (8 x 52.00)


Processed prompts: 100%|███████████| 9/9 [00:09<00:00,  1.10s/it, est. speed input: 57.18 toks/s, output: 453.22 toks/s]


- batch size 9: 453.06 tokens/sec (9 x 50.34)


Processed prompts: 100%|█████████| 10/10 [00:09<00:00,  1.01it/s, est. speed input: 63.46 toks/s, output: 514.91 toks/s]


- batch size 10: 514.74 tokens/sec (10 x 51.47)


Processed prompts: 100%|█████████| 11/11 [00:09<00:00,  1.10it/s, est. speed input: 69.57 toks/s, output: 561.93 toks/s]


- batch size 11: 561.78 tokens/sec (11 x 51.07)


Processed prompts: 100%|█████████| 12/12 [00:10<00:00,  1.19it/s, est. speed input: 75.24 toks/s, output: 610.24 toks/s]


- batch size 12: 610.03 tokens/sec (12 x 50.84)


Processed prompts: 100%|█████████| 13/13 [00:10<00:00,  1.28it/s, est. speed input: 80.76 toks/s, output: 656.31 toks/s]


- batch size 13: 656.12 tokens/sec (13 x 50.47)


Processed prompts: 100%|█████████| 14/14 [00:10<00:00,  1.36it/s, est. speed input: 86.09 toks/s, output: 690.78 toks/s]


- batch size 14: 690.56 tokens/sec (14 x 49.33)


Processed prompts: 100%|█████████| 15/15 [00:10<00:00,  1.47it/s, est. speed input: 92.65 toks/s, output: 747.39 toks/s]


- batch size 15: 747.14 tokens/sec (15 x 49.81)


Processed prompts: 100%|█████████| 16/16 [00:10<00:00,  1.56it/s, est. speed input: 98.35 toks/s, output: 792.05 toks/s]


- batch size 16: 791.76 tokens/sec (16 x 49.49)


Processed prompts: 100%|█████████| 17/17 [00:10<00:00,  1.55it/s, est. speed input: 97.86 toks/s, output: 791.16 toks/s]


- batch size 17: 790.86 tokens/sec (17 x 46.52)


Processed prompts: 100%|████████| 18/18 [00:11<00:00,  1.63it/s, est. speed input: 102.61 toks/s, output: 833.17 toks/s]


- batch size 18: 832.88 tokens/sec (18 x 46.27)


Processed prompts: 100%|████████| 19/19 [00:11<00:00,  1.73it/s, est. speed input: 108.87 toks/s, output: 871.30 toks/s]


- batch size 19: 871.01 tokens/sec (19 x 45.84)


Processed prompts: 100%|████████| 20/20 [00:11<00:00,  1.79it/s, est. speed input: 112.79 toks/s, output: 907.66 toks/s]


- batch size 20: 907.32 tokens/sec (20 x 45.37)


Processed prompts: 100%|████████| 21/21 [00:11<00:00,  1.88it/s, est. speed input: 118.24 toks/s, output: 957.62 toks/s]


- batch size 21: 957.29 tokens/sec (21 x 45.59)


Processed prompts: 100%|███████| 22/22 [00:11<00:00,  1.96it/s, est. speed input: 123.82 toks/s, output: 1005.52 toks/s]


- batch size 22: 1005.15 tokens/sec (22 x 45.69)


Processed prompts: 100%|███████| 23/23 [00:11<00:00,  1.96it/s, est. speed input: 123.65 toks/s, output: 1003.52 toks/s]


- batch size 23: 1003.09 tokens/sec (23 x 43.61)


Processed prompts: 100%|███████| 24/24 [00:11<00:00,  2.03it/s, est. speed input: 127.89 toks/s, output: 1039.36 toks/s]


- batch size 24: 1038.99 tokens/sec (24 x 43.29)


Processed prompts: 100%|███████| 25/25 [00:11<00:00,  2.09it/s, est. speed input: 131.69 toks/s, output: 1061.09 toks/s]


- batch size 25: 1060.58 tokens/sec (25 x 42.42)


Processed prompts: 100%|███████| 26/26 [00:11<00:00,  2.18it/s, est. speed input: 137.70 toks/s, output: 1109.60 toks/s]


- batch size 26: 1109.19 tokens/sec (26 x 42.66)


Processed prompts: 100%|███████| 27/27 [00:11<00:00,  2.26it/s, est. speed input: 142.36 toks/s, output: 1153.66 toks/s]


- batch size 27: 1153.22 tokens/sec (27 x 42.71)


Processed prompts: 100%|███████| 28/28 [00:12<00:00,  2.33it/s, est. speed input: 146.82 toks/s, output: 1187.31 toks/s]


- batch size 28: 1186.84 tokens/sec (28 x 42.39)


Processed prompts: 100%|███████| 29/29 [00:12<00:00,  2.40it/s, est. speed input: 151.49 toks/s, output: 1223.90 toks/s]


- batch size 29: 1223.40 tokens/sec (29 x 42.19)


Processed prompts: 100%|███████| 30/30 [00:12<00:00,  2.44it/s, est. speed input: 154.00 toks/s, output: 1243.88 toks/s]


- batch size 30: 1243.38 tokens/sec (30 x 41.45)


Processed prompts: 100%|███████| 31/31 [00:12<00:00,  2.53it/s, est. speed input: 159.80 toks/s, output: 1282.70 toks/s]


- batch size 31: 1282.13 tokens/sec (31 x 41.36)


Processed prompts: 100%|███████| 32/32 [00:12<00:00,  2.59it/s, est. speed input: 163.12 toks/s, output: 1321.86 toks/s]


- batch size 32: 1321.37 tokens/sec (32 x 41.29)


Processed prompts: 100%|███████| 33/33 [00:12<00:00,  2.61it/s, est. speed input: 164.50 toks/s, output: 1331.06 toks/s]


- batch size 33: 1330.54 tokens/sec (33 x 40.32)


Processed prompts: 100%|███████| 34/34 [00:12<00:00,  2.68it/s, est. speed input: 169.09 toks/s, output: 1366.55 toks/s]


- batch size 34: 1365.97 tokens/sec (34 x 40.18)


Processed prompts: 100%|███████| 35/35 [00:12<00:00,  2.75it/s, est. speed input: 173.34 toks/s, output: 1405.81 toks/s]


- batch size 35: 1405.09 tokens/sec (35 x 40.15)


Processed prompts: 100%|███████| 36/36 [00:12<00:00,  2.81it/s, est. speed input: 177.34 toks/s, output: 1431.51 toks/s]


- batch size 36: 1430.91 tokens/sec (36 x 39.75)


Processed prompts: 100%|███████| 37/37 [00:12<00:00,  2.85it/s, est. speed input: 179.79 toks/s, output: 1452.48 toks/s]


- batch size 37: 1451.85 tokens/sec (37 x 39.24)


Processed prompts: 100%|███████| 38/38 [00:13<00:00,  2.91it/s, est. speed input: 183.41 toks/s, output: 1478.96 toks/s]


- batch size 38: 1478.29 tokens/sec (38 x 38.90)


Processed prompts: 100%|███████| 39/39 [00:13<00:00,  2.99it/s, est. speed input: 188.84 toks/s, output: 1533.46 toks/s]


- batch size 39: 1532.73 tokens/sec (39 x 39.30)


Processed prompts: 100%|███████| 40/40 [00:13<00:00,  3.03it/s, est. speed input: 190.64 toks/s, output: 1518.94 toks/s]


- batch size 40: 1518.18 tokens/sec (40 x 37.95)


Processed prompts: 100%|███████| 41/41 [00:13<00:00,  3.11it/s, est. speed input: 195.87 toks/s, output: 1575.50 toks/s]


- batch size 41: 1574.79 tokens/sec (41 x 38.41)


Processed prompts: 100%|███████| 42/42 [00:13<00:00,  3.16it/s, est. speed input: 199.45 toks/s, output: 1613.63 toks/s]


- batch size 42: 1612.86 tokens/sec (42 x 38.40)


Processed prompts: 100%|███████| 43/43 [00:13<00:00,  3.18it/s, est. speed input: 200.46 toks/s, output: 1625.72 toks/s]


- batch size 43: 1624.96 tokens/sec (43 x 37.79)


Processed prompts: 100%|███████| 44/44 [00:13<00:00,  3.27it/s, est. speed input: 205.80 toks/s, output: 1659.72 toks/s]


- batch size 44: 1658.89 tokens/sec (44 x 37.70)


Processed prompts: 100%|███████| 45/45 [00:21<00:00,  2.09it/s, est. speed input: 131.66 toks/s, output: 1064.46 toks/s]


- batch size 45: 1064.15 tokens/sec (45 x 23.65)


Processed prompts: 100%|███████| 46/46 [00:21<00:00,  2.10it/s, est. speed input: 132.23 toks/s, output: 1071.14 toks/s]


- batch size 46: 1070.79 tokens/sec (46 x 23.28)


Processed prompts: 100%|███████| 47/47 [00:22<00:00,  2.13it/s, est. speed input: 134.17 toks/s, output: 1087.55 toks/s]


- batch size 47: 1087.21 tokens/sec (47 x 23.13)


Processed prompts: 100%|███████| 48/48 [00:22<00:00,  2.13it/s, est. speed input: 134.46 toks/s, output: 1085.58 toks/s]

- batch size 48: 1085.12 tokens/sec (48 x 22.61)





### w8a16

In [8]:
llm = vllm_load(test_models["llama-3.1:w8a16"])

INFO 09-22 12:47:04 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=neuralmagic/Meta-Llama-3.1-8B-Instruct-quant

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-22 12:47:08 model_runner.py:1008] Loading model weights took 8.4927 GB
INFO 09-22 12:47:09 gpu_executor.py:122] # GPU blocks: 6716, # CPU blocks: 2048
INFO 09-22 12:47:09 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-22 12:47:09 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-22 12:47:18 model_runner.py:1430] Graph capturing finished in 9 secs.


In [10]:
vllm_generate(test_messages*18, llm)

vLLM performance test:


Processed prompts: 100%|█████████████| 1/1 [00:06<00:00,  6.05s/it, est. speed input: 6.94 toks/s, output: 84.63 toks/s]


Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre de nombreux avantages à ses adhérents et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus élevés** : Le Crédit Mutuel propose des taux d'intérêt plus élevés que les banques traditionnelles, notamment pour les comptes courants et les prêts.\n2. **Services personnalisés** : En tant que banque coopérative, le Crédit Mutuel met l'accent sur la proximité et la personnalisation de ses services. Les clients ont accès à des conseillers financiers compétents qui leur proposent des solutions adaptées à leurs besoins.\n3. **Transparence et sécurité** : Le Crédit Mutuel est connu pour sa transparence dans les tarifs et les conditions de prêt. Les clients sont également protégés par des garanties de sécurité renforcées.\n4. **Épargne et placement** : Le Crédit Mutuel propose une gamme de produits d'épargne et de placement attractifs, notamment des comptes d'épargne, des livrets et des fonds 

Processed prompts: 100%|█████████████| 1/1 [00:05<00:00,  5.82s/it, est. speed input: 7.21 toks/s, output: 87.92 toks/s]


- batch size 1: 87.89 tokens/sec (1 x 87.89)


Processed prompts: 100%|███████████| 2/2 [00:05<00:00,  2.98s/it, est. speed input: 14.25 toks/s, output: 171.63 toks/s]


- batch size 2: 171.58 tokens/sec (2 x 85.79)


Processed prompts: 100%|███████████| 3/3 [00:05<00:00,  1.99s/it, est. speed input: 21.48 toks/s, output: 257.74 toks/s]


- batch size 3: 257.67 tokens/sec (3 x 85.89)


Processed prompts: 100%|███████████| 4/4 [00:06<00:00,  1.50s/it, est. speed input: 27.99 toks/s, output: 335.94 toks/s]


- batch size 4: 335.81 tokens/sec (4 x 83.95)


Processed prompts: 100%|███████████| 5/5 [00:06<00:00,  1.21s/it, est. speed input: 34.65 toks/s, output: 413.27 toks/s]


- batch size 5: 413.13 tokens/sec (5 x 82.63)


Processed prompts: 100%|███████████| 6/6 [00:06<00:00,  1.02s/it, est. speed input: 41.37 toks/s, output: 502.31 toks/s]


- batch size 6: 502.13 tokens/sec (6 x 83.69)


Processed prompts: 100%|███████████| 7/7 [00:06<00:00,  1.14it/s, est. speed input: 48.05 toks/s, output: 563.09 toks/s]


- batch size 7: 562.86 tokens/sec (7 x 80.41)


Processed prompts: 100%|███████████| 8/8 [00:06<00:00,  1.29it/s, est. speed input: 54.08 toks/s, output: 658.42 toks/s]


- batch size 8: 658.07 tokens/sec (8 x 82.26)


Processed prompts: 100%|███████████| 9/9 [00:06<00:00,  1.44it/s, est. speed input: 60.28 toks/s, output: 734.72 toks/s]


- batch size 9: 734.43 tokens/sec (9 x 81.60)


Processed prompts: 100%|█████████| 10/10 [00:06<00:00,  1.59it/s, est. speed input: 66.78 toks/s, output: 806.88 toks/s]


- batch size 10: 806.50 tokens/sec (10 x 80.65)


Processed prompts: 100%|█████████| 11/11 [00:06<00:00,  1.69it/s, est. speed input: 71.30 toks/s, output: 857.11 toks/s]


- batch size 11: 856.78 tokens/sec (11 x 77.89)


Processed prompts: 100%|█████████| 12/12 [00:06<00:00,  1.87it/s, est. speed input: 78.49 toks/s, output: 956.26 toks/s]


- batch size 12: 955.85 tokens/sec (12 x 79.65)


Processed prompts: 100%|████████| 13/13 [00:06<00:00,  2.01it/s, est. speed input: 84.51 toks/s, output: 1017.83 toks/s]


- batch size 13: 1017.28 tokens/sec (13 x 78.25)


Processed prompts: 100%|████████| 14/14 [00:06<00:00,  2.15it/s, est. speed input: 90.58 toks/s, output: 1087.73 toks/s]


- batch size 14: 1087.24 tokens/sec (14 x 77.66)


Processed prompts: 100%|████████| 15/15 [00:06<00:00,  2.29it/s, est. speed input: 96.63 toks/s, output: 1162.61 toks/s]


- batch size 15: 1162.08 tokens/sec (15 x 77.47)


Processed prompts: 100%|███████| 16/16 [00:06<00:00,  2.46it/s, est. speed input: 103.28 toks/s, output: 1235.94 toks/s]


- batch size 16: 1235.28 tokens/sec (16 x 77.20)


Processed prompts: 100%|████████| 17/17 [00:07<00:00,  2.37it/s, est. speed input: 99.74 toks/s, output: 1199.24 toks/s]


- batch size 17: 1198.74 tokens/sec (17 x 70.51)


Processed prompts: 100%|███████| 18/18 [00:07<00:00,  2.48it/s, est. speed input: 104.09 toks/s, output: 1259.25 toks/s]


- batch size 18: 1258.70 tokens/sec (18 x 69.93)


Processed prompts: 100%|███████| 19/19 [00:07<00:00,  2.61it/s, est. speed input: 109.93 toks/s, output: 1317.82 toks/s]


- batch size 19: 1317.21 tokens/sec (19 x 69.33)


Processed prompts: 100%|███████| 20/20 [00:07<00:00,  2.67it/s, est. speed input: 112.21 toks/s, output: 1365.50 toks/s]


- batch size 20: 1364.70 tokens/sec (20 x 68.24)


Processed prompts: 100%|███████| 21/21 [00:07<00:00,  2.86it/s, est. speed input: 119.95 toks/s, output: 1436.85 toks/s]


- batch size 21: 1436.10 tokens/sec (21 x 68.39)


Processed prompts: 100%|███████| 22/22 [00:07<00:00,  2.98it/s, est. speed input: 125.10 toks/s, output: 1498.98 toks/s]


- batch size 22: 1498.23 tokens/sec (22 x 68.10)


Processed prompts: 100%|███████| 23/23 [00:07<00:00,  3.09it/s, est. speed input: 130.16 toks/s, output: 1557.75 toks/s]


- batch size 23: 1557.03 tokens/sec (23 x 67.70)


Processed prompts: 100%|███████| 24/24 [00:07<00:00,  3.21it/s, est. speed input: 134.87 toks/s, output: 1614.59 toks/s]


- batch size 24: 1613.82 tokens/sec (24 x 67.24)


Processed prompts: 100%|███████| 25/25 [00:07<00:00,  3.34it/s, est. speed input: 140.42 toks/s, output: 1693.10 toks/s]


- batch size 25: 1692.18 tokens/sec (25 x 67.69)


Processed prompts: 100%|███████| 26/26 [00:07<00:00,  3.39it/s, est. speed input: 142.59 toks/s, output: 1714.22 toks/s]


- batch size 26: 1713.37 tokens/sec (26 x 65.90)


Processed prompts: 100%|███████| 27/27 [00:07<00:00,  3.57it/s, est. speed input: 150.38 toks/s, output: 1784.17 toks/s]


- batch size 27: 1783.18 tokens/sec (27 x 66.04)


Processed prompts: 100%|███████| 28/28 [00:07<00:00,  3.64it/s, est. speed input: 152.80 toks/s, output: 1857.85 toks/s]


- batch size 28: 1856.88 tokens/sec (28 x 66.32)


Processed prompts: 100%|███████| 29/29 [00:07<00:00,  3.73it/s, est. speed input: 156.48 toks/s, output: 1885.38 toks/s]


- batch size 29: 1884.27 tokens/sec (29 x 64.97)


Processed prompts: 100%|███████| 30/30 [00:07<00:00,  3.85it/s, est. speed input: 161.69 toks/s, output: 1931.98 toks/s]


- batch size 30: 1930.94 tokens/sec (30 x 64.36)


Processed prompts: 100%|███████| 31/31 [00:08<00:00,  3.87it/s, est. speed input: 162.87 toks/s, output: 1967.71 toks/s]


- batch size 31: 1966.60 tokens/sec (31 x 63.44)


Processed prompts: 100%|███████| 32/32 [00:07<00:00,  4.06it/s, est. speed input: 170.60 toks/s, output: 2071.56 toks/s]


- batch size 32: 2070.27 tokens/sec (32 x 64.70)


Processed prompts: 100%|███████| 33/33 [00:08<00:00,  3.99it/s, est. speed input: 167.64 toks/s, output: 2013.23 toks/s]


- batch size 33: 2012.02 tokens/sec (33 x 60.97)


Processed prompts: 100%|███████| 34/34 [00:08<00:00,  4.06it/s, est. speed input: 170.75 toks/s, output: 2057.06 toks/s]


- batch size 34: 2055.94 tokens/sec (34 x 60.47)


Processed prompts: 100%|███████| 35/35 [00:08<00:00,  4.08it/s, est. speed input: 171.43 toks/s, output: 2072.74 toks/s]


- batch size 35: 2071.45 tokens/sec (35 x 59.18)


Processed prompts: 100%|███████| 36/36 [00:08<00:00,  4.22it/s, est. speed input: 177.45 toks/s, output: 2139.55 toks/s]


- batch size 36: 2138.32 tokens/sec (36 x 59.40)


Processed prompts: 100%|███████| 37/37 [00:08<00:00,  4.32it/s, est. speed input: 181.51 toks/s, output: 2183.74 toks/s]


- batch size 37: 2182.29 tokens/sec (37 x 58.98)


Processed prompts: 100%|███████| 38/38 [00:08<00:00,  4.40it/s, est. speed input: 184.81 toks/s, output: 2239.70 toks/s]


- batch size 38: 2238.40 tokens/sec (38 x 58.91)


Processed prompts: 100%|███████| 39/39 [00:08<00:00,  4.45it/s, est. speed input: 186.98 toks/s, output: 2260.57 toks/s]


- batch size 39: 2259.25 tokens/sec (39 x 57.93)


Processed prompts: 100%|███████| 40/40 [00:08<00:00,  4.62it/s, est. speed input: 193.99 toks/s, output: 2339.56 toks/s]


- batch size 40: 2338.13 tokens/sec (40 x 58.45)


Processed prompts: 100%|███████| 41/41 [00:08<00:00,  4.67it/s, est. speed input: 196.33 toks/s, output: 2359.76 toks/s]


- batch size 41: 2358.16 tokens/sec (41 x 57.52)


Processed prompts: 100%|███████| 42/42 [00:08<00:00,  4.72it/s, est. speed input: 198.40 toks/s, output: 2396.78 toks/s]


- batch size 42: 2395.28 tokens/sec (42 x 57.03)


Processed prompts: 100%|███████| 43/43 [00:09<00:00,  4.71it/s, est. speed input: 198.19 toks/s, output: 2383.91 toks/s]


- batch size 43: 2382.54 tokens/sec (43 x 55.41)


Processed prompts: 100%|███████| 44/44 [00:09<00:00,  4.88it/s, est. speed input: 204.93 toks/s, output: 2442.57 toks/s]


- batch size 44: 2440.99 tokens/sec (44 x 55.48)


Processed prompts: 100%|███████| 45/45 [00:09<00:00,  4.94it/s, est. speed input: 207.50 toks/s, output: 2503.28 toks/s]


- batch size 45: 2501.75 tokens/sec (45 x 55.59)


Processed prompts: 100%|███████| 46/46 [00:09<00:00,  4.95it/s, est. speed input: 207.94 toks/s, output: 2512.30 toks/s]


- batch size 46: 2510.82 tokens/sec (46 x 54.58)


Processed prompts: 100%|███████| 47/47 [00:09<00:00,  5.10it/s, est. speed input: 214.43 toks/s, output: 2596.01 toks/s]


- batch size 47: 2594.32 tokens/sec (47 x 55.20)


Processed prompts: 100%|███████| 48/48 [00:09<00:00,  5.20it/s, est. speed input: 218.22 toks/s, output: 2615.13 toks/s]


- batch size 48: 2613.30 tokens/sec (48 x 54.44)


Processed prompts: 100%|███████| 49/49 [00:09<00:00,  5.39it/s, est. speed input: 226.31 toks/s, output: 2720.64 toks/s]


- batch size 49: 2718.71 tokens/sec (49 x 55.48)


Processed prompts: 100%|███████| 50/50 [00:09<00:00,  5.55it/s, est. speed input: 233.29 toks/s, output: 2790.36 toks/s]


- batch size 50: 2788.50 tokens/sec (50 x 55.77)


Processed prompts: 100%|███████| 51/51 [00:09<00:00,  5.17it/s, est. speed input: 217.31 toks/s, output: 2616.69 toks/s]


- batch size 51: 2615.17 tokens/sec (51 x 51.28)


Processed prompts: 100%|███████| 52/52 [00:09<00:00,  5.32it/s, est. speed input: 223.62 toks/s, output: 2702.69 toks/s]


- batch size 52: 2700.85 tokens/sec (52 x 51.94)


Processed prompts: 100%|███████| 53/53 [00:09<00:00,  5.41it/s, est. speed input: 227.09 toks/s, output: 2724.34 toks/s]


- batch size 53: 2722.50 tokens/sec (53 x 51.37)


Processed prompts: 100%|███████| 54/54 [00:09<00:00,  5.40it/s, est. speed input: 227.07 toks/s, output: 2739.90 toks/s]


- batch size 54: 2738.10 tokens/sec (54 x 50.71)


Processed prompts: 100%|███████| 55/55 [00:09<00:00,  5.52it/s, est. speed input: 231.88 toks/s, output: 2794.47 toks/s]


- batch size 55: 2792.75 tokens/sec (55 x 50.78)


Processed prompts: 100%|███████| 56/56 [00:10<00:00,  5.50it/s, est. speed input: 230.98 toks/s, output: 2791.16 toks/s]


- batch size 56: 2789.30 tokens/sec (56 x 49.81)


Processed prompts: 100%|███████| 57/57 [00:10<00:00,  5.62it/s, est. speed input: 236.13 toks/s, output: 2836.53 toks/s]


- batch size 57: 2834.65 tokens/sec (57 x 49.73)


Processed prompts: 100%|███████| 58/58 [00:10<00:00,  5.71it/s, est. speed input: 239.74 toks/s, output: 2868.93 toks/s]


- batch size 58: 2866.93 tokens/sec (58 x 49.43)


Processed prompts: 100%|███████| 59/59 [00:10<00:00,  5.69it/s, est. speed input: 239.28 toks/s, output: 2891.93 toks/s]


- batch size 59: 2890.12 tokens/sec (59 x 48.99)


Processed prompts: 100%|███████| 60/60 [00:10<00:00,  5.85it/s, est. speed input: 245.72 toks/s, output: 2945.32 toks/s]


- batch size 60: 2943.27 tokens/sec (60 x 49.05)


Processed prompts: 100%|███████| 61/61 [00:10<00:00,  5.85it/s, est. speed input: 245.85 toks/s, output: 2951.12 toks/s]


- batch size 61: 2949.07 tokens/sec (61 x 48.35)


Processed prompts: 100%|███████| 62/62 [00:10<00:00,  5.93it/s, est. speed input: 249.18 toks/s, output: 2990.77 toks/s]


- batch size 62: 2988.63 tokens/sec (62 x 48.20)


Processed prompts: 100%|███████| 63/63 [00:10<00:00,  5.92it/s, est. speed input: 248.93 toks/s, output: 2987.75 toks/s]


- batch size 63: 2985.80 tokens/sec (63 x 47.39)


Processed prompts: 100%|███████| 64/64 [00:10<00:00,  5.93it/s, est. speed input: 249.17 toks/s, output: 2987.57 toks/s]


- batch size 64: 2985.45 tokens/sec (64 x 46.65)


Processed prompts: 100%|███████| 65/65 [00:13<00:00,  4.89it/s, est. speed input: 205.33 toks/s, output: 2475.45 toks/s]


- batch size 65: 2474.07 tokens/sec (65 x 38.06)


Processed prompts: 100%|███████| 66/66 [00:13<00:00,  4.85it/s, est. speed input: 203.57 toks/s, output: 2432.96 toks/s]


- batch size 66: 2431.67 tokens/sec (66 x 36.84)


Processed prompts: 100%|███████| 67/67 [00:13<00:00,  4.87it/s, est. speed input: 204.60 toks/s, output: 2458.98 toks/s]


- batch size 67: 2457.61 tokens/sec (67 x 36.68)


Processed prompts: 100%|███████| 68/68 [00:14<00:00,  4.75it/s, est. speed input: 199.54 toks/s, output: 2413.53 toks/s]


- batch size 68: 2412.29 tokens/sec (68 x 35.47)


Processed prompts:   0%|                     | 0/69 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

KeyboardInterrupt: 

### fp8

Note: the "FP8-dynamic" version is much slower that the "FP8" version.
- batch size 16: 61 tokens/sec vs 77 tokens/sec
- batch size 32: 42 tokens/sec vs 62 tokens/sec
- batch size 48: 03 tokens/sec vs 52 tokens/sec !

In [8]:
llm = vllm_load(test_models["llama-3.1:fp8"])

config.json:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

INFO 09-22 13:39:33 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8, use_v2_block_manager=Fal

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

INFO 09-22 13:39:35 model_runner.py:997] Starting to load model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8...
INFO 09-22 13:39:36 weight_utils.py:242] Using model weights format ['*.safetensors']


model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.08G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/62.8k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-22 13:47:57 model_runner.py:1008] Loading model weights took 8.4889 GB
INFO 09-22 13:47:58 gpu_executor.py:122] # GPU blocks: 6856, # CPU blocks: 2048
INFO 09-22 13:47:58 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-22 13:47:58 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-22 13:48:07 model_runner.py:1430] Graph capturing finished in 9 secs.


In [9]:
vllm_generate(test_messages*16, llm)

vLLM performance test:


Processed prompts: 100%|████████████| 1/1 [00:06<00:00,  6.11s/it, est. speed input: 10.32 toks/s, output: 83.84 toks/s]


Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre plusieurs avantages à ses membres et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus élevés sur les épargnes** : Le Crédit Mutuel propose des taux d'intérêt plus élevés sur les comptes d'épargne comparés aux banques traditionnelles.\n2. **Frais de gestion réduits** : Les frais de gestion pour les comptes courants et les prêts sont souvent moins élevés chez le Crédit Mutuel que dans les banques conventionnelles.\n3. **Prêts personnalisés** : Le Crédit Mutuel prend en compte la situation personnelle et financière de chaque client pour proposer des prêts adaptés à ses besoins.\n4. **Soutien à l'entrepreneuriat** : Le Crédit Mutuel propose des solutions financières spécifiques pour les entrepreneurs et les PME, notamment des prêts et des services de financement.\n5. **Services bancaires complets** : Le Crédit Mutuel propose une gamme complète de services bancaires, y compris les co

Processed prompts: 100%|████████████| 1/1 [00:05<00:00,  5.93s/it, est. speed input: 10.62 toks/s, output: 86.31 toks/s]


- batch size 1: 86.27 tokens/sec (1 x 86.27)


Processed prompts: 100%|███████████| 2/2 [00:06<00:00,  3.04s/it, est. speed input: 20.88 toks/s, output: 168.32 toks/s]


- batch size 2: 168.27 tokens/sec (2 x 84.14)


Processed prompts: 100%|███████████| 3/3 [00:06<00:00,  2.03s/it, est. speed input: 31.39 toks/s, output: 252.42 toks/s]


- batch size 3: 252.33 tokens/sec (3 x 84.11)


Processed prompts: 100%|███████████| 4/4 [00:06<00:00,  1.53s/it, est. speed input: 41.17 toks/s, output: 334.58 toks/s]


- batch size 4: 334.44 tokens/sec (4 x 83.61)


Processed prompts: 100%|███████████| 5/5 [00:06<00:00,  1.24s/it, est. speed input: 51.01 toks/s, output: 411.35 toks/s]


- batch size 5: 411.19 tokens/sec (5 x 82.24)


Processed prompts: 100%|███████████| 6/6 [00:06<00:00,  1.03s/it, est. speed input: 61.37 toks/s, output: 497.41 toks/s]


- batch size 6: 497.24 tokens/sec (6 x 82.87)


Processed prompts: 100%|███████████| 7/7 [00:06<00:00,  1.12it/s, est. speed input: 71.09 toks/s, output: 575.11 toks/s]


- batch size 7: 574.88 tokens/sec (7 x 82.13)


Processed prompts: 100%|███████████| 8/8 [00:06<00:00,  1.28it/s, est. speed input: 80.54 toks/s, output: 654.54 toks/s]


- batch size 8: 654.25 tokens/sec (8 x 81.78)


Processed prompts: 100%|███████████| 9/9 [00:06<00:00,  1.43it/s, est. speed input: 90.25 toks/s, output: 729.52 toks/s]


- batch size 9: 729.21 tokens/sec (9 x 81.02)


Processed prompts: 100%|█████████| 10/10 [00:06<00:00,  1.57it/s, est. speed input: 99.38 toks/s, output: 806.05 toks/s]


- batch size 10: 805.72 tokens/sec (10 x 80.57)


Processed prompts: 100%|████████| 11/11 [00:06<00:00,  1.74it/s, est. speed input: 109.74 toks/s, output: 889.25 toks/s]


- batch size 11: 888.85 tokens/sec (11 x 80.80)


Processed prompts: 100%|████████| 12/12 [00:06<00:00,  1.88it/s, est. speed input: 118.63 toks/s, output: 964.09 toks/s]


- batch size 12: 963.66 tokens/sec (12 x 80.31)


Processed prompts: 100%|███████| 13/13 [00:06<00:00,  2.03it/s, est. speed input: 127.69 toks/s, output: 1026.37 toks/s]


- batch size 13: 1025.87 tokens/sec (13 x 78.91)


Processed prompts: 100%|███████| 14/14 [00:06<00:00,  2.14it/s, est. speed input: 134.68 toks/s, output: 1086.59 toks/s]


- batch size 14: 1086.05 tokens/sec (14 x 77.57)


Processed prompts: 100%|███████| 15/15 [00:06<00:00,  2.29it/s, est. speed input: 144.63 toks/s, output: 1172.89 toks/s]


- batch size 15: 1172.37 tokens/sec (15 x 78.16)


Processed prompts: 100%|███████| 16/16 [00:06<00:00,  2.42it/s, est. speed input: 152.37 toks/s, output: 1232.30 toks/s]


- batch size 16: 1231.70 tokens/sec (16 x 76.98)


Processed prompts: 100%|███████| 17/17 [00:06<00:00,  2.47it/s, est. speed input: 155.67 toks/s, output: 1264.67 toks/s]


- batch size 17: 1264.05 tokens/sec (17 x 74.36)


Processed prompts: 100%|███████| 18/18 [00:06<00:00,  2.61it/s, est. speed input: 164.44 toks/s, output: 1327.22 toks/s]


- batch size 18: 1326.57 tokens/sec (18 x 73.70)


Processed prompts: 100%|███████| 19/19 [00:06<00:00,  2.74it/s, est. speed input: 173.17 toks/s, output: 1394.73 toks/s]


- batch size 19: 1394.01 tokens/sec (19 x 73.37)


Processed prompts: 100%|███████| 20/20 [00:07<00:00,  2.84it/s, est. speed input: 179.14 toks/s, output: 1443.90 toks/s]


- batch size 20: 1443.12 tokens/sec (20 x 72.16)


Processed prompts: 100%|███████| 21/21 [00:06<00:00,  3.02it/s, est. speed input: 190.27 toks/s, output: 1545.87 toks/s]


- batch size 21: 1545.00 tokens/sec (21 x 73.57)


Processed prompts: 100%|███████| 22/22 [00:07<00:00,  3.13it/s, est. speed input: 197.52 toks/s, output: 1604.08 toks/s]


- batch size 22: 1603.21 tokens/sec (22 x 72.87)


Processed prompts: 100%|███████| 23/23 [00:07<00:00,  3.26it/s, est. speed input: 205.49 toks/s, output: 1667.17 toks/s]


- batch size 23: 1666.15 tokens/sec (23 x 72.44)


Processed prompts: 100%|███████| 24/24 [00:07<00:00,  3.40it/s, est. speed input: 214.05 toks/s, output: 1732.91 toks/s]


- batch size 24: 1731.86 tokens/sec (24 x 72.16)


Processed prompts: 100%|███████| 25/25 [00:07<00:00,  3.23it/s, est. speed input: 203.68 toks/s, output: 1641.56 toks/s]


- batch size 25: 1640.63 tokens/sec (25 x 65.63)


Processed prompts: 100%|███████| 26/26 [00:07<00:00,  3.36it/s, est. speed input: 212.02 toks/s, output: 1720.50 toks/s]


- batch size 26: 1719.50 tokens/sec (26 x 66.13)


Processed prompts: 100%|███████| 27/27 [00:07<00:00,  3.46it/s, est. speed input: 218.23 toks/s, output: 1758.78 toks/s]


- batch size 27: 1757.78 tokens/sec (27 x 65.10)


Processed prompts: 100%|███████| 28/28 [00:07<00:00,  3.57it/s, est. speed input: 225.07 toks/s, output: 1820.45 toks/s]


- batch size 28: 1819.32 tokens/sec (28 x 64.98)


Processed prompts: 100%|███████| 29/29 [00:07<00:00,  3.64it/s, est. speed input: 229.44 toks/s, output: 1856.89 toks/s]


- batch size 29: 1855.89 tokens/sec (29 x 64.00)


Processed prompts: 100%|███████| 30/30 [00:08<00:00,  3.67it/s, est. speed input: 231.08 toks/s, output: 1874.22 toks/s]


- batch size 30: 1873.13 tokens/sec (30 x 62.44)


Processed prompts: 100%|███████| 31/31 [00:08<00:00,  3.80it/s, est. speed input: 239.93 toks/s, output: 1946.91 toks/s]


- batch size 31: 1945.79 tokens/sec (31 x 62.77)


Processed prompts: 100%|███████| 32/32 [00:08<00:00,  3.88it/s, est. speed input: 244.64 toks/s, output: 1988.15 toks/s]


- batch size 32: 1986.91 tokens/sec (32 x 62.09)


Processed prompts: 100%|███████| 33/33 [00:08<00:00,  3.89it/s, est. speed input: 244.79 toks/s, output: 1959.25 toks/s]


- batch size 33: 1958.11 tokens/sec (33 x 59.34)


Processed prompts: 100%|███████| 34/34 [00:08<00:00,  3.96it/s, est. speed input: 249.55 toks/s, output: 2020.37 toks/s]


- batch size 34: 2019.27 tokens/sec (34 x 59.39)


Processed prompts: 100%|███████| 35/35 [00:08<00:00,  4.08it/s, est. speed input: 257.34 toks/s, output: 2081.09 toks/s]


- batch size 35: 2079.73 tokens/sec (35 x 59.42)


Processed prompts: 100%|███████| 36/36 [00:08<00:00,  4.17it/s, est. speed input: 262.86 toks/s, output: 2136.26 toks/s]


- batch size 36: 2134.90 tokens/sec (36 x 59.30)


Processed prompts: 100%|███████| 37/37 [00:08<00:00,  4.23it/s, est. speed input: 266.38 toks/s, output: 2150.39 toks/s]


- batch size 37: 2149.06 tokens/sec (37 x 58.08)


Processed prompts: 100%|███████| 38/38 [00:08<00:00,  4.36it/s, est. speed input: 274.64 toks/s, output: 2227.99 toks/s]


- batch size 38: 2226.55 tokens/sec (38 x 58.59)


Processed prompts: 100%|███████| 39/39 [00:08<00:00,  4.64it/s, est. speed input: 292.55 toks/s, output: 2370.01 toks/s]


- batch size 39: 2368.33 tokens/sec (39 x 60.73)


Processed prompts: 100%|███████| 40/40 [00:08<00:00,  4.47it/s, est. speed input: 281.69 toks/s, output: 2281.73 toks/s]


- batch size 40: 2280.26 tokens/sec (40 x 57.01)


Processed prompts: 100%|███████| 41/41 [00:08<00:00,  4.57it/s, est. speed input: 287.97 toks/s, output: 2330.75 toks/s]


- batch size 41: 2329.16 tokens/sec (41 x 56.81)


Processed prompts: 100%|███████| 42/42 [00:09<00:00,  4.61it/s, est. speed input: 290.65 toks/s, output: 2353.57 toks/s]


- batch size 42: 2351.93 tokens/sec (42 x 56.00)


Processed prompts: 100%|███████| 43/43 [00:09<00:00,  4.64it/s, est. speed input: 292.56 toks/s, output: 2369.20 toks/s]


- batch size 43: 2367.68 tokens/sec (43 x 55.06)


Processed prompts: 100%|███████| 44/44 [00:09<00:00,  4.75it/s, est. speed input: 299.25 toks/s, output: 2423.04 toks/s]


- batch size 44: 2421.30 tokens/sec (44 x 55.03)


Processed prompts: 100%|███████| 45/45 [00:09<00:00,  4.75it/s, est. speed input: 299.00 toks/s, output: 2421.10 toks/s]


- batch size 45: 2419.48 tokens/sec (45 x 53.77)


Processed prompts: 100%|███████| 46/46 [00:09<00:00,  4.84it/s, est. speed input: 305.23 toks/s, output: 2474.15 toks/s]


- batch size 46: 2472.53 tokens/sec (46 x 53.75)


Processed prompts: 100%|███████| 47/47 [00:09<00:00,  4.92it/s, est. speed input: 310.48 toks/s, output: 2512.15 toks/s]


- batch size 47: 2510.48 tokens/sec (47 x 53.41)


Processed prompts: 100%|███████| 48/48 [00:09<00:00,  4.90it/s, est. speed input: 309.00 toks/s, output: 2507.15 toks/s]


- batch size 48: 2505.45 tokens/sec (48 x 52.20)


Processed prompts: 100%|███████| 49/49 [00:10<00:00,  4.81it/s, est. speed input: 303.06 toks/s, output: 2448.22 toks/s]


- batch size 49: 2446.60 tokens/sec (49 x 49.93)


Processed prompts: 100%|███████| 50/50 [00:10<00:00,  4.81it/s, est. speed input: 302.88 toks/s, output: 2459.86 toks/s]


- batch size 50: 2458.17 tokens/sec (50 x 49.16)


Processed prompts: 100%|███████| 51/51 [00:10<00:00,  4.89it/s, est. speed input: 308.39 toks/s, output: 2497.62 toks/s]


- batch size 51: 2496.01 tokens/sec (51 x 48.94)


Processed prompts: 100%|███████| 52/52 [00:10<00:00,  5.09it/s, est. speed input: 320.59 toks/s, output: 2598.08 toks/s]


- batch size 52: 2596.31 tokens/sec (52 x 49.93)


Processed prompts: 100%|███████| 53/53 [00:10<00:00,  4.95it/s, est. speed input: 311.58 toks/s, output: 2519.97 toks/s]


- batch size 53: 2518.22 tokens/sec (53 x 47.51)


Processed prompts: 100%|███████| 54/54 [00:10<00:00,  5.08it/s, est. speed input: 319.88 toks/s, output: 2592.56 toks/s]


- batch size 54: 2590.62 tokens/sec (54 x 47.97)


Processed prompts: 100%|███████| 55/55 [00:10<00:00,  5.10it/s, est. speed input: 321.73 toks/s, output: 2603.96 toks/s]


- batch size 55: 2602.18 tokens/sec (55 x 47.31)


Processed prompts: 100%|███████| 56/56 [00:28<00:00,  1.99it/s, est. speed input: 125.12 toks/s, output: 1013.03 toks/s]


- batch size 56: 1012.78 tokens/sec (56 x 18.09)


Processed prompts: 100%|████████| 57/57 [00:29<00:00,  1.96it/s, est. speed input: 123.31 toks/s, output: 997.82 toks/s]


- batch size 57: 997.55 tokens/sec (57 x 17.50)


Processed prompts: 100%|████████| 58/58 [00:30<00:00,  1.90it/s, est. speed input: 119.90 toks/s, output: 971.78 toks/s]


- batch size 58: 971.53 tokens/sec (58 x 16.75)


Processed prompts: 100%|████████| 59/59 [00:31<00:00,  1.90it/s, est. speed input: 119.68 toks/s, output: 970.45 toks/s]


- batch size 59: 970.21 tokens/sec (59 x 16.44)


Processed prompts: 100%|████████| 60/60 [00:31<00:00,  1.90it/s, est. speed input: 119.84 toks/s, output: 972.38 toks/s]


- batch size 60: 972.13 tokens/sec (60 x 16.20)


Processed prompts: 100%|████████| 61/61 [00:32<00:00,  1.91it/s, est. speed input: 120.05 toks/s, output: 974.06 toks/s]


- batch size 61: 973.83 tokens/sec (61 x 15.96)


Processed prompts: 100%|████████| 62/62 [00:32<00:00,  1.91it/s, est. speed input: 120.49 toks/s, output: 973.36 toks/s]


- batch size 62: 973.11 tokens/sec (62 x 15.70)


Processed prompts: 100%|████████| 63/63 [00:32<00:00,  1.94it/s, est. speed input: 122.35 toks/s, output: 989.52 toks/s]


- batch size 63: 989.27 tokens/sec (63 x 15.70)


Processed prompts: 100%|████████| 64/64 [00:33<00:00,  1.94it/s, est. speed input: 121.93 toks/s, output: 986.78 toks/s]


- batch size 64: 986.53 tokens/sec (64 x 15.41)


Processed prompts: 100%|████████| 65/65 [00:34<00:00,  1.91it/s, est. speed input: 120.07 toks/s, output: 972.32 toks/s]


- batch size 65: 972.07 tokens/sec (65 x 14.95)


Processed prompts: 100%|████████| 66/66 [00:34<00:00,  1.92it/s, est. speed input: 120.88 toks/s, output: 978.95 toks/s]


- batch size 66: 978.71 tokens/sec (66 x 14.83)


Processed prompts:   0%|                     | 0/67 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

KeyboardInterrupt: 

### w8a8

In [8]:
llm = vllm_load(test_models["llama-3.1:w8a8"])

config.json:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

INFO 09-22 14:08:30 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=neuralmagic/Meta-Llama-3.1-8B-Instruct-quantiz

tokenizer_config.json:   0%|          | 0.00/50.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

INFO 09-22 14:08:33 model_runner.py:997] Starting to load model neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8...
INFO 09-22 14:08:33 weight_utils.py:242] Using model weights format ['*.safetensors']


model-00002-of-00002.safetensors:   0%|          | 0.00/4.08G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/43.5k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-22 14:16:47 model_runner.py:1008] Loading model weights took 8.4939 GB
INFO 09-22 14:16:48 gpu_executor.py:122] # GPU blocks: 6759, # CPU blocks: 2048
INFO 09-22 14:16:48 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-22 14:16:48 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-22 14:16:57 model_runner.py:1430] Graph capturing finished in 9 secs.


In [9]:
vllm_generate(test_messages*18, llm)

vLLM performance test:


Processed prompts: 100%|█████████████| 1/1 [00:06<00:00,  6.90s/it, est. speed input: 6.09 toks/s, output: 74.24 toks/s]


Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre plusieurs avantages à ses membres et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus faibles** : Le Crédit Mutuel propose des taux d'intérêt compétitifs pour les prêts, les comptes courants et les épargnes.\n2. **Accès à des services bancaires complets** : Le Crédit Mutuel offre une gamme complète de services bancaires, y compris des prêts, des comptes courants, des épargnes, des cartes de crédit, des assurances et des investissements.\n3. **Conseils personnalisés** : Les conseillers du Crédit Mutuel peuvent vous aider à gérer votre patrimoine, à élaborer un plan financier et à atteindre vos objectifs économiques.\n4. **Transparence et sécurité** : Le Crédit Mutuel est une banque coopérative, ce qui signifie que les décisions sont prises collectivement par les membres et non par des actionnaires. Cela assure une transparence et une sécurité supplémentaires.\n5. **Services en li

Processed prompts: 100%|█████████████| 1/1 [00:06<00:00,  6.63s/it, est. speed input: 6.33 toks/s, output: 77.21 toks/s]


- batch size 1: 77.18 tokens/sec (1 x 77.18)


Processed prompts: 100%|███████████| 2/2 [00:06<00:00,  3.39s/it, est. speed input: 12.53 toks/s, output: 149.72 toks/s]


- batch size 2: 149.67 tokens/sec (2 x 74.84)


Processed prompts: 100%|███████████| 3/3 [00:06<00:00,  2.27s/it, est. speed input: 18.83 toks/s, output: 221.19 toks/s]


- batch size 3: 221.12 tokens/sec (3 x 73.71)


Processed prompts: 100%|███████████| 4/4 [00:06<00:00,  1.70s/it, est. speed input: 24.69 toks/s, output: 277.91 toks/s]


- batch size 4: 277.82 tokens/sec (4 x 69.45)


Processed prompts: 100%|███████████| 5/5 [00:06<00:00,  1.38s/it, est. speed input: 30.48 toks/s, output: 360.57 toks/s]


- batch size 5: 360.46 tokens/sec (5 x 72.09)


Processed prompts: 100%|███████████| 6/6 [00:06<00:00,  1.16s/it, est. speed input: 36.45 toks/s, output: 442.11 toks/s]


- batch size 6: 441.98 tokens/sec (6 x 73.66)


Processed prompts: 100%|███████████| 7/7 [00:06<00:00,  1.00it/s, est. speed input: 42.40 toks/s, output: 494.18 toks/s]


- batch size 7: 493.99 tokens/sec (7 x 70.57)


Processed prompts: 100%|███████████| 8/8 [00:06<00:00,  1.14it/s, est. speed input: 48.02 toks/s, output: 580.86 toks/s]


- batch size 8: 580.67 tokens/sec (8 x 72.58)


Processed prompts: 100%|███████████| 9/9 [00:07<00:00,  1.27it/s, est. speed input: 53.51 toks/s, output: 649.17 toks/s]


- batch size 9: 648.93 tokens/sec (9 x 72.10)


Processed prompts: 100%|█████████| 10/10 [00:07<00:00,  1.42it/s, est. speed input: 59.71 toks/s, output: 715.61 toks/s]


- batch size 10: 715.38 tokens/sec (10 x 71.54)


Processed prompts: 100%|█████████| 11/11 [00:07<00:00,  1.55it/s, est. speed input: 65.46 toks/s, output: 785.15 toks/s]


- batch size 11: 784.85 tokens/sec (11 x 71.35)


Processed prompts: 100%|█████████| 12/12 [00:07<00:00,  1.68it/s, est. speed input: 70.58 toks/s, output: 849.79 toks/s]


- batch size 12: 849.42 tokens/sec (12 x 70.79)


Processed prompts: 100%|█████████| 13/13 [00:07<00:00,  1.81it/s, est. speed input: 75.96 toks/s, output: 914.40 toks/s]


- batch size 13: 914.05 tokens/sec (13 x 70.31)


Processed prompts: 100%|█████████| 14/14 [00:07<00:00,  1.90it/s, est. speed input: 79.95 toks/s, output: 952.74 toks/s]


- batch size 14: 952.35 tokens/sec (14 x 68.02)


Processed prompts: 100%|████████| 15/15 [00:07<00:00,  2.05it/s, est. speed input: 86.48 toks/s, output: 1028.98 toks/s]


- batch size 15: 1028.55 tokens/sec (15 x 68.57)


Processed prompts: 100%|████████| 16/16 [00:07<00:00,  2.17it/s, est. speed input: 91.31 toks/s, output: 1107.42 toks/s]


- batch size 16: 1106.96 tokens/sec (16 x 69.18)


Processed prompts: 100%|████████| 17/17 [00:07<00:00,  2.25it/s, est. speed input: 94.65 toks/s, output: 1123.09 toks/s]


- batch size 17: 1122.60 tokens/sec (17 x 66.04)


Processed prompts: 100%|███████| 18/18 [00:07<00:00,  2.40it/s, est. speed input: 100.96 toks/s, output: 1194.59 toks/s]


- batch size 18: 1194.09 tokens/sec (18 x 66.34)


Processed prompts: 100%|███████| 19/19 [00:07<00:00,  2.51it/s, est. speed input: 105.88 toks/s, output: 1262.74 toks/s]


- batch size 19: 1262.16 tokens/sec (19 x 66.43)


Processed prompts: 100%|███████| 20/20 [00:07<00:00,  2.61it/s, est. speed input: 109.41 toks/s, output: 1300.31 toks/s]


- batch size 20: 1299.72 tokens/sec (20 x 64.99)


Processed prompts: 100%|███████| 21/21 [00:07<00:00,  2.75it/s, est. speed input: 115.49 toks/s, output: 1377.52 toks/s]


- batch size 21: 1376.91 tokens/sec (21 x 65.57)


Processed prompts: 100%|███████| 22/22 [00:07<00:00,  2.84it/s, est. speed input: 119.28 toks/s, output: 1429.30 toks/s]


- batch size 22: 1428.64 tokens/sec (22 x 64.94)


Processed prompts: 100%|███████| 23/23 [00:07<00:00,  2.95it/s, est. speed input: 124.36 toks/s, output: 1456.42 toks/s]


- batch size 23: 1455.72 tokens/sec (23 x 63.29)


Processed prompts: 100%|███████| 24/24 [00:07<00:00,  3.09it/s, est. speed input: 129.65 toks/s, output: 1500.97 toks/s]


- batch size 24: 1500.26 tokens/sec (24 x 62.51)


Processed prompts: 100%|███████| 25/25 [00:07<00:00,  3.20it/s, est. speed input: 134.22 toks/s, output: 1593.73 toks/s]


- batch size 25: 1592.90 tokens/sec (25 x 63.72)


Processed prompts: 100%|███████| 26/26 [00:07<00:00,  3.26it/s, est. speed input: 137.04 toks/s, output: 1645.39 toks/s]


- batch size 26: 1644.59 tokens/sec (26 x 63.25)


Processed prompts: 100%|███████| 27/27 [00:07<00:00,  3.42it/s, est. speed input: 143.74 toks/s, output: 1724.50 toks/s]


- batch size 27: 1723.58 tokens/sec (27 x 63.84)


Processed prompts: 100%|███████| 28/28 [00:07<00:00,  3.52it/s, est. speed input: 147.78 toks/s, output: 1722.77 toks/s]


- batch size 28: 1721.85 tokens/sec (28 x 61.49)


Processed prompts: 100%|█████████| 29/29 [00:18<00:00,  1.54it/s, est. speed input: 64.72 toks/s, output: 759.85 toks/s]


- batch size 29: 759.69 tokens/sec (29 x 26.20)


Processed prompts: 100%|█████████| 30/30 [00:31<00:00,  1.06s/it, est. speed input: 39.75 toks/s, output: 467.74 toks/s]


- batch size 30: 467.67 tokens/sec (30 x 15.59)


Processed prompts: 100%|█████████| 31/31 [00:46<00:00,  1.49s/it, est. speed input: 28.29 toks/s, output: 337.61 toks/s]


- batch size 31: 337.58 tokens/sec (31 x 10.89)


Processed prompts: 100%|█████████| 32/32 [00:59<00:00,  1.87s/it, est. speed input: 22.40 toks/s, output: 269.94 toks/s]


- batch size 32: 269.92 tokens/sec (32 x 8.44)


Processed prompts:   0%|                     | 0/33 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

KeyboardInterrupt: 

### w4a16

In [8]:
llm = vllm_load(test_models["llama-3.1:w4a16"])

config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

INFO 09-22 14:25:08 gptq_marlin.py:108] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel.
INFO 09-22 14:25:08 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_tim

tokenizer_config.json:   0%|          | 0.00/50.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

INFO 09-22 14:25:10 model_runner.py:997] Starting to load model neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16...
INFO 09-22 14:25:11 weight_utils.py:242] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/5.74G [00:00<?, ?B/s]

INFO 09-22 14:30:25 weight_utils.py:287] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 09-22 14:30:26 model_runner.py:1008] Loading model weights took 5.3812 GB
INFO 09-22 14:30:28 gpu_executor.py:122] # GPU blocks: 8325, # CPU blocks: 2048
INFO 09-22 14:30:30 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-22 14:30:30 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-22 14:30:40 model_runner.py:1430] Graph capturing finished in 10 secs.


In [9]:
vllm_generate(test_messages*18, llm)

vLLM performance test:


Processed prompts: 100%|████████████| 1/1 [00:04<00:00,  4.38s/it, est. speed input: 9.58 toks/s, output: 116.78 toks/s]


Generated text: "Le Crédit Mutuel est une banque mutuelle française qui offre divers avantages à ses clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts sur les comptes courants** : Le Crédit Mutuel offre des intérêts sur les comptes courants, ce qui permet aux clients de gagner de l'argent sans avoir à investir leurs fonds.\n2. **Prêt à usage** : Le Crédit Mutuel propose des prêts à usage avec des taux d'intérêt compétitifs, ce qui peut aider les clients à financer leurs besoins ou leurs projets.\n3. **Assurance de prêt** : Le Crédit Mutuel offre une assurance de prêt qui couvre les risques de défaut de remboursement, ce qui donne aux emprunteurs une sécurité supplémentaire.\n4. **Services bancaires complets** : Le Crédit Mutuel propose une gamme complète de services bancaires, y compris des cartes de crédit, des chèques, des virements et des paiements en ligne.\n5. **Sécurité et protection** : Le Crédit Mutuel met en place des mesures de sécurité pour protéger les

Processed prompts: 100%|███████████| 1/1 [00:04<00:00,  4.08s/it, est. speed input: 10.30 toks/s, output: 125.62 toks/s]


- batch size 1: 125.55 tokens/sec (1 x 125.55)


Processed prompts: 100%|███████████| 2/2 [00:04<00:00,  2.12s/it, est. speed input: 20.02 toks/s, output: 241.18 toks/s]


- batch size 2: 241.06 tokens/sec (2 x 120.53)


Processed prompts: 100%|███████████| 3/3 [00:04<00:00,  1.40s/it, est. speed input: 30.41 toks/s, output: 355.68 toks/s]


- batch size 3: 355.46 tokens/sec (3 x 118.49)


Processed prompts: 100%|███████████| 4/4 [00:04<00:00,  1.06s/it, est. speed input: 39.45 toks/s, output: 478.56 toks/s]


- batch size 4: 478.33 tokens/sec (4 x 119.58)


Processed prompts: 100%|███████████| 5/5 [00:04<00:00,  1.16it/s, est. speed input: 48.76 toks/s, output: 570.07 toks/s]


- batch size 5: 569.82 tokens/sec (5 x 113.96)


Processed prompts: 100%|███████████| 6/6 [00:04<00:00,  1.39it/s, est. speed input: 58.42 toks/s, output: 697.80 toks/s]


- batch size 6: 697.43 tokens/sec (6 x 116.24)


Processed prompts: 100%|███████████| 7/7 [00:04<00:00,  1.59it/s, est. speed input: 67.28 toks/s, output: 787.57 toks/s]


- batch size 7: 787.10 tokens/sec (7 x 112.44)


Processed prompts: 100%|███████████| 8/8 [00:04<00:00,  1.80it/s, est. speed input: 75.68 toks/s, output: 881.61 toks/s]


- batch size 8: 881.11 tokens/sec (8 x 110.14)


Processed prompts: 100%|███████████| 9/9 [00:04<00:00,  2.00it/s, est. speed input: 84.02 toks/s, output: 988.23 toks/s]


- batch size 9: 987.65 tokens/sec (9 x 109.74)


Processed prompts: 100%|████████| 10/10 [00:04<00:00,  2.20it/s, est. speed input: 92.81 toks/s, output: 1067.00 toks/s]


- batch size 10: 1066.43 tokens/sec (10 x 106.64)


Processed prompts: 100%|███████| 11/11 [00:04<00:00,  2.39it/s, est. speed input: 100.90 toks/s, output: 1190.30 toks/s]


- batch size 11: 1189.65 tokens/sec (11 x 108.15)


Processed prompts: 100%|███████| 12/12 [00:04<00:00,  2.60it/s, est. speed input: 109.12 toks/s, output: 1288.86 toks/s]


- batch size 12: 1288.05 tokens/sec (12 x 107.34)


Processed prompts: 100%|███████| 13/13 [00:04<00:00,  2.78it/s, est. speed input: 116.89 toks/s, output: 1381.02 toks/s]


- batch size 13: 1380.25 tokens/sec (13 x 106.17)


Processed prompts: 100%|███████| 14/14 [00:04<00:00,  2.91it/s, est. speed input: 122.61 toks/s, output: 1435.34 toks/s]


- batch size 14: 1434.50 tokens/sec (14 x 102.46)


Processed prompts: 100%|███████| 15/15 [00:04<00:00,  3.15it/s, est. speed input: 132.66 toks/s, output: 1549.33 toks/s]


- batch size 15: 1548.31 tokens/sec (15 x 103.22)


Processed prompts: 100%|███████| 16/16 [00:04<00:00,  3.34it/s, est. speed input: 140.13 toks/s, output: 1636.07 toks/s]


- batch size 16: 1634.87 tokens/sec (16 x 102.18)


Processed prompts: 100%|███████| 17/17 [00:05<00:00,  3.16it/s, est. speed input: 132.65 toks/s, output: 1581.27 toks/s]


- batch size 17: 1580.32 tokens/sec (17 x 92.96)


Processed prompts: 100%|███████| 18/18 [00:05<00:00,  3.29it/s, est. speed input: 138.55 toks/s, output: 1630.24 toks/s]


- batch size 18: 1629.18 tokens/sec (18 x 90.51)


Processed prompts: 100%|███████| 19/19 [00:05<00:00,  3.50it/s, est. speed input: 147.55 toks/s, output: 1711.78 toks/s]


- batch size 19: 1710.76 tokens/sec (19 x 90.04)


Processed prompts: 100%|███████| 20/20 [00:05<00:00,  3.70it/s, est. speed input: 155.47 toks/s, output: 1786.38 toks/s]


- batch size 20: 1785.25 tokens/sec (20 x 89.26)


Processed prompts: 100%|███████| 21/21 [00:05<00:00,  3.77it/s, est. speed input: 158.44 toks/s, output: 1844.72 toks/s]


- batch size 21: 1843.56 tokens/sec (21 x 87.79)


Processed prompts: 100%|███████| 22/22 [00:05<00:00,  3.96it/s, est. speed input: 166.44 toks/s, output: 1955.59 toks/s]


- batch size 22: 1954.32 tokens/sec (22 x 88.83)


Processed prompts: 100%|███████| 23/23 [00:05<00:00,  4.12it/s, est. speed input: 173.21 toks/s, output: 2013.39 toks/s]


- batch size 23: 2011.92 tokens/sec (23 x 87.47)


Processed prompts: 100%|███████| 24/24 [00:05<00:00,  4.24it/s, est. speed input: 178.23 toks/s, output: 2090.27 toks/s]


- batch size 24: 2088.91 tokens/sec (24 x 87.04)


Processed prompts: 100%|███████| 25/25 [00:05<00:00,  4.43it/s, est. speed input: 186.27 toks/s, output: 2175.76 toks/s]


- batch size 25: 2174.08 tokens/sec (25 x 86.96)


Processed prompts: 100%|███████| 26/26 [00:05<00:00,  4.47it/s, est. speed input: 188.00 toks/s, output: 2199.89 toks/s]


- batch size 26: 2198.47 tokens/sec (26 x 84.56)


Processed prompts: 100%|███████| 27/27 [00:05<00:00,  4.69it/s, est. speed input: 197.45 toks/s, output: 2314.77 toks/s]


- batch size 27: 2313.17 tokens/sec (27 x 85.67)


Processed prompts: 100%|███████| 28/28 [00:05<00:00,  4.84it/s, est. speed input: 203.44 toks/s, output: 2359.25 toks/s]


- batch size 28: 2357.53 tokens/sec (28 x 84.20)


Processed prompts: 100%|███████| 29/29 [00:05<00:00,  4.91it/s, est. speed input: 206.42 toks/s, output: 2453.69 toks/s]


- batch size 29: 2452.02 tokens/sec (29 x 84.55)


Processed prompts: 100%|███████| 30/30 [00:06<00:00,  4.96it/s, est. speed input: 208.42 toks/s, output: 2441.66 toks/s]


- batch size 30: 2439.96 tokens/sec (30 x 81.33)


Processed prompts: 100%|███████| 31/31 [00:05<00:00,  5.17it/s, est. speed input: 217.64 toks/s, output: 2538.39 toks/s]


- batch size 31: 2536.60 tokens/sec (31 x 81.83)


Processed prompts: 100%|███████| 32/32 [00:06<00:00,  5.30it/s, est. speed input: 222.50 toks/s, output: 2605.98 toks/s]


- batch size 32: 2604.17 tokens/sec (32 x 81.38)


Processed prompts: 100%|███████| 33/33 [00:06<00:00,  5.14it/s, est. speed input: 215.97 toks/s, output: 2520.17 toks/s]


- batch size 33: 2518.36 tokens/sec (33 x 76.31)


Processed prompts: 100%|███████| 34/34 [00:06<00:00,  5.15it/s, est. speed input: 216.40 toks/s, output: 2504.99 toks/s]


- batch size 34: 2503.17 tokens/sec (34 x 73.62)


Processed prompts: 100%|███████| 35/35 [00:06<00:00,  5.24it/s, est. speed input: 220.19 toks/s, output: 2595.18 toks/s]


- batch size 35: 2593.18 tokens/sec (35 x 74.09)


Processed prompts: 100%|███████| 36/36 [00:06<00:00,  5.36it/s, est. speed input: 225.03 toks/s, output: 2650.47 toks/s]


- batch size 36: 2648.47 tokens/sec (36 x 73.57)


Processed prompts: 100%|███████| 37/37 [00:06<00:00,  5.45it/s, est. speed input: 229.04 toks/s, output: 2696.51 toks/s]


- batch size 37: 2694.60 tokens/sec (37 x 72.83)


Processed prompts: 100%|███████| 38/38 [00:06<00:00,  5.50it/s, est. speed input: 231.22 toks/s, output: 2670.48 toks/s]


- batch size 38: 2668.55 tokens/sec (38 x 70.22)


Processed prompts: 100%|███████| 39/39 [00:06<00:00,  5.69it/s, est. speed input: 239.27 toks/s, output: 2815.79 toks/s]


- batch size 39: 2813.51 tokens/sec (39 x 72.14)


Processed prompts: 100%|███████| 40/40 [00:06<00:00,  5.84it/s, est. speed input: 245.40 toks/s, output: 2855.39 toks/s]


- batch size 40: 2848.50 tokens/sec (40 x 71.21)


Processed prompts: 100%|███████| 41/41 [00:07<00:00,  5.80it/s, est. speed input: 243.61 toks/s, output: 2846.18 toks/s]


- batch size 41: 2843.88 tokens/sec (41 x 69.36)


Processed prompts: 100%|███████| 42/42 [00:07<00:00,  5.96it/s, est. speed input: 250.50 toks/s, output: 2936.79 toks/s]


- batch size 42: 2934.41 tokens/sec (42 x 69.87)


Processed prompts: 100%|███████| 43/43 [00:07<00:00,  6.09it/s, est. speed input: 255.91 toks/s, output: 2955.13 toks/s]


- batch size 43: 2952.75 tokens/sec (43 x 68.67)


Processed prompts: 100%|███████| 44/44 [00:07<00:00,  6.00it/s, est. speed input: 251.98 toks/s, output: 2956.01 toks/s]


- batch size 44: 2953.65 tokens/sec (44 x 67.13)


Processed prompts: 100%|███████| 45/45 [00:07<00:00,  6.20it/s, est. speed input: 260.36 toks/s, output: 3044.16 toks/s]


- batch size 45: 3041.90 tokens/sec (45 x 67.60)


Processed prompts: 100%|███████| 46/46 [00:07<00:00,  6.26it/s, est. speed input: 263.08 toks/s, output: 3097.65 toks/s]


- batch size 46: 3095.31 tokens/sec (46 x 67.29)


Processed prompts: 100%|███████| 47/47 [00:07<00:00,  6.23it/s, est. speed input: 261.82 toks/s, output: 3090.93 toks/s]


- batch size 47: 3036.79 tokens/sec (47 x 64.61)


Processed prompts: 100%|███████| 48/48 [00:07<00:00,  6.43it/s, est. speed input: 270.23 toks/s, output: 3176.00 toks/s]


- batch size 48: 3173.57 tokens/sec (48 x 66.12)


Processed prompts: 100%|███████| 49/49 [00:08<00:00,  5.85it/s, est. speed input: 245.66 toks/s, output: 2914.78 toks/s]


- batch size 49: 2912.62 tokens/sec (49 x 59.44)


Processed prompts: 100%|███████| 50/50 [00:08<00:00,  5.91it/s, est. speed input: 248.53 toks/s, output: 2905.07 toks/s]


- batch size 50: 2903.08 tokens/sec (50 x 58.06)


Processed prompts: 100%|███████| 51/51 [00:09<00:00,  5.66it/s, est. speed input: 238.12 toks/s, output: 2783.81 toks/s]


- batch size 51: 2781.90 tokens/sec (51 x 54.55)


Processed prompts: 100%|███████| 52/52 [00:09<00:00,  5.62it/s, est. speed input: 235.93 toks/s, output: 2762.60 toks/s]


- batch size 52: 2760.68 tokens/sec (52 x 53.09)


Processed prompts: 100%|███████| 53/53 [00:09<00:00,  5.71it/s, est. speed input: 239.64 toks/s, output: 2830.30 toks/s]


- batch size 53: 2828.41 tokens/sec (53 x 53.37)


Processed prompts: 100%|███████| 54/54 [00:09<00:00,  5.73it/s, est. speed input: 240.64 toks/s, output: 2820.32 toks/s]


- batch size 54: 2818.25 tokens/sec (54 x 52.19)


Processed prompts: 100%|███████| 55/55 [00:09<00:00,  5.77it/s, est. speed input: 242.54 toks/s, output: 2883.49 toks/s]


- batch size 55: 2881.63 tokens/sec (55 x 52.39)


Processed prompts: 100%|███████| 56/56 [00:09<00:00,  5.78it/s, est. speed input: 242.64 toks/s, output: 2857.54 toks/s]


- batch size 56: 2855.51 tokens/sec (56 x 50.99)


Processed prompts: 100%|███████| 57/57 [00:09<00:00,  5.80it/s, est. speed input: 243.43 toks/s, output: 2879.73 toks/s]


- batch size 57: 2877.61 tokens/sec (57 x 50.48)


Processed prompts: 100%|███████| 58/58 [00:09<00:00,  5.91it/s, est. speed input: 248.43 toks/s, output: 2923.29 toks/s]


- batch size 58: 2921.28 tokens/sec (58 x 50.37)


Processed prompts: 100%|███████| 59/59 [00:09<00:00,  5.93it/s, est. speed input: 249.22 toks/s, output: 2939.09 toks/s]


- batch size 59: 2937.12 tokens/sec (59 x 49.78)


Processed prompts: 100%|███████| 60/60 [00:09<00:00,  6.09it/s, est. speed input: 255.81 toks/s, output: 2973.72 toks/s]


- batch size 60: 2971.65 tokens/sec (60 x 49.53)


Processed prompts: 100%|███████| 61/61 [00:10<00:00,  5.99it/s, est. speed input: 251.65 toks/s, output: 2963.72 toks/s]


- batch size 61: 2961.58 tokens/sec (61 x 48.55)


Processed prompts: 100%|███████| 62/62 [00:09<00:00,  6.22it/s, est. speed input: 261.29 toks/s, output: 3035.77 toks/s]


- batch size 62: 3033.71 tokens/sec (62 x 48.93)


Processed prompts: 100%|███████| 63/63 [00:10<00:00,  6.29it/s, est. speed input: 264.39 toks/s, output: 3088.26 toks/s]


- batch size 63: 3085.80 tokens/sec (63 x 48.98)


Processed prompts: 100%|███████| 64/64 [00:10<00:00,  6.22it/s, est. speed input: 261.25 toks/s, output: 3032.34 toks/s]


- batch size 64: 3030.35 tokens/sec (64 x 47.35)


Processed prompts: 100%|███████| 65/65 [00:11<00:00,  5.62it/s, est. speed input: 236.04 toks/s, output: 2794.32 toks/s]


- batch size 65: 2792.49 tokens/sec (65 x 42.96)


Processed prompts: 100%|███████| 66/66 [00:11<00:00,  5.80it/s, est. speed input: 243.59 toks/s, output: 2826.58 toks/s]


- batch size 66: 2824.80 tokens/sec (66 x 42.80)


Processed prompts: 100%|███████| 67/67 [00:11<00:00,  5.71it/s, est. speed input: 239.81 toks/s, output: 2794.38 toks/s]


- batch size 67: 2792.58 tokens/sec (67 x 41.68)


Processed prompts: 100%|███████| 68/68 [00:11<00:00,  5.79it/s, est. speed input: 243.15 toks/s, output: 2831.61 toks/s]


- batch size 68: 2829.74 tokens/sec (68 x 41.61)


Processed prompts: 100%|███████| 69/69 [00:11<00:00,  5.84it/s, est. speed input: 245.27 toks/s, output: 2880.51 toks/s]


- batch size 69: 2878.64 tokens/sec (69 x 41.72)


Processed prompts: 100%|███████| 70/70 [00:11<00:00,  5.84it/s, est. speed input: 245.45 toks/s, output: 2876.64 toks/s]


- batch size 70: 2874.84 tokens/sec (70 x 41.07)


Processed prompts: 100%|███████| 71/71 [00:11<00:00,  5.99it/s, est. speed input: 251.64 toks/s, output: 2927.52 toks/s]


- batch size 71: 2925.52 tokens/sec (71 x 41.20)


Processed prompts: 100%|███████| 72/72 [00:12<00:00,  5.91it/s, est. speed input: 248.35 toks/s, output: 2909.87 toks/s]

- batch size 72: 2907.97 tokens/sec (72 x 40.39)





## SGLang

### fp16

In [7]:
runtime = sglang_load(test_models["llama-3.1"])

INFO 09-22 14:43:27 weight_utils.py:242] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


In [8]:
sglang_generate(test_messages*8, runtime)

SGLang performance test:
Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre divers avantages à ses membres, notamment :\n\n1. **Intérêts plus élevés sur les épargnes** : Les membres du Crédit Mutuel bénéficient souvent de taux d'intérêt plus élevés que ceux proposés par les banques traditionnelles sur leurs comptes d'épargne.\n2. **Prêts à des conditions favorables** : Les membres peuvent obtenir des prêts à des taux d'intérêt attractifs et avec des conditions de remboursement flexibles.\n3. **Sécurité et stabilité** : En tant que banque coopérative, le Crédit Mutuel est contrôlée par ses membres, ce qui garantit une gestion responsable et durable de l'argent.\n4. **Services personnalisés** : Les conseillers du Crédit Mutuel prennent le temps de comprendre les besoins de chaque membre pour offrir des solutions adaptées à leur situation financière.\n5. **Économies sur les frais** : Les membres du Crédit Mutuel peuvent bénéficier de réductions sur les frais 

### w8a16

In [7]:
runtime = sglang_load(test_models["llama-3.1:w8a16"])

INFO 09-22 14:52:36 weight_utils.py:242] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


Process Process-1:2:
Traceback (most recent call last):
  File "/root/miniconda3/envs/wordslab-notebooks/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/root/miniconda3/envs/wordslab-notebooks/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/workspace/wordslab-llms/.venv/lib/python3.11/site-packages/sglang/srt/managers/detokenizer_manager.py", line 185, in start_detokenizer_process
    loop.run_until_complete(manager.handle_loop())
  File "uvloop/loop.pyx", line 1511, in uvloop.loop.Loop.run_until_complete
  File "uvloop/loop.pyx", line 1504, in uvloop.loop.Loop.run_until_complete
  File "uvloop/loop.pyx", line 1377, in uvloop.loop.Loop.run_forever
  File "uvloop/loop.pyx", line 555, in uvloop.loop.Loop._run
Process Process-1:1:
Traceback (most recent call last):
  File "/root/miniconda3/envs/wordslab-notebooks/lib/python3.11/multiprocessing/process.py", line 314, in _bootstra

In [8]:
sglang_generate(test_messages*10, runtime)

SGLang performance test:
Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre plusieurs avantages à ses adhérents et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts attractifs** : Le Crédit Mutuel propose des taux d'intérêt compétitifs pour les prêts, les comptes courants et les épargnes.\n2. **Services personnalisés** : En tant que banque coopérative, le Crédit Mutuel se concentre sur la relation avec ses adhérents et clients, offrant des services personnalisés et adaptés à leurs besoins.\n3. **Sécurité et confidentialité** : Comme banque coopérative, le Crédit Mutuel est soumis à des règles de confidentialité et de sécurité strictes pour protéger les données personnelles et financières de ses adhérents et clients.\n4. **Prêts sans frais de dossier** : Le Crédit Mutuel propose des prêts sans frais de dossier pour certains types de prêts, ce qui peut aider les clients à économiser de l'argent.\n5. **Cartes de crédit et de débit** : Le

KeyboardInterrupt: 

### fp8

In [None]:
runtime = sglang_load(test_models["llama-3.1:fp8"])

**INTERNAL ERROR** during FP8 model load

### w8a8

In [7]:
runtime = sglang_load(test_models["llama-3.1:w8a8"])

INFO 09-22 15:04:44 weight_utils.py:242] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


Process Process-1:2:
Traceback (most recent call last):
  File "/root/miniconda3/envs/wordslab-notebooks/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/root/miniconda3/envs/wordslab-notebooks/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/workspace/wordslab-llms/.venv/lib/python3.11/site-packages/sglang/srt/managers/detokenizer_manager.py", line 185, in start_detokenizer_process
    loop.run_until_complete(manager.handle_loop())
  File "uvloop/loop.pyx", line 1511, in uvloop.loop.Loop.run_until_complete
  File "uvloop/loop.pyx", line 1504, in uvloop.loop.Loop.run_until_complete
  File "uvloop/loop.pyx", line 1377, in uvloop.loop.Loop.run_forever
  File "uvloop/loop.pyx", line 555, in uvloop.loop.Loop._run
  File "uvloop/handles/poll.pyx", line 216, in uvloop.loop.__on_uvpoll_event
  File "uvloop/cbhandles.pyx", line 83, in uvloop.loop.Handle._run
  File "uvloop/cbhandles.py

In [8]:
sglang_generate(test_messages*8, runtime)

SGLang performance test:
Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre divers avantages à ses adhérents et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts attractifs** : Le Crédit Mutuel propose des taux d'intérêt compétitifs pour les prêts, les comptes courants et les épargnes.\n2. **Service personnalisé** : En tant que banque coopérative, le Crédit Mutuel se concentre sur la relation avec ses clients et adhérents, offrant un service personnalisé et attentif.\n3. **Produits financiers adaptés** : Le Crédit Mutuel propose une large gamme de produits financiers adaptés aux besoins de ses clients, tels que les prêts immobiliers, les prêts personnels, les comptes courants, les comptes d'épargne, etc.\n4. **Sécurité et protection** : Le Crédit Mutuel met en place des mesures de sécurité et de protection pour protéger les données et les comptes de ses clients.\n5. **Transparence et responsabilité** : En tant que banque coopérative, 

KeyboardInterrupt: 

### w4a16

In [7]:
runtime = sglang_load(test_models["llama-3.1:w4a16"])

INFO 09-22 15:13:32 gptq_marlin.py:108] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel.
INFO 09-22 15:13:33 weight_utils.py:242] Using model weights format ['*.safetensors']
INFO 09-22 15:13:33 weight_utils.py:287] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Process Process-1:2:
Traceback (most recent call last):
  File "/root/miniconda3/envs/wordslab-notebooks/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/root/miniconda3/envs/wordslab-notebooks/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/workspace/wordslab-llms/.venv/lib/python3.11/site-packages/sglang/srt/managers/detokenizer_manager.py", line 185, in start_detokenizer_process
    loop.run_until_complete(manager.handle_loop())
  File "uvloop/loop.pyx", line 1511, in uvloop.loop.Loop.run_until_complete
  File "uvloop/loop.pyx", line 1504, in uvloop.loop.Loop.run_until_complete
  File "uvloop/loop.pyx", line 1377, in uvloop.loop.Loop.run_forever
  File "uvloop/loop.pyx", line 555, in uvloop.loop.Loop._run
  File "uvloop/handles/poll.pyx", line 216, in uvloop.loop.__on_uvpoll_event
  File "uvloop/cbhandles.pyx", line 83, in uvloop.loop.Handle._run
  File "uvloop/cbhandles.py

In [8]:
sglang_generate(test_messages*18, runtime)

SGLang performance test:
Generated text: "Le Crédit Mutuel est une banque mutuelle française qui offre divers services financiers à ses clients. Voici quelques-uns des principaux avantages du Crédit Mutuel :\n\n1. **Sécurité et stabilité** : Le Crédit Mutuel est une banque mutuelle, ce qui signifie qu'il est géré par ses membres (clients) et non par des actionnaires. Cela garantit sa stabilité et sa sécurité.\n2. **Services personnalisés** : Le Crédit Mutuel propose des services personnalisés adaptés aux besoins de ses clients, qu'ils soient particuliers ou professionnels.\n3. **Offres de produits variées** : La banque propose une gamme de produits et services, notamment des comptes courants, des épargnes, des crédits, des assurances et des services d'investissement.\n4. **Prêt rapide et à bon taux d'intérêt** : Le Crédit Mutuel offre des prêts à des taux d'intérêt compétitifs et avec des conditions de remboursement flexibles.\n5. **Assistance et conseil** : Les clients du Crédit Mutue

KeyboardInterrupt: 

## ollama

### fp16

In [11]:
model = ollama_load(ollama_test_models["llama-3.1"])

In [17]:
ollama_generate(test_messages, model, test_models["llama-3.1"])

ollama performance test:
Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre plusieurs avantages à ses membres. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus élevés sur les épargnes** : Les membres du Crédit Mutuel bénéficient souvent d'intérêts plus élevés sur leurs épargnes comparés aux banques traditionnelles.\n2. **Prêt à taux réduit** : Le Crédit Mutuel propose des prêts à taux réduits pour les membres, ce qui peut être avantageux pour les personnes qui cherchent à financer une acquisition ou un projet.\n3. **Services personnalisés** : Les conseillers du Crédit Mutuel sont souvent plus accessibles et plus proches de leurs clients que ceux des banques traditionnelles, ce qui permet un service personnalisé et plus adapté aux besoins de chaque membre.\n4. **Participation aux bénéfices** : Les membres du Crédit Mutuel participent aux bénéfices de l'organisme, ce qui peut leur permettre de bénéficier de dividendes ou de primes.\n5. **S

### int8

In [18]:
model = ollama_load(ollama_test_models["llama-3.1:int8"])

In [19]:
ollama_generate(test_messages, model, test_models["llama-3.1"])

ollama performance test:
Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre divers avantages à ses membres et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus élevés sur les comptes courants** : Le Crédit Mutuel propose des taux d'intérêt plus élevés que les banques traditionnelles pour les comptes courants, ce qui peut aider les clients à gagner de l'argent en fonction de leurs dépôts.\n2. **Services personnalisés** : En tant que banque coopérative, le Crédit Mutuel se concentre sur la relation client et offre des services personnalisés pour répondre aux besoins de ses membres et clients.\n3. **Prêts à taux réduits** : Le Crédit Mutuel propose des prêts à des taux d'intérêt réduits, ce qui peut aider les clients à acheter une maison ou à financer un projet important.\n4. **Assurance de crédit groupé** : Le Crédit Mutuel offre une assurance de crédit groupé qui permet aux clients de bénéficier d'une protection en cas de difficul

### int4

In [20]:
model = ollama_load(ollama_test_models["llama-3.1:int4"])

In [21]:
ollama_generate(test_messages, model, test_models["llama-3.1"])

ollama performance test:
Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre diverses avantages à ses adhérents et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus élevés** : Le Crédit Mutuel propose des taux d'intérêt plus attractifs que les banques traditionnelles pour les comptes courants, les prêts et les épargnes.\n2. **Système coopératif** : En tant que banque coopérative, le Crédit Mutuel est géré par ses adhérents, qui sont également des actionnaires. Cela signifie que les décisions stratégiques sont prises collectivement, ce qui peut conduire à des avantages pour les membres.\n3. **Services personnalisés** : Le Crédit Mutuel se préoccupe de la personnalisation des services bancaires pour répondre aux besoins spécifiques de ses clients.\n4. **Épargne et épargne à long terme** : Le Crédit Mutuel propose des produits d'épargne à long terme, tels que les livrets et les comptes d'épargne retraite, qui peuvent aider les client

# 4. Performance tests on RTX 4090 - qwen-2.5

## vLLM

### FP16

In [9]:
llm = vllm_load(test_models["qwen-2.5"])

INFO 09-23 23:09:00 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, use_v2_block_manager=False, num_scheduler_steps=1, enable_prefix_caching=False, use_async_ou

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 09-23 23:09:13 model_runner.py:1008] Loading model weights took 14.2487 GB
INFO 09-23 23:09:15 gpu_executor.py:122] # GPU blocks: 8760, # CPU blocks: 4681
INFO 09-23 23:09:16 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-23 23:09:16 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-23 23:09:25 model_runner.py:1430] Graph capturing finished in 9 secs.


In [10]:
vllm_generate(test_messages*12, llm)

vLLM performance test:


Processed prompts: 100%|█████████████| 1/1 [00:08<00:00,  8.84s/it, est. speed input: 4.52 toks/s, output: 57.91 toks/s]


Generated text: "Le Crédit Mutuel est une coopérative financière qui propose divers produits et services bancaires. Voici quelques-uns des principaux avantages qu'il offre :\n\n1. **Structure Coopérative** : Le Crédit Mutuel est une coopérative de crédit, ce qui signifie que les clients sont également propriétaires. Cette structure permet une plus grande transparence et responsabilité vis-à-vis des membres.\n\n2. **Service Client Qualifié** : Les conseillers du Crédit Mutuel sont souvent connus pour leur expertise et leur disponibilité. Ils peuvent offrir un service personnalisé et adaptatif selon les besoins spécifiques de chaque client.\n\n3. **Large Gamme de Produits Financiers** : Le Crédit Mutuel propose une large gamme de produits financiers, y compris des comptes courants, des épargnes, des prêts immobiliers, des cartes de crédit, des assurances et des placements.\n\n4. **Réseau de Bureaux Étendu** : Avec un réseau de succursales étendu dans toute la France, le Crédit Mutuel off

Processed prompts: 100%|█████████████| 1/1 [00:08<00:00,  8.67s/it, est. speed input: 4.62 toks/s, output: 58.74 toks/s]


- batch size 1: 58.72 tokens/sec (1 x 58.72)


Processed prompts: 100%|████████████| 2/2 [00:08<00:00,  4.19s/it, est. speed input: 9.67 toks/s, output: 109.95 toks/s]


- batch size 2: 109.91 tokens/sec (2 x 54.95)


Processed prompts: 100%|███████████| 3/3 [00:09<00:00,  3.05s/it, est. speed input: 13.32 toks/s, output: 162.04 toks/s]


- batch size 3: 162.00 tokens/sec (3 x 54.00)


Processed prompts: 100%|███████████| 4/4 [00:09<00:00,  2.30s/it, est. speed input: 17.42 toks/s, output: 203.80 toks/s]


- batch size 4: 203.76 tokens/sec (4 x 50.94)


Processed prompts: 100%|███████████| 5/5 [00:09<00:00,  1.84s/it, est. speed input: 21.78 toks/s, output: 262.65 toks/s]


- batch size 5: 262.59 tokens/sec (5 x 52.52)


Processed prompts: 100%|███████████| 6/6 [00:09<00:00,  1.53s/it, est. speed input: 26.24 toks/s, output: 301.49 toks/s]


- batch size 6: 301.43 tokens/sec (6 x 50.24)


Processed prompts: 100%|███████████| 7/7 [00:09<00:00,  1.32s/it, est. speed input: 30.46 toks/s, output: 369.44 toks/s]


- batch size 7: 369.36 tokens/sec (7 x 52.77)


Processed prompts: 100%|███████████| 8/8 [00:09<00:00,  1.16s/it, est. speed input: 34.51 toks/s, output: 411.00 toks/s]


- batch size 8: 410.91 tokens/sec (8 x 51.36)


Processed prompts: 100%|███████████| 9/9 [00:09<00:00,  1.03s/it, est. speed input: 38.71 toks/s, output: 436.05 toks/s]


- batch size 9: 435.94 tokens/sec (9 x 48.44)


Processed prompts: 100%|█████████| 10/10 [00:09<00:00,  1.07it/s, est. speed input: 42.79 toks/s, output: 520.97 toks/s]


- batch size 10: 520.83 tokens/sec (10 x 52.08)


Processed prompts: 100%|█████████| 11/11 [00:09<00:00,  1.17it/s, est. speed input: 47.03 toks/s, output: 553.62 toks/s]


- batch size 11: 553.48 tokens/sec (11 x 50.32)


Processed prompts: 100%|█████████| 12/12 [00:09<00:00,  1.27it/s, est. speed input: 50.90 toks/s, output: 598.44 toks/s]


- batch size 12: 598.29 tokens/sec (12 x 49.86)


Processed prompts: 100%|█████████| 13/13 [00:09<00:00,  1.38it/s, est. speed input: 55.01 toks/s, output: 653.27 toks/s]


- batch size 13: 653.06 tokens/sec (13 x 50.24)


Processed prompts: 100%|█████████| 14/14 [00:09<00:00,  1.46it/s, est. speed input: 58.59 toks/s, output: 675.58 toks/s]


- batch size 14: 675.36 tokens/sec (14 x 48.24)


Processed prompts: 100%|█████████| 15/15 [00:09<00:00,  1.57it/s, est. speed input: 63.06 toks/s, output: 750.07 toks/s]


- batch size 15: 749.82 tokens/sec (15 x 49.99)


Processed prompts: 100%|█████████| 16/16 [00:09<00:00,  1.67it/s, est. speed input: 66.84 toks/s, output: 797.11 toks/s]


- batch size 16: 796.87 tokens/sec (16 x 49.80)


Processed prompts: 100%|█████████| 17/17 [00:10<00:00,  1.70it/s, est. speed input: 67.90 toks/s, output: 810.12 toks/s]


- batch size 17: 809.87 tokens/sec (17 x 47.64)


Processed prompts: 100%|█████████| 18/18 [00:10<00:00,  1.77it/s, est. speed input: 70.96 toks/s, output: 873.11 toks/s]


- batch size 18: 872.85 tokens/sec (18 x 48.49)


Processed prompts: 100%|█████████| 19/19 [00:10<00:00,  1.86it/s, est. speed input: 74.41 toks/s, output: 853.88 toks/s]


- batch size 19: 853.60 tokens/sec (19 x 44.93)


Processed prompts: 100%|█████████| 20/20 [00:10<00:00,  1.93it/s, est. speed input: 77.09 toks/s, output: 930.31 toks/s]


- batch size 20: 930.01 tokens/sec (20 x 46.50)


Processed prompts: 100%|█████████| 21/21 [00:10<00:00,  2.06it/s, est. speed input: 82.39 toks/s, output: 908.74 toks/s]


- batch size 21: 908.38 tokens/sec (21 x 43.26)


Processed prompts: 100%|█████████| 22/22 [00:10<00:00,  2.10it/s, est. speed input: 84.07 toks/s, output: 989.05 toks/s]


- batch size 22: 988.70 tokens/sec (22 x 44.94)


Processed prompts: 100%|████████| 23/23 [00:10<00:00,  2.13it/s, est. speed input: 85.25 toks/s, output: 1005.10 toks/s]


- batch size 23: 1004.76 tokens/sec (23 x 43.69)


Processed prompts: 100%|████████| 24/24 [00:10<00:00,  2.21it/s, est. speed input: 88.34 toks/s, output: 1039.43 toks/s]


- batch size 24: 1039.11 tokens/sec (24 x 43.30)


Processed prompts: 100%|████████| 25/25 [00:10<00:00,  2.28it/s, est. speed input: 91.28 toks/s, output: 1083.07 toks/s]


- batch size 25: 1082.68 tokens/sec (25 x 43.31)


Processed prompts: 100%|████████| 26/26 [00:11<00:00,  2.36it/s, est. speed input: 94.36 toks/s, output: 1133.52 toks/s]


- batch size 26: 1133.12 tokens/sec (26 x 43.58)


Processed prompts: 100%|████████| 27/27 [00:11<00:00,  2.41it/s, est. speed input: 96.64 toks/s, output: 1129.28 toks/s]


- batch size 27: 1128.84 tokens/sec (27 x 41.81)


Processed prompts: 100%|████████| 28/28 [00:11<00:00,  2.48it/s, est. speed input: 99.40 toks/s, output: 1196.06 toks/s]


- batch size 28: 1195.60 tokens/sec (28 x 42.70)


Processed prompts: 100%|████████| 29/29 [00:11<00:00,  2.48it/s, est. speed input: 99.17 toks/s, output: 1175.92 toks/s]


- batch size 29: 1175.48 tokens/sec (29 x 40.53)


Processed prompts: 100%|███████| 30/30 [00:11<00:00,  2.55it/s, est. speed input: 102.02 toks/s, output: 1234.46 toks/s]


- batch size 30: 1233.98 tokens/sec (30 x 41.13)


Processed prompts: 100%|███████| 31/31 [00:11<00:00,  2.62it/s, est. speed input: 105.03 toks/s, output: 1216.06 toks/s]


- batch size 31: 1215.60 tokens/sec (31 x 39.21)


Processed prompts: 100%|███████| 32/32 [00:11<00:00,  2.71it/s, est. speed input: 108.30 toks/s, output: 1280.88 toks/s]


- batch size 32: 1280.40 tokens/sec (32 x 40.01)


Processed prompts: 100%|███████| 33/33 [00:12<00:00,  2.74it/s, est. speed input: 109.73 toks/s, output: 1304.56 toks/s]


- batch size 33: 1304.05 tokens/sec (33 x 39.52)


Processed prompts: 100%|███████| 34/34 [00:12<00:00,  2.83it/s, est. speed input: 113.29 toks/s, output: 1310.63 toks/s]


- batch size 34: 1310.15 tokens/sec (34 x 38.53)


Processed prompts: 100%|███████| 35/35 [00:12<00:00,  2.91it/s, est. speed input: 116.47 toks/s, output: 1347.38 toks/s]


- batch size 35: 1346.84 tokens/sec (35 x 38.48)


Processed prompts: 100%|███████| 36/36 [00:12<00:00,  2.91it/s, est. speed input: 116.57 toks/s, output: 1407.31 toks/s]


- batch size 36: 1406.76 tokens/sec (36 x 39.08)


Processed prompts: 100%|███████| 37/37 [00:12<00:00,  2.99it/s, est. speed input: 119.49 toks/s, output: 1423.49 toks/s]


- batch size 37: 1422.93 tokens/sec (37 x 38.46)


Processed prompts: 100%|███████| 38/38 [00:12<00:00,  3.05it/s, est. speed input: 122.12 toks/s, output: 1451.83 toks/s]


- batch size 38: 1451.29 tokens/sec (38 x 38.19)


Processed prompts: 100%|███████| 39/39 [00:12<00:00,  3.11it/s, est. speed input: 124.53 toks/s, output: 1477.35 toks/s]


- batch size 39: 1476.76 tokens/sec (39 x 37.87)


Processed prompts: 100%|███████| 40/40 [00:12<00:00,  3.22it/s, est. speed input: 128.61 toks/s, output: 1464.79 toks/s]


- batch size 40: 1464.14 tokens/sec (40 x 36.60)


Processed prompts: 100%|███████| 41/41 [00:12<00:00,  3.29it/s, est. speed input: 131.62 toks/s, output: 1585.34 toks/s]


- batch size 41: 1584.62 tokens/sec (41 x 38.65)


Processed prompts: 100%|███████| 42/42 [00:12<00:00,  3.35it/s, est. speed input: 134.18 toks/s, output: 1547.25 toks/s]


- batch size 42: 1546.53 tokens/sec (42 x 36.82)


Processed prompts: 100%|███████| 43/43 [00:12<00:00,  3.46it/s, est. speed input: 138.75 toks/s, output: 1578.20 toks/s]


- batch size 43: 1577.56 tokens/sec (43 x 36.69)


Processed prompts: 100%|███████| 44/44 [00:12<00:00,  3.49it/s, est. speed input: 139.77 toks/s, output: 1639.35 toks/s]


- batch size 44: 1638.66 tokens/sec (44 x 37.24)


Processed prompts: 100%|███████| 45/45 [00:12<00:00,  3.59it/s, est. speed input: 143.46 toks/s, output: 1663.17 toks/s]


- batch size 45: 1662.36 tokens/sec (45 x 36.94)


Processed prompts: 100%|███████| 46/46 [00:18<00:00,  2.54it/s, est. speed input: 101.48 toks/s, output: 1169.99 toks/s]


- batch size 46: 1169.62 tokens/sec (46 x 25.43)


Processed prompts: 100%|█████████| 47/47 [00:24<00:00,  1.90it/s, est. speed input: 75.92 toks/s, output: 864.81 toks/s]


- batch size 47: 864.60 tokens/sec (47 x 18.40)


Processed prompts: 100%|█████████| 48/48 [00:25<00:00,  1.86it/s, est. speed input: 74.53 toks/s, output: 832.82 toks/s]

- batch size 48: 832.64 tokens/sec (48 x 17.35)





### w8a16

In [8]:
llm = vllm_load(test_models["qwen-2.5:w8a16"])

INFO 09-23 23:25:51 gptq_marlin.py:108] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel.
INFO 09-23 23:25:51 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=Fals

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 09-23 23:26:07 model_runner.py:1008] Loading model weights took 8.3072 GB
INFO 09-23 23:26:09 gpu_executor.py:122] # GPU blocks: 15179, # CPU blocks: 4681
INFO 09-23 23:26:13 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-23 23:26:13 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-23 23:26:23 model_runner.py:1430] Graph capturing finished in 10 secs.


In [9]:
vllm_generate(test_messages*18, llm)

vLLM performance test:


Processed prompts: 100%|█████████████| 1/1 [00:05<00:00,  5.84s/it, est. speed input: 6.86 toks/s, output: 87.75 toks/s]


Generated text: "Le Crédit Mutuel est une coopérative financière qui propose divers produits et services bancaires. Voici quelques-uns des principaux avantages qu'il offre :\n\n1. **Structure Coopérative** : Le Crédit Mutuel est une coopérative de crédit, ce qui signifie que ses actionnaires sont ses clients. Cette structure permet une gestion plus démocratique et une répartition des bénéfices entre les clients.\n\n2. **Services Personnalisés** : Les conseillers du Crédit Mutuel sont formés pour offrir des conseils personnalisés et adaptés aux besoins spécifiques de chaque client. Cela peut être particulièrement avantageux pour les petites entreprises ou les particuliers ayant des situations financières complexes.\n\n3. **Large Portefeuille de Produits Financiers** : Le Crédit Mutuel propose une gamme étendue de produits financiers, allant des comptes courants et épargne à des prêts immobiliers, des cartes bancaires, des assurances et des placements.\n\n4. **Tarifs Avantageux** : Les m

Processed prompts: 100%|█████████████| 1/1 [00:04<00:00,  4.74s/it, est. speed input: 8.44 toks/s, output: 90.52 toks/s]


- batch size 1: 90.46 tokens/sec (1 x 90.46)


Processed prompts: 100%|███████████| 2/2 [00:05<00:00,  2.77s/it, est. speed input: 14.61 toks/s, output: 171.52 toks/s]


- batch size 2: 171.42 tokens/sec (2 x 85.71)


Processed prompts: 100%|███████████| 3/3 [00:05<00:00,  1.93s/it, est. speed input: 21.08 toks/s, output: 265.36 toks/s]


- batch size 3: 265.26 tokens/sec (3 x 88.42)


Processed prompts: 100%|███████████| 4/4 [00:05<00:00,  1.45s/it, est. speed input: 27.58 toks/s, output: 323.73 toks/s]


- batch size 4: 323.60 tokens/sec (4 x 80.90)


Processed prompts: 100%|███████████| 5/5 [00:05<00:00,  1.16s/it, est. speed input: 34.50 toks/s, output: 421.99 toks/s]


- batch size 5: 421.83 tokens/sec (5 x 84.37)


Processed prompts: 100%|███████████| 6/6 [00:05<00:00,  1.03it/s, est. speed input: 41.25 toks/s, output: 496.07 toks/s]


- batch size 6: 495.79 tokens/sec (6 x 82.63)


Processed prompts: 100%|███████████| 7/7 [00:05<00:00,  1.19it/s, est. speed input: 48.02 toks/s, output: 584.91 toks/s]


- batch size 7: 584.65 tokens/sec (7 x 83.52)


Processed prompts: 100%|███████████| 8/8 [00:05<00:00,  1.36it/s, est. speed input: 54.26 toks/s, output: 679.10 toks/s]


- batch size 8: 678.80 tokens/sec (8 x 84.85)


Processed prompts: 100%|███████████| 9/9 [00:05<00:00,  1.52it/s, est. speed input: 60.68 toks/s, output: 723.98 toks/s]


- batch size 9: 723.70 tokens/sec (9 x 80.41)


Processed prompts: 100%|█████████| 10/10 [00:05<00:00,  1.69it/s, est. speed input: 67.64 toks/s, output: 782.67 toks/s]


- batch size 10: 782.31 tokens/sec (10 x 78.23)


Processed prompts: 100%|█████████| 11/11 [00:05<00:00,  1.84it/s, est. speed input: 73.89 toks/s, output: 890.72 toks/s]


- batch size 11: 890.29 tokens/sec (11 x 80.94)


Processed prompts: 100%|█████████| 12/12 [00:06<00:00,  2.00it/s, est. speed input: 79.86 toks/s, output: 946.12 toks/s]


- batch size 12: 945.70 tokens/sec (12 x 78.81)


Processed prompts: 100%|████████| 13/13 [00:06<00:00,  2.12it/s, est. speed input: 84.69 toks/s, output: 1000.19 toks/s]


- batch size 13: 999.55 tokens/sec (13 x 76.89)


Processed prompts: 100%|████████| 14/14 [00:06<00:00,  2.32it/s, est. speed input: 92.78 toks/s, output: 1044.03 toks/s]


- batch size 14: 1043.56 tokens/sec (14 x 74.54)


Processed prompts: 100%|████████| 15/15 [00:06<00:00,  2.47it/s, est. speed input: 99.01 toks/s, output: 1119.38 toks/s]


- batch size 15: 1118.82 tokens/sec (15 x 74.59)


Processed prompts: 100%|███████| 16/16 [00:06<00:00,  2.62it/s, est. speed input: 104.88 toks/s, output: 1224.31 toks/s]


- batch size 16: 1223.61 tokens/sec (16 x 76.48)


Processed prompts: 100%|███████| 17/17 [00:06<00:00,  2.75it/s, est. speed input: 109.91 toks/s, output: 1205.50 toks/s]


- batch size 17: 1204.88 tokens/sec (17 x 70.88)


Processed prompts: 100%|███████| 18/18 [00:06<00:00,  2.73it/s, est. speed input: 109.43 toks/s, output: 1283.44 toks/s]


- batch size 18: 1282.77 tokens/sec (18 x 71.26)


Processed prompts: 100%|███████| 19/19 [00:06<00:00,  2.83it/s, est. speed input: 113.59 toks/s, output: 1333.45 toks/s]


- batch size 19: 1332.77 tokens/sec (19 x 70.15)


Processed prompts: 100%|███████| 20/20 [00:06<00:00,  2.95it/s, est. speed input: 118.17 toks/s, output: 1405.37 toks/s]


- batch size 20: 1404.66 tokens/sec (20 x 70.23)


Processed prompts: 100%|███████| 21/21 [00:06<00:00,  3.07it/s, est. speed input: 122.64 toks/s, output: 1334.92 toks/s]


- batch size 21: 1334.16 tokens/sec (21 x 63.53)


Processed prompts: 100%|███████| 22/22 [00:06<00:00,  3.20it/s, est. speed input: 128.05 toks/s, output: 1493.27 toks/s]


- batch size 22: 1492.36 tokens/sec (22 x 67.83)


Processed prompts: 100%|███████| 23/23 [00:06<00:00,  3.39it/s, est. speed input: 135.84 toks/s, output: 1528.53 toks/s]


- batch size 23: 1527.67 tokens/sec (23 x 66.42)


Processed prompts: 100%|███████| 24/24 [00:06<00:00,  3.49it/s, est. speed input: 139.53 toks/s, output: 1619.51 toks/s]


- batch size 24: 1618.57 tokens/sec (24 x 67.44)


Processed prompts: 100%|███████| 25/25 [00:07<00:00,  3.51it/s, est. speed input: 140.32 toks/s, output: 1616.94 toks/s]


- batch size 25: 1615.98 tokens/sec (25 x 64.64)


Processed prompts: 100%|███████| 26/26 [00:07<00:00,  3.62it/s, est. speed input: 145.06 toks/s, output: 1748.70 toks/s]


- batch size 26: 1747.77 tokens/sec (26 x 67.22)


Processed prompts: 100%|███████| 27/27 [00:07<00:00,  3.79it/s, est. speed input: 152.05 toks/s, output: 1794.28 toks/s]


- batch size 27: 1793.25 tokens/sec (27 x 66.42)


Processed prompts: 100%|███████| 28/28 [00:07<00:00,  3.94it/s, est. speed input: 157.69 toks/s, output: 1796.21 toks/s]


- batch size 28: 1795.13 tokens/sec (28 x 64.11)


Processed prompts: 100%|███████| 29/29 [00:07<00:00,  4.07it/s, est. speed input: 162.87 toks/s, output: 1917.83 toks/s]


- batch size 29: 1916.74 tokens/sec (29 x 66.09)


Processed prompts: 100%|███████| 30/30 [00:07<00:00,  4.16it/s, est. speed input: 166.46 toks/s, output: 1950.58 toks/s]


- batch size 30: 1949.48 tokens/sec (30 x 64.98)


Processed prompts: 100%|███████| 31/31 [00:07<00:00,  4.24it/s, est. speed input: 169.76 toks/s, output: 1970.44 toks/s]


- batch size 31: 1969.22 tokens/sec (31 x 63.52)


Processed prompts: 100%|███████| 32/32 [00:07<00:00,  4.43it/s, est. speed input: 177.37 toks/s, output: 2055.64 toks/s]


- batch size 32: 2054.31 tokens/sec (32 x 64.20)


Processed prompts: 100%|███████| 33/33 [00:07<00:00,  4.33it/s, est. speed input: 173.29 toks/s, output: 2064.23 toks/s]


- batch size 33: 2062.89 tokens/sec (33 x 62.51)


Processed prompts: 100%|███████| 34/34 [00:07<00:00,  4.39it/s, est. speed input: 175.59 toks/s, output: 2062.32 toks/s]


- batch size 34: 2060.81 tokens/sec (34 x 60.61)


Processed prompts: 100%|███████| 35/35 [00:07<00:00,  4.47it/s, est. speed input: 178.92 toks/s, output: 2083.23 toks/s]


- batch size 35: 2081.90 tokens/sec (35 x 59.48)


Processed prompts: 100%|███████| 36/36 [00:07<00:00,  4.63it/s, est. speed input: 185.37 toks/s, output: 2108.89 toks/s]


- batch size 36: 2107.48 tokens/sec (36 x 58.54)


Processed prompts: 100%|███████| 37/37 [00:07<00:00,  4.70it/s, est. speed input: 187.97 toks/s, output: 2201.31 toks/s]


- batch size 37: 2199.58 tokens/sec (37 x 59.45)


Processed prompts: 100%|███████| 38/38 [00:08<00:00,  4.71it/s, est. speed input: 188.44 toks/s, output: 2226.25 toks/s]


- batch size 38: 2224.91 tokens/sec (38 x 58.55)


Processed prompts: 100%|███████| 39/39 [00:08<00:00,  4.85it/s, est. speed input: 194.21 toks/s, output: 2309.98 toks/s]


- batch size 39: 2308.53 tokens/sec (39 x 59.19)


Processed prompts: 100%|███████| 40/40 [00:08<00:00,  4.99it/s, est. speed input: 199.41 toks/s, output: 2338.26 toks/s]


- batch size 40: 2336.66 tokens/sec (40 x 58.42)


Processed prompts: 100%|███████| 41/41 [00:08<00:00,  5.09it/s, est. speed input: 203.48 toks/s, output: 2364.34 toks/s]


- batch size 41: 2362.76 tokens/sec (41 x 57.63)


Processed prompts: 100%|███████| 42/42 [00:08<00:00,  5.11it/s, est. speed input: 204.50 toks/s, output: 2345.51 toks/s]


- batch size 42: 2343.89 tokens/sec (42 x 55.81)


Processed prompts: 100%|███████| 43/43 [00:08<00:00,  5.23it/s, est. speed input: 209.59 toks/s, output: 2484.87 toks/s]


- batch size 43: 2483.26 tokens/sec (43 x 57.75)


Processed prompts: 100%|███████| 44/44 [00:08<00:00,  5.40it/s, est. speed input: 216.17 toks/s, output: 2405.47 toks/s]


- batch size 44: 2403.64 tokens/sec (44 x 54.63)


Processed prompts: 100%|███████| 45/45 [00:08<00:00,  5.39it/s, est. speed input: 215.67 toks/s, output: 2469.73 toks/s]


- batch size 45: 2468.09 tokens/sec (45 x 54.85)


Processed prompts: 100%|███████| 46/46 [00:08<00:00,  5.52it/s, est. speed input: 220.92 toks/s, output: 2581.12 toks/s]


- batch size 46: 2578.76 tokens/sec (46 x 56.06)


Processed prompts: 100%|███████| 47/47 [00:08<00:00,  5.59it/s, est. speed input: 223.84 toks/s, output: 2626.40 toks/s]


- batch size 47: 2624.11 tokens/sec (47 x 55.83)


Processed prompts: 100%|███████| 48/48 [00:08<00:00,  5.59it/s, est. speed input: 223.65 toks/s, output: 2648.52 toks/s]


- batch size 48: 2646.72 tokens/sec (48 x 55.14)


Processed prompts: 100%|███████| 49/49 [00:08<00:00,  5.76it/s, est. speed input: 230.36 toks/s, output: 2773.78 toks/s]


- batch size 49: 2771.97 tokens/sec (49 x 56.57)


Processed prompts: 100%|███████| 50/50 [00:08<00:00,  5.82it/s, est. speed input: 232.76 toks/s, output: 2715.12 toks/s]


- batch size 50: 2712.97 tokens/sec (50 x 54.26)


Processed prompts: 100%|███████| 51/51 [00:08<00:00,  6.07it/s, est. speed input: 243.19 toks/s, output: 2775.67 toks/s]


- batch size 51: 2773.41 tokens/sec (51 x 54.38)


Processed prompts: 100%|███████| 52/52 [00:08<00:00,  6.03it/s, est. speed input: 241.14 toks/s, output: 2851.45 toks/s]


- batch size 52: 2849.18 tokens/sec (52 x 54.79)


Processed prompts: 100%|███████| 53/53 [00:09<00:00,  5.73it/s, est. speed input: 229.38 toks/s, output: 2745.91 toks/s]


- batch size 53: 2743.73 tokens/sec (53 x 51.77)


Processed prompts: 100%|███████| 54/54 [00:09<00:00,  5.56it/s, est. speed input: 222.44 toks/s, output: 2631.25 toks/s]


- batch size 54: 2629.36 tokens/sec (54 x 48.69)


Processed prompts: 100%|███████| 55/55 [00:09<00:00,  5.54it/s, est. speed input: 221.78 toks/s, output: 2626.22 toks/s]


- batch size 55: 2624.44 tokens/sec (55 x 47.72)


Processed prompts: 100%|███████| 56/56 [00:09<00:00,  5.85it/s, est. speed input: 234.03 toks/s, output: 2671.34 toks/s]


- batch size 56: 2669.41 tokens/sec (56 x 47.67)


Processed prompts: 100%|███████| 57/57 [00:09<00:00,  5.92it/s, est. speed input: 236.89 toks/s, output: 2761.86 toks/s]


- batch size 57: 2759.92 tokens/sec (57 x 48.42)


Processed prompts: 100%|███████| 58/58 [00:09<00:00,  5.90it/s, est. speed input: 235.99 toks/s, output: 2755.38 toks/s]


- batch size 58: 2753.53 tokens/sec (58 x 47.47)


Processed prompts: 100%|███████| 59/59 [00:09<00:00,  6.13it/s, est. speed input: 245.53 toks/s, output: 2810.64 toks/s]


- batch size 59: 2808.66 tokens/sec (59 x 47.60)


Processed prompts: 100%|███████| 60/60 [00:10<00:00,  5.99it/s, est. speed input: 239.68 toks/s, output: 2848.30 toks/s]


- batch size 60: 2846.16 tokens/sec (60 x 47.44)


Processed prompts: 100%|███████| 61/61 [00:09<00:00,  6.16it/s, est. speed input: 246.57 toks/s, output: 2849.55 toks/s]


- batch size 61: 2847.53 tokens/sec (61 x 46.68)


Processed prompts: 100%|███████| 62/62 [00:10<00:00,  6.14it/s, est. speed input: 245.51 toks/s, output: 2901.63 toks/s]


- batch size 62: 2899.53 tokens/sec (62 x 46.77)


Processed prompts: 100%|███████| 63/63 [00:10<00:00,  6.23it/s, est. speed input: 249.34 toks/s, output: 2916.67 toks/s]


- batch size 63: 2914.42 tokens/sec (63 x 46.26)


Processed prompts: 100%|███████| 64/64 [00:10<00:00,  6.36it/s, est. speed input: 254.36 toks/s, output: 2892.61 toks/s]


- batch size 64: 2890.26 tokens/sec (64 x 45.16)


Processed prompts: 100%|███████| 65/65 [00:12<00:00,  5.27it/s, est. speed input: 210.88 toks/s, output: 2477.30 toks/s]


- batch size 65: 2475.51 tokens/sec (65 x 38.08)


Processed prompts: 100%|███████| 66/66 [00:12<00:00,  5.37it/s, est. speed input: 215.08 toks/s, output: 2543.39 toks/s]


- batch size 66: 2541.69 tokens/sec (66 x 38.51)


Processed prompts: 100%|███████| 67/67 [00:12<00:00,  5.44it/s, est. speed input: 217.89 toks/s, output: 2507.16 toks/s]


- batch size 67: 2505.37 tokens/sec (67 x 37.39)


Processed prompts: 100%|███████| 68/68 [00:12<00:00,  5.44it/s, est. speed input: 217.55 toks/s, output: 2524.82 toks/s]


- batch size 68: 2523.16 tokens/sec (68 x 37.11)


Processed prompts: 100%|███████| 69/69 [00:12<00:00,  5.43it/s, est. speed input: 217.27 toks/s, output: 2541.87 toks/s]


- batch size 69: 2514.45 tokens/sec (69 x 36.44)


Processed prompts: 100%|███████| 70/70 [00:13<00:00,  5.29it/s, est. speed input: 211.59 toks/s, output: 2524.35 toks/s]


- batch size 70: 2522.71 tokens/sec (70 x 36.04)


Processed prompts: 100%|███████| 71/71 [00:12<00:00,  5.48it/s, est. speed input: 219.54 toks/s, output: 2532.34 toks/s]


- batch size 71: 2530.42 tokens/sec (71 x 35.64)


Processed prompts: 100%|███████| 72/72 [00:13<00:00,  5.41it/s, est. speed input: 216.43 toks/s, output: 2560.34 toks/s]

- batch size 72: 2558.47 tokens/sec (72 x 35.53)





### w4a16

In [8]:
llm = vllm_load(test_models["qwen-2.5:w4a16"])

INFO 09-23 23:36:52 gptq_marlin.py:108] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel.
INFO 09-23 23:36:52 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=Fals

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-23 23:37:08 model_runner.py:1008] Loading model weights took 5.2048 GB
INFO 09-23 23:37:12 gpu_executor.py:122] # GPU blocks: 18623, # CPU blocks: 4681
INFO 09-23 23:37:22 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-23 23:37:22 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-23 23:37:33 model_runner.py:1430] Graph capturing finished in 10 secs.


In [9]:
vllm_generate(test_messages*32, llm)

vLLM performance test:


Processed prompts: 100%|███████████| 1/1 [00:03<00:00,  3.76s/it, est. speed input: 10.64 toks/s, output: 118.64 toks/s]


Generated text: "Le Crédit Mutuel est une institution financière coopérative, ce qui signifie que ses clients sont également ses actionnaires. Voici quelques avantages qu'il peut offrir :\n\n1. **Coopération et solidarité** : En tant que coopérative, le Crédit Mutuel favorise la solidarité entre ses clients et ses employés. Les décisions sont prises de manière collective et les bénéfices sont répartis entre les membres.\n\n2. **Service client personnalisé** : Le Crédit Mutuel propose souvent des services personnalisés adaptés aux besoins spécifiques de chaque client, en particulier pour les clients de longue date.\n\n3. **Transparence** : Les actions du Crédit Mutuel sont cotées sur les marchés financiers, ce qui permet une transparence accrue sur les performances financières de l'institution.\n\n4. **Tarifs avantageux** : Parce que le Crédit Mutuel est une coopérative, il peut proposer des taux d'intérêt plus bas sur les prêts et des commissions moins élevées sur les comptes bancaires

Processed prompts: 100%|███████████| 1/1 [00:02<00:00,  2.99s/it, est. speed input: 13.38 toks/s, output: 137.44 toks/s]


- batch size 1: 137.34 tokens/sec (1 x 137.34)


Processed prompts: 100%|███████████| 2/2 [00:03<00:00,  1.99s/it, est. speed input: 20.40 toks/s, output: 257.91 toks/s]


- batch size 2: 257.79 tokens/sec (2 x 128.90)


Processed prompts: 100%|███████████| 3/3 [00:03<00:00,  1.31s/it, est. speed input: 31.11 toks/s, output: 383.57 toks/s]


- batch size 3: 383.37 tokens/sec (3 x 127.79)


Processed prompts: 100%|███████████| 4/4 [00:03<00:00,  1.02it/s, est. speed input: 40.89 toks/s, output: 483.51 toks/s]


- batch size 4: 483.10 tokens/sec (4 x 120.78)


Processed prompts: 100%|███████████| 5/5 [00:03<00:00,  1.31it/s, est. speed input: 52.40 toks/s, output: 592.35 toks/s]


- batch size 5: 591.96 tokens/sec (5 x 118.39)


Processed prompts: 100%|███████████| 6/6 [00:03<00:00,  1.51it/s, est. speed input: 60.61 toks/s, output: 712.19 toks/s]


- batch size 6: 711.81 tokens/sec (6 x 118.63)


Processed prompts: 100%|███████████| 7/7 [00:03<00:00,  1.75it/s, est. speed input: 70.59 toks/s, output: 800.81 toks/s]


- batch size 7: 800.27 tokens/sec (7 x 114.32)


Processed prompts: 100%|███████████| 8/8 [00:04<00:00,  1.98it/s, est. speed input: 79.33 toks/s, output: 956.87 toks/s]


- batch size 8: 956.24 tokens/sec (8 x 119.53)


Processed prompts: 100%|███████████| 9/9 [00:04<00:00,  2.23it/s, est. speed input: 89.01 toks/s, output: 975.13 toks/s]


- batch size 9: 974.50 tokens/sec (9 x 108.28)


Processed prompts: 100%|████████| 10/10 [00:04<00:00,  2.44it/s, est. speed input: 97.99 toks/s, output: 1087.17 toks/s]


- batch size 10: 1086.46 tokens/sec (10 x 108.65)


Processed prompts: 100%|███████| 11/11 [00:04<00:00,  2.67it/s, est. speed input: 107.24 toks/s, output: 1249.57 toks/s]


- batch size 11: 1248.85 tokens/sec (11 x 113.53)


Processed prompts: 100%|███████| 12/12 [00:04<00:00,  2.89it/s, est. speed input: 115.73 toks/s, output: 1361.70 toks/s]


- batch size 12: 1360.79 tokens/sec (12 x 113.40)


Processed prompts: 100%|███████| 13/13 [00:04<00:00,  3.04it/s, est. speed input: 121.52 toks/s, output: 1472.05 toks/s]


- batch size 13: 1470.86 tokens/sec (13 x 113.14)


Processed prompts: 100%|███████| 14/14 [00:04<00:00,  3.32it/s, est. speed input: 132.88 toks/s, output: 1604.49 toks/s]


- batch size 14: 1603.42 tokens/sec (14 x 114.53)


Processed prompts: 100%|███████| 15/15 [00:04<00:00,  3.54it/s, est. speed input: 142.24 toks/s, output: 1654.37 toks/s]


- batch size 15: 1652.95 tokens/sec (15 x 110.20)


Processed prompts: 100%|███████| 16/16 [00:04<00:00,  3.76it/s, est. speed input: 150.55 toks/s, output: 1750.16 toks/s]


- batch size 16: 1748.92 tokens/sec (16 x 109.31)


Processed prompts: 100%|███████| 17/17 [00:04<00:00,  3.64it/s, est. speed input: 145.74 toks/s, output: 1739.70 toks/s]


- batch size 17: 1738.42 tokens/sec (17 x 102.26)


Processed prompts: 100%|███████| 18/18 [00:04<00:00,  3.83it/s, est. speed input: 153.37 toks/s, output: 1776.04 toks/s]


- batch size 18: 1774.80 tokens/sec (18 x 98.60)


Processed prompts: 100%|███████| 19/19 [00:04<00:00,  4.03it/s, est. speed input: 161.62 toks/s, output: 1841.91 toks/s]


- batch size 19: 1840.65 tokens/sec (19 x 96.88)


Processed prompts: 100%|███████| 20/20 [00:04<00:00,  4.18it/s, est. speed input: 167.39 toks/s, output: 1924.33 toks/s]


- batch size 20: 1923.07 tokens/sec (20 x 96.15)


Processed prompts: 100%|███████| 21/21 [00:04<00:00,  4.28it/s, est. speed input: 171.14 toks/s, output: 1904.73 toks/s]


- batch size 21: 1903.44 tokens/sec (21 x 90.64)


Processed prompts: 100%|███████| 22/22 [00:04<00:00,  4.48it/s, est. speed input: 179.39 toks/s, output: 2133.93 toks/s]


- batch size 22: 2132.40 tokens/sec (22 x 96.93)


Processed prompts: 100%|███████| 23/23 [00:04<00:00,  4.65it/s, est. speed input: 186.39 toks/s, output: 2186.50 toks/s]


- batch size 23: 2185.02 tokens/sec (23 x 95.00)


Processed prompts: 100%|███████| 24/24 [00:04<00:00,  4.84it/s, est. speed input: 193.69 toks/s, output: 2249.39 toks/s]


- batch size 24: 2247.62 tokens/sec (24 x 93.65)


Processed prompts: 100%|███████| 25/25 [00:05<00:00,  4.95it/s, est. speed input: 198.02 toks/s, output: 2307.50 toks/s]


- batch size 25: 2305.76 tokens/sec (25 x 92.23)


Processed prompts: 100%|███████| 26/26 [00:05<00:00,  4.96it/s, est. speed input: 198.73 toks/s, output: 2362.66 toks/s]


- batch size 26: 2361.01 tokens/sec (26 x 90.81)


Processed prompts: 100%|███████| 27/27 [00:05<00:00,  5.23it/s, est. speed input: 209.78 toks/s, output: 2467.77 toks/s]


- batch size 27: 2465.87 tokens/sec (27 x 91.33)


Processed prompts: 100%|███████| 28/28 [00:05<00:00,  5.40it/s, est. speed input: 216.01 toks/s, output: 2508.40 toks/s]


- batch size 28: 2506.53 tokens/sec (28 x 89.52)


Processed prompts: 100%|███████| 29/29 [00:05<00:00,  5.48it/s, est. speed input: 219.29 toks/s, output: 2644.91 toks/s]


- batch size 29: 2642.54 tokens/sec (29 x 91.12)


Processed prompts: 100%|███████| 30/30 [00:05<00:00,  5.65it/s, est. speed input: 226.25 toks/s, output: 2638.53 toks/s]


- batch size 30: 2635.74 tokens/sec (30 x 87.86)


Processed prompts: 100%|███████| 31/31 [00:05<00:00,  5.68it/s, est. speed input: 227.48 toks/s, output: 2706.46 toks/s]


- batch size 31: 2704.26 tokens/sec (31 x 87.23)


Processed prompts: 100%|███████| 32/32 [00:05<00:00,  5.95it/s, est. speed input: 238.16 toks/s, output: 2813.11 toks/s]


- batch size 32: 2810.80 tokens/sec (32 x 87.84)


Processed prompts: 100%|███████| 33/33 [00:05<00:00,  5.69it/s, est. speed input: 227.46 toks/s, output: 2672.01 toks/s]


- batch size 33: 2669.86 tokens/sec (33 x 80.90)


Processed prompts: 100%|███████| 34/34 [00:05<00:00,  5.82it/s, est. speed input: 232.94 toks/s, output: 2770.51 toks/s]


- batch size 34: 2768.25 tokens/sec (34 x 81.42)


Processed prompts: 100%|███████| 35/35 [00:05<00:00,  5.88it/s, est. speed input: 235.59 toks/s, output: 2690.12 toks/s]


- batch size 35: 2687.94 tokens/sec (35 x 76.80)


Processed prompts: 100%|███████| 36/36 [00:05<00:00,  6.05it/s, est. speed input: 242.10 toks/s, output: 2813.86 toks/s]


- batch size 36: 2811.54 tokens/sec (36 x 78.10)


Processed prompts: 100%|███████| 37/37 [00:05<00:00,  6.19it/s, est. speed input: 247.58 toks/s, output: 2877.06 toks/s]


- batch size 37: 2874.82 tokens/sec (37 x 77.70)


Processed prompts: 100%|███████| 38/38 [00:06<00:00,  6.33it/s, est. speed input: 253.48 toks/s, output: 2907.14 toks/s]


- batch size 38: 2904.42 tokens/sec (38 x 76.43)


Processed prompts: 100%|███████| 39/39 [00:06<00:00,  6.22it/s, est. speed input: 249.09 toks/s, output: 2935.31 toks/s]


- batch size 39: 2932.93 tokens/sec (39 x 75.20)


Processed prompts: 100%|███████| 40/40 [00:06<00:00,  6.48it/s, est. speed input: 259.33 toks/s, output: 3071.90 toks/s]


- batch size 40: 3068.59 tokens/sec (40 x 76.71)


Processed prompts: 100%|███████| 41/41 [00:06<00:00,  6.63it/s, est. speed input: 265.36 toks/s, output: 3073.45 toks/s]


- batch size 41: 3071.04 tokens/sec (41 x 74.90)


Processed prompts: 100%|███████| 42/42 [00:06<00:00,  6.61it/s, est. speed input: 264.63 toks/s, output: 3032.90 toks/s]


- batch size 42: 3030.36 tokens/sec (42 x 72.15)


Processed prompts: 100%|███████| 43/43 [00:06<00:00,  6.85it/s, est. speed input: 274.32 toks/s, output: 3167.53 toks/s]


- batch size 43: 3164.29 tokens/sec (43 x 73.59)


Processed prompts: 100%|███████| 44/44 [00:06<00:00,  6.84it/s, est. speed input: 273.74 toks/s, output: 3260.48 toks/s]


- batch size 44: 3257.81 tokens/sec (44 x 74.04)


Processed prompts: 100%|███████| 45/45 [00:06<00:00,  6.83it/s, est. speed input: 273.12 toks/s, output: 3266.18 toks/s]


- batch size 45: 3263.34 tokens/sec (45 x 72.52)


Processed prompts: 100%|███████| 46/46 [00:06<00:00,  7.18it/s, est. speed input: 287.40 toks/s, output: 3300.00 toks/s]


- batch size 46: 3296.88 tokens/sec (46 x 71.67)


Processed prompts: 100%|███████| 47/47 [00:06<00:00,  7.13it/s, est. speed input: 285.58 toks/s, output: 3387.90 toks/s]


- batch size 47: 3384.62 tokens/sec (47 x 72.01)


Processed prompts: 100%|███████| 48/48 [00:06<00:00,  7.20it/s, est. speed input: 288.18 toks/s, output: 3312.70 toks/s]


- batch size 48: 3309.65 tokens/sec (48 x 68.95)


Processed prompts: 100%|███████| 49/49 [00:07<00:00,  6.56it/s, est. speed input: 262.56 toks/s, output: 3043.37 toks/s]


- batch size 49: 3041.00 tokens/sec (49 x 62.06)


Processed prompts: 100%|███████| 50/50 [00:07<00:00,  6.83it/s, est. speed input: 273.53 toks/s, output: 3179.24 toks/s]


- batch size 50: 3176.82 tokens/sec (50 x 63.54)


Processed prompts: 100%|███████| 51/51 [00:07<00:00,  7.05it/s, est. speed input: 282.22 toks/s, output: 3255.49 toks/s]


- batch size 51: 3252.75 tokens/sec (51 x 63.78)


Processed prompts: 100%|███████| 52/52 [00:07<00:00,  6.71it/s, est. speed input: 268.41 toks/s, output: 3163.84 toks/s]


- batch size 52: 3161.25 tokens/sec (52 x 60.79)


Processed prompts: 100%|███████| 53/53 [00:08<00:00,  6.46it/s, est. speed input: 258.27 toks/s, output: 3008.44 toks/s]


- batch size 53: 3006.03 tokens/sec (53 x 56.72)


Processed prompts: 100%|███████| 54/54 [00:08<00:00,  6.39it/s, est. speed input: 255.68 toks/s, output: 2987.06 toks/s]


- batch size 54: 2984.63 tokens/sec (54 x 55.27)


Processed prompts: 100%|███████| 55/55 [00:08<00:00,  6.34it/s, est. speed input: 253.89 toks/s, output: 2910.54 toks/s]


- batch size 55: 2908.44 tokens/sec (55 x 52.88)


Processed prompts: 100%|███████| 56/56 [00:08<00:00,  6.40it/s, est. speed input: 256.19 toks/s, output: 3037.47 toks/s]


- batch size 56: 3034.97 tokens/sec (56 x 54.20)


Processed prompts: 100%|███████| 57/57 [00:08<00:00,  6.56it/s, est. speed input: 262.44 toks/s, output: 3055.00 toks/s]


- batch size 57: 3052.63 tokens/sec (57 x 53.55)


Processed prompts: 100%|███████| 58/58 [00:08<00:00,  6.48it/s, est. speed input: 259.30 toks/s, output: 3038.55 toks/s]


- batch size 58: 3035.95 tokens/sec (58 x 52.34)


Processed prompts: 100%|███████| 59/59 [00:08<00:00,  6.60it/s, est. speed input: 264.17 toks/s, output: 3073.91 toks/s]


- batch size 59: 3071.15 tokens/sec (59 x 52.05)


Processed prompts: 100%|███████| 60/60 [00:09<00:00,  6.49it/s, est. speed input: 259.64 toks/s, output: 3045.03 toks/s]


- batch size 60: 3042.15 tokens/sec (60 x 50.70)


Processed prompts: 100%|███████| 61/61 [00:09<00:00,  6.66it/s, est. speed input: 266.34 toks/s, output: 3130.84 toks/s]


- batch size 61: 3128.01 tokens/sec (61 x 51.28)


Processed prompts: 100%|███████| 62/62 [00:09<00:00,  6.65it/s, est. speed input: 266.21 toks/s, output: 3119.49 toks/s]


- batch size 62: 3116.78 tokens/sec (62 x 50.27)


Processed prompts: 100%|███████| 63/63 [00:09<00:00,  6.72it/s, est. speed input: 269.08 toks/s, output: 3222.29 toks/s]


- batch size 63: 3219.51 tokens/sec (63 x 51.10)


Processed prompts: 100%|███████| 64/64 [00:09<00:00,  6.81it/s, est. speed input: 272.52 toks/s, output: 3203.51 toks/s]


- batch size 64: 3200.48 tokens/sec (64 x 50.01)


Processed prompts: 100%|███████| 65/65 [00:10<00:00,  5.98it/s, est. speed input: 239.05 toks/s, output: 2796.84 toks/s]


- batch size 65: 2794.87 tokens/sec (65 x 43.00)


Processed prompts: 100%|███████| 66/66 [00:10<00:00,  6.13it/s, est. speed input: 245.12 toks/s, output: 2889.71 toks/s]


- batch size 66: 2887.63 tokens/sec (66 x 43.75)


Processed prompts: 100%|███████| 67/67 [00:10<00:00,  6.25it/s, est. speed input: 250.20 toks/s, output: 2971.73 toks/s]


- batch size 67: 2969.55 tokens/sec (67 x 44.32)


Processed prompts: 100%|███████| 68/68 [00:11<00:00,  6.10it/s, est. speed input: 244.17 toks/s, output: 2857.74 toks/s]


- batch size 68: 2855.63 tokens/sec (68 x 41.99)


Processed prompts: 100%|███████| 69/69 [00:11<00:00,  6.17it/s, est. speed input: 246.73 toks/s, output: 2910.35 toks/s]


- batch size 69: 2908.37 tokens/sec (69 x 42.15)


Processed prompts: 100%|███████| 70/70 [00:11<00:00,  6.05it/s, est. speed input: 241.98 toks/s, output: 2872.72 toks/s]


- batch size 70: 2870.37 tokens/sec (70 x 41.01)


Processed prompts: 100%|███████| 71/71 [00:11<00:00,  6.33it/s, est. speed input: 253.50 toks/s, output: 2975.56 toks/s]


- batch size 71: 2973.39 tokens/sec (71 x 41.88)


Processed prompts: 100%|███████| 72/72 [00:11<00:00,  6.20it/s, est. speed input: 248.07 toks/s, output: 2891.84 toks/s]


- batch size 72: 2889.52 tokens/sec (72 x 40.13)


Processed prompts: 100%|███████| 73/73 [00:11<00:00,  6.23it/s, est. speed input: 249.14 toks/s, output: 2819.00 toks/s]


- batch size 73: 2817.01 tokens/sec (73 x 38.59)


Processed prompts: 100%|███████| 74/74 [00:12<00:00,  5.73it/s, est. speed input: 229.12 toks/s, output: 2694.05 toks/s]


- batch size 74: 2692.41 tokens/sec (74 x 36.38)


Processed prompts:   0%|                     | 0/75 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

KeyboardInterrupt: 

### 14 B - 8 bits

In [None]:
llm = vllm_load(test_models["qwen-2.5-14b:w8a16"])

In [None]:
vllm_generate(test_messages*18, llm)

### 14 B - 4 bits

In [8]:
llm = vllm_load(test_models["qwen-2.5-14b:w4a16"])

INFO 09-23 23:47:35 gptq_marlin.py:108] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel.
INFO 09-23 23:47:35 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', speculative_config=None, tokenizer='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=Fa

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 09-23 23:47:46 model_runner.py:1008] Loading model weights took 9.3813 GB
INFO 09-23 23:47:48 gpu_executor.py:122] # GPU blocks: 4153, # CPU blocks: 1365
INFO 09-23 23:48:00 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-23 23:48:00 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-23 23:48:11 model_runner.py:1430] Graph capturing finished in 11 secs.


In [9]:
vllm_generate(test_messages*18, llm)

vLLM performance test:


Processed prompts: 100%|█████████████| 1/1 [00:05<00:00,  5.47s/it, est. speed input: 7.31 toks/s, output: 74.38 toks/s]


Generated text: "Le Crédit Mutuel offre plusieurs avantages à ses clients :\n\n1. **Taux d'intérêt compétitifs** : Le Crédit Mutuel propose souvent des taux d'intérêt attractifs pour les prêts, les comptes et les produits d'épargne.\n\n2. **Produits variés** : La banque propose une large gamme de services et produits adaptés aux besoins de chaque client, que ce soit pour l'entrepreneuriat, la gestion de patrimoine, ou encore le financement immobilier.\n\n3. **Réseau étendu** : Avec son réseau de plus de 2000 agences, il est facile de trouver un point de service proche de chez soi ou de son lieu de travail.\n\n4. **Flexibilité et personnalisation** : Les offres du Crédit Mutuel sont généralement très flexibles et peuvent être personnalisées selon les besoins spécifiques du client.\n\n5. **Services digitaux** : L'institut bancaire offre des solutions numériques performantes comme le Crédit Mutuel Mobile et Crédit Mutuel Online, permettant aux clients de gérer leurs comptes où qu'ils soie

Processed prompts: 100%|█████████████| 1/1 [00:04<00:00,  4.43s/it, est. speed input: 9.03 toks/s, output: 80.14 toks/s]


- batch size 1: 80.10 tokens/sec (1 x 80.10)


Processed prompts: 100%|███████████| 2/2 [00:05<00:00,  2.77s/it, est. speed input: 14.62 toks/s, output: 142.95 toks/s]


- batch size 2: 142.89 tokens/sec (2 x 71.44)


Processed prompts: 100%|███████████| 3/3 [00:05<00:00,  1.67s/it, est. speed input: 24.39 toks/s, output: 219.51 toks/s]


- batch size 3: 219.40 tokens/sec (3 x 73.13)


Processed prompts: 100%|███████████| 4/4 [00:04<00:00,  1.15s/it, est. speed input: 34.73 toks/s, output: 294.95 toks/s]


- batch size 4: 294.82 tokens/sec (4 x 73.70)


Processed prompts: 100%|███████████| 5/5 [00:05<00:00,  1.08s/it, est. speed input: 36.88 toks/s, output: 352.35 toks/s]


- batch size 5: 352.17 tokens/sec (5 x 70.43)


Processed prompts: 100%|███████████| 6/6 [00:06<00:00,  1.06s/it, est. speed input: 38.03 toks/s, output: 374.16 toks/s]


- batch size 6: 374.00 tokens/sec (6 x 62.33)


Processed prompts: 100%|███████████| 7/7 [00:05<00:00,  1.30it/s, est. speed input: 52.24 toks/s, output: 465.49 toks/s]


- batch size 7: 465.27 tokens/sec (7 x 66.47)


Processed prompts: 100%|███████████| 8/8 [00:05<00:00,  1.43it/s, est. speed input: 57.32 toks/s, output: 515.48 toks/s]


- batch size 8: 515.16 tokens/sec (8 x 64.39)


Processed prompts: 100%|███████████| 9/9 [00:05<00:00,  1.65it/s, est. speed input: 66.10 toks/s, output: 607.56 toks/s]


- batch size 9: 607.24 tokens/sec (9 x 67.47)


Processed prompts: 100%|█████████| 10/10 [00:06<00:00,  1.64it/s, est. speed input: 65.94 toks/s, output: 613.34 toks/s]


- batch size 10: 613.02 tokens/sec (10 x 61.30)


Processed prompts: 100%|█████████| 11/11 [00:05<00:00,  1.90it/s, est. speed input: 76.48 toks/s, output: 700.43 toks/s]


- batch size 11: 700.08 tokens/sec (11 x 63.64)


Processed prompts: 100%|█████████| 12/12 [00:06<00:00,  2.00it/s, est. speed input: 79.89 toks/s, output: 762.63 toks/s]


- batch size 12: 762.25 tokens/sec (12 x 63.52)


Processed prompts: 100%|█████████| 13/13 [00:05<00:00,  2.36it/s, est. speed input: 94.24 toks/s, output: 863.36 toks/s]


- batch size 13: 862.81 tokens/sec (13 x 66.37)


Processed prompts: 100%|█████████| 14/14 [00:06<00:00,  2.29it/s, est. speed input: 91.89 toks/s, output: 881.08 toks/s]


- batch size 14: 880.58 tokens/sec (14 x 62.90)


Processed prompts: 100%|████████| 15/15 [00:05<00:00,  2.57it/s, est. speed input: 103.24 toks/s, output: 946.09 toks/s]


- batch size 15: 945.60 tokens/sec (15 x 63.04)


Processed prompts: 100%|███████| 16/16 [00:05<00:00,  2.67it/s, est. speed input: 106.75 toks/s, output: 1003.90 toks/s]


- batch size 16: 1003.33 tokens/sec (16 x 62.71)


Processed prompts: 100%|█████████| 17/17 [00:07<00:00,  2.33it/s, est. speed input: 93.01 toks/s, output: 873.79 toks/s]


- batch size 17: 873.38 tokens/sec (17 x 51.38)


Processed prompts: 100%|███████| 18/18 [00:06<00:00,  2.71it/s, est. speed input: 108.37 toks/s, output: 1009.19 toks/s]


- batch size 18: 1008.69 tokens/sec (18 x 56.04)


Processed prompts: 100%|███████| 19/19 [00:06<00:00,  2.98it/s, est. speed input: 119.32 toks/s, output: 1071.35 toks/s]


- batch size 19: 1070.72 tokens/sec (19 x 56.35)


Processed prompts: 100%|███████| 20/20 [00:06<00:00,  2.95it/s, est. speed input: 117.98 toks/s, output: 1078.93 toks/s]


- batch size 20: 1078.30 tokens/sec (20 x 53.92)


Processed prompts: 100%|███████| 21/21 [00:06<00:00,  3.36it/s, est. speed input: 134.57 toks/s, output: 1190.97 toks/s]


- batch size 21: 1190.29 tokens/sec (21 x 56.68)


Processed prompts: 100%|███████| 22/22 [00:06<00:00,  3.20it/s, est. speed input: 128.32 toks/s, output: 1194.21 toks/s]


- batch size 22: 1193.51 tokens/sec (22 x 54.25)


Processed prompts: 100%|███████| 23/23 [00:06<00:00,  3.33it/s, est. speed input: 133.68 toks/s, output: 1252.70 toks/s]


- batch size 23: 1251.99 tokens/sec (23 x 54.43)


Processed prompts: 100%|███████| 24/24 [00:06<00:00,  3.43it/s, est. speed input: 137.19 toks/s, output: 1273.75 toks/s]


- batch size 24: 1272.97 tokens/sec (24 x 53.04)


Processed prompts: 100%|███████| 25/25 [00:07<00:00,  3.17it/s, est. speed input: 126.62 toks/s, output: 1187.19 toks/s]


- batch size 25: 1186.57 tokens/sec (25 x 47.46)


Processed prompts: 100%|███████| 26/26 [00:07<00:00,  3.54it/s, est. speed input: 141.69 toks/s, output: 1342.83 toks/s]


- batch size 26: 1341.99 tokens/sec (26 x 51.62)


Processed prompts: 100%|███████| 27/27 [00:07<00:00,  3.67it/s, est. speed input: 147.04 toks/s, output: 1355.16 toks/s]


- batch size 27: 1354.31 tokens/sec (27 x 50.16)


Processed prompts: 100%|███████| 28/28 [00:07<00:00,  3.80it/s, est. speed input: 152.04 toks/s, output: 1467.63 toks/s]


- batch size 28: 1466.79 tokens/sec (28 x 52.39)


Processed prompts: 100%|███████| 29/29 [00:07<00:00,  4.14it/s, est. speed input: 165.69 toks/s, output: 1525.47 toks/s]


- batch size 29: 1524.46 tokens/sec (29 x 52.57)


Processed prompts: 100%|███████| 30/30 [00:07<00:00,  3.98it/s, est. speed input: 159.34 toks/s, output: 1494.52 toks/s]


- batch size 30: 1493.66 tokens/sec (30 x 49.79)


Processed prompts: 100%|███████| 31/31 [00:07<00:00,  4.05it/s, est. speed input: 162.35 toks/s, output: 1557.71 toks/s]


- batch size 31: 1556.69 tokens/sec (31 x 50.22)


Processed prompts: 100%|███████| 32/32 [00:07<00:00,  4.08it/s, est. speed input: 163.11 toks/s, output: 1568.23 toks/s]


- batch size 32: 1567.02 tokens/sec (32 x 48.97)


Processed prompts: 100%|███████| 33/33 [00:08<00:00,  4.09it/s, est. speed input: 163.41 toks/s, output: 1479.35 toks/s]


- batch size 33: 1478.57 tokens/sec (33 x 44.81)


Processed prompts: 100%|███████| 34/34 [00:08<00:00,  4.23it/s, est. speed input: 169.48 toks/s, output: 1606.61 toks/s]


- batch size 34: 1605.70 tokens/sec (34 x 47.23)


Processed prompts: 100%|███████| 35/35 [00:08<00:00,  4.34it/s, est. speed input: 174.02 toks/s, output: 1553.49 toks/s]


- batch size 35: 1552.39 tokens/sec (35 x 44.35)


Processed prompts: 100%|███████| 36/36 [00:08<00:00,  4.24it/s, est. speed input: 169.56 toks/s, output: 1571.02 toks/s]


- batch size 36: 1570.01 tokens/sec (36 x 43.61)


Processed prompts: 100%|███████| 37/37 [00:08<00:00,  4.48it/s, est. speed input: 179.21 toks/s, output: 1634.30 toks/s]


- batch size 37: 1633.28 tokens/sec (37 x 44.14)


Processed prompts: 100%|███████| 38/38 [00:08<00:00,  4.66it/s, est. speed input: 186.37 toks/s, output: 1731.27 toks/s]


- batch size 38: 1729.33 tokens/sec (38 x 45.51)


Processed prompts: 100%|███████| 39/39 [00:08<00:00,  4.50it/s, est. speed input: 180.35 toks/s, output: 1662.63 toks/s]


- batch size 39: 1661.66 tokens/sec (39 x 42.61)


Processed prompts: 100%|███████| 40/40 [00:08<00:00,  4.77it/s, est. speed input: 190.84 toks/s, output: 1746.20 toks/s]


- batch size 40: 1745.06 tokens/sec (40 x 43.63)


Processed prompts: 100%|███████| 41/41 [00:08<00:00,  4.68it/s, est. speed input: 187.15 toks/s, output: 1748.12 toks/s]


- batch size 41: 1746.98 tokens/sec (41 x 42.61)


Processed prompts: 100%|███████| 42/42 [00:08<00:00,  5.02it/s, est. speed input: 200.74 toks/s, output: 1821.95 toks/s]


- batch size 42: 1820.58 tokens/sec (42 x 43.35)


Processed prompts: 100%|███████| 43/43 [00:08<00:00,  5.00it/s, est. speed input: 200.09 toks/s, output: 1839.14 toks/s]


- batch size 43: 1837.78 tokens/sec (43 x 42.74)


Processed prompts: 100%|███████| 44/44 [00:09<00:00,  4.85it/s, est. speed input: 193.92 toks/s, output: 1799.72 toks/s]


- batch size 44: 1798.06 tokens/sec (44 x 40.87)


Processed prompts: 100%|███████| 45/45 [00:08<00:00,  5.14it/s, est. speed input: 205.66 toks/s, output: 1916.34 toks/s]


- batch size 45: 1915.05 tokens/sec (45 x 42.56)


Processed prompts: 100%|███████| 46/46 [00:09<00:00,  5.02it/s, est. speed input: 200.72 toks/s, output: 1875.53 toks/s]


- batch size 46: 1874.20 tokens/sec (46 x 40.74)


Processed prompts: 100%|███████| 47/47 [00:09<00:00,  5.00it/s, est. speed input: 200.37 toks/s, output: 1853.17 toks/s]


- batch size 47: 1852.00 tokens/sec (47 x 39.40)


Processed prompts: 100%|███████| 48/48 [00:09<00:00,  4.84it/s, est. speed input: 193.57 toks/s, output: 1845.19 toks/s]


- batch size 48: 1843.98 tokens/sec (48 x 38.42)


Processed prompts: 100%|███████| 49/49 [00:10<00:00,  4.84it/s, est. speed input: 193.66 toks/s, output: 1771.83 toks/s]


- batch size 49: 1770.48 tokens/sec (49 x 36.13)


Processed prompts: 100%|███████| 50/50 [00:10<00:00,  4.76it/s, est. speed input: 190.52 toks/s, output: 1679.34 toks/s]


- batch size 50: 1677.66 tokens/sec (50 x 33.55)


Processed prompts: 100%|███████| 51/51 [00:11<00:00,  4.48it/s, est. speed input: 179.34 toks/s, output: 1662.83 toks/s]


- batch size 51: 1661.91 tokens/sec (51 x 32.59)


Processed prompts: 100%|███████| 52/52 [00:10<00:00,  4.90it/s, est. speed input: 196.09 toks/s, output: 1790.20 toks/s]


- batch size 52: 1789.01 tokens/sec (52 x 34.40)


Processed prompts: 100%|███████| 53/53 [00:11<00:00,  4.62it/s, est. speed input: 184.69 toks/s, output: 1672.65 toks/s]


- batch size 53: 1671.29 tokens/sec (53 x 31.53)


Processed prompts: 100%|███████| 54/54 [00:10<00:00,  5.05it/s, est. speed input: 201.92 toks/s, output: 1880.18 toks/s]


- batch size 54: 1878.82 tokens/sec (54 x 34.79)


Processed prompts: 100%|███████| 55/55 [00:11<00:00,  4.88it/s, est. speed input: 195.24 toks/s, output: 1801.03 toks/s]


- batch size 55: 1799.96 tokens/sec (55 x 32.73)


Processed prompts: 100%|███████| 56/56 [00:12<00:00,  4.61it/s, est. speed input: 184.27 toks/s, output: 1762.44 toks/s]


- batch size 56: 1761.49 tokens/sec (56 x 31.46)


Processed prompts: 100%|███████| 57/57 [00:11<00:00,  4.78it/s, est. speed input: 191.31 toks/s, output: 1778.63 toks/s]


- batch size 57: 1777.61 tokens/sec (57 x 31.19)


Processed prompts: 100%|███████| 58/58 [00:12<00:00,  4.74it/s, est. speed input: 189.66 toks/s, output: 1771.94 toks/s]


- batch size 58: 1770.90 tokens/sec (58 x 30.53)


Processed prompts: 100%|███████| 59/59 [00:12<00:00,  4.81it/s, est. speed input: 192.71 toks/s, output: 1810.96 toks/s]


- batch size 59: 1809.98 tokens/sec (59 x 30.68)


Processed prompts: 100%|███████| 60/60 [00:12<00:00,  4.83it/s, est. speed input: 193.06 toks/s, output: 1831.53 toks/s]


- batch size 60: 1830.46 tokens/sec (60 x 30.51)


Processed prompts: 100%|███████| 61/61 [00:11<00:00,  5.12it/s, est. speed input: 204.93 toks/s, output: 1933.12 toks/s]


- batch size 61: 1931.93 tokens/sec (61 x 31.67)


Processed prompts: 100%|███████| 62/62 [00:12<00:00,  5.05it/s, est. speed input: 202.01 toks/s, output: 1857.90 toks/s]


- batch size 62: 1856.78 tokens/sec (62 x 29.95)


Processed prompts: 100%|███████| 63/63 [00:12<00:00,  5.12it/s, est. speed input: 205.01 toks/s, output: 1926.46 toks/s]


- batch size 63: 1925.28 tokens/sec (63 x 30.56)


Processed prompts: 100%|█████████| 64/64 [00:30<00:00,  2.12it/s, est. speed input: 84.92 toks/s, output: 782.97 toks/s]


- batch size 64: 782.78 tokens/sec (64 x 12.23)


Processed prompts: 100%|█████████| 65/65 [00:32<00:00,  2.02it/s, est. speed input: 80.73 toks/s, output: 741.98 toks/s]


- batch size 65: 741.81 tokens/sec (65 x 11.41)


Processed prompts:   0%|                     | 0/66 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

KeyboardInterrupt: 

### 32 B - 4 bits

In [8]:
llm = vllm_load(test_models["qwen-2.5-32b:w4a16"])

INFO 09-23 23:58:35 gptq_marlin.py:108] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel.
INFO 09-23 23:58:35 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', speculative_config=None, tokenizer='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=Fa

Loading safetensors checkpoint shards:   0% Completed | 0/5 [00:00<?, ?it/s]


INFO 09-24 00:01:20 model_runner.py:1008] Loading model weights took 18.1477 GB
INFO 09-24 00:01:26 gpu_executor.py:122] # GPU blocks: 786, # CPU blocks: 1024
INFO 09-24 00:01:31 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-24 00:01:31 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-24 00:01:44 model_runner.py:1430] Graph capturing finished in 13 secs.


In [11]:
vllm_generate(test_messages*18, llm)

vLLM performance test:


Processed prompts: 100%|█████████████| 1/1 [00:10<00:00, 10.67s/it, est. speed input: 3.75 toks/s, output: 40.86 toks/s]


Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre plusieurs avantages à ses clients :\n\n1. **Service personnalisé** : En tant que banque coopérative, le Crédit Mutuel met l'accent sur le service client de proximité. Les conseillers sont généralement bien formés pour comprendre les besoins spécifiques des clients.\n\n2. **Tarification avantageuse** : Le Crédit Mutuel propose souvent des tarifs compétitifs pour ses produits et services, ce qui peut être particulièrement intéressant pour les emprunts immobiliers ou les placements financiers.\n\n3. **Engagement sociétal et environnemental** : La banque s'engage dans diverses initiatives pour soutenir l'économie locale, promouvoir la durabilité et le développement durable. Cela peut être important pour les clients soucieux de ces questions.\n\n4. **Réseau étendu** : Avec plus de 6000 agences réparties en France, le Crédit Mutuel offre une grande couverture géographique, facilitant ainsi l'accès aux services b

Processed prompts: 100%|█████████████| 1/1 [00:09<00:00,  9.83s/it, est. speed input: 4.07 toks/s, output: 42.82 toks/s]


- batch size 1: 42.81 tokens/sec (1 x 42.81)


Processed prompts: 100%|█████████████| 2/2 [00:09<00:00,  4.79s/it, est. speed input: 8.46 toks/s, output: 82.93 toks/s]


- batch size 2: 82.91 tokens/sec (2 x 41.46)


Processed prompts: 100%|███████████| 3/3 [00:10<00:00,  3.51s/it, est. speed input: 11.59 toks/s, output: 117.27 toks/s]


- batch size 3: 117.22 tokens/sec (3 x 39.07)


Processed prompts: 100%|███████████| 4/4 [00:10<00:00,  2.53s/it, est. speed input: 15.79 toks/s, output: 150.83 toks/s]


- batch size 4: 150.77 tokens/sec (4 x 37.69)


Processed prompts: 100%|███████████| 5/5 [00:10<00:00,  2.16s/it, est. speed input: 18.54 toks/s, output: 198.50 toks/s]


- batch size 5: 198.46 tokens/sec (5 x 39.69)


Processed prompts: 100%|███████████| 6/6 [00:10<00:00,  1.79s/it, est. speed input: 22.45 toks/s, output: 227.79 toks/s]


- batch size 6: 227.73 tokens/sec (6 x 37.96)


Processed prompts: 100%|███████████| 7/7 [00:12<00:00,  1.78s/it, est. speed input: 22.60 toks/s, output: 250.26 toks/s]


- batch size 7: 250.20 tokens/sec (7 x 35.74)


Processed prompts: 100%|███████████| 8/8 [00:12<00:00,  1.54s/it, est. speed input: 26.01 toks/s, output: 285.07 toks/s]


- batch size 8: 284.99 tokens/sec (8 x 35.62)


Processed prompts: 100%|███████████| 9/9 [00:11<00:00,  1.33s/it, est. speed input: 30.03 toks/s, output: 324.76 toks/s]


- batch size 9: 324.68 tokens/sec (9 x 36.08)


Processed prompts: 100%|█████████| 10/10 [00:12<00:00,  1.27s/it, est. speed input: 31.58 toks/s, output: 338.69 toks/s]


- batch size 10: 338.62 tokens/sec (10 x 33.86)


Processed prompts: 100%|█████████| 11/11 [00:11<00:00,  1.07s/it, est. speed input: 37.54 toks/s, output: 394.56 toks/s]


- batch size 11: 394.44 tokens/sec (11 x 35.86)


Processed prompts: 100%|█████████| 12/12 [00:12<00:00,  1.06s/it, est. speed input: 37.66 toks/s, output: 388.32 toks/s]


- batch size 12: 388.22 tokens/sec (12 x 32.35)


Processed prompts: 100%|█████████| 13/13 [00:12<00:00,  1.03it/s, est. speed input: 41.20 toks/s, output: 436.05 toks/s]


- batch size 13: 435.92 tokens/sec (13 x 33.53)


Processed prompts: 100%|█████████| 14/14 [00:12<00:00,  1.10it/s, est. speed input: 44.01 toks/s, output: 431.60 toks/s]


- batch size 14: 431.48 tokens/sec (14 x 30.82)


Processed prompts: 100%|█████████| 15/15 [00:12<00:00,  1.19it/s, est. speed input: 47.81 toks/s, output: 511.34 toks/s]


- batch size 15: 511.21 tokens/sec (15 x 34.08)


Processed prompts: 100%|█████████| 16/16 [00:12<00:00,  1.26it/s, est. speed input: 50.45 toks/s, output: 507.65 toks/s]


- batch size 16: 507.51 tokens/sec (16 x 31.72)


Processed prompts: 100%|█████████| 17/17 [00:14<00:00,  1.21it/s, est. speed input: 48.56 toks/s, output: 504.19 toks/s]


- batch size 17: 504.06 tokens/sec (17 x 29.65)


Processed prompts: 100%|█████████| 18/18 [00:13<00:00,  1.30it/s, est. speed input: 51.93 toks/s, output: 522.20 toks/s]


- batch size 18: 522.07 tokens/sec (18 x 29.00)


Processed prompts: 100%|█████████| 19/19 [00:14<00:00,  1.35it/s, est. speed input: 54.10 toks/s, output: 562.59 toks/s]


- batch size 19: 562.43 tokens/sec (19 x 29.60)


Processed prompts: 100%|█████████| 20/20 [00:14<00:00,  1.38it/s, est. speed input: 55.29 toks/s, output: 587.85 toks/s]


- batch size 20: 587.67 tokens/sec (20 x 29.38)


Processed prompts: 100%|█████████| 21/21 [00:14<00:00,  1.43it/s, est. speed input: 57.28 toks/s, output: 604.76 toks/s]


- batch size 21: 604.58 tokens/sec (21 x 28.79)


Processed prompts: 100%|█████████| 22/22 [00:14<00:00,  1.56it/s, est. speed input: 62.58 toks/s, output: 634.73 toks/s]


- batch size 22: 634.55 tokens/sec (22 x 28.84)


Processed prompts: 100%|█████████| 23/23 [00:14<00:00,  1.64it/s, est. speed input: 65.68 toks/s, output: 661.96 toks/s]


- batch size 23: 661.76 tokens/sec (23 x 28.77)


Processed prompts: 100%|█████████| 24/24 [00:14<00:00,  1.66it/s, est. speed input: 66.24 toks/s, output: 689.57 toks/s]


- batch size 24: 689.38 tokens/sec (24 x 28.72)


Processed prompts: 100%|█████████| 25/25 [00:14<00:00,  1.67it/s, est. speed input: 66.78 toks/s, output: 694.57 toks/s]


- batch size 25: 694.39 tokens/sec (25 x 27.78)


Processed prompts: 100%|█████████| 26/26 [00:13<00:00,  1.94it/s, est. speed input: 77.53 toks/s, output: 773.54 toks/s]


- batch size 26: 773.28 tokens/sec (26 x 29.74)


Processed prompts: 100%|█████████| 27/27 [00:14<00:00,  1.85it/s, est. speed input: 74.02 toks/s, output: 747.24 toks/s]


- batch size 27: 747.01 tokens/sec (27 x 27.67)


Processed prompts: 100%|█████████| 28/28 [00:15<00:00,  1.87it/s, est. speed input: 74.62 toks/s, output: 756.42 toks/s]


- batch size 28: 756.22 tokens/sec (28 x 27.01)


Processed prompts: 100%|█████████| 29/29 [00:14<00:00,  1.96it/s, est. speed input: 78.31 toks/s, output: 782.75 toks/s]


- batch size 29: 782.52 tokens/sec (29 x 26.98)


Processed prompts: 100%|█████████| 30/30 [00:15<00:00,  2.00it/s, est. speed input: 79.98 toks/s, output: 811.81 toks/s]


- batch size 30: 811.55 tokens/sec (30 x 27.05)


Processed prompts: 100%|█████████| 31/31 [00:15<00:00,  2.04it/s, est. speed input: 81.89 toks/s, output: 850.78 toks/s]


- batch size 31: 850.43 tokens/sec (31 x 27.43)


Processed prompts: 100%|█████████| 32/32 [00:15<00:00,  2.12it/s, est. speed input: 84.78 toks/s, output: 874.51 toks/s]


- batch size 32: 874.22 tokens/sec (32 x 27.32)


Processed prompts:   0%|                     | 0/33 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts: 100%|█████████| 33/33 [00:16<00:00,  2.02it/s, est. speed input: 80.87 toks/s, output: 852.82 toks/s]


- batch size 33: 852.54 tokens/sec (33 x 25.83)


Processed prompts: 100%|█████████| 34/34 [00:17<00:00,  1.99it/s, est. speed input: 79.76 toks/s, output: 828.85 toks/s]


- batch size 34: 828.55 tokens/sec (34 x 24.37)


Processed prompts: 100%|█████████| 35/35 [00:17<00:00,  2.06it/s, est. speed input: 82.44 toks/s, output: 860.64 toks/s]


- batch size 35: 860.38 tokens/sec (35 x 24.58)


Processed prompts: 100%|█████████| 36/36 [00:17<00:00,  2.07it/s, est. speed input: 82.77 toks/s, output: 871.29 toks/s]


- batch size 36: 871.03 tokens/sec (36 x 24.20)


Processed prompts: 100%|█████████| 37/37 [00:19<00:00,  1.91it/s, est. speed input: 76.32 toks/s, output: 796.70 toks/s]


- batch size 37: 796.46 tokens/sec (37 x 21.53)


Processed prompts: 100%|█████████| 38/38 [00:19<00:00,  1.98it/s, est. speed input: 79.35 toks/s, output: 839.02 toks/s]


- batch size 38: 838.76 tokens/sec (38 x 22.07)


Processed prompts:  18%|█▊        | 7/39 [00:14<00:26,  1.21it/s, est. speed input: 20.03 toks/s, output: 181.70 toks/s]



Processed prompts: 100%|█████████| 39/39 [00:19<00:00,  1.97it/s, est. speed input: 78.76 toks/s, output: 826.32 toks/s]


- batch size 39: 826.10 tokens/sec (39 x 21.18)


Processed prompts: 100%|█████████| 40/40 [00:18<00:00,  2.13it/s, est. speed input: 85.21 toks/s, output: 873.00 toks/s]


- batch size 40: 872.71 tokens/sec (40 x 21.82)


Processed prompts: 100%|█████████| 41/41 [00:21<00:00,  1.91it/s, est. speed input: 76.43 toks/s, output: 816.04 toks/s]


- batch size 41: 815.82 tokens/sec (41 x 19.90)


Processed prompts: 100%|█████████| 42/42 [00:19<00:00,  2.11it/s, est. speed input: 84.32 toks/s, output: 873.24 toks/s]


- batch size 42: 872.95 tokens/sec (42 x 20.78)


Processed prompts:  14%|█▍        | 6/43 [00:13<00:36,  1.01it/s, est. speed input: 17.48 toks/s, output: 157.83 toks/s]



Processed prompts: 100%|█████████| 43/43 [00:21<00:00,  2.00it/s, est. speed input: 80.19 toks/s, output: 839.08 toks/s]


- batch size 43: 838.87 tokens/sec (43 x 19.51)


Processed prompts: 100%|█████████| 44/44 [00:21<00:00,  2.08it/s, est. speed input: 83.24 toks/s, output: 853.70 toks/s]


- batch size 44: 853.50 tokens/sec (44 x 19.40)


Processed prompts: 100%|█████████| 45/45 [00:21<00:00,  2.05it/s, est. speed input: 82.00 toks/s, output: 851.21 toks/s]


- batch size 45: 850.97 tokens/sec (45 x 18.91)


Processed prompts:  13%|█▎        | 6/46 [00:12<00:47,  1.19s/it, est. speed input: 18.19 toks/s, output: 155.16 toks/s]



Processed prompts: 100%|█████████| 46/46 [00:21<00:00,  2.14it/s, est. speed input: 85.78 toks/s, output: 865.08 toks/s]


- batch size 46: 864.81 tokens/sec (46 x 18.80)


Processed prompts: 100%|█████████| 47/47 [00:23<00:00,  1.99it/s, est. speed input: 79.80 toks/s, output: 830.91 toks/s]


- batch size 47: 830.67 tokens/sec (47 x 17.67)


Processed prompts: 100%|█████████| 48/48 [00:24<00:00,  1.97it/s, est. speed input: 78.90 toks/s, output: 833.97 toks/s]


- batch size 48: 833.73 tokens/sec (48 x 17.37)


Processed prompts:   0%|                     | 0/49 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts: 100%|█████████| 49/49 [00:25<00:00,  1.93it/s, est. speed input: 77.34 toks/s, output: 768.74 toks/s]


- batch size 49: 768.52 tokens/sec (49 x 15.68)


Processed prompts:   0%|                     | 0/50 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

KeyboardInterrupt: 

## SGLang

### fp16

In [None]:
runtime = sglang_load(test_models["llama-3.1"])

In [None]:
sglang_generate(test_messages*8, runtime)

### w8a16

In [None]:
runtime = sglang_load(test_models["llama-3.1:w8a16"])

In [None]:
sglang_generate(test_messages*10, runtime)

In [None]:
sglang_generate(test_messages*8, runtime)

### w4a16

In [None]:
runtime = sglang_load(test_models["llama-3.1:w4a16"])

In [None]:
sglang_generate(test_messages*18, runtime)

## ollama

### fp16

In [11]:
model = ollama_load(ollama_test_models["llama-3.1"])

In [None]:
ollama_generate(test_messages, model, test_models["llama-3.1"])

### int8

In [18]:
model = ollama_load(ollama_test_models["llama-3.1:int8"])

In [None]:
ollama_generate(test_messages, model, test_models["llama-3.1"])

### int4

In [20]:
model = ollama_load(ollama_test_models["llama-3.1:int4"])

In [None]:
ollama_generate(test_messages, model, test_models["llama-3.1"])

# Installing Open WebUI

https://docs.openwebui.com/

```bash
docker run -d -p 8000:8080 --gpus all --add-host=host.docker.internal:host-gateway -v /workspace/open-webui:/app/backend/data --name open-webui -e WEBUI_AUTH='False' --restart always ghcr.io/open-webui/open-webui:cuda
```

# Models to explore

- Qwen/Qwen2-VL-7B-Instruct
- Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8
- Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4
²---
- unsloth/Mistral-Small-Instruct-2409-bnb-4bit [ready]
- bartowski/Mistral-Small-Instruct-2409-GGUF
---
- mistralai/Pixtral-12B-2409
- DewEfresh/pixtral-12b-8bit

In [3]:
vllm_load("Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4")

config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

INFO 09-22 20:19:44 gptq_marlin.py:108] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel.
INFO 09-22 20:19:44 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=Fals

tokenizer_config.json:   0%|          | 0.00/7.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

INFO 09-22 20:19:48 model_runner.py:997] Starting to load model Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4...
INFO 09-22 20:19:49 weight_utils.py:242] Using model weights format ['*.safetensors']


model-00001-of-00002.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/75.4k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-22 20:24:59 model_runner.py:1008] Loading model weights took 5.2048 GB
INFO 09-22 20:25:00 gpu_executor.py:122] # GPU blocks: 18623, # CPU blocks: 4681
INFO 09-22 20:25:00 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-22 20:25:00 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-22 20:25:09 model_runner.py:1430] Graph capturing finished in 9 secs.


<vllm.entrypoints.llm.LLM at 0x7f6ef476f910>