## Install prerequisites

In [27]:
from importlib.metadata import version
version('torch')

'2.4.0'

In [None]:
pip install --upgrade transformers

https://docs.vllm.ai/en/latest/

In [12]:
version('transformers')

'4.44.2'

In [None]:
pip install --upgrade vllm

In [3]:
version('vllm')

'0.6.1.post2'

https://sglang.readthedocs.io/en/latest/

In [None]:
pip install --upgrade "sglang[all]"

In [None]:
pip install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/

In [7]:
version('sglang')

'0.3.1.post2'

In [28]:
version('flashinfer')

'0.1.6+cu124torch2.4'

https://github.com/ollama/ollama-python

https://github.com/ollama/ollama/tree/main/docs

In [None]:
pip install --upgrade ollama

In [10]:
version('ollama')

'0.3.3'

## Test installation

In [1]:
test_messages = [
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages du Crédit Mutuel ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages du Crédit Agricole ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la Société Générale ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la BNP ?"}
]
]

In [2]:
test_models = {
    "llama-3.1" : "meta-llama/Meta-Llama-3.1-8B-Instruct"
}

In [3]:
from transformers import AutoTokenizer

def format_prompt(messages, model):
    tokenizer = AutoTokenizer.from_pretrained(model)
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

In [4]:
format_prompt(test_messages, test_models["llama-3.1"])

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nTu es un assistant utile et professionnel qui répond toujours en français.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuels sont les avantages du Crédit Mutuel ?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
 '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nTu es un assistant utile et professionnel qui répond toujours en français.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuels sont les avantages du Crédit Agricole ?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
 '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nTu es un assistant utile et professionnel qui répond toujours en français.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuels sont 

### vLLM

In [5]:
# Authenticate VLLM with Huggingface Hub
import os

with open("/workspace/hftoken", 'r') as file:
    myhftoken = file.read().strip()

os.environ["HF_TOKEN"]=myhftoken

In [6]:
import time
from vllm import LLM, SamplingParams

def vllm_load(model):    
    llm = LLM(model, gpu_memory_utilization=0.99, max_model_len=8192)
    llm._model = model
    return llm

def vllm_generate(messages, llm):    
    print(f"vLLM performance test:")
    
    prompts = format_prompt(messages, llm._model)
    sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512)
    # warmup
    outputs = llm.generate(prompts[0], sampling_params)
    print(f"Generated text: {outputs[0].outputs[0].text!r}")
    
    for batch_size in range(1, len(messages) + 1):
        start_time = time.time()  # Record the start time
        outputs = llm.generate(prompts[0:batch_size], sampling_params)
        end_time = time.time()  # Record the end time
            
        # Print the outputs.
        tokenscount = 0
        for output in outputs:
            generated_text = output.outputs[0].text
            tokenscount = tokenscount + len(output.outputs[0].token_ids)

        tokens_per_sec = tokenscount/(end_time-start_time)
        print(f"- batch size {batch_size}: {tokens_per_sec:.2f} tokens/sec ({batch_size} x {tokens_per_sec/batch_size:.2f})")

In [7]:
llm = vllm_load(test_models["llama-3.1"])

INFO 09-21 14:59:48 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Meta-Llama-3.1-8B-Instruct, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 09-21 14:59:49 model_runne

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 09-21 14:59:52 model_runner.py:890] Loading model weights took 14.9888 GB
INFO 09-21 14:59:53 gpu_executor.py:121] # GPU blocks: 3610, # CPU blocks: 2048
INFO 09-21 14:59:54 model_runner.py:1181] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-21 14:59:54 model_runner.py:1185] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-21 15:00:05 model_runner.py:1300] Graph capturing finished in 11 secs.


In [8]:
vllm_generate(test_messages*8, llm)

vLLM performance test:


Processed prompts: 100%|█████████████| 1/1 [00:09<00:00,  9.43s/it, est. speed input: 6.68 toks/s, output: 54.32 toks/s]


Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre plusieurs avantages à ses membres et clients. Voici quelques-uns des principaux avantages :\n\n1. **Intérêts plus élevés sur les épargnes** : Le Crédit Mutuel propose des taux d'intérêt plus élevés sur les comptes d'épargne que de nombreux autres établissements bancaires.\n2. **Taux d'emprunt compétitifs** : Les prêts personnels, les prêts immobiliers et les prêts pour la mobilité sont proposés avec des taux d'intérêt attractifs.\n3. **Services personnalisés** : Le Crédit Mutuel offre des services personnalisés et adaptés aux besoins de ses membres et clients, grâce à une approche relationnelle et à une connaissance approfondie de leurs situations financières.\n4. **Sécurité et confidentialité** : Le Crédit Mutuel s'engage à protéger les données personnelles et financières de ses membres et clients, conformément aux règles de protection des données.\n5. **Participation aux décisions** : En tant que membre 

Processed prompts: 100%|█████████████| 1/1 [00:09<00:00,  9.29s/it, est. speed input: 6.78 toks/s, output: 55.13 toks/s]


- batch size 1: 55.11 tokens/sec (1 x 55.11)


Processed prompts: 100%|███████████| 2/2 [00:09<00:00,  4.80s/it, est. speed input: 13.24 toks/s, output: 106.76 toks/s]


- batch size 2: 106.74 tokens/sec (2 x 53.37)


Processed prompts: 100%|███████████| 3/3 [00:09<00:00,  3.20s/it, est. speed input: 19.90 toks/s, output: 160.03 toks/s]


- batch size 3: 159.98 tokens/sec (3 x 53.33)


Processed prompts: 100%|███████████| 4/4 [00:09<00:00,  2.42s/it, est. speed input: 26.08 toks/s, output: 211.15 toks/s]


- batch size 4: 211.10 tokens/sec (4 x 52.77)


Processed prompts: 100%|███████████| 5/5 [00:09<00:00,  1.95s/it, est. speed input: 32.33 toks/s, output: 262.77 toks/s]


- batch size 5: 262.71 tokens/sec (5 x 52.54)


Processed prompts: 100%|███████████| 6/6 [00:09<00:00,  1.63s/it, est. speed input: 38.80 toks/s, output: 314.48 toks/s]


- batch size 6: 314.39 tokens/sec (6 x 52.40)


Processed prompts: 100%|███████████| 7/7 [00:09<00:00,  1.40s/it, est. speed input: 45.23 toks/s, output: 365.90 toks/s]


- batch size 7: 365.80 tokens/sec (7 x 52.26)


Processed prompts: 100%|███████████| 8/8 [00:09<00:00,  1.23s/it, est. speed input: 51.25 toks/s, output: 416.53 toks/s]


- batch size 8: 416.44 tokens/sec (8 x 52.05)


Processed prompts: 100%|███████████| 9/9 [00:09<00:00,  1.11s/it, est. speed input: 56.98 toks/s, output: 463.06 toks/s]


- batch size 9: 462.94 tokens/sec (9 x 51.44)


Processed prompts: 100%|█████████| 10/10 [00:10<00:00,  1.00s/it, est. speed input: 63.07 toks/s, output: 505.20 toks/s]


- batch size 10: 505.04 tokens/sec (10 x 50.50)


Processed prompts: 100%|█████████| 11/11 [00:10<00:00,  1.09it/s, est. speed input: 69.00 toks/s, output: 559.12 toks/s]


- batch size 11: 558.94 tokens/sec (11 x 50.81)


Processed prompts: 100%|█████████| 12/12 [00:10<00:00,  1.19it/s, est. speed input: 74.67 toks/s, output: 606.83 toks/s]


- batch size 12: 606.66 tokens/sec (12 x 50.56)


Processed prompts: 100%|█████████| 13/13 [00:10<00:00,  1.28it/s, est. speed input: 80.69 toks/s, output: 647.62 toks/s]


- batch size 13: 647.42 tokens/sec (13 x 49.80)


Processed prompts: 100%|█████████| 14/14 [00:10<00:00,  1.37it/s, est. speed input: 86.45 toks/s, output: 696.47 toks/s]


- batch size 14: 696.27 tokens/sec (14 x 49.73)


Processed prompts: 100%|█████████| 15/15 [00:10<00:00,  1.46it/s, est. speed input: 92.41 toks/s, output: 736.12 toks/s]


- batch size 15: 735.89 tokens/sec (15 x 49.06)


Processed prompts: 100%|█████████| 16/16 [00:10<00:00,  1.53it/s, est. speed input: 96.50 toks/s, output: 784.27 toks/s]


- batch size 16: 784.04 tokens/sec (16 x 49.00)


Processed prompts: 100%|█████████| 17/17 [00:10<00:00,  1.56it/s, est. speed input: 98.10 toks/s, output: 787.19 toks/s]


- batch size 17: 786.94 tokens/sec (17 x 46.29)


Processed prompts: 100%|████████| 18/18 [00:11<00:00,  1.62it/s, est. speed input: 102.19 toks/s, output: 825.41 toks/s]


- batch size 18: 825.16 tokens/sec (18 x 45.84)


Processed prompts: 100%|████████| 19/19 [00:11<00:00,  1.70it/s, est. speed input: 106.98 toks/s, output: 859.23 toks/s]


- batch size 19: 858.96 tokens/sec (19 x 45.21)


Processed prompts: 100%|████████| 20/20 [00:11<00:00,  1.77it/s, est. speed input: 111.41 toks/s, output: 902.62 toks/s]


- batch size 20: 902.31 tokens/sec (20 x 45.12)


Processed prompts: 100%|████████| 21/21 [00:11<00:00,  1.84it/s, est. speed input: 116.22 toks/s, output: 936.93 toks/s]


- batch size 21: 936.60 tokens/sec (21 x 44.60)


Processed prompts: 100%|████████| 22/22 [00:11<00:00,  1.92it/s, est. speed input: 121.24 toks/s, output: 982.60 toks/s]


- batch size 22: 982.27 tokens/sec (22 x 44.65)


Processed prompts: 100%|████████| 23/23 [00:11<00:00,  1.93it/s, est. speed input: 121.52 toks/s, output: 980.62 toks/s]


- batch size 23: 980.30 tokens/sec (23 x 42.62)


Processed prompts: 100%|███████| 24/24 [00:12<00:00,  1.98it/s, est. speed input: 124.75 toks/s, output: 1002.42 toks/s]


- batch size 24: 1002.09 tokens/sec (24 x 41.75)


Processed prompts: 100%|███████| 25/25 [00:12<00:00,  2.07it/s, est. speed input: 130.63 toks/s, output: 1054.14 toks/s]


- batch size 25: 1053.77 tokens/sec (25 x 42.15)


Processed prompts: 100%|███████| 26/26 [00:12<00:00,  2.15it/s, est. speed input: 135.23 toks/s, output: 1091.22 toks/s]


- batch size 26: 1090.81 tokens/sec (26 x 41.95)


Processed prompts: 100%|███████| 27/27 [00:12<00:00,  2.21it/s, est. speed input: 139.65 toks/s, output: 1133.60 toks/s]


- batch size 27: 1133.13 tokens/sec (27 x 41.97)


Processed prompts: 100%|███████| 28/28 [00:12<00:00,  2.31it/s, est. speed input: 145.68 toks/s, output: 1179.29 toks/s]


- batch size 28: 1178.82 tokens/sec (28 x 42.10)


Processed prompts: 100%|███████| 29/29 [00:12<00:00,  2.33it/s, est. speed input: 146.56 toks/s, output: 1183.64 toks/s]


- batch size 29: 1183.22 tokens/sec (29 x 40.80)


Processed prompts: 100%|███████| 30/30 [00:12<00:00,  2.41it/s, est. speed input: 151.92 toks/s, output: 1222.63 toks/s]


- batch size 30: 1222.15 tokens/sec (30 x 40.74)


Processed prompts: 100%|███████| 31/31 [00:12<00:00,  2.47it/s, est. speed input: 155.98 toks/s, output: 1259.03 toks/s]


- batch size 31: 1258.56 tokens/sec (31 x 40.60)


Processed prompts: 100%|███████| 32/32 [00:12<00:00,  2.52it/s, est. speed input: 158.80 toks/s, output: 1288.18 toks/s]

- batch size 32: 1287.65 tokens/sec (32 x 40.24)





### SGLang

In [18]:
import json, time
import sglang

def sglang_load(model):
    runtime = sglang.Runtime(model_path=model)
    runtime._model = model
    return runtime

def sglang_generate(messages, runtime):
    print(f"SGLang performance test:")
    
    prompts = format_prompt(messages, runtime._model)
    sampling_params = { "temperature":0.7, "top_p":0.8, "repetition_penalty":1.05, "max_new_tokens":512 }
    # warmup
    output = json.loads(runtime.generate(prompt=prompts[0], sampling_params=sampling_params))
    print(f"Generated text: {output['text']!r}")
    
    for batch_size in range(1, len(messages) + 1):
        start_time = time.time()  # Record the start time
        outputs = json.loads(runtime.generate(prompt=prompts[0:batch_size], sampling_params=sampling_params))
        end_time = time.time()  # Record the end time
            
        # Print the outputs.
        tokenscount = 0
        for output in outputs:
            generated_text = output["text"]
            tokenscount = tokenscount + output["meta_info"]["completion_tokens"]

        tokens_per_sec = tokenscount/(end_time-start_time)
        print(f"- batch size {batch_size}: {tokens_per_sec:.2f} tokens/sec ({batch_size} x {tokens_per_sec/batch_size:.2f})")

In [6]:
runtime = sglang_load(test_models["llama-3.1"])

INFO 09-21 14:36:03 weight_utils.py:236] Using model weights format ['*.safetensors']


[rank0]:W0921 14:36:03.593000 140239112718016 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit
Traceback (most recent call last):
  File "/root/miniconda3/envs/wordslab-notebooks/lib/python3.11/site-packages/torch/_inductor/compile_worker/__main__.py", line 45, in <module>
    main()
  File "/root/miniconda3/envs/wordslab-notebooks/lib/python3.11/site-packages/torch/_inductor/compile_worker/__main__.py", line 38, in main
    pre_fork_setup()
  File "/root/miniconda3/envs/wordslab-notebooks/lib/python3.11/site-packages/torch/_inductor/async_compile.py", line 62, in pre_fork_setup
    from triton.compiler.compiler import triton_key
ImportError: cannot import name 'triton_key' from 'triton.compiler.compiler' (/workspace/wordslab-llms/.venv/lib/python3.11/site-packages/triton/compiler/compiler.py)


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


In [21]:
sglang_generate(test_messages*8, runtime)

SGLang performance test:
Generated text: "Le Crédit Mutuel est une banque coopérative française qui offre divers avantages à ses membres, notamment :\n\n1. **Intérêts plus élevés sur les épargnes** : Le Crédit Mutuel propose des taux d'intérêt attractifs sur les comptes d'épargne, ce qui permet aux membres de gagner plus sur leurs économies.\n2. **Prêts à des conditions avantageuses** : Les prêts accordés par le Crédit Mutuel sont souvent moins chers que ceux proposés par les banques traditionnelles, avec des taux d'intérêt compétitifs et des conditions de remboursement flexibles.\n3. **Services personnalisés** : Le Crédit Mutuel offre des services personnalisés à ses membres, notamment en termes de conseil financier et de suivi de leur situation financière.\n4. **Participation aux décisions** : En tant que membre du Crédit Mutuel, vous avez la possibilité de participer aux décisions stratégiques de l'organisation et de voter pour les dirigeants.\n5. **Soutien à l'économie locale** : L

https://docs.vllm.ai/en/latest/quantization/fp8.html#quantization-process