From cb90bc6bb02fdc0fbb3db9cb3c3d0ff7b6c94fa7 Mon Sep 17 00:00:00 2001 From: aresnow Date: Tue, 6 Feb 2024 10:36:46 +0800 Subject: [PATCH 1/7] Support qwen1.5 --- xinference/model/llm/llm_family.json | 251 ++++++++++++++++ .../model/llm/llm_family_modelscope.json | 274 ++++++++++++++++++ xinference/model/llm/pytorch/core.py | 2 +- xinference/model/llm/pytorch/utils.py | 2 +- xinference/model/llm/vllm/core.py | 1 + 5 files changed, 528 insertions(+), 2 deletions(-) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 3cfb53a047..6e6c29d415 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -1340,6 +1340,257 @@ ] } }, + { + "version": 1, + "context_length": 32768, + "model_name": "qwen1.5-chat", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "tools" + ], + "model_description": "Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "0_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen1.5-0.5B-Chat" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_8", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen1.5-1.8B-Chat" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 4, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen1.5-4B-Chat" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen1.5-7B-Chat" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen1.5-14B-Chat" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 72, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen1.5-72B-Chat" + }, + { + "model_format": "gptq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen1.5-0.5B-Chat-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": "1_8", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen1.5-1.8B-Chat-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 4, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen1.5-4B-Chat-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen1.5-7B-Chat-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": 72, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}" + }, + { + "model_format": "awq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen1.5-0.5B-Chat-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": "1_8", + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen1.5-1.8B-Chat-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 4, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen1.5-4B-Chat-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen1.5-7B-Chat-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen1.5-14B-Chat-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": 72, + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen1.5-72B-Chat-AWQ" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "1_8", + "quantizations": [ + "q8_0" + ], + "model_id": "Qwen/Qwen1.5-0.5B-Chat-GGUF", + "model_file_name_template": "qwen1_5-1_8b-chat-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 4, + "quantizations": [ + "q8_0" + ], + "model_id": "Qwen/Qwen1.5-4B-Chat-GGUF", + "model_file_name_template": "qwen1_5-4b-chat-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "q5_k_m" + ], + "model_id": "Qwen/Qwen1.5-7B-Chat-GGUF", + "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "q5_k_m" + ], + "model_id": "Qwen/Qwen1.5-7B-Chat-GGUF", + "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 14, + "quantizations": [ + "q5_k_m" + ], + "model_id": "Qwen/Qwen1.5-14B-Chat-GGUF", + "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 72, + "quantizations": [ + "q2_k" + ], + "model_id": "Qwen/Qwen1.5-72B-Chat-GGUF", + "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf" + } + ], + "prompt_style": { + "style_name": "QWEN", + "system_prompt": "You are a helpful assistant.", + "roles": [ + "user", + "assistant" + ], + "intra_message_sep": "\n", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] + } + }, { "version": 1, "context_length": 8192, diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index f017614baf..5f2bd6d293 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -1624,6 +1624,280 @@ ] } }, + { + "version": 1, + "context_length": 32768, + "model_name": "qwen1.5-chat", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "tools" + ], + "model_description": "Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "0_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen1.5-0.5B-Chat" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_8", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen1.5-1.8B-Chat", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 4, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen1.5-4B-Chat", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen1.5-7B-Chat", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen1.5-14B-Chat", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 72, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen1.5-72B-Chat", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen1.5-0.5B-Chat-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": "1_8", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen1.5-1.8B-Chat-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 4, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen1.5-4B-Chat-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen1.5-7B-Chat-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 72, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen1.5-0.5B-Chat-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": "1_8", + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen1.5-1.8B-Chat-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 4, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen1.5-4B-Chat-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen1.5-7B-Chat-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen1.5-14B-Chat-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 72, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen1.5-72B-Chat-AWQ", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": "1_8", + "quantizations": [ + "q8_0" + ], + "model_id": "qwen/Qwen1.5-0.5B-Chat-GGUF", + "model_hub": "modelscope", + "model_file_name_template": "qwen1_5-1_8b-chat-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 4, + "quantizations": [ + "q8_0" + ], + "model_id": "qwen/Qwen1.5-4B-Chat-GGUF", + "model_hub": "modelscope", + "model_file_name_template": "qwen1_5-4b-chat-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "q5_k_m" + ], + "model_id": "qwen/Qwen1.5-7B-Chat-GGUF", + "model_hub": "modelscope", + "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "q5_k_m" + ], + "model_id": "qwen/Qwen1.5-7B-Chat-GGUF", + "model_hub": "modelscope", + "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 14, + "quantizations": [ + "q5_k_m" + ], + "model_id": "qwen/Qwen1.5-14B-Chat-GGUF", + "model_hub": "modelscope", + "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 72, + "quantizations": [ + "q2_k" + ], + "model_id": "qwen/Qwen1.5-72B-Chat-GGUF", + "model_hub": "modelscope", + "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf" + } + ], + "prompt_style": { + "style_name": "QWEN", + "system_prompt": "You are a helpful assistant.", + "roles": [ + "user", + "assistant" + ], + "intra_message_sep": "\n", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] + } + }, { "version": 1, "context_length": 4096, diff --git a/xinference/model/llm/pytorch/core.py b/xinference/model/llm/pytorch/core.py index 9bbcacf3d0..462824596c 100644 --- a/xinference/model/llm/pytorch/core.py +++ b/xinference/model/llm/pytorch/core.py @@ -448,7 +448,7 @@ def chat( generate_config = self._sanitize_generate_config(generate_config) # TODO(codingl2k1): qwen hacky to set stop for function call. model_family = self.model_family.model_family or self.model_family.model_name - if tools and "qwen-chat" == model_family: + if tools and model_family in ["qwen-chat", "qwen1.5-chat"]: stop = generate_config.get("stop") if isinstance(stop, str): generate_config["stop"] = [stop, "Observation:"] diff --git a/xinference/model/llm/pytorch/utils.py b/xinference/model/llm/pytorch/utils.py index 637530f85b..390e6dff81 100644 --- a/xinference/model/llm/pytorch/utils.py +++ b/xinference/model/llm/pytorch/utils.py @@ -122,7 +122,7 @@ def generate_stream( temperature, repetition_penalty, top_p, top_k ) - if "qwen" in str(type(model)).lower(): + if "qwen" in str(type(model)).lower() and "1.5" not in str(type(model)): # TODO: hacky input_ids = tokenizer(prompt, allowed_special="all").input_ids else: diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 501e655dd3..38b7fc0735 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -88,6 +88,7 @@ class VLLMGenerateConfig(TypedDict, total=False): "internlm-chat-8k", "internlm-chat-20b", "qwen-chat", + "qwen1.5-chat", "Yi", "Yi-chat", "code-llama", From 835c5c836e487bc32ab968835d4d89a35d59b578 Mon Sep 17 00:00:00 2001 From: aresnow Date: Tue, 6 Feb 2024 10:38:55 +0800 Subject: [PATCH 2/7] Add docs --- doc/source/models/builtin/llm/index.rst | 7 + .../models/builtin/llm/llama-2-chat.rst | 45 +++ .../models/builtin/llm/qwen1.5-chat.rst | 375 ++++++++++++++++++ xinference/model/llm/llm_family.json | 4 +- 4 files changed, 428 insertions(+), 3 deletions(-) create mode 100644 doc/source/models/builtin/llm/qwen1.5-chat.rst diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index 8f1c309f27..78bcf3ba8a 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -216,6 +216,11 @@ The following is a list of built-in LLM in Xinference: - 4096 - Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities. + * - :ref:`qwen1.5-chat ` + - chat, tools + - 32768 + - Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. + * - :ref:`skywork ` - generate - 4096 @@ -401,6 +406,8 @@ The following is a list of built-in LLM in Xinference: qwen-vl-chat + qwen1.5-chat + skywork skywork-math diff --git a/doc/source/models/builtin/llm/llama-2-chat.rst b/doc/source/models/builtin/llm/llama-2-chat.rst index 144af09a72..730509d47e 100644 --- a/doc/source/models/builtin/llm/llama-2-chat.rst +++ b/doc/source/models/builtin/llm/llama-2-chat.rst @@ -103,3 +103,48 @@ chosen quantization method from the options listed above:: xinference launch --model-name llama-2-chat --size-in-billions 70 --model-format pytorch --quantization ${quantization} + +Model Spec 7 (ggufv2, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 7 +- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0 +- **Model ID:** TheBloke/Llama-2-7B-Chat-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name llama-2-chat --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 8 (ggufv2, 13 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 13 +- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0 +- **Model ID:** TheBloke/Llama-2-13B-chat-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name llama-2-chat --size-in-billions 13 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 9 (ggufv2, 70 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 70 +- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0 +- **Model ID:** TheBloke/Llama-2-70B-Chat-GGUF +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name llama-2-chat --size-in-billions 70 --model-format ggufv2 --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/qwen1.5-chat.rst b/doc/source/models/builtin/llm/qwen1.5-chat.rst new file mode 100644 index 0000000000..8ba5a44c1e --- /dev/null +++ b/doc/source/models/builtin/llm/qwen1.5-chat.rst @@ -0,0 +1,375 @@ +.. _models_llm_qwen1.5-chat: + +======================================== +qwen1.5-chat +======================================== + +- **Context Length:** 32768 +- **Model Name:** qwen1.5-chat +- **Languages:** en, zh +- **Abilities:** chat, tools +- **Description:** Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 0_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 0_5 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** Qwen/Qwen1.5-0.5B-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 0_5 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 1_8 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 1_8 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** Qwen/Qwen1.5-1.8B-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 1_8 --model-format pytorch --quantization ${quantization} + + +Model Spec 3 (pytorch, 4 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 4 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** Qwen/Qwen1.5-4B-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 4 --model-format pytorch --quantization ${quantization} + + +Model Spec 4 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** Qwen/Qwen1.5-7B-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization} + + +Model Spec 5 (pytorch, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 14 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** Qwen/Qwen1.5-14B-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 14 --model-format pytorch --quantization ${quantization} + + +Model Spec 6 (pytorch, 72 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 72 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** Qwen/Qwen1.5-72B-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 72 --model-format pytorch --quantization ${quantization} + + +Model Spec 7 (gptq, 0_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 0_5 +- **Quantizations:** Int4, Int8 +- **Model ID:** Qwen/Qwen1.5-0.5B-Chat-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 0_5 --model-format gptq --quantization ${quantization} + + +Model Spec 8 (gptq, 1_8 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 1_8 +- **Quantizations:** Int4, Int8 +- **Model ID:** Qwen/Qwen1.5-1.8B-Chat-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 1_8 --model-format gptq --quantization ${quantization} + + +Model Spec 9 (gptq, 4 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 4 +- **Quantizations:** Int4, Int8 +- **Model ID:** Qwen/Qwen1.5-4B-Chat-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 4 --model-format gptq --quantization ${quantization} + + +Model Spec 10 (gptq, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 7 +- **Quantizations:** Int4, Int8 +- **Model ID:** Qwen/Qwen1.5-7B-Chat-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 7 --model-format gptq --quantization ${quantization} + + +Model Spec 11 (gptq, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 14 +- **Quantizations:** Int4, Int8 +- **Model ID:** Qwen/Qwen1.5-14B-Chat-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 14 --model-format gptq --quantization ${quantization} + + +Model Spec 12 (gptq, 72 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 72 +- **Quantizations:** Int4, Int8 +- **Model ID:** Qwen/Qwen1.5-72B-Chat-GPTQ-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 72 --model-format gptq --quantization ${quantization} + + +Model Spec 13 (awq, 0_5 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 0_5 +- **Quantizations:** Int4 +- **Model ID:** Qwen/Qwen1.5-0.5B-Chat-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 0_5 --model-format awq --quantization ${quantization} + + +Model Spec 14 (awq, 1_8 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 1_8 +- **Quantizations:** Int4 +- **Model ID:** Qwen/Qwen1.5-1.8B-Chat-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 1_8 --model-format awq --quantization ${quantization} + + +Model Spec 15 (awq, 4 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 4 +- **Quantizations:** Int4 +- **Model ID:** Qwen/Qwen1.5-4B-Chat-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 4 --model-format awq --quantization ${quantization} + + +Model Spec 16 (awq, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 7 +- **Quantizations:** Int4 +- **Model ID:** Qwen/Qwen1.5-7B-Chat-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 7 --model-format awq --quantization ${quantization} + + +Model Spec 17 (awq, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 14 +- **Quantizations:** Int4 +- **Model ID:** Qwen/Qwen1.5-14B-Chat-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 14 --model-format awq --quantization ${quantization} + + +Model Spec 18 (awq, 72 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 72 +- **Quantizations:** Int4 +- **Model ID:** Qwen/Qwen1.5-72B-Chat-AWQ +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 72 --model-format awq --quantization ${quantization} + + +Model Spec 19 (ggufv2, 1_8 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 1_8 +- **Quantizations:** q8_0 +- **Model ID:** Qwen/Qwen1.5-0.5B-Chat-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 1_8 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 20 (ggufv2, 4 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 4 +- **Quantizations:** q8_0 +- **Model ID:** Qwen/Qwen1.5-4B-Chat-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 4 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 21 (ggufv2, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 7 +- **Quantizations:** q5_k_m +- **Model ID:** Qwen/Qwen1.5-7B-Chat-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 22 (ggufv2, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 7 +- **Quantizations:** q5_k_m +- **Model ID:** Qwen/Qwen1.5-7B-Chat-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 23 (ggufv2, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 14 +- **Quantizations:** q5_k_m +- **Model ID:** Qwen/Qwen1.5-14B-Chat-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 14 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 24 (ggufv2, 72 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 72 +- **Quantizations:** q2_k +- **Model ID:** Qwen/Qwen1.5-72B-Chat-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen1.5-chat --size-in-billions 72 --model-format ggufv2 --quantization ${quantization} + diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 6e6c29d415..4cd143eaeb 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -881,9 +881,7 @@ "Q4_K_M", "Q5_0", "Q5_K_S", - "Q5_K_M", - "Q6_K", - "Q8_0" + "Q5_K_M" ], "model_id": "TheBloke/Llama-2-70B-Chat-GGUF", "model_file_name_template": "llama-2-70b-chat.{quantization}.gguf" From 4fe61d1f3c7fcdec7d1c71c844bc655413089884 Mon Sep 17 00:00:00 2001 From: aresnow Date: Tue, 6 Feb 2024 11:13:41 +0800 Subject: [PATCH 3/7] Fix --- xinference/model/llm/pytorch/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/pytorch/utils.py b/xinference/model/llm/pytorch/utils.py index 390e6dff81..195dcbc16b 100644 --- a/xinference/model/llm/pytorch/utils.py +++ b/xinference/model/llm/pytorch/utils.py @@ -122,7 +122,7 @@ def generate_stream( temperature, repetition_penalty, top_p, top_k ) - if "qwen" in str(type(model)).lower() and "1.5" not in str(type(model)): + if ".modeling_qwen." in str(type(model)).lower(): # TODO: hacky input_ids = tokenizer(prompt, allowed_special="all").input_ids else: From 8e5f83872165c5e3f04398ad8b42e9d5e9726baf Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Tue, 6 Feb 2024 11:24:31 +0800 Subject: [PATCH 4/7] Fix qwen 1.5 gguf modelscope --- .../model/llm/llm_family_modelscope.json | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 5f2bd6d293..5eadaa15b6 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -1819,33 +1819,33 @@ }, { "model_format": "ggufv2", - "model_size_in_billions": "1_8", + "model_size_in_billions": "0_5", "quantizations": [ "q8_0" ], "model_id": "qwen/Qwen1.5-0.5B-Chat-GGUF", "model_hub": "modelscope", - "model_file_name_template": "qwen1_5-1_8b-chat-{quantization}.gguf" + "model_file_name_template": "qwen1.5-0.5b-chat-{quantization}.gguf" }, { "model_format": "ggufv2", - "model_size_in_billions": 4, + "model_size_in_billions": "1_8", "quantizations": [ "q8_0" ], - "model_id": "qwen/Qwen1.5-4B-Chat-GGUF", + "model_id": "qwen/Qwen1.5-1.8B-Chat-GGUF", "model_hub": "modelscope", - "model_file_name_template": "qwen1_5-4b-chat-{quantization}.gguf" + "model_file_name_template": "qwen1.5-1_8b-chat-{quantization}.gguf" }, { "model_format": "ggufv2", - "model_size_in_billions": 7, + "model_size_in_billions": 4, "quantizations": [ - "q5_k_m" + "q8_0" ], - "model_id": "qwen/Qwen1.5-7B-Chat-GGUF", + "model_id": "qwen/Qwen1.5-4B-Chat-GGUF", "model_hub": "modelscope", - "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf" + "model_file_name_template": "qwen1.5-4b-chat-{quantization}.gguf" }, { "model_format": "ggufv2", @@ -1855,7 +1855,7 @@ ], "model_id": "qwen/Qwen1.5-7B-Chat-GGUF", "model_hub": "modelscope", - "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf" + "model_file_name_template": "qwen1.5-7b-chat-{quantization}.gguf" }, { "model_format": "ggufv2", @@ -1865,7 +1865,7 @@ ], "model_id": "qwen/Qwen1.5-14B-Chat-GGUF", "model_hub": "modelscope", - "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf" + "model_file_name_template": "qwen1.5-14b-chat-{quantization}.gguf" }, { "model_format": "ggufv2", @@ -1875,7 +1875,7 @@ ], "model_id": "qwen/Qwen1.5-72B-Chat-GGUF", "model_hub": "modelscope", - "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf" + "model_file_name_template": "qwen1.5-72b-chat-{quantization}.gguf" } ], "prompt_style": { From 6e40f3cbe94cf5d08ea992b35724e68c77bed793 Mon Sep 17 00:00:00 2001 From: aresnow Date: Tue, 6 Feb 2024 11:43:33 +0800 Subject: [PATCH 5/7] Fix vllm --- xinference/model/llm/vllm/core.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 38b7fc0735..7b2b1f7812 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -56,6 +56,7 @@ class VLLMModelConfig(TypedDict, total=False): max_num_batched_tokens: int max_num_seqs: int quantization: Optional[str] + max_model_len: Optional[int] class VLLMGenerateConfig(TypedDict, total=False): @@ -88,7 +89,6 @@ class VLLMGenerateConfig(TypedDict, total=False): "internlm-chat-8k", "internlm-chat-20b", "qwen-chat", - "qwen1.5-chat", "Yi", "Yi-chat", "code-llama", @@ -99,6 +99,8 @@ class VLLMGenerateConfig(TypedDict, total=False): "mixtral-instruct-v0.1", "chatglm3", ] +if VLLM_INSTALLED and vllm.__version__ >= "0.3.0": + VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat") class VLLMModel(LLM): @@ -152,6 +154,7 @@ def _sanitize_model_config( model_config.setdefault("gpu_memory_utilization", 0.90) model_config.setdefault("max_num_seqs", 256) model_config.setdefault("quantization", None) + model_config.setdefault("max_model_len", 4096) return model_config From 70567afd40a720e2f76ecd3912871bd334401fa0 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Tue, 6 Feb 2024 12:08:56 +0800 Subject: [PATCH 6/7] Fix function call --- xinference/api/restful_api.py | 7 ++++++- xinference/model/llm/ggml/llamacpp.py | 2 +- xinference/model/llm/utils.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py index f26bb211f6..509d3dc48c 100644 --- a/xinference/api/restful_api.py +++ b/xinference/api/restful_api.py @@ -1159,7 +1159,12 @@ async def create_chat_completion(self, request: Request) -> Response: raise HTTPException(status_code=500, detail=str(e)) model_family = desc.get("model_family", "") - function_call_models = ["chatglm3", "gorilla-openfunctions-v1", "qwen-chat"] + function_call_models = [ + "chatglm3", + "gorilla-openfunctions-v1", + "qwen-chat", + "qwen1.5-chat", + ] is_qwen = desc.get("model_format") == "ggmlv3" and "qwen-chat" == model_family diff --git a/xinference/model/llm/ggml/llamacpp.py b/xinference/model/llm/ggml/llamacpp.py index e859f45cb6..cafce34114 100644 --- a/xinference/model/llm/ggml/llamacpp.py +++ b/xinference/model/llm/ggml/llamacpp.py @@ -307,7 +307,7 @@ def chat( generate_config = self._sanitize_generate_config(generate_config) # TODO(codingl2k1): qwen hacky to set stop for function call. model_family = self.model_family.model_family or self.model_family.model_name - if tools and "qwen-chat" == model_family: + if tools and model_family in ["qwen-chat", "qwen1.5-chat"]: stop = generate_config.get("stop") if isinstance(stop, str): generate_config["stop"] = [stop, "Observation:"] diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py index 7a5d5210da..d5e6df175c 100644 --- a/xinference/model/llm/utils.py +++ b/xinference/model/llm/utils.py @@ -556,7 +556,7 @@ def _tool_calls_completion(cls, model_family, model_uid, c, tools): content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools) elif "chatglm3" == family: content, func, args = cls._eval_chatglm3_arguments(c, tools) - elif "qwen-chat" == family: + elif family in ["qwen-chat", "qwen1.5-chat"]: content, func, args = cls._eval_qwen_chat_arguments(c, tools) else: raise Exception( From 986032e8deb10a05dbe7c400b2374940149e1393 Mon Sep 17 00:00:00 2001 From: aresnow Date: Tue, 6 Feb 2024 12:31:01 +0800 Subject: [PATCH 7/7] Fix json file --- xinference/model/llm/llm_family.json | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 4cd143eaeb..259a75636e 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -1520,7 +1520,7 @@ "quantizations": [ "q8_0" ], - "model_id": "Qwen/Qwen1.5-0.5B-Chat-GGUF", + "model_id": "Qwen/Qwen1.5-1.8B-Chat-GGUF", "model_file_name_template": "qwen1_5-1_8b-chat-{quantization}.gguf" }, { @@ -1541,15 +1541,6 @@ "model_id": "Qwen/Qwen1.5-7B-Chat-GGUF", "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf" }, - { - "model_format": "ggufv2", - "model_size_in_billions": 7, - "quantizations": [ - "q5_k_m" - ], - "model_id": "Qwen/Qwen1.5-7B-Chat-GGUF", - "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf" - }, { "model_format": "ggufv2", "model_size_in_billions": 14,