From cb90bc6bb02fdc0fbb3db9cb3c3d0ff7b6c94fa7 Mon Sep 17 00:00:00 2001
From: aresnow <aresnow1@gmail.com>
Date: Tue, 6 Feb 2024 10:36:46 +0800
Subject: [PATCH 1/7] Support qwen1.5

---
 xinference/model/llm/llm_family.json          | 251 ++++++++++++++++
 .../model/llm/llm_family_modelscope.json      | 274 ++++++++++++++++++
 xinference/model/llm/pytorch/core.py          |   2 +-
 xinference/model/llm/pytorch/utils.py         |   2 +-
 xinference/model/llm/vllm/core.py             |   1 +
 5 files changed, 528 insertions(+), 2 deletions(-)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 3cfb53a047..6e6c29d415 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -1340,6 +1340,257 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen1.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen1.5-0.5B-Chat"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_8",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen1.5-1.8B-Chat"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen1.5-4B-Chat"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen1.5-7B-Chat"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen1.5-14B-Chat"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen1.5-72B-Chat"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen1.5-0.5B-Chat-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_8",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen1.5-1.8B-Chat-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen1.5-4B-Chat-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen1.5-7B-Chat-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-0.5B-Chat-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "1_8",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-1.8B-Chat-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-4B-Chat-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-7B-Chat-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-14B-Chat-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-72B-Chat-AWQ"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_8",
+        "quantizations": [
+          "q8_0"
+        ],
+        "model_id": "Qwen/Qwen1.5-0.5B-Chat-GGUF",
+        "model_file_name_template": "qwen1_5-1_8b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "q8_0"
+        ],
+        "model_id": "Qwen/Qwen1.5-4B-Chat-GGUF",
+        "model_file_name_template": "qwen1_5-4b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q5_k_m"
+        ],
+        "model_id": "Qwen/Qwen1.5-7B-Chat-GGUF",
+        "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q5_k_m"
+        ],
+        "model_id": "Qwen/Qwen1.5-7B-Chat-GGUF",
+        "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "q5_k_m"
+        ],
+        "model_id": "Qwen/Qwen1.5-14B-Chat-GGUF",
+        "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "q2_k"
+        ],
+        "model_id": "Qwen/Qwen1.5-72B-Chat-GGUF",
+        "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 8192,
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index f017614baf..5f2bd6d293 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -1624,6 +1624,280 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen1.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-0.5B-Chat"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_8",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-1.8B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-4B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-7B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-14B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-72B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen1.5-0.5B-Chat-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_8",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen1.5-1.8B-Chat-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen1.5-4B-Chat-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen1.5-7B-Chat-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-0.5B-Chat-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "1_8",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-1.8B-Chat-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-4B-Chat-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-7B-Chat-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-14B-Chat-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-72B-Chat-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_8",
+        "quantizations": [
+          "q8_0"
+        ],
+        "model_id": "qwen/Qwen1.5-0.5B-Chat-GGUF",
+        "model_hub": "modelscope",
+        "model_file_name_template": "qwen1_5-1_8b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "q8_0"
+        ],
+        "model_id": "qwen/Qwen1.5-4B-Chat-GGUF",
+        "model_hub": "modelscope",
+        "model_file_name_template": "qwen1_5-4b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q5_k_m"
+        ],
+        "model_id": "qwen/Qwen1.5-7B-Chat-GGUF",
+        "model_hub": "modelscope",
+        "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q5_k_m"
+        ],
+        "model_id": "qwen/Qwen1.5-7B-Chat-GGUF",
+        "model_hub": "modelscope",
+        "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "q5_k_m"
+        ],
+        "model_id": "qwen/Qwen1.5-14B-Chat-GGUF",
+        "model_hub": "modelscope",
+        "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "q2_k"
+        ],
+        "model_id": "qwen/Qwen1.5-72B-Chat-GGUF",
+        "model_hub": "modelscope",
+        "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,
diff --git a/xinference/model/llm/pytorch/core.py b/xinference/model/llm/pytorch/core.py
index 9bbcacf3d0..462824596c 100644
--- a/xinference/model/llm/pytorch/core.py
+++ b/xinference/model/llm/pytorch/core.py
@@ -448,7 +448,7 @@ def chat(
         generate_config = self._sanitize_generate_config(generate_config)
         # TODO(codingl2k1): qwen hacky to set stop for function call.
         model_family = self.model_family.model_family or self.model_family.model_name
-        if tools and "qwen-chat" == model_family:
+        if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
             stop = generate_config.get("stop")
             if isinstance(stop, str):
                 generate_config["stop"] = [stop, "Observation:"]
diff --git a/xinference/model/llm/pytorch/utils.py b/xinference/model/llm/pytorch/utils.py
index 637530f85b..390e6dff81 100644
--- a/xinference/model/llm/pytorch/utils.py
+++ b/xinference/model/llm/pytorch/utils.py
@@ -122,7 +122,7 @@ def generate_stream(
         temperature, repetition_penalty, top_p, top_k
     )
 
-    if "qwen" in str(type(model)).lower():
+    if "qwen" in str(type(model)).lower() and "1.5" not in str(type(model)):
         # TODO: hacky
         input_ids = tokenizer(prompt, allowed_special="all").input_ids
     else:
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 501e655dd3..38b7fc0735 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -88,6 +88,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
     "internlm-chat-8k",
     "internlm-chat-20b",
     "qwen-chat",
+    "qwen1.5-chat",
     "Yi",
     "Yi-chat",
     "code-llama",

From 835c5c836e487bc32ab968835d4d89a35d59b578 Mon Sep 17 00:00:00 2001
From: aresnow <aresnow1@gmail.com>
Date: Tue, 6 Feb 2024 10:38:55 +0800
Subject: [PATCH 2/7] Add docs

---
 doc/source/models/builtin/llm/index.rst       |   7 +
 .../models/builtin/llm/llama-2-chat.rst       |  45 +++
 .../models/builtin/llm/qwen1.5-chat.rst       | 375 ++++++++++++++++++
 xinference/model/llm/llm_family.json          |   4 +-
 4 files changed, 428 insertions(+), 3 deletions(-)
 create mode 100644 doc/source/models/builtin/llm/qwen1.5-chat.rst

diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
index 8f1c309f27..78bcf3ba8a 100644
--- a/doc/source/models/builtin/llm/index.rst
+++ b/doc/source/models/builtin/llm/index.rst
@@ -216,6 +216,11 @@ The following is a list of built-in LLM in Xinference:
      - 4096
      - Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.
 
+   * - :ref:`qwen1.5-chat <models_llm_qwen1.5-chat>`
+     - chat, tools
+     - 32768
+     - Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data.
+
    * - :ref:`skywork <models_llm_skywork>`
      - generate
      - 4096
@@ -401,6 +406,8 @@ The following is a list of built-in LLM in Xinference:
   
    qwen-vl-chat
   
+   qwen1.5-chat
+  
    skywork
   
    skywork-math
diff --git a/doc/source/models/builtin/llm/llama-2-chat.rst b/doc/source/models/builtin/llm/llama-2-chat.rst
index 144af09a72..730509d47e 100644
--- a/doc/source/models/builtin/llm/llama-2-chat.rst
+++ b/doc/source/models/builtin/llm/llama-2-chat.rst
@@ -103,3 +103,48 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2-chat --size-in-billions 70 --model-format pytorch --quantization ${quantization}
 
+
+Model Spec 7 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0
+- **Model ID:** TheBloke/Llama-2-7B-Chat-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/Xorbits/Llama-2-7b-Chat-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name llama-2-chat --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 8 (ggufv2, 13 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 13
+- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0
+- **Model ID:** TheBloke/Llama-2-13B-chat-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/Xorbits/Llama-2-13b-Chat-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name llama-2-chat --size-in-billions 13 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 9 (ggufv2, 70 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 70
+- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0
+- **Model ID:** TheBloke/Llama-2-70B-Chat-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/TheBloke/Llama-2-70B-Chat-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name llama-2-chat --size-in-billions 70 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen1.5-chat.rst b/doc/source/models/builtin/llm/qwen1.5-chat.rst
new file mode 100644
index 0000000000..8ba5a44c1e
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen1.5-chat.rst
@@ -0,0 +1,375 @@
+.. _models_llm_qwen1.5-chat:
+
+========================================
+qwen1.5-chat
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** qwen1.5-chat
+- **Languages:** en, zh
+- **Abilities:** chat, tools
+- **Description:** Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 0_5
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** Qwen/Qwen1.5-0.5B-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 0_5 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 1_8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_8
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** Qwen/Qwen1.5-1.8B-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 1_8 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 3 (pytorch, 4 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 4
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** Qwen/Qwen1.5-4B-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-4B-Chat>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 4 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 4 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** Qwen/Qwen1.5-7B-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-7B-Chat>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 5 (pytorch, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 14
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** Qwen/Qwen1.5-14B-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-14B-Chat>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 14 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 6 (pytorch, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 72
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** Qwen/Qwen1.5-72B-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-72B-Chat>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 72 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 7 (gptq, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 0_5
+- **Quantizations:** Int4, Int8
+- **Model ID:** Qwen/Qwen1.5-0.5B-Chat-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 0_5 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 8 (gptq, 1_8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 1_8
+- **Quantizations:** Int4, Int8
+- **Model ID:** Qwen/Qwen1.5-1.8B-Chat-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 1_8 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 9 (gptq, 4 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 4
+- **Quantizations:** Int4, Int8
+- **Model ID:** Qwen/Qwen1.5-4B-Chat-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 4 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 10 (gptq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4, Int8
+- **Model ID:** Qwen/Qwen1.5-7B-Chat-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 7 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 11 (gptq, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 14
+- **Quantizations:** Int4, Int8
+- **Model ID:** Qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 14 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 12 (gptq, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 72
+- **Quantizations:** Int4, Int8
+- **Model ID:** Qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 72 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 13 (awq, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 0_5
+- **Quantizations:** Int4
+- **Model ID:** Qwen/Qwen1.5-0.5B-Chat-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 0_5 --model-format awq --quantization ${quantization}
+
+
+Model Spec 14 (awq, 1_8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 1_8
+- **Quantizations:** Int4
+- **Model ID:** Qwen/Qwen1.5-1.8B-Chat-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 1_8 --model-format awq --quantization ${quantization}
+
+
+Model Spec 15 (awq, 4 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 4
+- **Quantizations:** Int4
+- **Model ID:** Qwen/Qwen1.5-4B-Chat-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-4B-Chat-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 4 --model-format awq --quantization ${quantization}
+
+
+Model Spec 16 (awq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4
+- **Model ID:** Qwen/Qwen1.5-7B-Chat-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-7B-Chat-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 7 --model-format awq --quantization ${quantization}
+
+
+Model Spec 17 (awq, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 14
+- **Quantizations:** Int4
+- **Model ID:** Qwen/Qwen1.5-14B-Chat-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-14B-Chat-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 14 --model-format awq --quantization ${quantization}
+
+
+Model Spec 18 (awq, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 72
+- **Quantizations:** Int4
+- **Model ID:** Qwen/Qwen1.5-72B-Chat-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-72B-Chat-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 72 --model-format awq --quantization ${quantization}
+
+
+Model Spec 19 (ggufv2, 1_8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 1_8
+- **Quantizations:** q8_0
+- **Model ID:** Qwen/Qwen1.5-0.5B-Chat-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 1_8 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 20 (ggufv2, 4 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 4
+- **Quantizations:** q8_0
+- **Model ID:** Qwen/Qwen1.5-4B-Chat-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 4 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 21 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** q5_k_m
+- **Model ID:** Qwen/Qwen1.5-7B-Chat-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 22 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** q5_k_m
+- **Model ID:** Qwen/Qwen1.5-7B-Chat-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 23 (ggufv2, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 14
+- **Quantizations:** q5_k_m
+- **Model ID:** Qwen/Qwen1.5-14B-Chat-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 14 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 24 (ggufv2, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 72
+- **Quantizations:** q2_k
+- **Model ID:** Qwen/Qwen1.5-72B-Chat-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-chat --size-in-billions 72 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 6e6c29d415..4cd143eaeb 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -881,9 +881,7 @@
           "Q4_K_M",
           "Q5_0",
           "Q5_K_S",
-          "Q5_K_M",
-          "Q6_K",
-          "Q8_0"
+          "Q5_K_M"
         ],
         "model_id": "TheBloke/Llama-2-70B-Chat-GGUF",
         "model_file_name_template": "llama-2-70b-chat.{quantization}.gguf"

From 4fe61d1f3c7fcdec7d1c71c844bc655413089884 Mon Sep 17 00:00:00 2001
From: aresnow <aresnow1@gmail.com>
Date: Tue, 6 Feb 2024 11:13:41 +0800
Subject: [PATCH 3/7] Fix

---
 xinference/model/llm/pytorch/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/model/llm/pytorch/utils.py b/xinference/model/llm/pytorch/utils.py
index 390e6dff81..195dcbc16b 100644
--- a/xinference/model/llm/pytorch/utils.py
+++ b/xinference/model/llm/pytorch/utils.py
@@ -122,7 +122,7 @@ def generate_stream(
         temperature, repetition_penalty, top_p, top_k
     )
 
-    if "qwen" in str(type(model)).lower() and "1.5" not in str(type(model)):
+    if ".modeling_qwen." in str(type(model)).lower():
         # TODO: hacky
         input_ids = tokenizer(prompt, allowed_special="all").input_ids
     else:

From 8e5f83872165c5e3f04398ad8b42e9d5e9726baf Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Tue, 6 Feb 2024 11:24:31 +0800
Subject: [PATCH 4/7] Fix qwen 1.5 gguf modelscope

---
 .../model/llm/llm_family_modelscope.json      | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index 5f2bd6d293..5eadaa15b6 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -1819,33 +1819,33 @@
       },
       {
         "model_format": "ggufv2",
-        "model_size_in_billions": "1_8",
+        "model_size_in_billions": "0_5",
         "quantizations": [
           "q8_0"
         ],
         "model_id": "qwen/Qwen1.5-0.5B-Chat-GGUF",
         "model_hub": "modelscope",
-        "model_file_name_template": "qwen1_5-1_8b-chat-{quantization}.gguf"
+        "model_file_name_template": "qwen1.5-0.5b-chat-{quantization}.gguf"
       },
       {
         "model_format": "ggufv2",
-        "model_size_in_billions": 4,
+        "model_size_in_billions": "1_8",
         "quantizations": [
           "q8_0"
         ],
-        "model_id": "qwen/Qwen1.5-4B-Chat-GGUF",
+        "model_id": "qwen/Qwen1.5-1.8B-Chat-GGUF",
         "model_hub": "modelscope",
-        "model_file_name_template": "qwen1_5-4b-chat-{quantization}.gguf"
+        "model_file_name_template": "qwen1.5-1_8b-chat-{quantization}.gguf"
       },
       {
         "model_format": "ggufv2",
-        "model_size_in_billions": 7,
+        "model_size_in_billions": 4,
         "quantizations": [
-          "q5_k_m"
+          "q8_0"
         ],
-        "model_id": "qwen/Qwen1.5-7B-Chat-GGUF",
+        "model_id": "qwen/Qwen1.5-4B-Chat-GGUF",
         "model_hub": "modelscope",
-        "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf"
+        "model_file_name_template": "qwen1.5-4b-chat-{quantization}.gguf"
       },
       {
         "model_format": "ggufv2",
@@ -1855,7 +1855,7 @@
         ],
         "model_id": "qwen/Qwen1.5-7B-Chat-GGUF",
         "model_hub": "modelscope",
-        "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf"
+        "model_file_name_template": "qwen1.5-7b-chat-{quantization}.gguf"
       },
       {
         "model_format": "ggufv2",
@@ -1865,7 +1865,7 @@
         ],
         "model_id": "qwen/Qwen1.5-14B-Chat-GGUF",
         "model_hub": "modelscope",
-        "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
+        "model_file_name_template": "qwen1.5-14b-chat-{quantization}.gguf"
       },
       {
         "model_format": "ggufv2",
@@ -1875,7 +1875,7 @@
         ],
         "model_id": "qwen/Qwen1.5-72B-Chat-GGUF",
         "model_hub": "modelscope",
-        "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf"
+        "model_file_name_template": "qwen1.5-72b-chat-{quantization}.gguf"
       }
     ],
     "prompt_style": {

From 6e40f3cbe94cf5d08ea992b35724e68c77bed793 Mon Sep 17 00:00:00 2001
From: aresnow <aresnow1@gmail.com>
Date: Tue, 6 Feb 2024 11:43:33 +0800
Subject: [PATCH 5/7] Fix vllm

---
 xinference/model/llm/vllm/core.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 38b7fc0735..7b2b1f7812 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -56,6 +56,7 @@ class VLLMModelConfig(TypedDict, total=False):
     max_num_batched_tokens: int
     max_num_seqs: int
     quantization: Optional[str]
+    max_model_len: Optional[int]
 
 
 class VLLMGenerateConfig(TypedDict, total=False):
@@ -88,7 +89,6 @@ class VLLMGenerateConfig(TypedDict, total=False):
     "internlm-chat-8k",
     "internlm-chat-20b",
     "qwen-chat",
-    "qwen1.5-chat",
     "Yi",
     "Yi-chat",
     "code-llama",
@@ -99,6 +99,8 @@ class VLLMGenerateConfig(TypedDict, total=False):
     "mixtral-instruct-v0.1",
     "chatglm3",
 ]
+if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
+    VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
 
 
 class VLLMModel(LLM):
@@ -152,6 +154,7 @@ def _sanitize_model_config(
         model_config.setdefault("gpu_memory_utilization", 0.90)
         model_config.setdefault("max_num_seqs", 256)
         model_config.setdefault("quantization", None)
+        model_config.setdefault("max_model_len", 4096)
 
         return model_config
 

From 70567afd40a720e2f76ecd3912871bd334401fa0 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Tue, 6 Feb 2024 12:08:56 +0800
Subject: [PATCH 6/7] Fix function call

---
 xinference/api/restful_api.py         | 7 ++++++-
 xinference/model/llm/ggml/llamacpp.py | 2 +-
 xinference/model/llm/utils.py         | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
index f26bb211f6..509d3dc48c 100644
--- a/xinference/api/restful_api.py
+++ b/xinference/api/restful_api.py
@@ -1159,7 +1159,12 @@ async def create_chat_completion(self, request: Request) -> Response:
             raise HTTPException(status_code=500, detail=str(e))
 
         model_family = desc.get("model_family", "")
-        function_call_models = ["chatglm3", "gorilla-openfunctions-v1", "qwen-chat"]
+        function_call_models = [
+            "chatglm3",
+            "gorilla-openfunctions-v1",
+            "qwen-chat",
+            "qwen1.5-chat",
+        ]
 
         is_qwen = desc.get("model_format") == "ggmlv3" and "qwen-chat" == model_family
 
diff --git a/xinference/model/llm/ggml/llamacpp.py b/xinference/model/llm/ggml/llamacpp.py
index e859f45cb6..cafce34114 100644
--- a/xinference/model/llm/ggml/llamacpp.py
+++ b/xinference/model/llm/ggml/llamacpp.py
@@ -307,7 +307,7 @@ def chat(
         generate_config = self._sanitize_generate_config(generate_config)
         # TODO(codingl2k1): qwen hacky to set stop for function call.
         model_family = self.model_family.model_family or self.model_family.model_name
-        if tools and "qwen-chat" == model_family:
+        if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
             stop = generate_config.get("stop")
             if isinstance(stop, str):
                 generate_config["stop"] = [stop, "Observation:"]
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
index 7a5d5210da..d5e6df175c 100644
--- a/xinference/model/llm/utils.py
+++ b/xinference/model/llm/utils.py
@@ -556,7 +556,7 @@ def _tool_calls_completion(cls, model_family, model_uid, c, tools):
             content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
         elif "chatglm3" == family:
             content, func, args = cls._eval_chatglm3_arguments(c, tools)
-        elif "qwen-chat" == family:
+        elif family in ["qwen-chat", "qwen1.5-chat"]:
             content, func, args = cls._eval_qwen_chat_arguments(c, tools)
         else:
             raise Exception(

From 986032e8deb10a05dbe7c400b2374940149e1393 Mon Sep 17 00:00:00 2001
From: aresnow <aresnow1@gmail.com>
Date: Tue, 6 Feb 2024 12:31:01 +0800
Subject: [PATCH 7/7] Fix json file

---
 xinference/model/llm/llm_family.json | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 4cd143eaeb..259a75636e 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -1520,7 +1520,7 @@
         "quantizations": [
           "q8_0"
         ],
-        "model_id": "Qwen/Qwen1.5-0.5B-Chat-GGUF",
+        "model_id": "Qwen/Qwen1.5-1.8B-Chat-GGUF",
         "model_file_name_template": "qwen1_5-1_8b-chat-{quantization}.gguf"
       },
       {
@@ -1541,15 +1541,6 @@
         "model_id": "Qwen/Qwen1.5-7B-Chat-GGUF",
         "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf"
       },
-      {
-        "model_format": "ggufv2",
-        "model_size_in_billions": 7,
-        "quantizations": [
-          "q5_k_m"
-        ],
-        "model_id": "Qwen/Qwen1.5-7B-Chat-GGUF",
-        "model_file_name_template": "qwen1_5-7b-chat-{quantization}.gguf"
-      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": 14,