FEAT: Support Qwen MoE model for huggingface and modelscope (#1263)

Co-authored-by: JunJun-Liu <4424137@qq.com> Co-authored-by: qinxuye <qinxuye@gmail.com>
xorbitsai · Apr 9, 2024 · c0dbe48 · c0dbe48
1 parent 43c72a6
commit c0dbe48
Show file tree

Hide file tree

Showing 5 changed files with 161 additions and 0 deletions.
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
@@ -301,6 +301,11 @@ The following is a list of built-in LLM in Xinference:
      - 32768
      - Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data.
 
+   * - :ref:`qwen1.5-moe-chat <models_llm_qwen1.5-moe-chat>`
+     - chat
+     - 32768
+     - Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.
+
    * - :ref:`skywork <models_llm_skywork>`
      - generate
      - 4096
@@ -520,6 +525,8 @@ The following is a list of built-in LLM in Xinference:
 
    qwen1.5-chat
 
+   qwen1.5-moe-chat
+
    skywork
 
    skywork-math

diff --git a/doc/source/models/builtin/llm/qwen1.5-moe-chat.rst b/doc/source/models/builtin/llm/qwen1.5-moe-chat.rst
@@ -0,0 +1,45 @@
+.. _models_llm_qwen1.5-moe-chat:
+
+========================================
+qwen1.5-moe-chat
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** qwen1.5-moe-chat
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 2_7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 2_7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** Qwen/Qwen1.5-MoE-A2.7B-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-moe-chat --size-in-billions 2_7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (gptq, 2_7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 2_7
+- **Quantizations:** Int4
+- **Model ID:** Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen1.5-moe-chat --size-in-billions 2_7 --model-format gptq --quantization ${quantization}
+
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -1878,6 +1878,58 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen1.5-moe-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "2_7",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "2_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 8192,

diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -2121,6 +2121,60 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen1.5-moe-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "2_7",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "2_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,

diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
@@ -120,6 +120,9 @@ class VLLMGenerateConfig(TypedDict, total=False):
     VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat")
     VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat-rag")
 
+if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
+    VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
+
 
 class VLLMModel(LLM):
     def __init__(