diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst index 8a17a37445..2b51e27bba 100644 --- a/doc/source/getting_started/installation.rst +++ b/doc/source/getting_started/installation.rst @@ -43,7 +43,7 @@ Currently, supported models include: - ``baichuan``, ``baichuan-chat``, ``baichuan-2-chat`` - ``internlm-16k``, ``internlm-chat-7b``, ``internlm-chat-8k``, ``internlm-chat-20b`` - ``mistral-v0.1``, ``mistral-instruct-v0.1``, ``mistral-instruct-v0.2`` -- ``Yi``, ``Yi-chat`` +- ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat`` - ``code-llama``, ``code-llama-python``, ``code-llama-instruct`` - ``c4ai-command-r-v01``, ``c4ai-command-r-v01-4bit`` - ``vicuna-v1.3``, ``vicuna-v1.5`` diff --git a/doc/source/models/builtin/llm/codeqwen1.5-chat.rst b/doc/source/models/builtin/llm/codeqwen1.5-chat.rst index ff6b7f20b2..2a5ef3cf32 100644 --- a/doc/source/models/builtin/llm/codeqwen1.5-chat.rst +++ b/doc/source/models/builtin/llm/codeqwen1.5-chat.rst @@ -4,7 +4,7 @@ codeqwen1.5-chat ======================================== -- **Context Length:** 32768 +- **Context Length:** 65536 - **Model Name:** codeqwen1.5-chat - **Languages:** en, zh - **Abilities:** chat diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index f371139ad5..030eb69797 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -108,7 +108,7 @@ The following is a list of built-in LLM in Xinference: * - :ref:`codeqwen1.5-chat ` - chat - - 32768 + - 65536 - CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes. * - :ref:`codeshell ` @@ -381,6 +381,11 @@ The following is a list of built-in LLM in Xinference: - 8192 - Starcoderplus is an open-source LLM trained by fine-tuning Starcoder on RedefinedWeb and StarCoderData datasets. + * - :ref:`starling-lm ` + - chat + - 4096 + - We introduce Starling-7B, an open large language model (LLM) trained by Reinforcement Learning from AI Feedback (RLAIF). The model harnesses the power of our new GPT-4 labeled ranking dataset + * - :ref:`tiny-llama ` - generate - 2048 @@ -431,19 +436,29 @@ The following is a list of built-in LLM in Xinference: - 4096 - The Yi series models are large language models trained from scratch by developers at 01.AI. + * - :ref:`yi-1.5 ` + - generate + - 4096 + - Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples. + + * - :ref:`yi-1.5-chat ` + - chat + - 4096 + - Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples. + * - :ref:`yi-200k ` - generate - - 204800 + - 262144 - The Yi series models are large language models trained from scratch by developers at 01.AI. * - :ref:`yi-chat ` - chat - - 204800 + - 4096 - The Yi series models are large language models trained from scratch by developers at 01.AI. * - :ref:`yi-vl-chat ` - chat, vision - - 204800 + - 4096 - Yi Vision Language (Yi-VL) model is the open-source, multimodal version of the Yi Large Language Model (LLM) series, enabling content comprehension, recognition, and multi-round conversations about images. * - :ref:`zephyr-7b-alpha ` @@ -607,6 +622,8 @@ The following is a list of built-in LLM in Xinference: starcoderplus + starling-lm + tiny-llama vicuna-v1.3 @@ -627,6 +644,10 @@ The following is a list of built-in LLM in Xinference: yi + yi-1.5 + + yi-1.5-chat + yi-200k yi-chat diff --git a/doc/source/models/builtin/llm/yi-1.5-chat.rst b/doc/source/models/builtin/llm/yi-1.5-chat.rst new file mode 100644 index 0000000000..df77f84b9a --- /dev/null +++ b/doc/source/models/builtin/llm/yi-1.5-chat.rst @@ -0,0 +1,60 @@ +.. _models_llm_yi-1.5-chat: + +======================================== +Yi-1.5-chat +======================================== + +- **Context Length:** 4096 +- **Model Name:** Yi-1.5-chat +- **Languages:** en, zh +- **Abilities:** chat +- **Description:** Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 6 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 6 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** 01-ai/Yi-1.5-6B-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name Yi-1.5-chat --size-in-billions 6 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 9 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 9 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** 01-ai/Yi-1.5-9B-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name Yi-1.5-chat --size-in-billions 9 --model-format pytorch --quantization ${quantization} + + +Model Spec 3 (pytorch, 34 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 34 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** 01-ai/Yi-1.5-34B-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name Yi-1.5-chat --size-in-billions 34 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/yi-1.5.rst b/doc/source/models/builtin/llm/yi-1.5.rst new file mode 100644 index 0000000000..a0b3727e53 --- /dev/null +++ b/doc/source/models/builtin/llm/yi-1.5.rst @@ -0,0 +1,60 @@ +.. _models_llm_yi-1.5: + +======================================== +Yi-1.5 +======================================== + +- **Context Length:** 4096 +- **Model Name:** Yi-1.5 +- **Languages:** en, zh +- **Abilities:** generate +- **Description:** Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 6 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 6 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** 01-ai/Yi-1.5-6B +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name Yi-1.5 --size-in-billions 6 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 9 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 9 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** 01-ai/Yi-1.5-9B +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name Yi-1.5 --size-in-billions 9 --model-format pytorch --quantization ${quantization} + + +Model Spec 3 (pytorch, 34 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 34 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** 01-ai/Yi-1.5-34B +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name Yi-1.5 --size-in-billions 34 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/yi-200k.rst b/doc/source/models/builtin/llm/yi-200k.rst index 15dcca708b..acb8aa1750 100644 --- a/doc/source/models/builtin/llm/yi-200k.rst +++ b/doc/source/models/builtin/llm/yi-200k.rst @@ -4,7 +4,7 @@ Yi-200k ======================================== -- **Context Length:** 204800 +- **Context Length:** 262144 - **Model Name:** Yi-200k - **Languages:** en, zh - **Abilities:** generate diff --git a/doc/source/models/builtin/llm/yi-chat.rst b/doc/source/models/builtin/llm/yi-chat.rst index 859e8526bc..4d5df8065b 100644 --- a/doc/source/models/builtin/llm/yi-chat.rst +++ b/doc/source/models/builtin/llm/yi-chat.rst @@ -4,7 +4,7 @@ Yi-chat ======================================== -- **Context Length:** 204800 +- **Context Length:** 4096 - **Model Name:** Yi-chat - **Languages:** en, zh - **Abilities:** chat @@ -29,7 +29,22 @@ chosen quantization method from the options listed above:: xinference launch --model-name Yi-chat --size-in-billions 34 --model-format gptq --quantization ${quantization} -Model Spec 2 (pytorch, 34 Billion) +Model Spec 2 (pytorch, 6 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 6 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** 01-ai/Yi-6B-Chat +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name Yi-chat --size-in-billions 6 --model-format pytorch --quantization ${quantization} + + +Model Spec 3 (pytorch, 34 Billion) ++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch @@ -44,7 +59,7 @@ chosen quantization method from the options listed above:: xinference launch --model-name Yi-chat --size-in-billions 34 --model-format pytorch --quantization ${quantization} -Model Spec 3 (ggufv2, 34 Billion) +Model Spec 4 (ggufv2, 34 Billion) ++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggufv2 diff --git a/doc/source/models/builtin/llm/yi-vl-chat.rst b/doc/source/models/builtin/llm/yi-vl-chat.rst index ea2f192c14..96fa6c64d1 100644 --- a/doc/source/models/builtin/llm/yi-vl-chat.rst +++ b/doc/source/models/builtin/llm/yi-vl-chat.rst @@ -4,7 +4,7 @@ yi-vl-chat ======================================== -- **Context Length:** 204800 +- **Context Length:** 4096 - **Model Name:** yi-vl-chat - **Languages:** en, zh - **Abilities:** chat, vision diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst index 2cef3cde4f..477e6f4e2f 100644 --- a/doc/source/user_guide/backends.rst +++ b/doc/source/user_guide/backends.rst @@ -50,7 +50,7 @@ Currently, supported model includes: - ``baichuan``, ``baichuan-chat``, ``baichuan-2-chat`` - ``internlm-16k``, ``internlm-chat-7b``, ``internlm-chat-8k``, ``internlm-chat-20b`` - ``mistral-v0.1``, ``mistral-instruct-v0.1``, ``mistral-instruct-v0.2`` -- ``Yi``, ``Yi-chat`` +- ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat`` - ``code-llama``, ``code-llama-python``, ``code-llama-instruct`` - ``c4ai-command-r-v01``, ``c4ai-command-r-v01-4bit`` - ``vicuna-v1.3``, ``vicuna-v1.5`` diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 082fa33c12..02981c3838 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -3651,7 +3651,7 @@ }, { "version": 1, - "context_length": 204800, + "context_length": 262144, "model_name": "Yi-200k", "model_lang": [ "en", @@ -3688,7 +3688,7 @@ }, { "version": 1, - "context_length": 204800, + "context_length": 4096, "model_name": "Yi-chat", "model_lang": [ "en", @@ -3707,6 +3707,17 @@ ], "model_id": "01-ai/Yi-34B-Chat-{quantization}" }, + { + "model_format": "pytorch", + "model_size_in_billions": 6, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "01-ai/Yi-6B-Chat", + "model_revision": "1c20c960895e4c3877cf478bc2df074221b81d7b" + }, { "model_format": "pytorch", "model_size_in_billions": 34, @@ -3762,6 +3773,124 @@ ] } }, + { + "version": 1, + "context_length": 4096, + "model_name": "Yi-1.5", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "generate" + ], + "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 6, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "01-ai/Yi-1.5-6B", + "model_revision": "741a657c42d2081f777ce4c6c5572090f8b8c886" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 9, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "01-ai/Yi-1.5-9B", + "model_revision": "9a6839c5b9db3dbb245fb98a072bfabc242621f2" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 34, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "01-ai/Yi-1.5-34B", + "model_revision": "4f83007957ec3eec76d87df19ad061eb0f57b5c5" + } + ] + }, + { + "version": 1, + "context_length": 4096, + "model_name": "Yi-1.5-chat", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 6, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "01-ai/Yi-1.5-6B-Chat", + "model_revision": "d68dab90947a3c869e28c9cb2806996af99a6080" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 9, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "01-ai/Yi-1.5-9B-Chat", + "model_revision": "1dc6e2b8dcfc12b95bede8dec67e6b6332ac64c6" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 34, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "01-ai/Yi-1.5-34B-Chat", + "model_revision": "fa695ee438bfcd0ec2b378fa1c7e0dea1b40393e" + } + ], + "prompt_style": { + "style_name": "CHATML", + "system_prompt": "", + "roles": [ + "<|im_start|>user", + "<|im_start|>assistant" + ], + "intra_message_sep": "<|im_end|>", + "inter_message_sep": "", + "stop_token_ids": [ + 2, + 6, + 7, + 8 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>", + "<|im_sep|>" + ] + } + }, { "version": 1, "context_length": 2048, @@ -4684,7 +4813,7 @@ }, { "version": 1, - "context_length": 204800, + "context_length": 4096, "model_name": "yi-vl-chat", "model_lang": [ "en", diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index f34d2ad1b7..149391d717 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -1289,7 +1289,7 @@ }, { "version": 1, - "context_length": 204800, + "context_length": 262144, "model_name": "Yi-200k", "model_lang": [ "en", @@ -1328,7 +1328,7 @@ }, { "version": 1, - "context_length": 204800, + "context_length": 4096, "model_name": "Yi-chat", "model_lang": [ "en", @@ -1349,6 +1349,18 @@ "model_id": "01ai/Yi-34B-Chat-{quantization}", "model_revision": "master" }, + { + "model_format": "pytorch", + "model_size_in_billions": 6, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-6B-Chat", + "model_revision": "master" + }, { "model_format": "pytorch", "model_size_in_billions": 34, @@ -1385,6 +1397,130 @@ ] } }, + { + "version": 1, + "context_length": 4096, + "model_name": "Yi-1.5", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "generate" + ], + "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 6, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-1.5-6B", + "model_revision": "master" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 9, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-1.5-9B", + "model_revision": "master" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 34, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-1.5-34B", + "model_revision": "master" + } + ] + }, + { + "version": 1, + "context_length": 4096, + "model_name": "Yi-1.5-chat", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 6, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-1.5-6B-Chat", + "model_revision": "master" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 9, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-1.5-9B-Chat", + "model_revision": "master" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 34, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_hub": "modelscope", + "model_id": "01ai/Yi-1.5-34B-Chat", + "model_revision": "master" + } + ], + "prompt_style": { + "style_name": "CHATML", + "system_prompt": "", + "roles": [ + "<|im_start|>user", + "<|im_start|>assistant" + ], + "intra_message_sep": "<|im_end|>", + "inter_message_sep": "", + "stop_token_ids": [ + 2, + 6, + 7, + 8 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>", + "<|im_sep|>" + ] + } + }, { "version": 1, "context_length": 2048, @@ -2755,7 +2891,7 @@ }, { "version": 1, - "context_length": 204800, + "context_length": 4096, "model_name": "yi-vl-chat", "model_lang": [ "en", diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 961362b125..bc68286188 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -91,6 +91,7 @@ class VLLMGenerateConfig(TypedDict, total=False): "internlm-16k", "mistral-v0.1", "Yi", + "Yi-1.5", "code-llama", "code-llama-python", ] @@ -107,6 +108,7 @@ class VLLMGenerateConfig(TypedDict, total=False): "internlm2-chat", "qwen-chat", "Yi-chat", + "Yi-1.5-chat", "code-llama-instruct", "mistral-instruct-v0.1", "mistral-instruct-v0.2",