xorbitsai · ChengjieLi28 · Apr 22, 2024 · Apr 19, 2024 · Apr 19, 2024 · Apr 19, 2024
diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst
@@ -36,7 +36,7 @@ Currently, supported models include:
 
 .. vllm_start
 
-- ``llama-2``, ``llama-2-chat``
+- ``llama-2``, ``llama-3``, ``llama-2-chat``, ``llama-3-instruct``
 - ``baichuan``, ``baichuan-chat``, ``baichuan-2-chat``
 - ``internlm-16k``, ``internlm-chat-7b``, ``internlm-chat-8k``, ``internlm-chat-20b``
 - ``mistral-v0.1``, ``mistral-instruct-v0.1``, ``mistral-instruct-v0.2``

diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
@@ -206,6 +206,16 @@ The following is a list of built-in LLM in Xinference:
      - 4096
      - Llama-2-Chat is a fine-tuned version of the Llama-2 LLM, specializing in chatting.
 
+   * - :ref:`llama-3 <models_llm_llama-3>`
+     - generate
+     - 8192
+     - Llama 3 is an auto-regressive language model that uses an optimized transformer architecture
+
+   * - :ref:`llama-3-instruct <models_llm_llama-3-instruct>`
+     - chat
+     - 8192
+     - The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..
+
    * - :ref:`minicpm-2b-dpo-bf16 <models_llm_minicpm-2b-dpo-bf16>`
      - chat
      - 4096
@@ -512,6 +522,10 @@ The following is a list of built-in LLM in Xinference:
 
    llama-2-chat
 
+   llama-3
+
+   llama-3-instruct
+
    minicpm-2b-dpo-bf16
 
    minicpm-2b-dpo-fp16

diff --git a/doc/source/models/builtin/llm/llama-3-instruct.rst b/doc/source/models/builtin/llm/llama-3-instruct.rst
@@ -0,0 +1,75 @@
+.. _models_llm_llama-3-instruct:
+
+========================================
+llama-3-instruct
+========================================
+
+- **Context Length:** 8192
+- **Model Name:** llama-3-instruct
+- **Languages:** en
+- **Abilities:** chat
+- **Description:** The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (ggufv2, 8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 8
+- **Quantizations:** IQ3_M, Q4_K_M, Q5_K_M, Q6_K, Q8_0
+- **Model ID:** lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name llama-3-instruct --size-in-billions 8 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 8
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** meta-llama/Meta-Llama-3-8B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`__, `ModelScope <https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name llama-3-instruct --size-in-billions 8 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 3 (ggufv2, 70 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 70
+- **Quantizations:** IQ1_M, IQ2_XS, Q4_K_M
+- **Model ID:** lmstudio-community/Meta-Llama-3-70B-Instruct-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/lmstudio-community/Meta-Llama-3-70B-Instruct-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name llama-3-instruct --size-in-billions 70 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 4 (pytorch, 70 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 70
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** meta-llama/Meta-Llama-3-70B-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct>`__, `ModelScope <https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name llama-3-instruct --size-in-billions 70 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/llama-3.rst b/doc/source/models/builtin/llm/llama-3.rst
@@ -0,0 +1,75 @@
+.. _models_llm_llama-3:
+
+========================================
+llama-3
+========================================
+
+- **Context Length:** 8192
+- **Model Name:** llama-3
+- **Languages:** en
+- **Abilities:** generate
+- **Description:** Llama 3 is an auto-regressive language model that uses an optimized transformer architecture
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 8
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** meta-llama/Meta-Llama-3-8B
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__, `ModelScope <https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name llama-3 --size-in-billions 8 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (ggufv2, 8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 8
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_1, Q4_K_M, Q4_K_S, Q5_0, Q5_1, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** QuantFactory/Meta-Llama-3-8B-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/QuantFactory/Meta-Llama-3-8B-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name llama-3 --size-in-billions 8 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 3 (pytorch, 70 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 70
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** meta-llama/Meta-Llama-3-70B
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3-70B>`__, `ModelScope <https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name llama-3 --size-in-billions 70 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 4 (ggufv2, 70 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 70
+- **Quantizations:** Q4_K_M, Q5_K_M
+- **Model ID:** NousResearch/Meta-Llama-3-70B-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/NousResearch/Meta-Llama-3-70B-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name llama-3 --size-in-billions 70 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst
@@ -45,7 +45,7 @@ Currently, supported model includes:
 
 .. vllm_start
 
-- ``llama-2``, ``llama-2-chat``
+- ``llama-2``, ``llama-3``, ``llama-2-chat``, ``llama-3-instruct``
 - ``baichuan``, ``baichuan-chat``, ``baichuan-2-chat``
 - ``internlm-16k``, ``internlm-chat-7b``, ``internlm-chat-8k``, ``internlm-chat-20b``
 - ``mistral-v0.1``, ``mistral-instruct-v0.1``, ``mistral-instruct-v0.2``

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -1220,6 +1220,148 @@
       }
     ]
   },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "llama-3",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Llama 3 is an auto-regressive language model that uses an optimized transformer architecture",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "meta-llama/Meta-Llama-3-8B"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "QuantFactory/Meta-Llama-3-8B-GGUF",
+        "model_file_name_template": "Meta-Llama-3-8B.{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "meta-llama/Meta-Llama-3-70B"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "Q4_K_M",
+          "Q5_K_M"
+        ],
+        "model_id": "NousResearch/Meta-Llama-3-70B-GGUF",
+        "model_file_name_template": "Meta-Llama-3-70B-{quantization}.gguf"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "llama-3-instruct",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..",
+    "model_specs": [
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "IQ3_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
+        "model_file_name_template": "Meta-Llama-3-8B-Instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "meta-llama/Meta-Llama-3-8B-Instruct"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "IQ1_M",
+          "IQ2_XS",
+          "Q4_K_M"
+        ],
+        "model_id": "lmstudio-community/Meta-Llama-3-70B-Instruct-GGUF",
+        "model_file_name_template": "Meta-Llama-3-8B-Instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "meta-llama/Meta-Llama-3-70B-Instruct"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "LLAMA3",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n\n",
+      "inter_message_sep": "<|eot_id|>",
+      "stop_token_ids": [
+        128001,
+        128009
+      ],
+      "stop": [
+        "<|end_of_text|>",
+        "<|eot_id|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,