diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py index bb739c1e41..ec58594cc3 100644 --- a/xinference/api/restful_api.py +++ b/xinference/api/restful_api.py @@ -275,16 +275,6 @@ def serve(self, logging_conf: Optional[dict] = None): self._router.add_api_route( "/v1/cluster/auth", self.is_cluster_authenticated, methods=["GET"] ) - self._router.add_api_route( - "/v1/engines/{model_name}", - self.query_engines_by_model_name, - methods=["GET"], - dependencies=( - [Security(self._auth_service, scopes=["models:list"])] - if self.is_authenticated() - else None - ), - ) # running instances self._router.add_api_route( "/v1/models/instances", @@ -1428,19 +1418,6 @@ async def stream_results(): self.handle_request_limit_error(e) raise HTTPException(status_code=500, detail=str(e)) - async def query_engines_by_model_name(self, model_name: str) -> JSONResponse: - try: - content = await ( - await self._get_supervisor_ref() - ).query_engines_by_model_name(model_name) - return JSONResponse(content=content) - except ValueError as re: - logger.error(re, exc_info=True) - raise HTTPException(status_code=400, detail=str(re)) - except Exception as e: - logger.error(e, exc_info=True) - raise HTTPException(status_code=500, detail=str(e)) - async def register_model(self, model_type: str, request: Request) -> JSONResponse: body = RegisterModelRequest.parse_obj(await request.json()) model = body.model diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py index 8da7d1947f..115baf1e73 100644 --- a/xinference/core/supervisor.py +++ b/xinference/core/supervisor.py @@ -591,24 +591,6 @@ def get_model_registration(self, model_type: str, model_name: str) -> Any: else: raise ValueError(f"Unsupported model type: {model_type}") - @log_async(logger=logger) - async def query_engines_by_model_name(self, model_name: str): - from copy import deepcopy - - from ..model.llm.llm_family import LLM_ENGINES - - if model_name not in LLM_ENGINES: - raise ValueError(f"Model {model_name} not found") - - # filter llm_class - engine_params = deepcopy(LLM_ENGINES[model_name]) - for engine in engine_params: - params = engine_params[engine] - for param in params: - del param["llm_class"] - - return engine_params - @log_async(logger=logger) async def register_model(self, model_type: str, model: str, persist: bool): if model_type in self._custom_register_type_to_cls: diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py index c6af541362..ec25095c3d 100644 --- a/xinference/model/llm/__init__.py +++ b/xinference/model/llm/__init__.py @@ -30,14 +30,8 @@ BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES, BUILTIN_LLM_PROMPT_STYLE, BUILTIN_MODELSCOPE_LLM_FAMILIES, - LLAMA_CLASSES, LLM_CLASSES, - LLM_ENGINES, PEFT_SUPPORTED_CLASSES, - PYTORCH_CLASSES, - SGLANG_CLASSES, - SUPPORTED_ENGINES, - VLLM_CLASSES, CustomLLMFamilyV1, GgmlLLMSpecV1, LLMFamilyV1, @@ -53,50 +47,6 @@ ) -def generate_engine_config_by_model_family(model_family): - model_name = model_family.model_name - specs = model_family.model_specs - engines = {} # structure for engine query - for spec in specs: - model_format = spec.model_format - model_size_in_billions = spec.model_size_in_billions - quantizations = spec.quantizations - for quantization in quantizations: - # traverse all supported engines to match the name, format, size in billions and quatization of model - for engine in SUPPORTED_ENGINES: - CLASSES = SUPPORTED_ENGINES[engine] - for cls in CLASSES: - if cls.match(model_family, spec, quantization): - engine_params = engines.get(engine, []) - already_exists = False - # if the name, format and size in billions of model already exists in the structure, add the new quantization - for param in engine_params: - if ( - model_name == param["model_name"] - and model_format == param["model_format"] - and model_size_in_billions - == param["model_size_in_billions"] - and quantization not in param["quantizations"] - ): - param["quantizations"].append(quantization) - already_exists = True - break - # successfully match the params for the first time, add to the structure - if not already_exists: - engine_params.append( - { - "model_name": model_name, - "model_format": model_format, - "model_size_in_billions": model_size_in_billions, - "quantizations": [quantization], - "llm_class": cls, - } - ) - engines[engine] = engine_params - break - LLM_ENGINES[model_name] = engines - - def _install(): from .ggml.chatglm import ChatglmCppChatModel from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel @@ -126,17 +76,8 @@ def _install(): ChatglmCppChatModel, ] ) - LLAMA_CLASSES.extend( - [ - ChatglmCppChatModel, - LlamaCppChatModel, - LlamaCppModel, - ] - ) LLM_CLASSES.extend([SGLANGModel, SGLANGChatModel]) - SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel]) LLM_CLASSES.extend([VLLMModel, VLLMChatModel]) - VLLM_CLASSES.extend([VLLMModel, VLLMChatModel]) LLM_CLASSES.extend( [ BaichuanPytorchChatModel, @@ -155,24 +96,6 @@ def _install(): PytorchModel, ] ) - PYTORCH_CLASSES.extend( - [ - BaichuanPytorchChatModel, - VicunaPytorchChatModel, - FalconPytorchChatModel, - ChatglmPytorchChatModel, - LlamaPytorchModel, - LlamaPytorchChatModel, - PytorchChatModel, - FalconPytorchModel, - Internlm2PytorchChatModel, - QwenVLChatModel, - OmniLMMModel, - YiVLChatModel, - DeepSeekVLChatModel, - PytorchModel, - ] - ) PEFT_SUPPORTED_CLASSES.extend( [ BaichuanPytorchChatModel, @@ -190,12 +113,6 @@ def _install(): ] ) - # support 4 engines for now - SUPPORTED_ENGINES["vLLM"] = VLLM_CLASSES - SUPPORTED_ENGINES["SGLang"] = SGLANG_CLASSES - SUPPORTED_ENGINES["PyTorch"] = PYTORCH_CLASSES - SUPPORTED_ENGINES["llama-cpp-python"] = LLAMA_CLASSES - json_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "llm_family.json" ) @@ -246,11 +163,6 @@ def _install(): if llm_spec.model_name not in LLM_MODEL_DESCRIPTIONS: LLM_MODEL_DESCRIPTIONS.update(generate_llm_description(llm_spec)) - # traverse all families and add engine parameters corresponding to the model name - for families in [BUILTIN_LLM_FAMILIES, BUILTIN_MODELSCOPE_LLM_FAMILIES]: - for family in families: - generate_engine_config_by_model_family(family) - from ...constants import XINFERENCE_MODEL_DIR user_defined_llm_dir = os.path.join(XINFERENCE_MODEL_DIR, "llm") diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py index ecf2e3071e..6f5a01d6e7 100644 --- a/xinference/model/llm/llm_family.py +++ b/xinference/model/llm/llm_family.py @@ -227,25 +227,16 @@ def parse_raw( CustomLLMFamilyV1.update_forward_refs() -LLAMA_CLASSES: List[Type[LLM]] = [] LLM_CLASSES: List[Type[LLM]] = [] PEFT_SUPPORTED_CLASSES: List[Type[LLM]] = [] BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = [] BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = [] -SGLANG_CLASSES: List[Type[LLM]] = [] -PYTORCH_CLASSES: List[Type[LLM]] = [] - UD_LLM_FAMILIES: List["LLMFamilyV1"] = [] UD_LLM_FAMILIES_LOCK = Lock() -VLLM_CLASSES: List[Type[LLM]] = [] - -LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {} -SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {} - LLM_LAUNCH_VERSIONS: Dict[str, List[str]] = {} @@ -913,7 +904,6 @@ def _apply_format_to_model_id(spec: LLMSpecV1, q: str) -> LLMSpecV1: def register_llm(llm_family: LLMFamilyV1, persist: bool): from ..utils import is_valid_model_name - from . import generate_engine_config_by_model_family if not is_valid_model_name(llm_family.model_name): raise ValueError(f"Invalid model name {llm_family.model_name}.") @@ -926,7 +916,6 @@ def register_llm(llm_family: LLMFamilyV1, persist: bool): ) UD_LLM_FAMILIES.append(llm_family) - generate_engine_config_by_model_family(llm_family) if persist: # We only validate model URL when persist is True. @@ -952,7 +941,6 @@ def unregister_llm(model_name: str, raise_error: bool = True): break if llm_family: UD_LLM_FAMILIES.remove(llm_family) - del LLM_ENGINES[model_name] persist_path = os.path.join( XINFERENCE_MODEL_DIR, "llm", f"{llm_family.model_name}.json" @@ -1002,31 +990,3 @@ def match_llm_cls( if cls.match(family, llm_spec, quantization): return cls return None - - -def check_engine_by_spec_parameters( - model_engine: str, - model_name: str, - model_format: str, - model_size_in_billions: Union[str, int], - quantization: str, -) -> Optional[Type[LLM]]: - if model_name not in LLM_ENGINES: - logger.debug(f"Cannot find model {model_name}.") - return None - if model_engine not in LLM_ENGINES[model_name]: - logger.debug(f"Model {model_name} cannot be run on engine {model_engine}.") - return None - match_params = LLM_ENGINES[model_name][model_engine] - for param in match_params: - if ( - model_name == param["model_name"] - and model_format == param["model_format"] - and model_size_in_billions == param["model_size_in_billions"] - and quantization in param["quantizations"] - ): - return param["llm_class"] - logger.debug( - f"Model {model_name} with format {model_format}, size {model_size_in_billions} and quantization {quantization} cannot be run on engine {model_engine}." - ) - return None diff --git a/xinference/model/llm/tests/test_llm_family.py b/xinference/model/llm/tests/test_llm_family.py index 81dc586e38..ec119b43ce 100644 --- a/xinference/model/llm/tests/test_llm_family.py +++ b/xinference/model/llm/tests/test_llm_family.py @@ -14,19 +14,13 @@ import codecs import json import os -import platform import shutil import tempfile from unittest.mock import MagicMock, Mock, patch import pytest -from ....constants import ( - XINFERENCE_DISABLE_VLLM, - XINFERENCE_ENABLE_SGLANG, - XINFERENCE_ENV_MODEL_SRC, -) -from ....utils import cuda_count +from ....constants import XINFERENCE_ENV_MODEL_SRC from ...utils import is_locale_chinese_simplified, valid_model_revision from ..llm_family import ( AWSRegion, @@ -45,8 +39,6 @@ match_model_size, parse_uri, ) -from ..sglang.core import SGLANG_INSTALLED -from ..vllm.core import VLLM_INSTALLED def test_deserialize_llm_family_v1(): @@ -1072,251 +1064,3 @@ def test_match_model_size(): assert not match_model_size("1.8", 18) assert not match_model_size("1.8", 1) assert match_model_size("001", 1) - - -@pytest.mark.skipif( - XINFERENCE_DISABLE_VLLM - or platform.system() != "Linux" - or cuda_count() <= 0 - or not VLLM_INSTALLED, - reason="Current system does not support vLLM", -) -def test_quert_engine_vLLM(): - from ..llm_family import LLM_ENGINES, check_engine_by_spec_parameters - - model_name = "qwen1.5-chat" - assert model_name in LLM_ENGINES - - assert ( - "vLLM" in LLM_ENGINES[model_name] and len(LLM_ENGINES[model_name]["vLLM"]) == 21 - ) - - assert check_engine_by_spec_parameters( - model_engine="vLLM", - model_name=model_name, - model_format="gptq", - model_size_in_billions="1_8", - quantization="Int4", - ) - assert ( - check_engine_by_spec_parameters( - model_engine="vLLM", - model_name=model_name, - model_format="gptq", - model_size_in_billions="1_8", - quantization="Int8", - ) - is None - ) - assert check_engine_by_spec_parameters( - model_engine="vLLM", - model_name=model_name, - model_format="pytorch", - model_size_in_billions="1_8", - quantization="none", - ) - assert ( - check_engine_by_spec_parameters( - model_engine="vLLM", - model_name=model_name, - model_format="pytorch", - model_size_in_billions="1_8", - quantization="4-bit", - ) - is None - ) - assert ( - check_engine_by_spec_parameters( - model_engine="vLLM", - model_name=model_name, - model_format="ggmlv3", - model_size_in_billions="1_8", - quantization="q2_k", - ) - is None - ) - - -@pytest.mark.skipif( - not XINFERENCE_ENABLE_SGLANG - or platform.system() != "Linux" - or cuda_count() <= 0 - or not SGLANG_INSTALLED, - reason="Current system does not support SGLang", -) -def test_quert_engine_SGLang(): - from ..llm_family import LLM_ENGINES, check_engine_by_spec_parameters - - model_name = "qwen1.5-chat" - assert model_name in LLM_ENGINES - - assert ( - "SGLang" in LLM_ENGINES[model_name] - and len(LLM_ENGINES[model_name]["SGLang"]) == 21 - ) - - assert check_engine_by_spec_parameters( - model_engine="SGLang", - model_name=model_name, - model_format="gptq", - model_size_in_billions="1_8", - quantization="Int4", - ) - assert ( - check_engine_by_spec_parameters( - model_engine="SGLang", - model_name=model_name, - model_format="gptq", - model_size_in_billions="1_8", - quantization="Int8", - ) - is None - ) - assert check_engine_by_spec_parameters( - model_engine="SGLang", - model_name=model_name, - model_format="pytorch", - model_size_in_billions="1_8", - quantization="none", - ) - assert ( - check_engine_by_spec_parameters( - model_engine="SGLang", - model_name=model_name, - model_format="pytorch", - model_size_in_billions="1_8", - quantization="4-bit", - ) - is None - ) - assert ( - check_engine_by_spec_parameters( - model_engine="SGLang", - model_name=model_name, - model_format="ggmlv3", - model_size_in_billions="1_8", - quantization="q2_k", - ) - is None - ) - - -def test_query_engine_general(): - from ..ggml.chatglm import ChatglmCppChatModel - from ..ggml.llamacpp import LlamaCppChatModel - from ..llm_family import ( - LLM_ENGINES, - check_engine_by_spec_parameters, - get_user_defined_llm_families, - register_llm, - unregister_llm, - ) - - model_name = "qwen1.5-chat" - assert model_name in LLM_ENGINES - - assert ( - "PyTorch" in LLM_ENGINES[model_name] - and len(LLM_ENGINES[model_name]["PyTorch"]) == 28 - ) - assert ( - "llama-cpp-python" in LLM_ENGINES[model_name] - and len(LLM_ENGINES[model_name]["llama-cpp-python"]) == 7 - ) - - assert check_engine_by_spec_parameters( - model_engine="PyTorch", - model_name=model_name, - model_format="gptq", - model_size_in_billions="1_8", - quantization="Int4", - ) - assert check_engine_by_spec_parameters( - model_engine="PyTorch", - model_name=model_name, - model_format="gptq", - model_size_in_billions="1_8", - quantization="Int8", - ) - assert check_engine_by_spec_parameters( - model_engine="PyTorch", - model_name=model_name, - model_format="pytorch", - model_size_in_billions="1_8", - quantization="none", - ) - assert check_engine_by_spec_parameters( - model_engine="PyTorch", - model_name=model_name, - model_format="pytorch", - model_size_in_billions="1_8", - quantization="4-bit", - ) - assert ( - check_engine_by_spec_parameters( - model_engine="llama-cpp-python", - model_name=model_name, - model_format="ggufv2", - model_size_in_billions="1_8", - quantization="q2_k", - ) - is LlamaCppChatModel - ) - assert ( - check_engine_by_spec_parameters( - model_engine="llama-cpp-python", - model_name=model_name, - model_format="ggmlv3", - model_size_in_billions="1_8", - quantization="q2_k", - ) - is None - ) - - assert ( - check_engine_by_spec_parameters( - model_engine="llama-cpp-python", - model_name="chatglm", - model_format="ggmlv3", - model_size_in_billions=6, - quantization="q4_0", - ) - is ChatglmCppChatModel - ) - - spec = GgmlLLMSpecV1( - model_format="ggmlv3", - model_size_in_billions=3, - model_id="TheBloke/orca_mini_3B-GGML", - quantizations=[""], - model_file_name_template="README.md", - ) - family = LLMFamilyV1( - version=1, - context_length=2048, - model_type="LLM", - model_name="custom_model", - model_lang=["en"], - model_ability=["embed", "chat"], - model_specs=[spec], - prompt_style=None, - ) - - register_llm(family, False) - - assert family in get_user_defined_llm_families() - assert ( - "custom_model" in LLM_ENGINES - and "llama-cpp-python" in LLM_ENGINES["custom_model"] - ) - assert check_engine_by_spec_parameters( - model_engine="llama-cpp-python", - model_name="custom_model", - model_format="ggmlv3", - model_size_in_billions=3, - quantization="", - ) - - unregister_llm(family.model_name) - assert family not in get_user_defined_llm_families() - assert "custom_model" not in LLM_ENGINES