xorbitsai · ChengjieLi28 · Apr 19, 2024 · Apr 19, 2024
diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
@@ -275,16 +275,6 @@ def serve(self, logging_conf: Optional[dict] = None):
         self._router.add_api_route(
             "/v1/cluster/auth", self.is_cluster_authenticated, methods=["GET"]
         )
-        self._router.add_api_route(
-            "/v1/engines/{model_name}",
-            self.query_engines_by_model_name,
-            methods=["GET"],
-            dependencies=(
-                [Security(self._auth_service, scopes=["models:list"])]
-                if self.is_authenticated()
-                else None
-            ),
-        )
         # running instances
         self._router.add_api_route(
             "/v1/models/instances",
@@ -1428,19 +1418,6 @@ async def stream_results():
                 self.handle_request_limit_error(e)
                 raise HTTPException(status_code=500, detail=str(e))
 
-    async def query_engines_by_model_name(self, model_name: str) -> JSONResponse:
-        try:
-            content = await (
-                await self._get_supervisor_ref()
-            ).query_engines_by_model_name(model_name)
-            return JSONResponse(content=content)
-        except ValueError as re:
-            logger.error(re, exc_info=True)
-            raise HTTPException(status_code=400, detail=str(re))
-        except Exception as e:
-            logger.error(e, exc_info=True)
-            raise HTTPException(status_code=500, detail=str(e))
-
     async def register_model(self, model_type: str, request: Request) -> JSONResponse:
         body = RegisterModelRequest.parse_obj(await request.json())
         model = body.model

diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py
@@ -591,24 +591,6 @@ def get_model_registration(self, model_type: str, model_name: str) -> Any:
         else:
             raise ValueError(f"Unsupported model type: {model_type}")
 
-    @log_async(logger=logger)
-    async def query_engines_by_model_name(self, model_name: str):
-        from copy import deepcopy
-
-        from ..model.llm.llm_family import LLM_ENGINES
-
-        if model_name not in LLM_ENGINES:
-            raise ValueError(f"Model {model_name} not found")
-
-        # filter llm_class
-        engine_params = deepcopy(LLM_ENGINES[model_name])
-        for engine in engine_params:
-            params = engine_params[engine]
-            for param in params:
-                del param["llm_class"]
-
-        return engine_params
-
     @log_async(logger=logger)
     async def register_model(self, model_type: str, model: str, persist: bool):
         if model_type in self._custom_register_type_to_cls:

diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
@@ -30,14 +30,8 @@
     BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
     BUILTIN_LLM_PROMPT_STYLE,
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
-    LLAMA_CLASSES,
     LLM_CLASSES,
-    LLM_ENGINES,
     PEFT_SUPPORTED_CLASSES,
-    PYTORCH_CLASSES,
-    SGLANG_CLASSES,
-    SUPPORTED_ENGINES,
-    VLLM_CLASSES,
     CustomLLMFamilyV1,
     GgmlLLMSpecV1,
     LLMFamilyV1,
@@ -53,50 +47,6 @@
 )
 
 
-def generate_engine_config_by_model_family(model_family):
-    model_name = model_family.model_name
-    specs = model_family.model_specs
-    engines = {}  # structure for engine query
-    for spec in specs:
-        model_format = spec.model_format
-        model_size_in_billions = spec.model_size_in_billions
-        quantizations = spec.quantizations
-        for quantization in quantizations:
-            # traverse all supported engines to match the name, format, size in billions and quatization of model
-            for engine in SUPPORTED_ENGINES:
-                CLASSES = SUPPORTED_ENGINES[engine]
-                for cls in CLASSES:
-                    if cls.match(model_family, spec, quantization):
-                        engine_params = engines.get(engine, [])
-                        already_exists = False
-                        # if the name, format and size in billions of model already exists in the structure, add the new quantization
-                        for param in engine_params:
-                            if (
-                                model_name == param["model_name"]
-                                and model_format == param["model_format"]
-                                and model_size_in_billions
-                                == param["model_size_in_billions"]
-                                and quantization not in param["quantizations"]
-                            ):
-                                param["quantizations"].append(quantization)
-                                already_exists = True
-                                break
-                        # successfully match the params for the first time, add to the structure
-                        if not already_exists:
-                            engine_params.append(
-                                {
-                                    "model_name": model_name,
-                                    "model_format": model_format,
-                                    "model_size_in_billions": model_size_in_billions,
-                                    "quantizations": [quantization],
-                                    "llm_class": cls,
-                                }
-                            )
-                        engines[engine] = engine_params
-                        break
-    LLM_ENGINES[model_name] = engines
-
-
 def _install():
     from .ggml.chatglm import ChatglmCppChatModel
     from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
@@ -126,17 +76,8 @@ def _install():
             ChatglmCppChatModel,
         ]
     )
-    LLAMA_CLASSES.extend(
-        [
-            ChatglmCppChatModel,
-            LlamaCppChatModel,
-            LlamaCppModel,
-        ]
-    )
     LLM_CLASSES.extend([SGLANGModel, SGLANGChatModel])
-    SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
     LLM_CLASSES.extend([VLLMModel, VLLMChatModel])
-    VLLM_CLASSES.extend([VLLMModel, VLLMChatModel])
     LLM_CLASSES.extend(
         [
             BaichuanPytorchChatModel,
@@ -155,24 +96,6 @@ def _install():
             PytorchModel,
         ]
     )
-    PYTORCH_CLASSES.extend(
-        [
-            BaichuanPytorchChatModel,
-            VicunaPytorchChatModel,
-            FalconPytorchChatModel,
-            ChatglmPytorchChatModel,
-            LlamaPytorchModel,
-            LlamaPytorchChatModel,
-            PytorchChatModel,
-            FalconPytorchModel,
-            Internlm2PytorchChatModel,
-            QwenVLChatModel,
-            OmniLMMModel,
-            YiVLChatModel,
-            DeepSeekVLChatModel,
-            PytorchModel,
-        ]
-    )
     PEFT_SUPPORTED_CLASSES.extend(
         [
             BaichuanPytorchChatModel,
@@ -190,12 +113,6 @@ def _install():
         ]
     )
 
-    # support 4 engines for now
-    SUPPORTED_ENGINES["vLLM"] = VLLM_CLASSES
-    SUPPORTED_ENGINES["SGLang"] = SGLANG_CLASSES
-    SUPPORTED_ENGINES["PyTorch"] = PYTORCH_CLASSES
-    SUPPORTED_ENGINES["llama-cpp-python"] = LLAMA_CLASSES
-
     json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
     )
@@ -246,11 +163,6 @@ def _install():
             if llm_spec.model_name not in LLM_MODEL_DESCRIPTIONS:
                 LLM_MODEL_DESCRIPTIONS.update(generate_llm_description(llm_spec))
 
-    # traverse all families and add engine parameters corresponding to the model name
-    for families in [BUILTIN_LLM_FAMILIES, BUILTIN_MODELSCOPE_LLM_FAMILIES]:
-        for family in families:
-            generate_engine_config_by_model_family(family)
-
     from ...constants import XINFERENCE_MODEL_DIR
 
     user_defined_llm_dir = os.path.join(XINFERENCE_MODEL_DIR, "llm")

diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
@@ -227,25 +227,16 @@ def parse_raw(
 CustomLLMFamilyV1.update_forward_refs()
 
 
-LLAMA_CLASSES: List[Type[LLM]] = []
 LLM_CLASSES: List[Type[LLM]] = []
 PEFT_SUPPORTED_CLASSES: List[Type[LLM]] = []
 
 BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
 BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
 
-SGLANG_CLASSES: List[Type[LLM]] = []
-PYTORCH_CLASSES: List[Type[LLM]] = []
-
 UD_LLM_FAMILIES: List["LLMFamilyV1"] = []
 
 UD_LLM_FAMILIES_LOCK = Lock()
 
-VLLM_CLASSES: List[Type[LLM]] = []
-
-LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
-SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
-
 LLM_LAUNCH_VERSIONS: Dict[str, List[str]] = {}
 
 
@@ -913,7 +904,6 @@ def _apply_format_to_model_id(spec: LLMSpecV1, q: str) -> LLMSpecV1:
 
 def register_llm(llm_family: LLMFamilyV1, persist: bool):
     from ..utils import is_valid_model_name
-    from . import generate_engine_config_by_model_family
 
     if not is_valid_model_name(llm_family.model_name):
         raise ValueError(f"Invalid model name {llm_family.model_name}.")
@@ -926,7 +916,6 @@ def register_llm(llm_family: LLMFamilyV1, persist: bool):
                 )
 
         UD_LLM_FAMILIES.append(llm_family)
-        generate_engine_config_by_model_family(llm_family)
 
     if persist:
         # We only validate model URL when persist is True.
@@ -952,7 +941,6 @@ def unregister_llm(model_name: str, raise_error: bool = True):
                 break
         if llm_family:
             UD_LLM_FAMILIES.remove(llm_family)
-            del LLM_ENGINES[model_name]
 
             persist_path = os.path.join(
                 XINFERENCE_MODEL_DIR, "llm", f"{llm_family.model_name}.json"
@@ -1002,31 +990,3 @@ def match_llm_cls(
             if cls.match(family, llm_spec, quantization):
                 return cls
     return None
-
-
-def check_engine_by_spec_parameters(
-    model_engine: str,
-    model_name: str,
-    model_format: str,
-    model_size_in_billions: Union[str, int],
-    quantization: str,
-) -> Optional[Type[LLM]]:
-    if model_name not in LLM_ENGINES:
-        logger.debug(f"Cannot find model {model_name}.")
-        return None
-    if model_engine not in LLM_ENGINES[model_name]:
-        logger.debug(f"Model {model_name} cannot be run on engine {model_engine}.")
-        return None
-    match_params = LLM_ENGINES[model_name][model_engine]
-    for param in match_params:
-        if (
-            model_name == param["model_name"]
-            and model_format == param["model_format"]
-            and model_size_in_billions == param["model_size_in_billions"]
-            and quantization in param["quantizations"]
-        ):
-            return param["llm_class"]
-    logger.debug(
-        f"Model {model_name} with format {model_format}, size {model_size_in_billions} and quantization {quantization} cannot be run on engine {model_engine}."
-    )
-    return None