From 77d49543d22ab2caa61eeab2d0ec08d8ee05f0b9 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 11:18:11 +0800 Subject: [PATCH 01/61] fix rebase --- xinference/model/llm/__init__.py | 2 + xinference/model/llm/ggml/ctransformer.py | 184 ++++++++++++++++++ .../model/llm/ggml/ctransformers_util.py | 143 ++++++++++++++ xinference/model/llm/llm_family.json | 29 ++- 4 files changed, 357 insertions(+), 1 deletion(-) create mode 100644 xinference/model/llm/ggml/ctransformer.py create mode 100644 xinference/model/llm/ggml/ctransformers_util.py diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py index 89d9c1a0b4..977bd81f31 100644 --- a/xinference/model/llm/__init__.py +++ b/xinference/model/llm/__init__.py @@ -35,6 +35,7 @@ def _install(): from .ggml.chatglm import ChatglmCppChatModel + from .ggml.ctransformer import CtransformerModel from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel from .pytorch.baichuan import BaichuanPytorchChatModel from .pytorch.chatglm import ChatglmPytorchChatModel @@ -54,6 +55,7 @@ def _install(): FalconPytorchModel, FalconPytorchChatModel, ChatglmPytorchChatModel, + CtransformerModel, ] ) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py new file mode 100644 index 0000000000..8191a57da7 --- /dev/null +++ b/xinference/model/llm/ggml/ctransformer.py @@ -0,0 +1,184 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Iterator, Optional, Sequence, TypedDict, Union + +from ctransformers import AutoConfig + +from xinference.model.llm.ggml.ctransformers_util import generate_stream +from xinference.types import Completion, CompletionChunk + +from ..core import LLM +from ..llm_family import LLMFamilyV1, LLMSpecV1 +from .llamacpp import SIZE_TO_GPU_LAYERS + +logger = logging.getLogger(__name__) + + +# class AutoConfig(TypedDict, total=False): +# top_k: int +# top_p: float +# temperature: float +# repetition_penalty: float +# last_n_tokens: float +# seed: int +# max_new_tokens: int +# stop: List[str] +# stream: bool +# reset: bool +# batch_size: int +# threads: int +# context_length: int +# gpu_layers: int + + +class CtransformerGenerateConfig(TypedDict, total=False): + max_new_tokens: Optional[int] + top_k: Optional[int] + top_p: Optional[float] + temperature: Optional[float] + repetition_penalty: Optional[float] + last_n_tokens: Optional[int] + seed: Optional[int] + batch_size: Optional[int] + threads: Optional[int] + stop: Optional[Sequence[str]] + stream: Optional[bool] + reset: Optional[bool] + + +class CtransformerModel(LLM): + def __init__( + self, + model_uid: str, + model_family: "LLMFamilyV1", + model_spec: "LLMSpecV1", + quantization: str, + model_path: str, + ctransformerModelConfig: Optional[AutoConfig] = None, + ): + super().__init__(model_uid, model_family, model_spec, quantization, model_path) + + closest_size = min( + SIZE_TO_GPU_LAYERS.keys(), + key=lambda x: abs(x - model_spec.model_size_in_billions), + ) + self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size] + self._ctransformer_model_config: AutoConfig = self._sanitize_model_config( + model_path, ctransformerModelConfig + ) + self._llm = None + + def _sanitize_model_config( + self, model_path, ctransformerModelConfig: Optional[AutoConfig] + ) -> AutoConfig: + if ctransformerModelConfig is None: + ctransformerModelConfig = AutoConfig.from_pretrained( + model_path, + local_files_only=False, + ) + + return ctransformerModelConfig + + def _sanitize_generate_config( + self, + ctransformerGenerateConfig: Optional[CtransformerGenerateConfig], + ) -> CtransformerGenerateConfig: + if ctransformerGenerateConfig is None: + ctransformerGenerateConfig = CtransformerGenerateConfig() + ctransformerGenerateConfig.setdefault("top_k", 40) + ctransformerGenerateConfig.setdefault("top_p", 0.95) + ctransformerGenerateConfig.setdefault("temperature", 0.8) + ctransformerGenerateConfig.setdefault("repetition_penalty", 1.1) + ctransformerGenerateConfig.setdefault("last_n_tokens", 64) + ctransformerGenerateConfig.setdefault("seed", -1) + ctransformerGenerateConfig.setdefault("batch_size", 8) + ctransformerGenerateConfig.setdefault("threads", -1) + ctransformerGenerateConfig.setdefault("stop", None) + ctransformerGenerateConfig.setdefault("stream", None) + ctransformerGenerateConfig.setdefault("reset", True) + + return ctransformerGenerateConfig + + def load(self): + try: + from ctransformers import AutoModelForCausalLM + except ImportError: + error_message = "Failed to import module 'ctransformers'" + if self._is_darwin_and_apple_silicon(): + system = "Metal" + else: + system = "CUDA" + + installation_guide = [ + f"Please make sure 'ctransformers' is installed and {system} accelerator is provided.", + f"You can install it by checking out the repository for command for {system} platform:" + f"https://github.com/marella/ctransformers", + ] + + raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") + + self._llm = AutoModelForCausalLM.from_pretrained( + model_path_or_repo_id=self._model_path, + model_type=self._model_type, + model_file=self._model_file, + config=self._ctransformer_model_config, + ) + + @classmethod + def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: + if llm_spec.model_format != "ggmlv3": + return False + if llm_spec.model_id not in ["TheBloke/starcoder-GGML"]: + return False + if "chatglm" in llm_family.model_name: + return False + if "generate" not in llm_family.model_ability: + return False + return True + + def generate( + self, prompt: str, generate_config: CtransformerGenerateConfig + ) -> Union[Completion, Iterator[CompletionChunk]]: + def generator_wrapper( + _prompt: str, + _generate_config: CtransformerGenerateConfig, + ) -> Iterator[CompletionChunk]: + assert self._llm is not None + for _completion_chunk, _ in generate_stream( + model=self._llm, prompt=_prompt, **_generate_config + ): + yield _completion_chunk + + generate_config = self._sanitize_generate_config(generate_config) + + stream_or_not = generate_config.get("stream", False) + if stream_or_not: + return generator_wrapper(_prompt=prompt, _generate_config=generate_config) + else: + for completion_chunk, completion_usage in generate_stream( + self._model, prompt=prompt, **generate_config + ): + pass + + completion = Completion( + id=completion_chunk["id"], + object=completion_chunk["object"], + created=completion_chunk["created"], + model=completion_chunk["model"], + choices=completion_chunk["choices"], + usage=completion_usage, + ) + return completion diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py new file mode 100644 index 0000000000..6b510348fe --- /dev/null +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -0,0 +1,143 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +import time +import uuid +from typing import Iterator, Optional, Sequence, Tuple + +from ctransformers.utils import utf8_split_incomplete + +from xinference.types import CompletionChoice, CompletionChunk, CompletionUsage + + +def _get(*values): + for value in values: + if value is not None: + return value + + +def generate_stream( + model, + prompt: str, + *, + max_new_tokens: Optional[int] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + temperature: Optional[float] = None, + repetition_penalty: Optional[float] = None, + last_n_tokens: Optional[int] = None, + seed: Optional[int] = None, + batch_size: Optional[int] = None, + stream: Optional[bool] = True, + threads: Optional[int] = None, + stop: Optional[Sequence[str]] = None, + reset: Optional[bool] = None, +) -> Iterator[Tuple[CompletionChunk, CompletionUsage]]: + max_new_tokens = _get(max_new_tokens) + stop = _get(stop) or [] + if isinstance(stop, str): + stop = [stop] + + tokens = model.tokenize(prompt) + + stop_regex = re.compile("|".join(map(re.escape, stop))) + count = 0 + text = "" + incomplete = b"" + + # parameters needed for Xinference. + finish_reason = None + + for token in model.generate( + tokens, + top_k=top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + last_n_tokens=last_n_tokens, + seed=seed, + batch_size=batch_size, + threads=threads, + reset=reset, + ): + # Handle incomplete UTF-8 multi-byte characters. + incomplete += model.detokenize([token], decode=False) + complete, incomplete = utf8_split_incomplete(incomplete) + output = complete.decode(errors="ignore") + text += output + + # https://github.com/abetlen/llama-cpp-python/blob/1a13d76c487df1c8560132d10bda62d6e2f4fa93/llama_cpp/llama.py#L686-L706 + # Check if one of the stop sequences is part of the text. + # Note that the stop sequence may not always be at the end of text. + if stop: + match = stop_regex.search(text) + if match: + text = text[: match.start()] + finish_reason = "stop" + break + + # Avoid sending the longest suffix of text which is also a prefix + # of a stop sequence, as it can form a stop sequence with the text + # generated later. + longest = 0 + for s in stop: + for i in range(len(s), 0, -1): + if text.endswith(s[:i]): + longest = max(i, longest) + break + + end = len(text) - longest + if end > 0: + output = text[:end] + completion_choice = CompletionChoice( + text=output, index=0, logprobs=None, finish_reason=None + ) + completion_chunk = CompletionChunk( + id=str(uuid.uuid1()), + object="text_completion", + created=int(time.time()), + model=model, + choices=[completion_choice], + ) + completion_usage = CompletionUsage( + prompt_tokens=len(tokens), + completion_tokens=count + 1, + total_tokens=count + 1 + len(tokens), + ) + + yield completion_chunk, completion_usage + text = text[end:] + + count += 1 + if max_new_tokens is not None and count >= max_new_tokens: + finish_reason = "length" + break + + completion_choice = CompletionChoice( + text=text, index=0, logprobs=None, finish_reason=finish_reason + ) + completion_chunk = CompletionChunk( + id=str(uuid.uuid1()), + object="text_completion", + created=int(time.time()), + model=model, + choices=[completion_choice], + ) + completion_usage = CompletionUsage( + prompt_tokens=len(tokens), + completion_tokens=count, + total_tokens=count + len(tokens), + ) + + yield completion_chunk, completion_usage diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index ae481f2245..cdb2845221 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -744,7 +744,8 @@ "version": 1, "model_name": "qwen-chat", "model_lang": [ - "en", "zh" + "en", + "zh" ], "model_ability": [ "embed", @@ -774,5 +775,31 @@ 151643 ] } + }, + { + "version": 1, + "model_name": "starcoder", + "model_lang": [ + "en" + ], + "model_ability":[ + "generate" + ], + "model_specs": [ + { + "model_format": "ggmlv3", + "model_size_in_billions": 16, + "quantizations": [ + "q4_0", + "q4_1", + "q5_0", + "q5_1", + "q8_0" + ], + "model_id": "TheBloke/starcoder-GGML", + "model_file_name_template": "starcoder.ggmlv3.{quantization}.bin" + } + ], + "prompt_style": null } ] From 382225291673db4813556f75e62038c93a52ee14 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 11:31:15 +0800 Subject: [PATCH 02/61] small edit --- xinference/model/llm/ggml/ctransformer.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 8191a57da7..f6341cbb46 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -13,6 +13,7 @@ # limitations under the License. import logging +import os from typing import Iterator, Optional, Sequence, TypedDict, Union from ctransformers import AutoConfig @@ -130,10 +131,19 @@ def load(self): raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") + # handle legacy cache. + model_path = os.path.join( + self.model_path, + self.model_spec.model_file_name_template.format( + quantization=self.quantization + ), + ) + legacy_model_file_path = os.path.join(self.model_path, "model.bin") + if os.path.exists(legacy_model_file_path): + model_path = legacy_model_file_path + self._llm = AutoModelForCausalLM.from_pretrained( - model_path_or_repo_id=self._model_path, - model_type=self._model_type, - model_file=self._model_file, + model_path_or_repo_id=model_path, config=self._ctransformer_model_config, ) @@ -143,8 +153,6 @@ def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: return False if llm_spec.model_id not in ["TheBloke/starcoder-GGML"]: return False - if "chatglm" in llm_family.model_name: - return False if "generate" not in llm_family.model_ability: return False return True From 09d9e8e5bc1982070b60f52f2a2d67bc8275ad12 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 12:05:48 +0800 Subject: [PATCH 03/61] fix lint --- xinference/model/llm/ggml/ctransformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index f6341cbb46..adeaab626a 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -177,7 +177,7 @@ def generator_wrapper( return generator_wrapper(_prompt=prompt, _generate_config=generate_config) else: for completion_chunk, completion_usage in generate_stream( - self._model, prompt=prompt, **generate_config + self._llm, prompt=prompt, **generate_config ): pass From f218a7dabdb92e8340263cfa16ff2b03bf3ca98f Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 12:15:47 +0800 Subject: [PATCH 04/61] fix model family match --- xinference/model/llm/ggml/ctransformer.py | 2 ++ xinference/model/llm/ggml/llamacpp.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index adeaab626a..31a9cc768e 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -151,6 +151,8 @@ def load(self): def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: if llm_spec.model_format != "ggmlv3": return False + if "starcoder" not in llm_spec.model_name: + return False if llm_spec.model_id not in ["TheBloke/starcoder-GGML"]: return False if "generate" not in llm_family.model_ability: diff --git a/xinference/model/llm/ggml/llamacpp.py b/xinference/model/llm/ggml/llamacpp.py index bea3e4744e..7af6918591 100644 --- a/xinference/model/llm/ggml/llamacpp.py +++ b/xinference/model/llm/ggml/llamacpp.py @@ -187,7 +187,7 @@ def load(self): def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: if llm_spec.model_format != "ggmlv3": return False - if "chatglm" in llm_family.model_name: + if "chatglm" in llm_family.model_name or "starcoder" in llm_family.model_name: return False if "generate" not in llm_family.model_ability: return False From 99b61172500b2be777bf381dc85e4a0298f61144 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 12:18:31 +0800 Subject: [PATCH 05/61] fix model family match --- xinference/model/llm/ggml/ctransformer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 31a9cc768e..08065fa6ab 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -153,8 +153,6 @@ def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: return False if "starcoder" not in llm_spec.model_name: return False - if llm_spec.model_id not in ["TheBloke/starcoder-GGML"]: - return False if "generate" not in llm_family.model_ability: return False return True From c4cf3b92854e6d864e6848f2f157156d3369abef Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 12:23:43 +0800 Subject: [PATCH 06/61] small edit on ctransformer.py --- xinference/model/llm/ggml/ctransformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 08065fa6ab..3fffa3f2b4 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -176,6 +176,7 @@ def generator_wrapper( if stream_or_not: return generator_wrapper(_prompt=prompt, _generate_config=generate_config) else: + assert self._llm is not None for completion_chunk, completion_usage in generate_stream( self._llm, prompt=prompt, **generate_config ): From 721a540584ee4b018970665a9d4d890aef23871d Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 13:26:24 +0800 Subject: [PATCH 07/61] small edit on ctransformer.py --- xinference/model/llm/ggml/ctransformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 3fffa3f2b4..81200f292d 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -151,7 +151,7 @@ def load(self): def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: if llm_spec.model_format != "ggmlv3": return False - if "starcoder" not in llm_spec.model_name: + if "starcoder" not in llm_family.model_name: return False if "generate" not in llm_family.model_ability: return False From 87333ef0cb8c8ee565958d844e82ba889c8073b7 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 13:49:47 +0800 Subject: [PATCH 08/61] add model-type list --- xinference/model/llm/ggml/ctransformer.py | 30 ++++++++++++++++++++++- xinference/model/llm/llm_family.json | 2 +- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 81200f292d..f3447569f2 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -44,6 +44,23 @@ # context_length: int # gpu_layers: int +# all supported models for Ctransformers with their model type. +model_type_for_ctransformer = { + "GPT-2": "gpt2", + "GPT-J": "gptj", + "GPT4All-J": "gptj", + "GPT-NeoX": "gpt_neox", + "StableLM": "gpt_neox", + "LLaMA": "llama", + "LLaMA-2": "llama", + "MPT": "mpt", + "Dolly-V2": "dolly-v2", + "Replit": "replit", + "StarCoder": "starcoder", + "StarChat": "starcoder", + "Falcon": "falcon", +} + class CtransformerGenerateConfig(TypedDict, total=False): max_new_tokens: Optional[int] @@ -72,6 +89,7 @@ def __init__( ): super().__init__(model_uid, model_family, model_spec, quantization, model_path) + self._model_type = None closest_size = min( SIZE_TO_GPU_LAYERS.keys(), key=lambda x: abs(x - model_spec.model_size_in_billions), @@ -80,6 +98,7 @@ def __init__( self._ctransformer_model_config: AutoConfig = self._sanitize_model_config( model_path, ctransformerModelConfig ) + self._model_family = model_family self._llm = None def _sanitize_model_config( @@ -142,8 +161,10 @@ def load(self): if os.path.exists(legacy_model_file_path): model_path = legacy_model_file_path + self._model_type = self._determine_model_type() self._llm = AutoModelForCausalLM.from_pretrained( model_path_or_repo_id=model_path, + model_type=self._model_type, config=self._ctransformer_model_config, ) @@ -151,12 +172,19 @@ def load(self): def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: if llm_spec.model_format != "ggmlv3": return False - if "starcoder" not in llm_family.model_name: + if "StarCoder" not in llm_family.model_name: return False if "generate" not in llm_family.model_ability: return False return True + def _determine_model_type(self): + if self._model_family.model_name not in model_type_for_ctransformer: + raise ValueError( + "The current model is not supported, check your model name. " + ) + return model_type_for_ctransformer[self._model_family.model_name] + def generate( self, prompt: str, generate_config: CtransformerGenerateConfig ) -> Union[Completion, Iterator[CompletionChunk]]: diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index cdb2845221..2b5f47020d 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -778,7 +778,7 @@ }, { "version": 1, - "model_name": "starcoder", + "model_name": "StarCoder", "model_lang": [ "en" ], From 115db12a73628934ae19013f6a23cc360f95ab5c Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 13:54:01 +0800 Subject: [PATCH 09/61] edit llama cpp --- xinference/model/llm/ggml/llamacpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/llamacpp.py b/xinference/model/llm/ggml/llamacpp.py index 7af6918591..3506fb31ea 100644 --- a/xinference/model/llm/ggml/llamacpp.py +++ b/xinference/model/llm/ggml/llamacpp.py @@ -258,7 +258,7 @@ def __init__( def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: if llm_spec.model_format != "ggmlv3": return False - if "chatglm" in llm_family.model_name: + if "chatglm" in llm_family.model_name or "StarCoder" in llm_family.model_name: return False if "chat" not in llm_family.model_ability: return False From e291cb19aee77187b24daf83dea6235580d2105c Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 13:58:25 +0800 Subject: [PATCH 10/61] edit llama cpp --- xinference/model/llm/ggml/llamacpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/llamacpp.py b/xinference/model/llm/ggml/llamacpp.py index 3506fb31ea..a55c9d717b 100644 --- a/xinference/model/llm/ggml/llamacpp.py +++ b/xinference/model/llm/ggml/llamacpp.py @@ -187,7 +187,7 @@ def load(self): def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: if llm_spec.model_format != "ggmlv3": return False - if "chatglm" in llm_family.model_name or "starcoder" in llm_family.model_name: + if "chatglm" in llm_family.model_name or "StarCoder" in llm_family.model_name: return False if "generate" not in llm_family.model_ability: return False From 9b33537766388fc01682dc909a12b9af70b409a8 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 14:24:07 +0800 Subject: [PATCH 11/61] edit llama cpp --- xinference/model/llm/ggml/ctransformers_util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index 6b510348fe..40647976f3 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -43,6 +43,7 @@ def generate_stream( threads: Optional[int] = None, stop: Optional[Sequence[str]] = None, reset: Optional[bool] = None, + **kwargs ) -> Iterator[Tuple[CompletionChunk, CompletionUsage]]: max_new_tokens = _get(max_new_tokens) stop = _get(stop) or [] From 2b1159fad88e256986c9d75082976e75d4fccca1 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 14:31:44 +0800 Subject: [PATCH 12/61] edit ctransformer cpp --- xinference/model/llm/ggml/ctransformer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index f3447569f2..7835941bcd 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -200,6 +200,10 @@ def generator_wrapper( generate_config = self._sanitize_generate_config(generate_config) + logger.error( + "Enter generate, prompt: %s, generate config: %s", prompt, generate_config + ) + stream_or_not = generate_config.get("stream", False) if stream_or_not: return generator_wrapper(_prompt=prompt, _generate_config=generate_config) From 9346de1a8e54e963d1fafc95bfa020ca44d3a882 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 15:00:59 +0800 Subject: [PATCH 13/61] edit ctransformer cpp --- xinference/model/llm/ggml/ctransformer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 7835941bcd..2868d502c3 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -222,4 +222,7 @@ def generator_wrapper( choices=completion_chunk["choices"], usage=completion_usage, ) + + logger.error("Generated", completion, generate_config) + return completion From 2edc68735c2a4d9286cbf6a5d1fd7e148d1f84d1 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 15:08:29 +0800 Subject: [PATCH 14/61] edit ctransformer cpp --- xinference/model/llm/ggml/ctransformer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 2868d502c3..730a870940 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -223,6 +223,10 @@ def generator_wrapper( usage=completion_usage, ) - logger.error("Generated", completion, generate_config) + logger.error( + "Generated, completion: %s, generate config: %s", + completion, + generate_config, + ) return completion From 8189c23f81dd356bb913274096af4ecabd606f8b Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 15:17:46 +0800 Subject: [PATCH 15/61] edit ctransformer cpp --- xinference/model/llm/ggml/ctransformer.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 730a870940..7f45cfcba2 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -99,6 +99,7 @@ def __init__( model_path, ctransformerModelConfig ) self._model_family = model_family + self._model_uid = model_uid self._llm = None def _sanitize_model_config( @@ -192,9 +193,9 @@ def generator_wrapper( _prompt: str, _generate_config: CtransformerGenerateConfig, ) -> Iterator[CompletionChunk]: - assert self._llm is not None + assert self._model_uid is not None for _completion_chunk, _ in generate_stream( - model=self._llm, prompt=_prompt, **_generate_config + model=self._model_uid, prompt=_prompt, **_generate_config ): yield _completion_chunk @@ -208,12 +209,16 @@ def generator_wrapper( if stream_or_not: return generator_wrapper(_prompt=prompt, _generate_config=generate_config) else: - assert self._llm is not None + assert self.model_uid is not None + completion_chunk = None + completion_usage = None for completion_chunk, completion_usage in generate_stream( - self._llm, prompt=prompt, **generate_config + self.model_uid, prompt=prompt, **generate_config ): pass + assert completion_chunk is not None + assert completion_usage is not None completion = Completion( id=completion_chunk["id"], object=completion_chunk["object"], From 153ae6c278fb733a516940576dc0e66a1f079e3c Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 15:23:03 +0800 Subject: [PATCH 16/61] edit ctransformer cpp --- xinference/model/llm/ggml/ctransformer.py | 10 ++++++++-- xinference/model/llm/ggml/ctransformers_util.py | 7 ++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 7f45cfcba2..a72fa48d99 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -195,7 +195,10 @@ def generator_wrapper( ) -> Iterator[CompletionChunk]: assert self._model_uid is not None for _completion_chunk, _ in generate_stream( - model=self._model_uid, prompt=_prompt, **_generate_config + model=self._model_uid, + model_ref=self._llm, + prompt=_prompt, + **_generate_config, ): yield _completion_chunk @@ -213,7 +216,10 @@ def generator_wrapper( completion_chunk = None completion_usage = None for completion_chunk, completion_usage in generate_stream( - self.model_uid, prompt=prompt, **generate_config + model=self.model_uid, + model_ref=self._llm, + prompt=prompt, + **generate_config, ): pass diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index 40647976f3..16ab0da9d3 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -29,6 +29,7 @@ def _get(*values): def generate_stream( model, + model_ref, prompt: str, *, max_new_tokens: Optional[int] = None, @@ -50,7 +51,7 @@ def generate_stream( if isinstance(stop, str): stop = [stop] - tokens = model.tokenize(prompt) + tokens = model_ref.tokenize(prompt) stop_regex = re.compile("|".join(map(re.escape, stop))) count = 0 @@ -60,7 +61,7 @@ def generate_stream( # parameters needed for Xinference. finish_reason = None - for token in model.generate( + for token in model_ref.generate( tokens, top_k=top_k, top_p=top_p, @@ -73,7 +74,7 @@ def generate_stream( reset=reset, ): # Handle incomplete UTF-8 multi-byte characters. - incomplete += model.detokenize([token], decode=False) + incomplete += model_ref.detokenize([token], decode=False) complete, incomplete = utf8_split_incomplete(incomplete) output = complete.decode(errors="ignore") text += output From ff93f987622eae71fa79557ecaf7db0f65706ce9 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 15:29:53 +0800 Subject: [PATCH 17/61] edit ctransformer cpp --- xinference/model/llm/ggml/ctransformers_util.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index 16ab0da9d3..b774364ddb 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging import re import time import uuid @@ -20,6 +21,8 @@ from xinference.types import CompletionChoice, CompletionChunk, CompletionUsage +logger = logging.getLogger(__name__) + def _get(*values): for value in values: @@ -79,6 +82,8 @@ def generate_stream( output = complete.decode(errors="ignore") text += output + logger.error("Output, completion: %s", text) + # https://github.com/abetlen/llama-cpp-python/blob/1a13d76c487df1c8560132d10bda62d6e2f4fa93/llama_cpp/llama.py#L686-L706 # Check if one of the stop sequences is part of the text. # Note that the stop sequence may not always be at the end of text. @@ -126,6 +131,7 @@ def generate_stream( finish_reason = "length" break + logger.error("Output, completion: %s", text) completion_choice = CompletionChoice( text=text, index=0, logprobs=None, finish_reason=finish_reason ) From 86335c4f460c639252296ff1502f82c7216d4cc2 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 15:33:26 +0800 Subject: [PATCH 18/61] edit ctransformer util --- .../model/llm/ggml/ctransformers_util.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index b774364ddb..4795c06e4d 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -43,7 +43,7 @@ def generate_stream( last_n_tokens: Optional[int] = None, seed: Optional[int] = None, batch_size: Optional[int] = None, - stream: Optional[bool] = True, + stream: Optional[bool] = False, threads: Optional[int] = None, stop: Optional[Sequence[str]] = None, reset: Optional[bool] = None, @@ -59,6 +59,7 @@ def generate_stream( stop_regex = re.compile("|".join(map(re.escape, stop))) count = 0 text = "" + total_text = "" incomplete = b"" # parameters needed for Xinference. @@ -81,6 +82,7 @@ def generate_stream( complete, incomplete = utf8_split_incomplete(incomplete) output = complete.decode(errors="ignore") text += output + total_text += output logger.error("Output, completion: %s", text) @@ -131,10 +133,17 @@ def generate_stream( finish_reason = "length" break - logger.error("Output, completion: %s", text) - completion_choice = CompletionChoice( - text=text, index=0, logprobs=None, finish_reason=finish_reason - ) + if stream is False: + completion_choice = CompletionChoice( + text=total_text, index=0, logprobs=None, finish_reason=finish_reason + ) + else: + completion_choice = CompletionChoice( + text=total_text, index=0, logprobs=None, finish_reason=finish_reason + ) + + logger.error("Output_final, completion: %s", text) + completion_chunk = CompletionChunk( id=str(uuid.uuid1()), object="text_completion", From ce6bb781888d8f1d76ef987f24e323805d65f3c1 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 16:01:42 +0800 Subject: [PATCH 19/61] edit ctransformer util --- xinference/model/llm/ggml/ctransformer.py | 17 ----------------- xinference/model/llm/ggml/ctransformers_util.py | 2 -- 2 files changed, 19 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index a72fa48d99..9d8d0c72ca 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -27,23 +27,6 @@ logger = logging.getLogger(__name__) - -# class AutoConfig(TypedDict, total=False): -# top_k: int -# top_p: float -# temperature: float -# repetition_penalty: float -# last_n_tokens: float -# seed: int -# max_new_tokens: int -# stop: List[str] -# stream: bool -# reset: bool -# batch_size: int -# threads: int -# context_length: int -# gpu_layers: int - # all supported models for Ctransformers with their model type. model_type_for_ctransformer = { "GPT-2": "gpt2", diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index 4795c06e4d..69c09dea6a 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -142,8 +142,6 @@ def generate_stream( text=total_text, index=0, logprobs=None, finish_reason=finish_reason ) - logger.error("Output_final, completion: %s", text) - completion_chunk = CompletionChunk( id=str(uuid.uuid1()), object="text_completion", From 3db942b40d8133e9825fd4cad65a986ef68d5cbf Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 16:15:24 +0800 Subject: [PATCH 20/61] edit ctransformer util --- xinference/model/llm/ggml/ctransformers_util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index 69c09dea6a..a5ccd2ac1d 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -133,13 +133,14 @@ def generate_stream( finish_reason = "length" break + logger.error("Final, completion: %s", text) if stream is False: completion_choice = CompletionChoice( text=total_text, index=0, logprobs=None, finish_reason=finish_reason ) else: completion_choice = CompletionChoice( - text=total_text, index=0, logprobs=None, finish_reason=finish_reason + text=text, index=0, logprobs=None, finish_reason=finish_reason ) completion_chunk = CompletionChunk( From 72a77f309284245b90740c0de91c983d84d65189 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 16:20:57 +0800 Subject: [PATCH 21/61] edit ctransformer util --- xinference/model/llm/ggml/ctransformers_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index a5ccd2ac1d..fc5e83001e 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -156,4 +156,4 @@ def generate_stream( total_tokens=count + len(tokens), ) - yield completion_chunk, completion_usage + return completion_chunk, completion_usage From fef3a6996ec597a7288f24c8f9a152cf018d526d Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 16:42:54 +0800 Subject: [PATCH 22/61] Ctransformer Pipeline is clear and ready to serve --- setup.cfg | 1 + xinference/model/llm/ggml/ctransformer.py | 1 + 2 files changed, 2 insertions(+) diff --git a/setup.cfg b/setup.cfg index 143cef1acb..e4108085e2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -72,6 +72,7 @@ all = tiktoken ggml = llama-cpp-python>=0.1.77 + ctransformers pytorch = transformers>=4.31.0 torch diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 9d8d0c72ca..cc45afd640 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -28,6 +28,7 @@ logger = logging.getLogger(__name__) # all supported models for Ctransformers with their model type. +# Please Strictly follows this name format when inputting new model to model_family. model_type_for_ctransformer = { "GPT-2": "gpt2", "GPT-J": "gptj", From 2345dae696fbbc4ff399bcdea3796ba709f254bc Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Tue, 8 Aug 2023 16:08:14 +0800 Subject: [PATCH 23/61] fix rebase --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index e4108085e2..75d5902ff0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -60,6 +60,7 @@ dev = flake8>=3.8.0 black all = + ctransformers llama-cpp-python>=0.1.77 transformers>=4.31.0 torch From ae6e5db8fadcbffcde8dd0fdf0b0efb4eff7b6e7 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 17:11:47 +0800 Subject: [PATCH 24/61] fix setup issue2. --- xinference/model/llm/ggml/ctransformer.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index cc45afd640..944135cb1d 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -16,8 +16,6 @@ import os from typing import Iterator, Optional, Sequence, TypedDict, Union -from ctransformers import AutoConfig - from xinference.model.llm.ggml.ctransformers_util import generate_stream from xinference.types import Completion, CompletionChunk @@ -25,6 +23,18 @@ from ..llm_family import LLMFamilyV1, LLMSpecV1 from .llamacpp import SIZE_TO_GPU_LAYERS +try: + from ctransformers import AutoConfig +except ImportError: + error_message = "Failed to import module 'ctransformers'" + + installation_guide = [ + "Please make sure 'ctransformers' is installed. You can install it by checking out the repository: " + "https://github.com/marella/ctransformers", + ] + + raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") + logger = logging.getLogger(__name__) # all supported models for Ctransformers with their model type. From 4db685bb18b0a7b41a02b66e52168980aa91a7ca Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 17:12:28 +0800 Subject: [PATCH 25/61] fix setup issue3 --- xinference/model/llm/ggml/ctransformers_util.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index fc5e83001e..c12ea8753e 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -17,7 +17,17 @@ import uuid from typing import Iterator, Optional, Sequence, Tuple -from ctransformers.utils import utf8_split_incomplete +try: + from ctransformers.utils import utf8_split_incomplete +except ImportError: + error_message = "Failed to import module 'ctransformers'" + + installation_guide = [ + "Please make sure 'ctransformers' is installed. You can install it by checking out the repository: " + "https://github.com/marella/ctransformers", + ] + + raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") from xinference.types import CompletionChoice, CompletionChunk, CompletionUsage @@ -47,7 +57,7 @@ def generate_stream( threads: Optional[int] = None, stop: Optional[Sequence[str]] = None, reset: Optional[bool] = None, - **kwargs + **kwargs, ) -> Iterator[Tuple[CompletionChunk, CompletionUsage]]: max_new_tokens = _get(max_new_tokens) stop = _get(stop) or [] From 5ae9ec78ba5d8f9403b28526a2f2708463ce99bd Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 17:28:02 +0800 Subject: [PATCH 26/61] fix import CI checking --- xinference/model/llm/ggml/ctransformer.py | 30 ++++++++++++------- .../model/llm/ggml/ctransformers_util.py | 26 ++++++++-------- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 944135cb1d..1192878cf5 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -14,7 +14,7 @@ import logging import os -from typing import Iterator, Optional, Sequence, TypedDict, Union +from typing import TYPE_CHECKING, Iterator, Optional, Sequence, TypedDict, Union from xinference.model.llm.ggml.ctransformers_util import generate_stream from xinference.types import Completion, CompletionChunk @@ -23,17 +23,8 @@ from ..llm_family import LLMFamilyV1, LLMSpecV1 from .llamacpp import SIZE_TO_GPU_LAYERS -try: +if TYPE_CHECKING: from ctransformers import AutoConfig -except ImportError: - error_message = "Failed to import module 'ctransformers'" - - installation_guide = [ - "Please make sure 'ctransformers' is installed. You can install it by checking out the repository: " - "https://github.com/marella/ctransformers", - ] - - raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") logger = logging.getLogger(__name__) @@ -99,6 +90,23 @@ def __init__( def _sanitize_model_config( self, model_path, ctransformerModelConfig: Optional[AutoConfig] ) -> AutoConfig: + try: + from ctransformers import AutoConfig + except ImportError: + error_message = "Failed to import module 'ctransformers - AutoConfig'" + if self._is_darwin_and_apple_silicon(): + system = "Metal" + else: + system = "CUDA" + + installation_guide = [ + f"Please make sure 'ctransformers' is installed and {system} accelerator is provided.", + f"You can install it by checking out the repository for command for {system} platform:" + f"https://github.com/marella/ctransformers", + ] + + raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") + if ctransformerModelConfig is None: ctransformerModelConfig = AutoConfig.from_pretrained( model_path, diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index c12ea8753e..35affb1575 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -17,18 +17,6 @@ import uuid from typing import Iterator, Optional, Sequence, Tuple -try: - from ctransformers.utils import utf8_split_incomplete -except ImportError: - error_message = "Failed to import module 'ctransformers'" - - installation_guide = [ - "Please make sure 'ctransformers' is installed. You can install it by checking out the repository: " - "https://github.com/marella/ctransformers", - ] - - raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") - from xinference.types import CompletionChoice, CompletionChunk, CompletionUsage logger = logging.getLogger(__name__) @@ -88,6 +76,20 @@ def generate_stream( reset=reset, ): # Handle incomplete UTF-8 multi-byte characters. + try: + from ctransformers.utils import utf8_split_incomplete + except ImportError: + error_message = ( + "Failed to import module 'ctransformers - utf8_split_incomplete'" + ) + + installation_guide = [ + "Please make sure 'ctransformers' is installed. You can install it by checking out the repository: " + "https://github.com/marella/ctransformers", + ] + + raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") + incomplete += model_ref.detokenize([token], decode=False) complete, incomplete = utf8_split_incomplete(incomplete) output = complete.decode(errors="ignore") From 6330d9536aa981996bf1110ed7e45e81ccbb973d Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 17:37:07 +0800 Subject: [PATCH 27/61] fix import CI checking 2 --- xinference/model/llm/ggml/ctransformer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 1192878cf5..7612408b88 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -63,6 +63,15 @@ class CtransformerGenerateConfig(TypedDict, total=False): class CtransformerModel(LLM): + try: + from ctransformers import AutoConfig + except ImportError: + error_message = "Failed to import module 'ctransformers - AutoConfig'" + installation_guide = [ + "Please make sure 'ctransformers' is installed, You can install it by checking out the repository for " + "command: https://github.com/marella/ctransformers", + ] + def __init__( self, model_uid: str, From fd6b70a49244c33d960c615c96ce3a921a78b2a6 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 17:43:13 +0800 Subject: [PATCH 28/61] fix import CI checking 3 --- xinference/model/llm/ggml/ctransformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 7612408b88..30baefa40f 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -79,7 +79,7 @@ def __init__( model_spec: "LLMSpecV1", quantization: str, model_path: str, - ctransformerModelConfig: Optional[AutoConfig] = None, + ctransformerModelConfig, ): super().__init__(model_uid, model_family, model_spec, quantization, model_path) From 22ce5262fda84f7b83f509b18a60a5435b4ed651 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 17:54:23 +0800 Subject: [PATCH 29/61] fix import CI checking 4 --- xinference/model/llm/ggml/ctransformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 30baefa40f..1f91c835f4 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -79,7 +79,7 @@ def __init__( model_spec: "LLMSpecV1", quantization: str, model_path: str, - ctransformerModelConfig, + ctransformerModelConfig: Optional["AutoConfig"], ): super().__init__(model_uid, model_family, model_spec, quantization, model_path) @@ -97,7 +97,7 @@ def __init__( self._llm = None def _sanitize_model_config( - self, model_path, ctransformerModelConfig: Optional[AutoConfig] + self, model_path, ctransformerModelConfig: Optional["AutoConfig"] ) -> AutoConfig: try: from ctransformers import AutoConfig From e1711d5c44d3016bdb58d3abda80bf0a659b33ea Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 4 Aug 2023 18:02:37 +0800 Subject: [PATCH 30/61] fix import CI checking 5 --- xinference/model/llm/ggml/ctransformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformer.py index 1f91c835f4..9c67713402 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformer.py @@ -98,7 +98,7 @@ def __init__( def _sanitize_model_config( self, model_path, ctransformerModelConfig: Optional["AutoConfig"] - ) -> AutoConfig: + ) -> "AutoConfig": try: from ctransformers import AutoConfig except ImportError: From ae04adb1fe33674eb7b322fcc5adc0579ddd776e Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Sun, 6 Aug 2023 18:20:15 +0800 Subject: [PATCH 31/61] add test to c-transformers --- .../llm/ggml/tests/test_ctransformers.py | 262 ++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 xinference/model/llm/ggml/tests/test_ctransformers.py diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py new file mode 100644 index 0000000000..b411c2d6aa --- /dev/null +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -0,0 +1,262 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import random +import re +import string +import time +from typing import Iterator + +import pytest +from ctransformers import AutoConfig, Config + +from xinference.model.llm import GgmlLLMSpecV1, LLMFamilyV1 +from xinference.model.llm.ggml.ctransformer import ( + CtransformerGenerateConfig, + CtransformerModel, +) +from xinference.types import ( + Completion, + CompletionChoice, + CompletionChunk, + CompletionUsage, +) + + +class MockPipeline: + def __init__(self) -> None: + pass + + +class MockCtransformersModel(CtransformerModel): + def load(self): + self._llm = MockPipeline() + + def generate_stream(self) -> Iterator[Completion]: + for i in range(5): + res = f"ctransformers_test_stream_{i}" + completion_choice = CompletionChoice( + text=res, index=0, logprobs=None, finish_reason="test_stream" + ) + completion_chunk = CompletionChunk( + id=str(f"test_{i}"), + object="text_completion", + created=int(time.time()), + model=self._model_uid, + choices=[completion_choice], + ) + completion_usage = CompletionUsage( + prompt_tokens=10, + completion_tokens=20, + total_tokens=30, + ) + completion = Completion( + id=completion_chunk["id"], + object=completion_chunk["object"], + created=completion_chunk["created"], + model=completion_chunk["model"], + choices=completion_chunk["choices"], + usage=completion_usage, + ) + yield completion + + def generate( + self, prompt: str, generate_config: CtransformerGenerateConfig + ) -> Completion: + completion_choice = CompletionChoice( + text="test_ctransformers_generate", + index=0, + logprobs=None, + finish_reason="test", + ) + completion_chunk = CompletionChunk( + id=str("test"), + object="text_completion", + created=int(time.time()), + model=self._model_uid, + choices=[completion_choice], + ) + completion_usage = CompletionUsage( + prompt_tokens=10, + completion_tokens=20, + total_tokens=30, + ) + completion = Completion( + id=completion_chunk["id"], + object=completion_chunk["object"], + created=completion_chunk["created"], + model=completion_chunk["model"], + choices=completion_chunk["choices"], + usage=completion_usage, + ) + return completion + + +mock_model_spec = GgmlLLMSpecV1( + model_format="ggmlv3", + model_size_in_billions=6, + quantizations=["q2_k", "q4_0"], + model_id="test_id", + model_file_name_template="TestModel.{quantization}.ggmlv3.bin", +) + +test_model_spec = """{ + "version":1, + "model_name":"TestModel", + "model_lang":[ + "en" + ], + "model_ability":[ + "embed", "generate" + ], + "model_specs":[ + { + "model_format":"ggmlv3", + "model_size_in_billions":6, + "quantizations": ["q2_k", "q4_0"], + "model_id":"test_id", + "model_file_name_template":"TestModel.{quantization}.ggmlv3.bin" + }, + { + "model_format":"pytorch", + "model_size_in_billions":3, + "quantizations": ["int8", "int4", "none"], + "model_id":"example/TestModel" + } + ], + "prompt_style": null +}""" + +mock_model_family = LLMFamilyV1.parse_raw(test_model_spec) + + +@pytest.fixture +def mock_AutoConfig_Pretrained(mocker): + # Create a mock of the Child.method() and set its return value + mock_from_pretrained = mocker.patch.object(AutoConfig, "from_pretrained") + config = Config() + auto_config = AutoConfig(config=config) + mock_from_pretrained.return_value = auto_config + return mock_from_pretrained + + +@pytest.mark.parametrize( + "model_spec, model_family", [(mock_model_spec, mock_model_family)] +) +def test_ctransformer_init(model_spec, model_family, mock_AutoConfig_Pretrained): + quantization = "q4_0" + uid = "".join(random.choice(string.digits) for i in range(15)) + path = "".join( + random.choice(string.ascii_letters + string.punctuation) for i in range(100) + ) + model = MockCtransformersModel( + model_uid=uid, + model_family=model_family, + model_spec=model_spec, + quantization=quantization, + model_path=path, + ctransformerModelConfig=None, + ) + + assert model.model_uid == uid + assert model.quantization == quantization + assert model.model_path == path + assert model._ctransformer_model_config is not None + assert isinstance(model._ctransformer_model_config, AutoConfig) + + assert isinstance(model.model_spec, GgmlLLMSpecV1) + assert isinstance(model.model_family, LLMFamilyV1) + assert isinstance(model.model_family.model_specs[0], GgmlLLMSpecV1) + + assert ( + model.model_family.model_specs[0].model_format == model.model_spec.model_format + ) + assert model.model_family.model_specs[0].model_format == model_spec.model_format + assert ( + model.model_family.model_specs[0].model_size_in_billions + == model.model_spec.model_size_in_billions + ) + assert ( + model.model_family.model_specs[0].model_size_in_billions + == model_spec.model_size_in_billions + ) + assert ( + model.model_family.model_specs[0].quantizations + == model.model_spec.quantizations + ) + assert model.model_family.model_specs[0].quantizations == model_spec.quantizations + assert model.model_family.model_specs[0].model_id == model.model_spec.model_id + assert model.model_family.model_specs[0].model_id == model_spec.model_id + assert ( + model.model_family.model_specs[0].model_file_name_template + == model.model_spec.model_file_name_template + ) + assert ( + model.model_family.model_specs[0].model_file_name_template + == model_spec.model_file_name_template + ) + assert ( + model.model_family.model_specs[0].model_local_path + == model.model_spec.model_local_path + ) + assert ( + model.model_family.model_specs[0].model_local_path + == model_spec.model_local_path + ) + + assert model._llm is None + + +@pytest.mark.parametrize( + "model_spec, model_family", [(mock_model_spec, mock_model_family)] +) +def test_model_generate(model_spec, model_family, mock_AutoConfig_Pretrained): + quantization = "q4_0" + uid = "".join(random.choice(string.digits) for i in range(100)) + path = "".join( + random.choice(string.ascii_letters + string.punctuation) for i in range(100) + ) + model = MockCtransformersModel( + model_uid=uid, + model_family=model_family, + model_spec=model_spec, + quantization=quantization, + model_path=path, + ctransformerModelConfig=None, + ) + + assert model._llm is None + + model.load() + assert isinstance(model._llm, MockPipeline) + + # generate with stream + pattern = r"[0-4]" + for completion in model.generate_stream(): + assert completion["id"].startswith("test_") + assert re.search(pattern, completion["id"]) + assert completion["choices"][0]["text"].startswith("ctransformers_test_stream_") + assert re.search(pattern, completion["choices"][0]["text"]) + assert completion["choices"][0]["finish_reason"] == "test_stream" + assert completion["usage"]["prompt_tokens"] == 10 + assert completion["usage"]["completion_tokens"] == 20 + assert completion["usage"]["total_tokens"] == 30 + + # generate without stream + responses = model.generate("def Helloworld():", generate_config={"stream": True}) + assert responses["object"] == "text_completion" + assert responses["choices"][0]["text"] == "test_ctransformers_generate" + assert responses["choices"][0]["finish_reason"] == "test" + assert responses["usage"]["prompt_tokens"] == 10 + assert responses["usage"]["completion_tokens"] == 20 + assert responses["usage"]["total_tokens"] == 30 From 7641b3078f174ac02d7df70f5ca9f8769d4b2aa6 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Sun, 6 Aug 2023 18:32:54 +0800 Subject: [PATCH 32/61] fix test typing issue from importing Ctransforemrs --- xinference/model/llm/ggml/tests/test_ctransformers.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index b411c2d6aa..eb601168ec 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -18,7 +18,6 @@ from typing import Iterator import pytest -from ctransformers import AutoConfig, Config from xinference.model.llm import GgmlLLMSpecV1, LLMFamilyV1 from xinference.model.llm.ggml.ctransformer import ( @@ -143,6 +142,10 @@ def generate( @pytest.fixture def mock_AutoConfig_Pretrained(mocker): # Create a mock of the Child.method() and set its return value + try: + from ctransformers import AutoConfig, Config + except ImportError: + raise ImportError("ctransformers AutoConfig or Config cannot been imported.") mock_from_pretrained = mocker.patch.object(AutoConfig, "from_pretrained") config = Config() auto_config = AutoConfig(config=config) @@ -168,6 +171,11 @@ def test_ctransformer_init(model_spec, model_family, mock_AutoConfig_Pretrained) ctransformerModelConfig=None, ) + try: + from ctransformers import AutoConfig + except ImportError: + raise ImportError("ctransformers AutoConfig or Config cannot been imported.") + assert model.model_uid == uid assert model.quantization == quantization assert model.model_path == path From 30575ed0d789227c2f624fde4178fb968120b4b2 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Sun, 6 Aug 2023 18:56:19 +0800 Subject: [PATCH 33/61] add pytest-mock to setup.cfg --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 75d5902ff0..6b81d1fb19 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,6 +52,7 @@ dev = pytest-timeout>=1.2.0 pytest-forked>=1.0 pytest-asyncio>=0.14.0 + pytest-mock>=3.11.1 ipython>=6.5.0 sphinx>=3.0.0,<5.0.0 pydata-sphinx-theme>=0.3.0 From 4b508a0ee37b00e728d7406e9503b24c6c6b04d2 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Sun, 6 Aug 2023 19:24:29 +0800 Subject: [PATCH 34/61] fix ctransformers import issue1 --- .../llm/ggml/tests/test_ctransformers.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index eb601168ec..6309276064 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -11,11 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import importlib import random import re import string import time from typing import Iterator +from unittest.mock import Mock import pytest @@ -139,16 +141,23 @@ def generate( mock_model_family = LLMFamilyV1.parse_raw(test_model_spec) +class MockAutoConfig: + def __init__(self, config, *args, **kwargs): + self.config = config + + @pytest.fixture def mock_AutoConfig_Pretrained(mocker): # Create a mock of the Child.method() and set its return value - try: - from ctransformers import AutoConfig, Config - except ImportError: - raise ImportError("ctransformers AutoConfig or Config cannot been imported.") - mock_from_pretrained = mocker.patch.object(AutoConfig, "from_pretrained") - config = Config() - auto_config = AutoConfig(config=config) + ctransformers_module = importlib.import_module("ctransformers") + mock_from_pretrained = mocker.patch.object( + ctransformers_module.AutoConfig, # Target object to patch + "from_pretrained", # Attribute to patch + side_effect=MockAutoConfig, # Custom side_effect function + ) + + config = Mock() + auto_config = MockAutoConfig(config) mock_from_pretrained.return_value = auto_config return mock_from_pretrained @@ -171,16 +180,11 @@ def test_ctransformer_init(model_spec, model_family, mock_AutoConfig_Pretrained) ctransformerModelConfig=None, ) - try: - from ctransformers import AutoConfig - except ImportError: - raise ImportError("ctransformers AutoConfig or Config cannot been imported.") - assert model.model_uid == uid assert model.quantization == quantization assert model.model_path == path assert model._ctransformer_model_config is not None - assert isinstance(model._ctransformer_model_config, AutoConfig) + assert isinstance(model._ctransformer_model_config, MockAutoConfig) assert isinstance(model.model_spec, GgmlLLMSpecV1) assert isinstance(model.model_family, LLMFamilyV1) From 98669fd61544b8713e41dd1b890e32a592e10383 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Sun, 6 Aug 2023 19:25:25 +0800 Subject: [PATCH 35/61] fix ctransformers import by add ctransformers to dev --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 6b81d1fb19..ef5f0eff12 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,6 +53,7 @@ dev = pytest-forked>=1.0 pytest-asyncio>=0.14.0 pytest-mock>=3.11.1 + ctransformers ipython>=6.5.0 sphinx>=3.0.0,<5.0.0 pydata-sphinx-theme>=0.3.0 From 31c4054ca5aa0ab7a0b91654e9492aa58a4a3d78 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Sun, 6 Aug 2023 19:41:05 +0800 Subject: [PATCH 36/61] fix ctransformers test issue2 --- xinference/model/llm/ggml/tests/test_ctransformers.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index 6309276064..60bc866d81 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -217,15 +217,6 @@ def test_ctransformer_init(model_spec, model_family, mock_AutoConfig_Pretrained) model.model_family.model_specs[0].model_file_name_template == model_spec.model_file_name_template ) - assert ( - model.model_family.model_specs[0].model_local_path - == model.model_spec.model_local_path - ) - assert ( - model.model_family.model_specs[0].model_local_path - == model_spec.model_local_path - ) - assert model._llm is None From 6472c84656b01b9f5e59eb6914fa624318981191 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Mon, 7 Aug 2023 10:46:13 +0800 Subject: [PATCH 37/61] fix ctransformers test issue2 --- xinference/model/llm/ggml/tests/test_ctransformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index 60bc866d81..e4f143515c 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -146,9 +146,9 @@ def __init__(self, config, *args, **kwargs): self.config = config +# Mock the AutoConfig_Pretrained to not directly import ctransformers @pytest.fixture def mock_AutoConfig_Pretrained(mocker): - # Create a mock of the Child.method() and set its return value ctransformers_module = importlib.import_module("ctransformers") mock_from_pretrained = mocker.patch.object( ctransformers_module.AutoConfig, # Target object to patch From 5b7d837cffced973e9d7b94e7190c9bedc4120bb Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Tue, 8 Aug 2023 12:49:58 +0800 Subject: [PATCH 38/61] refactor toward suggestions. --- setup.cfg | 1 - xinference/model/llm/__init__.py | 4 +- .../{ctransformer.py => ctransformers.py} | 92 ++++++++++++------- .../llm/ggml/tests/test_ctransformers.py | 18 ++-- 4 files changed, 70 insertions(+), 45 deletions(-) rename xinference/model/llm/ggml/{ctransformer.py => ctransformers.py} (74%) diff --git a/setup.cfg b/setup.cfg index ef5f0eff12..6b81d1fb19 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,7 +53,6 @@ dev = pytest-forked>=1.0 pytest-asyncio>=0.14.0 pytest-mock>=3.11.1 - ctransformers ipython>=6.5.0 sphinx>=3.0.0,<5.0.0 pydata-sphinx-theme>=0.3.0 diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py index 977bd81f31..14ca84554d 100644 --- a/xinference/model/llm/__init__.py +++ b/xinference/model/llm/__init__.py @@ -35,7 +35,7 @@ def _install(): from .ggml.chatglm import ChatglmCppChatModel - from .ggml.ctransformer import CtransformerModel + from .ggml.ctransformers import CtransformersModel from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel from .pytorch.baichuan import BaichuanPytorchChatModel from .pytorch.chatglm import ChatglmPytorchChatModel @@ -55,7 +55,7 @@ def _install(): FalconPytorchModel, FalconPytorchChatModel, ChatglmPytorchChatModel, - CtransformerModel, + CtransformersModel, ] ) diff --git a/xinference/model/llm/ggml/ctransformer.py b/xinference/model/llm/ggml/ctransformers.py similarity index 74% rename from xinference/model/llm/ggml/ctransformer.py rename to xinference/model/llm/ggml/ctransformers.py index 9c67713402..17e8f1d43b 100644 --- a/xinference/model/llm/ggml/ctransformer.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -47,8 +47,25 @@ } -class CtransformerGenerateConfig(TypedDict, total=False): - max_new_tokens: Optional[int] +class CtransformersModelConfig(TypedDict, total=False): + top_k: int + top_p: float + temperature: float + repetition_penalty: float + last_n_tokens: int + seed: int + batch_size: int + threads: int + max_new_tokens: int + stop: Optional[Sequence[str]] + stream: bool + reset: bool + context_length: int + gpu_layers: int + + +class CtransformersGenerateConfig(TypedDict, total=False): + max_tokens: Optional[int] top_k: Optional[int] top_p: Optional[float] temperature: Optional[float] @@ -62,7 +79,7 @@ class CtransformerGenerateConfig(TypedDict, total=False): reset: Optional[bool] -class CtransformerModel(LLM): +class CtransformersModel(LLM): try: from ctransformers import AutoConfig except ImportError: @@ -79,7 +96,7 @@ def __init__( model_spec: "LLMSpecV1", quantization: str, model_path: str, - ctransformerModelConfig: Optional["AutoConfig"], + ctransformers_Model_Config: Optional[CtransformersModelConfig], ): super().__init__(model_uid, model_family, model_spec, quantization, model_path) @@ -90,14 +107,14 @@ def __init__( ) self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size] self._ctransformer_model_config: AutoConfig = self._sanitize_model_config( - model_path, ctransformerModelConfig + model_path, ctransformers_Model_Config ) self._model_family = model_family self._model_uid = model_uid self._llm = None def _sanitize_model_config( - self, model_path, ctransformerModelConfig: Optional["AutoConfig"] + self, model_path, ctransformers_model_config: Optional[CtransformersModelConfig] ) -> "AutoConfig": try: from ctransformers import AutoConfig @@ -116,33 +133,35 @@ def _sanitize_model_config( raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") - if ctransformerModelConfig is None: - ctransformerModelConfig = AutoConfig.from_pretrained( + if ctransformers_model_config is None: + ctransformers_model_config = AutoConfig.from_pretrained( model_path, local_files_only=False, ) - return ctransformerModelConfig + return ctransformers_model_config def _sanitize_generate_config( self, - ctransformerGenerateConfig: Optional[CtransformerGenerateConfig], - ) -> CtransformerGenerateConfig: - if ctransformerGenerateConfig is None: - ctransformerGenerateConfig = CtransformerGenerateConfig() - ctransformerGenerateConfig.setdefault("top_k", 40) - ctransformerGenerateConfig.setdefault("top_p", 0.95) - ctransformerGenerateConfig.setdefault("temperature", 0.8) - ctransformerGenerateConfig.setdefault("repetition_penalty", 1.1) - ctransformerGenerateConfig.setdefault("last_n_tokens", 64) - ctransformerGenerateConfig.setdefault("seed", -1) - ctransformerGenerateConfig.setdefault("batch_size", 8) - ctransformerGenerateConfig.setdefault("threads", -1) - ctransformerGenerateConfig.setdefault("stop", None) - ctransformerGenerateConfig.setdefault("stream", None) - ctransformerGenerateConfig.setdefault("reset", True) - - return ctransformerGenerateConfig + ctransformers_generate_config: Optional[CtransformersGenerateConfig], + ) -> CtransformersGenerateConfig: + # if the bufferConfig is not None, we try to copy the selected attributes to the ctransformersGenerateConfig. + if ctransformers_generate_config is None: + ctransformers_generate_config = CtransformersGenerateConfig() + + ctransformers_generate_config.setdefault("top_k", 40) + ctransformers_generate_config.setdefault("top_p", 0.95) + ctransformers_generate_config.setdefault("temperature", 0.8) + ctransformers_generate_config.setdefault("repetition_penalty", 1.1) + ctransformers_generate_config.setdefault("last_n_tokens", 64) + ctransformers_generate_config.setdefault("seed", -1) + ctransformers_generate_config.setdefault("batch_size", 8) + ctransformers_generate_config.setdefault("threads", -1) + ctransformers_generate_config.setdefault("stop", None) + ctransformers_generate_config.setdefault("stream", None) + ctransformers_generate_config.setdefault("reset", True) + + return ctransformers_generate_config def load(self): try: @@ -162,16 +181,12 @@ def load(self): raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") - # handle legacy cache. model_path = os.path.join( self.model_path, self.model_spec.model_file_name_template.format( quantization=self.quantization ), ) - legacy_model_file_path = os.path.join(self.model_path, "model.bin") - if os.path.exists(legacy_model_file_path): - model_path = legacy_model_file_path self._model_type = self._determine_model_type() self._llm = AutoModelForCausalLM.from_pretrained( @@ -198,22 +213,26 @@ def _determine_model_type(self): return model_type_for_ctransformer[self._model_family.model_name] def generate( - self, prompt: str, generate_config: CtransformerGenerateConfig + self, prompt: str, generate_config_raw: CtransformersGenerateConfig ) -> Union[Completion, Iterator[CompletionChunk]]: def generator_wrapper( _prompt: str, - _generate_config: CtransformerGenerateConfig, + _max_new_tokens: Union[int, None], + _generate_config: CtransformersGenerateConfig, ) -> Iterator[CompletionChunk]: assert self._model_uid is not None for _completion_chunk, _ in generate_stream( model=self._model_uid, model_ref=self._llm, prompt=_prompt, + max_new_tokens=_max_new_tokens, **_generate_config, ): yield _completion_chunk - generate_config = self._sanitize_generate_config(generate_config) + generate_config = self._sanitize_generate_config(generate_config_raw) + max_new_tokens: Union[int, None] + max_new_tokens = generate_config.pop("max_tokens") logger.error( "Enter generate, prompt: %s, generate config: %s", prompt, generate_config @@ -221,7 +240,11 @@ def generator_wrapper( stream_or_not = generate_config.get("stream", False) if stream_or_not: - return generator_wrapper(_prompt=prompt, _generate_config=generate_config) + return generator_wrapper( + _prompt=prompt, + _max_new_tokens=max_new_tokens, + _generate_config=generate_config, + ) else: assert self.model_uid is not None completion_chunk = None @@ -230,6 +253,7 @@ def generator_wrapper( model=self.model_uid, model_ref=self._llm, prompt=prompt, + max_new_tokens=max_new_tokens, **generate_config, ): pass diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index e4f143515c..e1259a27ee 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -22,9 +22,9 @@ import pytest from xinference.model.llm import GgmlLLMSpecV1, LLMFamilyV1 -from xinference.model.llm.ggml.ctransformer import ( - CtransformerGenerateConfig, - CtransformerModel, +from xinference.model.llm.ggml.ctransformers import ( + CtransformersGenerateConfig, + CtransformersModel, ) from xinference.types import ( Completion, @@ -39,7 +39,7 @@ def __init__(self) -> None: pass -class MockCtransformersModel(CtransformerModel): +class MockCtransformersModel(CtransformersModel): def load(self): self._llm = MockPipeline() @@ -72,7 +72,7 @@ def generate_stream(self) -> Iterator[Completion]: yield completion def generate( - self, prompt: str, generate_config: CtransformerGenerateConfig + self, prompt: str, generate_config_raw: CtransformersGenerateConfig ) -> Completion: completion_choice = CompletionChoice( text="test_ctransformers_generate", @@ -177,7 +177,7 @@ def test_ctransformer_init(model_spec, model_family, mock_AutoConfig_Pretrained) model_spec=model_spec, quantization=quantization, model_path=path, - ctransformerModelConfig=None, + ctransformers_Model_Config=None, ) assert model.model_uid == uid @@ -235,7 +235,7 @@ def test_model_generate(model_spec, model_family, mock_AutoConfig_Pretrained): model_spec=model_spec, quantization=quantization, model_path=path, - ctransformerModelConfig=None, + ctransformers_Model_Config=None, ) assert model._llm is None @@ -256,7 +256,9 @@ def test_model_generate(model_spec, model_family, mock_AutoConfig_Pretrained): assert completion["usage"]["total_tokens"] == 30 # generate without stream - responses = model.generate("def Helloworld():", generate_config={"stream": True}) + responses = model.generate( + "def Helloworld():", generate_config_raw={"stream": True} + ) assert responses["object"] == "text_completion" assert responses["choices"][0]["text"] == "test_ctransformers_generate" assert responses["choices"][0]["finish_reason"] == "test" From ac2799c70ab35aa65467cbfc40d0b688fcd9f685 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Tue, 8 Aug 2023 16:04:41 +0800 Subject: [PATCH 39/61] fix test dependency --- .github/workflows/python.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 810939afbd..28741ea347 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -92,6 +92,7 @@ jobs: pip install sentencepiece pip install transformers_stream_generator pip install bitsandbytes + pip install ctransformers pip install -e ".[dev]" working-directory: . From 1c7ae949c8d6d9513bd5c04812aa07a3ed2040ed Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Tue, 8 Aug 2023 16:45:38 +0800 Subject: [PATCH 40/61] update logger error --- xinference/model/llm/ggml/ctransformers_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index 35affb1575..297f5d5090 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -145,7 +145,7 @@ def generate_stream( finish_reason = "length" break - logger.error("Final, completion: %s", text) + logger.error("Final, completion: %s", total_text) if stream is False: completion_choice = CompletionChoice( text=total_text, index=0, logprobs=None, finish_reason=finish_reason From 58ca6776333c3483794a1a7520ee6b06723886b6 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Tue, 8 Aug 2023 17:02:35 +0800 Subject: [PATCH 41/61] update logger error --- xinference/model/llm/ggml/ctransformers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index 17e8f1d43b..950e06f746 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -232,7 +232,10 @@ def generator_wrapper( generate_config = self._sanitize_generate_config(generate_config_raw) max_new_tokens: Union[int, None] - max_new_tokens = generate_config.pop("max_tokens") + if "max_tokens" in generate_config: + max_new_tokens = generate_config.pop("max_tokens") + else: + max_new_tokens = None logger.error( "Enter generate, prompt: %s, generate config: %s", prompt, generate_config @@ -260,6 +263,10 @@ def generator_wrapper( assert completion_chunk is not None assert completion_usage is not None + + logger.error( + "Generated choice, completion: %s", completion_chunk["choices"] + ) completion = Completion( id=completion_chunk["id"], object=completion_chunk["object"], From 8955272dde6e70c19044f5aeb28b1fcbac85fce0 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Tue, 8 Aug 2023 17:03:51 +0800 Subject: [PATCH 42/61] update logger error --- xinference/model/llm/ggml/ctransformers_util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index 297f5d5090..4396289267 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -168,4 +168,6 @@ def generate_stream( total_tokens=count + len(tokens), ) + logger.error("Completionchoice: %s", completion_choice) + return completion_chunk, completion_usage From f9092386b4f1aa8f3fedaea65a528f3bd8663d31 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Tue, 8 Aug 2023 17:18:22 +0800 Subject: [PATCH 43/61] update logger error --- xinference/model/llm/ggml/ctransformers_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index 4396289267..d8c7a678dc 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -170,4 +170,4 @@ def generate_stream( logger.error("Completionchoice: %s", completion_choice) - return completion_chunk, completion_usage + yield completion_chunk, completion_usage From e76c576fd151ef520050c8d6e5157c6cfdf2bc93 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Tue, 8 Aug 2023 17:22:22 +0800 Subject: [PATCH 44/61] fix issue in ctransformersutil and delete redundant logger. --- xinference/model/llm/ggml/ctransformers.py | 3 --- xinference/model/llm/ggml/ctransformers_util.py | 1 - 2 files changed, 4 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index 950e06f746..864d804b3d 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -264,9 +264,6 @@ def generator_wrapper( assert completion_chunk is not None assert completion_usage is not None - logger.error( - "Generated choice, completion: %s", completion_chunk["choices"] - ) completion = Completion( id=completion_chunk["id"], object=completion_chunk["object"], diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index d8c7a678dc..98c4bd11e9 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -145,7 +145,6 @@ def generate_stream( finish_reason = "length" break - logger.error("Final, completion: %s", total_text) if stream is False: completion_choice = CompletionChoice( text=total_text, index=0, logprobs=None, finish_reason=finish_reason From cfb267e6e0529bd3b29211fe2dbf04767ad270f5 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Wed, 9 Aug 2023 17:53:34 +0800 Subject: [PATCH 45/61] refactor toward suggestions. --- xinference/model/llm/ggml/ctransformers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index 864d804b3d..f8caf13f7d 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -16,11 +16,11 @@ import os from typing import TYPE_CHECKING, Iterator, Optional, Sequence, TypedDict, Union -from xinference.model.llm.ggml.ctransformers_util import generate_stream from xinference.types import Completion, CompletionChunk from ..core import LLM from ..llm_family import LLMFamilyV1, LLMSpecV1 +from .ctransformers_util import generate_stream from .llamacpp import SIZE_TO_GPU_LAYERS if TYPE_CHECKING: @@ -30,7 +30,7 @@ # all supported models for Ctransformers with their model type. # Please Strictly follows this name format when inputting new model to model_family. -model_type_for_ctransformer = { +MODEL_TYPE_FOR_CTRANSFORMERS = { "GPT-2": "gpt2", "GPT-J": "gptj", "GPT4All-J": "gptj", @@ -206,11 +206,11 @@ def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: return True def _determine_model_type(self): - if self._model_family.model_name not in model_type_for_ctransformer: + if self._model_family.model_name not in MODEL_TYPE_FOR_CTRANSFORMERS: raise ValueError( "The current model is not supported, check your model name. " ) - return model_type_for_ctransformer[self._model_family.model_name] + return MODEL_TYPE_FOR_CTRANSFORMERS[self._model_family.model_name] def generate( self, prompt: str, generate_config_raw: CtransformersGenerateConfig From 9bca70cc05a29c035f30cd838cf80c0381e33168 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Wed, 9 Aug 2023 18:12:30 +0800 Subject: [PATCH 46/61] fix part of issues from suggestions. --- xinference/model/llm/ggml/ctransformers.py | 29 ++++++------------- .../model/llm/ggml/ctransformers_util.py | 11 ++----- 2 files changed, 11 insertions(+), 29 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index f8caf13f7d..2fa613022d 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -14,7 +14,7 @@ import logging import os -from typing import TYPE_CHECKING, Iterator, Optional, Sequence, TypedDict, Union +from typing import Iterator, Optional, Sequence, TypedDict, Union from xinference.types import Completion, CompletionChunk @@ -23,9 +23,6 @@ from .ctransformers_util import generate_stream from .llamacpp import SIZE_TO_GPU_LAYERS -if TYPE_CHECKING: - from ctransformers import AutoConfig - logger = logging.getLogger(__name__) # all supported models for Ctransformers with their model type. @@ -106,7 +103,7 @@ def __init__( key=lambda x: abs(x - model_spec.model_size_in_billions), ) self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size] - self._ctransformer_model_config: AutoConfig = self._sanitize_model_config( + self._ctransformer_model_config = self._sanitize_model_config( model_path, ctransformers_Model_Config ) self._model_family = model_family @@ -145,7 +142,7 @@ def _sanitize_generate_config( self, ctransformers_generate_config: Optional[CtransformersGenerateConfig], ) -> CtransformersGenerateConfig: - # if the bufferConfig is not None, we try to copy the selected attributes to the ctransformersGenerateConfig. + # if the input config is not None, we try to copy the selected attributes to the ctransformersGenerateConfig. if ctransformers_generate_config is None: ctransformers_generate_config = CtransformersGenerateConfig() @@ -168,14 +165,10 @@ def load(self): from ctransformers import AutoModelForCausalLM except ImportError: error_message = "Failed to import module 'ctransformers'" - if self._is_darwin_and_apple_silicon(): - system = "Metal" - else: - system = "CUDA" installation_guide = [ - f"Please make sure 'ctransformers' is installed and {system} accelerator is provided.", - f"You can install it by checking out the repository for command for {system} platform:" + f"Please make sure 'ctransformers' is installed.", + f"You can install it by checking out the repository for command." f"https://github.com/marella/ctransformers", ] @@ -208,7 +201,7 @@ def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: def _determine_model_type(self): if self._model_family.model_name not in MODEL_TYPE_FOR_CTRANSFORMERS: raise ValueError( - "The current model is not supported, check your model name. " + f"The current model {self._model_family.model_name} is not supported, check your model name. " ) return MODEL_TYPE_FOR_CTRANSFORMERS[self._model_family.model_name] @@ -231,13 +224,9 @@ def generator_wrapper( yield _completion_chunk generate_config = self._sanitize_generate_config(generate_config_raw) - max_new_tokens: Union[int, None] - if "max_tokens" in generate_config: - max_new_tokens = generate_config.pop("max_tokens") - else: - max_new_tokens = None + max_new_tokens = generate_config.pop("max_tokens", None) - logger.error( + logger.debug( "Enter generate, prompt: %s, generate config: %s", prompt, generate_config ) @@ -273,7 +262,7 @@ def generator_wrapper( usage=completion_usage, ) - logger.error( + logger.debug( "Generated, completion: %s, generate config: %s", completion, generate_config, diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index 98c4bd11e9..0b3956b2cb 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -17,17 +17,11 @@ import uuid from typing import Iterator, Optional, Sequence, Tuple -from xinference.types import CompletionChoice, CompletionChunk, CompletionUsage +from ....types import CompletionChoice, CompletionChunk, CompletionUsage logger = logging.getLogger(__name__) -def _get(*values): - for value in values: - if value is not None: - return value - - def generate_stream( model, model_ref, @@ -47,8 +41,7 @@ def generate_stream( reset: Optional[bool] = None, **kwargs, ) -> Iterator[Tuple[CompletionChunk, CompletionUsage]]: - max_new_tokens = _get(max_new_tokens) - stop = _get(stop) or [] + stop = stop or [] if isinstance(stop, str): stop = [stop] From cb66cd314bfb5bbbb2461af129391b5ffe75e67d Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Thu, 10 Aug 2023 11:03:14 +0800 Subject: [PATCH 47/61] fix part of issues from suggestions. --- xinference/model/llm/ggml/ctransformers.py | 119 ++++++++++++--------- 1 file changed, 70 insertions(+), 49 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index 2fa613022d..684370a84b 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -14,7 +14,10 @@ import logging import os -from typing import Iterator, Optional, Sequence, TypedDict, Union +from typing import TYPE_CHECKING, Iterator, Optional, Sequence, TypedDict, Union + +if TYPE_CHECKING: + from ctransformers import AutoConfig from xinference.types import Completion, CompletionChunk @@ -45,20 +48,8 @@ class CtransformersModelConfig(TypedDict, total=False): - top_k: int - top_p: float - temperature: float - repetition_penalty: float - last_n_tokens: int - seed: int - batch_size: int - threads: int - max_new_tokens: int - stop: Optional[Sequence[str]] - stream: bool - reset: bool - context_length: int - gpu_layers: int + n_ctx: int + n_gpu_layers: int class CtransformersGenerateConfig(TypedDict, total=False): @@ -77,15 +68,6 @@ class CtransformersGenerateConfig(TypedDict, total=False): class CtransformersModel(LLM): - try: - from ctransformers import AutoConfig - except ImportError: - error_message = "Failed to import module 'ctransformers - AutoConfig'" - installation_guide = [ - "Please make sure 'ctransformers' is installed, You can install it by checking out the repository for " - "command: https://github.com/marella/ctransformers", - ] - def __init__( self, model_uid: str, @@ -114,49 +96,88 @@ def _sanitize_model_config( self, model_path, ctransformers_model_config: Optional[CtransformersModelConfig] ) -> "AutoConfig": try: - from ctransformers import AutoConfig + from ctransformers import AutoConfig, Config except ImportError: - error_message = "Failed to import module 'ctransformers - AutoConfig'" - if self._is_darwin_and_apple_silicon(): - system = "Metal" - else: - system = "CUDA" + error_message = ( + "Failed to import module 'ctransformers - AutoConfig and Config'" + ) installation_guide = [ - f"Please make sure 'ctransformers' is installed and {system} accelerator is provided.", - f"You can install it by checking out the repository for command for {system} platform:" + f"Please make sure 'ctransformers' is installed.", + f"You can install it by checking out the repository for command:" f"https://github.com/marella/ctransformers", ] raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") - if ctransformers_model_config is None: - ctransformers_model_config = AutoConfig.from_pretrained( - model_path, - local_files_only=False, - ) + # if the model have customized config, we update it. + if ctransformers_model_config: + potential_context_length = ctransformers_model_config.pop("n_ctx", None) + potential_gpu_layers = ctransformers_model_config.pop("n_gpu_layers", None) + + if potential_context_length and potential_gpu_layers: + ctransformers_model_config_returned = Config( + context_length=potential_context_length, + gpu_layers=potential_gpu_layers, + ) + elif potential_gpu_layers: + ctransformers_model_config_returned = Config( + gpu_layers=potential_gpu_layers + ) + elif potential_context_length: + ctransformers_model_config_returned = Config( + context_length=potential_context_length + ) + else: + ctransformers_model_config_returned = Config() + else: + ctransformers_model_config_returned = Config() - return ctransformers_model_config + return AutoConfig(ctransformers_model_config_returned) def _sanitize_generate_config( self, ctransformers_generate_config: Optional[CtransformersGenerateConfig], ) -> CtransformersGenerateConfig: + try: + from ctransformers import Config + except ImportError: + error_message = "Failed to import module 'ctransformers - Config'" + + installation_guide = [ + f"Please make sure 'ctransformers' is installed.", + f"You can install it by checking out the repository for command:" + f"https://github.com/marella/ctransformers", + ] + + raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") + # if the input config is not None, we try to copy the selected attributes to the ctransformersGenerateConfig. if ctransformers_generate_config is None: ctransformers_generate_config = CtransformersGenerateConfig() - ctransformers_generate_config.setdefault("top_k", 40) - ctransformers_generate_config.setdefault("top_p", 0.95) - ctransformers_generate_config.setdefault("temperature", 0.8) - ctransformers_generate_config.setdefault("repetition_penalty", 1.1) - ctransformers_generate_config.setdefault("last_n_tokens", 64) - ctransformers_generate_config.setdefault("seed", -1) - ctransformers_generate_config.setdefault("batch_size", 8) - ctransformers_generate_config.setdefault("threads", -1) - ctransformers_generate_config.setdefault("stop", None) - ctransformers_generate_config.setdefault("stream", None) - ctransformers_generate_config.setdefault("reset", True) + # get the newest configuration from ctransformers. + default_config = Config() + + ctransformers_generate_config.setdefault("top_k", default_config.top_k) + ctransformers_generate_config.setdefault("top_p", default_config.top_p) + ctransformers_generate_config.setdefault( + "temperature", default_config.temperature + ) + ctransformers_generate_config.setdefault( + "repetition_penalty", default_config.repetition_penalty + ) + ctransformers_generate_config.setdefault( + "last_n_tokens", default_config.last_n_tokens + ) + ctransformers_generate_config.setdefault("seed", default_config.seed) + ctransformers_generate_config.setdefault( + "batch_size", default_config.batch_size + ) + ctransformers_generate_config.setdefault("threads", default_config.threads) + ctransformers_generate_config.setdefault("stop", default_config.stop) + ctransformers_generate_config.setdefault("stream", default_config.stream) + ctransformers_generate_config.setdefault("reset", default_config.reset) return ctransformers_generate_config From 5ffa5fe0cb1d8693a9e35f803761ebf6475b9dcb Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Thu, 10 Aug 2023 11:28:35 +0800 Subject: [PATCH 48/61] fix test issue by remove mock autoconfig. --- .../llm/ggml/tests/test_ctransformers.py | 31 +++---------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index e1259a27ee..61336945f1 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -11,13 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import importlib import random import re import string import time from typing import Iterator -from unittest.mock import Mock import pytest @@ -141,31 +139,12 @@ def generate( mock_model_family = LLMFamilyV1.parse_raw(test_model_spec) -class MockAutoConfig: - def __init__(self, config, *args, **kwargs): - self.config = config - - -# Mock the AutoConfig_Pretrained to not directly import ctransformers -@pytest.fixture -def mock_AutoConfig_Pretrained(mocker): - ctransformers_module = importlib.import_module("ctransformers") - mock_from_pretrained = mocker.patch.object( - ctransformers_module.AutoConfig, # Target object to patch - "from_pretrained", # Attribute to patch - side_effect=MockAutoConfig, # Custom side_effect function - ) - - config = Mock() - auto_config = MockAutoConfig(config) - mock_from_pretrained.return_value = auto_config - return mock_from_pretrained - - @pytest.mark.parametrize( "model_spec, model_family", [(mock_model_spec, mock_model_family)] ) -def test_ctransformer_init(model_spec, model_family, mock_AutoConfig_Pretrained): +def test_ctransformer_init(model_spec, model_family): + from ctransformers import AutoConfig + quantization = "q4_0" uid = "".join(random.choice(string.digits) for i in range(15)) path = "".join( @@ -184,7 +163,7 @@ def test_ctransformer_init(model_spec, model_family, mock_AutoConfig_Pretrained) assert model.quantization == quantization assert model.model_path == path assert model._ctransformer_model_config is not None - assert isinstance(model._ctransformer_model_config, MockAutoConfig) + assert isinstance(model._ctransformer_model_config, AutoConfig) assert isinstance(model.model_spec, GgmlLLMSpecV1) assert isinstance(model.model_family, LLMFamilyV1) @@ -223,7 +202,7 @@ def test_ctransformer_init(model_spec, model_family, mock_AutoConfig_Pretrained) @pytest.mark.parametrize( "model_spec, model_family", [(mock_model_spec, mock_model_family)] ) -def test_model_generate(model_spec, model_family, mock_AutoConfig_Pretrained): +def test_model_generate(model_spec, model_family): quantization = "q4_0" uid = "".join(random.choice(string.digits) for i in range(100)) path = "".join( From f019ace68ddfd3a216103dad19f6bdc9de8df994 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Thu, 10 Aug 2023 14:43:37 +0800 Subject: [PATCH 49/61] refactor toward suggestions. --- xinference/model/llm/ggml/ctransformers.py | 79 +++------ .../model/llm/ggml/ctransformers_util.py | 4 +- .../llm/ggml/tests/test_ctransformers.py | 152 ++++-------------- xinference/model/llm/llm_family.json | 2 +- 4 files changed, 57 insertions(+), 180 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index 684370a84b..c2bde51b25 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -19,8 +19,7 @@ if TYPE_CHECKING: from ctransformers import AutoConfig -from xinference.types import Completion, CompletionChunk - +from ....types import Completion, CompletionChunk from ..core import LLM from ..llm_family import LLMFamilyV1, LLMSpecV1 from .ctransformers_util import generate_stream @@ -41,8 +40,8 @@ "MPT": "mpt", "Dolly-V2": "dolly-v2", "Replit": "replit", - "StarCoder": "starcoder", - "StarChat": "starcoder", + "starcoder": "starcoder", + "starchat": "starcoder", "Falcon": "falcon", } @@ -92,6 +91,10 @@ def __init__( self._model_uid = model_uid self._llm = None + def _can_apply_cublas(self): + # TODO: figure out the quantizations supported. + return True + def _sanitize_model_config( self, model_path, ctransformers_model_config: Optional[CtransformersModelConfig] ) -> "AutoConfig": @@ -111,27 +114,23 @@ def _sanitize_model_config( raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") # if the model have customized config, we update it. + ctransformers_model_config_returned = Config() + potential_gpu_layers = None if ctransformers_model_config: potential_context_length = ctransformers_model_config.pop("n_ctx", None) potential_gpu_layers = ctransformers_model_config.pop("n_gpu_layers", None) - if potential_context_length and potential_gpu_layers: - ctransformers_model_config_returned = Config( - context_length=potential_context_length, - gpu_layers=potential_gpu_layers, - ) - elif potential_gpu_layers: - ctransformers_model_config_returned = Config( - gpu_layers=potential_gpu_layers - ) - elif potential_context_length: - ctransformers_model_config_returned = Config( - context_length=potential_context_length - ) - else: - ctransformers_model_config_returned = Config() - else: - ctransformers_model_config_returned = Config() + ctransformers_model_config_returned.context_length = ( + potential_context_length + ) + ctransformers_model_config_returned.gpu_layers = potential_gpu_layers + + # if user does not define gpu layers, we have to set it with our system if applicable. + if potential_gpu_layers is None: + if self._is_darwin_and_apple_silicon(): + ctransformers_model_config_returned.gpu_layers = 1 + elif self._is_linux() and self._can_apply_cublas(): + ctransformers_model_config_returned = self._gpu_layers return AutoConfig(ctransformers_model_config_returned) @@ -139,45 +138,13 @@ def _sanitize_generate_config( self, ctransformers_generate_config: Optional[CtransformersGenerateConfig], ) -> CtransformersGenerateConfig: - try: - from ctransformers import Config - except ImportError: - error_message = "Failed to import module 'ctransformers - Config'" - - installation_guide = [ - f"Please make sure 'ctransformers' is installed.", - f"You can install it by checking out the repository for command:" - f"https://github.com/marella/ctransformers", - ] - - raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") - # if the input config is not None, we try to copy the selected attributes to the ctransformersGenerateConfig. if ctransformers_generate_config is None: ctransformers_generate_config = CtransformersGenerateConfig() - # get the newest configuration from ctransformers. - default_config = Config() - - ctransformers_generate_config.setdefault("top_k", default_config.top_k) - ctransformers_generate_config.setdefault("top_p", default_config.top_p) - ctransformers_generate_config.setdefault( - "temperature", default_config.temperature - ) - ctransformers_generate_config.setdefault( - "repetition_penalty", default_config.repetition_penalty - ) - ctransformers_generate_config.setdefault( - "last_n_tokens", default_config.last_n_tokens - ) - ctransformers_generate_config.setdefault("seed", default_config.seed) - ctransformers_generate_config.setdefault( - "batch_size", default_config.batch_size - ) - ctransformers_generate_config.setdefault("threads", default_config.threads) - ctransformers_generate_config.setdefault("stop", default_config.stop) - ctransformers_generate_config.setdefault("stream", default_config.stream) - ctransformers_generate_config.setdefault("reset", default_config.reset) + # for our system, the threads will have to be set to 4 + # all other parameters, if not specified, will be set to default when generate. + ctransformers_generate_config.setdefault("threads", 4) return ctransformers_generate_config diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index 0b3956b2cb..33a14705be 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -89,7 +89,7 @@ def generate_stream( text += output total_text += output - logger.error("Output, completion: %s", text) + logger.debug("Output, completion: %s", text) # https://github.com/abetlen/llama-cpp-python/blob/1a13d76c487df1c8560132d10bda62d6e2f4fa93/llama_cpp/llama.py#L686-L706 # Check if one of the stop sequences is part of the text. @@ -160,6 +160,6 @@ def generate_stream( total_tokens=count + len(tokens), ) - logger.error("Completionchoice: %s", completion_choice) + logger.debug("Completionchoice: %s", completion_choice) yield completion_chunk, completion_usage diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index 61336945f1..4146084bd7 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -12,93 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. import random -import re import string -import time -from typing import Iterator import pytest -from xinference.model.llm import GgmlLLMSpecV1, LLMFamilyV1 -from xinference.model.llm.ggml.ctransformers import ( - CtransformersGenerateConfig, - CtransformersModel, -) -from xinference.types import ( - Completion, - CompletionChoice, - CompletionChunk, - CompletionUsage, -) - +from xinference.client import Client, GenerateModelHandle -class MockPipeline: - def __init__(self) -> None: - pass +from ....llm import GgmlLLMSpecV1, LLMFamilyV1 +from ..ctransformers import CtransformersModel class MockCtransformersModel(CtransformersModel): def load(self): - self._llm = MockPipeline() - - def generate_stream(self) -> Iterator[Completion]: - for i in range(5): - res = f"ctransformers_test_stream_{i}" - completion_choice = CompletionChoice( - text=res, index=0, logprobs=None, finish_reason="test_stream" - ) - completion_chunk = CompletionChunk( - id=str(f"test_{i}"), - object="text_completion", - created=int(time.time()), - model=self._model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( - prompt_tokens=10, - completion_tokens=20, - total_tokens=30, - ) - completion = Completion( - id=completion_chunk["id"], - object=completion_chunk["object"], - created=completion_chunk["created"], - model=completion_chunk["model"], - choices=completion_chunk["choices"], - usage=completion_usage, - ) - yield completion - - def generate( - self, prompt: str, generate_config_raw: CtransformersGenerateConfig - ) -> Completion: - completion_choice = CompletionChoice( - text="test_ctransformers_generate", - index=0, - logprobs=None, - finish_reason="test", - ) - completion_chunk = CompletionChunk( - id=str("test"), - object="text_completion", - created=int(time.time()), - model=self._model_uid, - choices=[completion_choice], - ) - completion_usage = CompletionUsage( - prompt_tokens=10, - completion_tokens=20, - total_tokens=30, - ) - completion = Completion( - id=completion_chunk["id"], - object=completion_chunk["object"], - created=completion_chunk["created"], - model=completion_chunk["model"], - choices=completion_chunk["choices"], - usage=completion_usage, - ) - return completion + pass mock_model_spec = GgmlLLMSpecV1( @@ -199,48 +125,32 @@ def test_ctransformer_init(model_spec, model_family): assert model._llm is None -@pytest.mark.parametrize( - "model_spec, model_family", [(mock_model_spec, mock_model_family)] -) -def test_model_generate(model_spec, model_family): - quantization = "q4_0" - uid = "".join(random.choice(string.digits) for i in range(100)) - path = "".join( - random.choice(string.ascii_letters + string.punctuation) for i in range(100) - ) - model = MockCtransformersModel( - model_uid=uid, - model_family=model_family, - model_spec=model_spec, - quantization=quantization, - model_path=path, - ctransformers_Model_Config=None, - ) +@pytest.mark.asyncio +@pytest.mark.parametrize("quantization", ["q4_0", "q4_1", "q5_0", "q5_1", "q8_0"]) +async def test_opt_pytorch_model(setup, quantization): + endpoint, _ = setup + client = Client(endpoint) + assert len(client.list_models()) == 0 - assert model._llm is None - - model.load() - assert isinstance(model._llm, MockPipeline) - - # generate with stream - pattern = r"[0-4]" - for completion in model.generate_stream(): - assert completion["id"].startswith("test_") - assert re.search(pattern, completion["id"]) - assert completion["choices"][0]["text"].startswith("ctransformers_test_stream_") - assert re.search(pattern, completion["choices"][0]["text"]) - assert completion["choices"][0]["finish_reason"] == "test_stream" - assert completion["usage"]["prompt_tokens"] == 10 - assert completion["usage"]["completion_tokens"] == 20 - assert completion["usage"]["total_tokens"] == 30 - - # generate without stream - responses = model.generate( - "def Helloworld():", generate_config_raw={"stream": True} + model_uid = client.launch_model( + model_name="starcoder", + model_size_in_billions=16, + model_format="ggmlv3", + quantization=quantization, ) - assert responses["object"] == "text_completion" - assert responses["choices"][0]["text"] == "test_ctransformers_generate" - assert responses["choices"][0]["finish_reason"] == "test" - assert responses["usage"]["prompt_tokens"] == 10 - assert responses["usage"]["completion_tokens"] == 20 - assert responses["usage"]["total_tokens"] == 30 + assert len(client.list_models()) == 1 + + model = client.get_model(model_uid=model_uid) + assert isinstance(model, GenerateModelHandle) + + completion = model.generate("def HelloWorld():") + assert "id" in completion + assert "text" in completion["choices"][0] + assert len(completion["choices"][0]["text"]) > 0 + assert "finish_reason" in completion["choices"][0] + assert "prompt_tokens" in completion["usage"] + assert "completion_tokens" in completion["usage"] + assert "total_tokens" in completion["usage"] + + client.terminate_model(model_uid=model_uid) + assert len(client.list_models()) == 0 diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 2b5f47020d..cdb2845221 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -778,7 +778,7 @@ }, { "version": 1, - "model_name": "StarCoder", + "model_name": "starcoder", "model_lang": [ "en" ], From 4f962f117301ded460ad979b5a2cc7e8656dd33e Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Thu, 10 Aug 2023 15:14:31 +0800 Subject: [PATCH 50/61] remove parameterize in test, only test for q4_0 --- xinference/model/llm/ggml/tests/test_ctransformers.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index 4146084bd7..003ce4c222 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -16,8 +16,7 @@ import pytest -from xinference.client import Client, GenerateModelHandle - +from .....client import Client, GenerateModelHandle from ....llm import GgmlLLMSpecV1, LLMFamilyV1 from ..ctransformers import CtransformersModel @@ -126,8 +125,7 @@ def test_ctransformer_init(model_spec, model_family): @pytest.mark.asyncio -@pytest.mark.parametrize("quantization", ["q4_0", "q4_1", "q5_0", "q5_1", "q8_0"]) -async def test_opt_pytorch_model(setup, quantization): +async def test_opt_pytorch_model(setup): endpoint, _ = setup client = Client(endpoint) assert len(client.list_models()) == 0 @@ -136,8 +134,9 @@ async def test_opt_pytorch_model(setup, quantization): model_name="starcoder", model_size_in_billions=16, model_format="ggmlv3", - quantization=quantization, + quantization="q4_0", ) + assert len(client.list_models()) == 1 model = client.get_model(model_uid=model_uid) From 973685bd0bac69935a5563d6a2f6e68e7429f6f9 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Thu, 10 Aug 2023 15:58:14 +0800 Subject: [PATCH 51/61] try smaller gpt-2 model for test ctransformer --- xinference/model/llm/ggml/ctransformers.py | 22 +++++++++---------- .../llm/ggml/tests/test_ctransformers.py | 10 ++++----- xinference/model/llm/llm_family.json | 22 +++++++++++++++++++ 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index c2bde51b25..43404b7969 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -30,19 +30,19 @@ # all supported models for Ctransformers with their model type. # Please Strictly follows this name format when inputting new model to model_family. MODEL_TYPE_FOR_CTRANSFORMERS = { - "GPT-2": "gpt2", - "GPT-J": "gptj", - "GPT4All-J": "gptj", - "GPT-NeoX": "gpt_neox", - "StableLM": "gpt_neox", - "LLaMA": "llama", - "LLaMA-2": "llama", - "MPT": "mpt", - "Dolly-V2": "dolly-v2", - "Replit": "replit", + "gpt-2": "gpt2", + "gpt-j": "gptj", + "gpt4all-j": "gptj", + "gpt-neox": "gpt_neox", + "stablelm": "gpt_neox", + "llama": "llama", + "llama-2": "llama", + "mpt": "mpt", + "dolly-v2": "dolly-v2", + "replit": "replit", "starcoder": "starcoder", "starchat": "starcoder", - "Falcon": "falcon", + "falcon": "falcon", } diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index 003ce4c222..9776c55d1d 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -125,16 +125,16 @@ def test_ctransformer_init(model_spec, model_family): @pytest.mark.asyncio -async def test_opt_pytorch_model(setup): +async def test_starcoder_model(setup): endpoint, _ = setup client = Client(endpoint) assert len(client.list_models()) == 0 model_uid = client.launch_model( - model_name="starcoder", - model_size_in_billions=16, + model_name="gpt-2", + model_size_in_billions=1, model_format="ggmlv3", - quantization="q4_0", + quantization="none", ) assert len(client.list_models()) == 1 @@ -142,7 +142,7 @@ async def test_opt_pytorch_model(setup): model = client.get_model(model_uid=model_uid) assert isinstance(model, GenerateModelHandle) - completion = model.generate("def HelloWorld():") + completion = model.generate("AI is going to") assert "id" in completion assert "text" in completion["choices"][0] assert len(completion["choices"][0]["text"]) > 0 diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index cdb2845221..8e709182f2 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -801,5 +801,27 @@ } ], "prompt_style": null + }, + { + "version": 1, + "model_name": "gpt-2", + "model_lang": [ + "en" + ], + "model_ability":[ + "generate" + ], + "model_specs": [ + { + "model_format": "ggmlv3", + "model_size_in_billions": 1, + "quantizations": [ + "none" + ], + "model_id": "marella/gpt-2-ggml", + "model_file_name_template": "ggml-model.bin" + } + ], + "prompt_style": null } ] From e09350dab1ef504a26de6fbc6859bcae099791fc Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Thu, 10 Aug 2023 16:29:44 +0800 Subject: [PATCH 52/61] add CTRANSFORME_SUPPORT_MODEL constant (can be expanded) --- xinference/model/llm/ggml/ctransformers.py | 13 +++++++++++-- xinference/model/llm/ggml/llamacpp.py | 11 +++++++++-- .../model/llm/ggml/tests/test_ctransformers.py | 2 +- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index 43404b7969..ac20f7b3e0 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -23,7 +23,6 @@ from ..core import LLM from ..llm_family import LLMFamilyV1, LLMSpecV1 from .ctransformers_util import generate_stream -from .llamacpp import SIZE_TO_GPU_LAYERS logger = logging.getLogger(__name__) @@ -45,6 +44,16 @@ "falcon": "falcon", } +CTRANSFORMERS_SUPPORTED_MODEL = ["starcoder", "gpt-2"] + +SIZE_TO_GPU_LAYERS = { + 3: 26, + 7: 32, + 13: 40, + 30: 60, + 65: 80, +} + class CtransformersModelConfig(TypedDict, total=False): n_ctx: int @@ -180,7 +189,7 @@ def load(self): def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: if llm_spec.model_format != "ggmlv3": return False - if "StarCoder" not in llm_family.model_name: + if llm_family.model_name not in CTRANSFORMERS_SUPPORTED_MODEL: return False if "generate" not in llm_family.model_ability: return False diff --git a/xinference/model/llm/ggml/llamacpp.py b/xinference/model/llm/ggml/llamacpp.py index a55c9d717b..4bcee28059 100644 --- a/xinference/model/llm/ggml/llamacpp.py +++ b/xinference/model/llm/ggml/llamacpp.py @@ -28,6 +28,7 @@ from ..core import LLM from ..llm_family import LLMFamilyV1, LLMSpecV1 from ..utils import ChatModelMixin +from .ctransformers import CTRANSFORMERS_SUPPORTED_MODEL if TYPE_CHECKING: from llama_cpp import LogitsProcessorList, StoppingCriteriaList @@ -187,7 +188,10 @@ def load(self): def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: if llm_spec.model_format != "ggmlv3": return False - if "chatglm" in llm_family.model_name or "StarCoder" in llm_family.model_name: + if ( + "chatglm" in llm_family.model_name + or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL + ): return False if "generate" not in llm_family.model_ability: return False @@ -258,7 +262,10 @@ def __init__( def match(cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1) -> bool: if llm_spec.model_format != "ggmlv3": return False - if "chatglm" in llm_family.model_name or "StarCoder" in llm_family.model_name: + if ( + "chatglm" in llm_family.model_name + or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL + ): return False if "chat" not in llm_family.model_ability: return False diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index 9776c55d1d..5965ee9824 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -125,7 +125,7 @@ def test_ctransformer_init(model_spec, model_family): @pytest.mark.asyncio -async def test_starcoder_model(setup): +async def test_Ctransformer_model(setup): endpoint, _ = setup client = Client(endpoint) assert len(client.list_models()) == 0 From 016c31898d9d920e435caf4529ad13d40a72e0a3 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Thu, 10 Aug 2023 16:49:35 +0800 Subject: [PATCH 53/61] fix cuda branch error. --- xinference/model/llm/ggml/ctransformers.py | 2 +- .../model/llm/ggml/tests/test_ctransformers.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index ac20f7b3e0..b1c0269c93 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -139,7 +139,7 @@ def _sanitize_model_config( if self._is_darwin_and_apple_silicon(): ctransformers_model_config_returned.gpu_layers = 1 elif self._is_linux() and self._can_apply_cublas(): - ctransformers_model_config_returned = self._gpu_layers + ctransformers_model_config_returned.gpu_layers = self._gpu_layers return AutoConfig(ctransformers_model_config_returned) diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index 5965ee9824..d13b2377fb 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -125,7 +125,7 @@ def test_ctransformer_init(model_spec, model_family): @pytest.mark.asyncio -async def test_Ctransformer_model(setup): +async def test_ctransformers_generate(setup): endpoint, _ = setup client = Client(endpoint) assert len(client.list_models()) == 0 @@ -142,14 +142,25 @@ async def test_Ctransformer_model(setup): model = client.get_model(model_uid=model_uid) assert isinstance(model, GenerateModelHandle) - completion = model.generate("AI is going to") + completion = model.generate("AI is going to", generate_config={"max_tokens": 5}) + print(completion) assert "id" in completion assert "text" in completion["choices"][0] assert len(completion["choices"][0]["text"]) > 0 + + assert completion["model"] == model_uid + assert "finish_reason" in completion["choices"][0] + assert completion["choices"][0]["finish_reason"] == "length" + assert "prompt_tokens" in completion["usage"] + assert completion["usage"]["prompt_tokens"] == 4 + assert "completion_tokens" in completion["usage"] + assert completion["usage"]["completion_tokens"] == 5 + assert "total_tokens" in completion["usage"] + assert completion["usage"]["total_tokens"] == 9 client.terminate_model(model_uid=model_uid) assert len(client.list_models()) == 0 From 2524747ca5258e5e7d272a378690d4f261ca82e8 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Thu, 10 Aug 2023 17:17:43 +0800 Subject: [PATCH 54/61] fix test cuda error. --- xinference/model/llm/ggml/ctransformers.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index b1c0269c93..375ef8f767 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -75,6 +75,12 @@ class CtransformersGenerateConfig(TypedDict, total=False): reset: Optional[bool] +def _has_cuda_device(): + from xorbits._mars.resource import cuda_count + + return cuda_count() > 0 + + class CtransformersModel(LLM): def __init__( self, @@ -100,10 +106,6 @@ def __init__( self._model_uid = model_uid self._llm = None - def _can_apply_cublas(self): - # TODO: figure out the quantizations supported. - return True - def _sanitize_model_config( self, model_path, ctransformers_model_config: Optional[CtransformersModelConfig] ) -> "AutoConfig": @@ -138,7 +140,7 @@ def _sanitize_model_config( if potential_gpu_layers is None: if self._is_darwin_and_apple_silicon(): ctransformers_model_config_returned.gpu_layers = 1 - elif self._is_linux() and self._can_apply_cublas(): + elif _has_cuda_device(): ctransformers_model_config_returned.gpu_layers = self._gpu_layers return AutoConfig(ctransformers_model_config_returned) From 740f31ab10034190190ac546410756801dd37fa6 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 11 Aug 2023 10:44:29 +0800 Subject: [PATCH 55/61] add GPU check to make sure only supported model can initialize Cuda. --- xinference/model/llm/ggml/ctransformers.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index 375ef8f767..2fafbebb24 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -44,8 +44,11 @@ "falcon": "falcon", } +# these two constants subjects to change for future development and ctransformers updates. CTRANSFORMERS_SUPPORTED_MODEL = ["starcoder", "gpt-2"] +CTRANSFORMERS_GPU_SUPPORT = ["llama", "llama-2", "mpt", "falcon"] + SIZE_TO_GPU_LAYERS = { 3: 26, 7: 32, @@ -138,7 +141,9 @@ def _sanitize_model_config( # if user does not define gpu layers, we have to set it with our system if applicable. if potential_gpu_layers is None: - if self._is_darwin_and_apple_silicon(): + if self._model_family.model_name not in CTRANSFORMERS_GPU_SUPPORT: + ctransformers_model_config_returned.gpu_layers = -1 + elif self._is_darwin_and_apple_silicon(): ctransformers_model_config_returned.gpu_layers = 1 elif _has_cuda_device(): ctransformers_model_config_returned.gpu_layers = self._gpu_layers From 25d3dd454257aeb2c4520eb831f72cad47008d27 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 11 Aug 2023 10:51:05 +0800 Subject: [PATCH 56/61] add GPU check to make sure only supported model can initialize Cuda. --- xinference/model/llm/ggml/ctransformers.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index 2fafbebb24..3a5984dac2 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -101,13 +101,15 @@ def __init__( SIZE_TO_GPU_LAYERS.keys(), key=lambda x: abs(x - model_spec.model_size_in_billions), ) + + self._model_family = model_family + self._model_uid = model_uid + self._llm = None + self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size] self._ctransformer_model_config = self._sanitize_model_config( model_path, ctransformers_Model_Config ) - self._model_family = model_family - self._model_uid = model_uid - self._llm = None def _sanitize_model_config( self, model_path, ctransformers_model_config: Optional[CtransformersModelConfig] From a9e3006991a34664b4db591c1a0c95e6a4b84ba3 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 11 Aug 2023 14:24:19 +0800 Subject: [PATCH 57/61] try adding mpt model. --- xinference/model/llm/ggml/ctransformers.py | 2 +- xinference/model/llm/llm_family.json | 27 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index 3a5984dac2..4f0d08c649 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -45,7 +45,7 @@ } # these two constants subjects to change for future development and ctransformers updates. -CTRANSFORMERS_SUPPORTED_MODEL = ["starcoder", "gpt-2"] +CTRANSFORMERS_SUPPORTED_MODEL = ["starcoder", "gpt-2", "mpt"] CTRANSFORMERS_GPU_SUPPORT = ["llama", "llama-2", "mpt", "falcon"] diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 8e709182f2..cd95caaee2 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -823,5 +823,32 @@ } ], "prompt_style": null + }, + { + "version": 1, + "model_name": "mpt", + "model_lang": [ + "en" + ], + "model_ability":[ + "generate" + ], + "model_specs": [ + { + "model_format": "ggmlv3", + "model_size_in_billions": 7, + "quantizations": [ + "q4_0", + "q4_1", + "q5_0", + "q5_1", + "q8_0", + "fp16" + ], + "model_id": "TheBloke/MPT-7B-Storywriter-GGML", + "model_file_name_template": "mpt-7b-storywriter.ggmlv3.{quantization}.bin" + } + ], + "prompt_style": null } ] From ef9a84de86c20aabf807334d34f1026dea03ea21 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 11 Aug 2023 14:40:53 +0800 Subject: [PATCH 58/61] remove mpt for this pr. --- xinference/model/llm/ggml/ctransformers.py | 2 +- xinference/model/llm/llm_family.json | 27 ---------------------- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index 4f0d08c649..3a5984dac2 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -45,7 +45,7 @@ } # these two constants subjects to change for future development and ctransformers updates. -CTRANSFORMERS_SUPPORTED_MODEL = ["starcoder", "gpt-2", "mpt"] +CTRANSFORMERS_SUPPORTED_MODEL = ["starcoder", "gpt-2"] CTRANSFORMERS_GPU_SUPPORT = ["llama", "llama-2", "mpt", "falcon"] diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index cd95caaee2..8e709182f2 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -823,32 +823,5 @@ } ], "prompt_style": null - }, - { - "version": 1, - "model_name": "mpt", - "model_lang": [ - "en" - ], - "model_ability":[ - "generate" - ], - "model_specs": [ - { - "model_format": "ggmlv3", - "model_size_in_billions": 7, - "quantizations": [ - "q4_0", - "q4_1", - "q5_0", - "q5_1", - "q8_0", - "fp16" - ], - "model_id": "TheBloke/MPT-7B-Storywriter-GGML", - "model_file_name_template": "mpt-7b-storywriter.ggmlv3.{quantization}.bin" - } - ], - "prompt_style": null } ] From 0f05689b766dce5fa3a6c157345d111541d22708 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 11 Aug 2023 14:53:38 +0800 Subject: [PATCH 59/61] refactor toward suggestions. --- xinference/model/llm/ggml/ctransformers.py | 7 ++-- .../model/llm/ggml/ctransformers_util.py | 32 ++++++++----------- .../llm/ggml/tests/test_ctransformers.py | 10 ++---- 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index 3a5984dac2..af18abf072 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -92,7 +92,7 @@ def __init__( model_spec: "LLMSpecV1", quantization: str, model_path: str, - ctransformers_Model_Config: Optional[CtransformersModelConfig], + ctransformers_model_config: Optional[CtransformersModelConfig], ): super().__init__(model_uid, model_family, model_spec, quantization, model_path) @@ -108,7 +108,7 @@ def __init__( self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size] self._ctransformer_model_config = self._sanitize_model_config( - model_path, ctransformers_Model_Config + model_path, ctransformers_model_config ) def _sanitize_model_config( @@ -230,12 +230,13 @@ def generator_wrapper( yield _completion_chunk generate_config = self._sanitize_generate_config(generate_config_raw) - max_new_tokens = generate_config.pop("max_tokens", None) logger.debug( "Enter generate, prompt: %s, generate config: %s", prompt, generate_config ) + max_new_tokens = generate_config.pop("max_tokens", None) + stream_or_not = generate_config.get("stream", False) if stream_or_not: return generator_wrapper( diff --git a/xinference/model/llm/ggml/ctransformers_util.py b/xinference/model/llm/ggml/ctransformers_util.py index 33a14705be..e263a56a70 100644 --- a/xinference/model/llm/ggml/ctransformers_util.py +++ b/xinference/model/llm/ggml/ctransformers_util.py @@ -56,6 +56,20 @@ def generate_stream( # parameters needed for Xinference. finish_reason = None + try: + from ctransformers.utils import utf8_split_incomplete + except ImportError: + error_message = ( + "Failed to import module 'ctransformers - utf8_split_incomplete'" + ) + + installation_guide = [ + "Please make sure 'ctransformers' is installed. You can install it by checking out the repository: " + "https://github.com/marella/ctransformers", + ] + + raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") + for token in model_ref.generate( tokens, top_k=top_k, @@ -69,28 +83,12 @@ def generate_stream( reset=reset, ): # Handle incomplete UTF-8 multi-byte characters. - try: - from ctransformers.utils import utf8_split_incomplete - except ImportError: - error_message = ( - "Failed to import module 'ctransformers - utf8_split_incomplete'" - ) - - installation_guide = [ - "Please make sure 'ctransformers' is installed. You can install it by checking out the repository: " - "https://github.com/marella/ctransformers", - ] - - raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") - incomplete += model_ref.detokenize([token], decode=False) complete, incomplete = utf8_split_incomplete(incomplete) output = complete.decode(errors="ignore") text += output total_text += output - logger.debug("Output, completion: %s", text) - # https://github.com/abetlen/llama-cpp-python/blob/1a13d76c487df1c8560132d10bda62d6e2f4fa93/llama_cpp/llama.py#L686-L706 # Check if one of the stop sequences is part of the text. # Note that the stop sequence may not always be at the end of text. @@ -160,6 +158,4 @@ def generate_stream( total_tokens=count + len(tokens), ) - logger.debug("Completionchoice: %s", completion_choice) - yield completion_chunk, completion_usage diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index d13b2377fb..a129a43a34 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -20,12 +20,6 @@ from ....llm import GgmlLLMSpecV1, LLMFamilyV1 from ..ctransformers import CtransformersModel - -class MockCtransformersModel(CtransformersModel): - def load(self): - pass - - mock_model_spec = GgmlLLMSpecV1( model_format="ggmlv3", model_size_in_billions=6, @@ -75,13 +69,13 @@ def test_ctransformer_init(model_spec, model_family): path = "".join( random.choice(string.ascii_letters + string.punctuation) for i in range(100) ) - model = MockCtransformersModel( + model = CtransformersModel( model_uid=uid, model_family=model_family, model_spec=model_spec, quantization=quantization, model_path=path, - ctransformers_Model_Config=None, + ctransformers_model_config=None, ) assert model.model_uid == uid From 360f7e73f5dc606ce5439ed9be2f89f66fb1aa52 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 11 Aug 2023 14:59:19 +0800 Subject: [PATCH 60/61] fix lint issue. --- xinference/model/llm/ggml/ctransformers.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index af18abf072..e04cc2ef14 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -130,27 +130,25 @@ def _sanitize_model_config( raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") # if the model have customized config, we update it. - ctransformers_model_config_returned = Config() + model_config_ret = Config() potential_gpu_layers = None if ctransformers_model_config: potential_context_length = ctransformers_model_config.pop("n_ctx", None) potential_gpu_layers = ctransformers_model_config.pop("n_gpu_layers", None) - ctransformers_model_config_returned.context_length = ( - potential_context_length - ) - ctransformers_model_config_returned.gpu_layers = potential_gpu_layers + model_config_ret.context_length = potential_context_length + model_config_ret.gpu_layers = potential_gpu_layers # if user does not define gpu layers, we have to set it with our system if applicable. if potential_gpu_layers is None: if self._model_family.model_name not in CTRANSFORMERS_GPU_SUPPORT: - ctransformers_model_config_returned.gpu_layers = -1 + model_config_ret.gpu_layers = -1 elif self._is_darwin_and_apple_silicon(): - ctransformers_model_config_returned.gpu_layers = 1 + model_config_ret.gpu_layers = 1 elif _has_cuda_device(): - ctransformers_model_config_returned.gpu_layers = self._gpu_layers + model_config_ret.gpu_layers = self._gpu_layers - return AutoConfig(ctransformers_model_config_returned) + return AutoConfig(model_config_ret) def _sanitize_generate_config( self, From bfecc8a24670a57ff67bc160a80dc380cdd52f85 Mon Sep 17 00:00:00 2001 From: Rui Ji Date: Fri, 11 Aug 2023 15:15:32 +0800 Subject: [PATCH 61/61] remove prompt style for generate model. --- xinference/model/llm/llm_family.json | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 8e709182f2..753b9bf2d0 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -799,8 +799,7 @@ "model_id": "TheBloke/starcoder-GGML", "model_file_name_template": "starcoder.ggmlv3.{quantization}.bin" } - ], - "prompt_style": null + ] }, { "version": 1, @@ -821,7 +820,6 @@ "model_id": "marella/gpt-2-ggml", "model_file_name_template": "ggml-model.bin" } - ], - "prompt_style": null + ] } ]