From e32fb16e0801b915bf8f5055329259ad456d365c Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Thu, 25 May 2023 01:28:39 +0000 Subject: [PATCH 01/12] Add book theme as dependency --- docs/requirements-docs.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index c770f13ac11f..945cb4aa13af 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,3 +1,4 @@ sphinx +sphinx-book-theme sphinx-click sphinx-copybutton From e3bb084196db98b8f49a35f2fda6696e5502b71e Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Thu, 25 May 2023 01:31:29 +0000 Subject: [PATCH 02/12] Shorten the command --- docs/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/README.md b/docs/README.md index e1b4c6cdf681..a1d4203d0f1f 100644 --- a/docs/README.md +++ b/docs/README.md @@ -14,7 +14,6 @@ make html ## Open the docs with your brower ```bash -cd build/html -python -m http.server +python -m http.server -d build/html/ ``` Launch your browser and open localhost:8000. From ef7c85998501e5f0b9f95494592fd7d8727dadc2 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Thu, 25 May 2023 04:08:41 +0000 Subject: [PATCH 03/12] Add Supported models doc --- docs/source/index.rst | 7 +++++ docs/source/models/supported_models.rst | 40 +++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 docs/source/models/supported_models.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index d31498a8b3f8..3f8cd6512dfd 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -10,3 +10,10 @@ Documentation getting_started/installation getting_started/quickstart + +.. toctree:: + :maxdepth: 1 + :caption: Models + + models/supported_models + models/adding_model diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst new file mode 100644 index 000000000000..bcb1022dd40f --- /dev/null +++ b/docs/source/models/supported_models.rst @@ -0,0 +1,40 @@ +.. _supported_models: + +Supported Models +================ + +CacheFlow supports a variety of generative Transformer models in `HuggingFace Transformers `_. +The following is the list of model architectures that are currently supported by CacheFlow. +Alongside each architecture, we have included some popular models that use it. + +.. list-table:: + :widths: 25 75 + :header-rows: 1 + + * - Architecture + - Models + * - :code:`GPT2LMHeadModel` + - GPT-2 + * - :code:`GPTNeoXForCausalLM` + - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM + * - :code:`LlamaForCausalLM` + - LLaMA, Vicuna, Alpaca, Koala + * - :code:`OPTForCausalLM` + - OPT, OPT-IML + +If your model uses one of the above model architectures, you can seamlessly run your model with CacheFlow. +Otherwise, please refer to :ref:`Adding a New Model ` for instructions on how to implement support for your model. +Alternatively, you can raise an issue on our `GitHub `_ page. + +.. tip:: + The easiest way to check if your model is supported is to run the program below: + + .. code-block:: python + + from cacheflow import LLM + + llm = LLM(model=...) # Name or path of your model + output = llm.generate("Hello, my name is") + print(output) + + If CacheFlow successfully generates text, it indicates that your model is supported. From 8268c7951a7bcf823da90b7de5191bc701383474 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Thu, 25 May 2023 04:09:24 +0000 Subject: [PATCH 04/12] [WIP] empty doc for adding a new model --- docs/source/models/adding_model.rst | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 docs/source/models/adding_model.rst diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst new file mode 100644 index 000000000000..b7f7f4d6fe4f --- /dev/null +++ b/docs/source/models/adding_model.rst @@ -0,0 +1,7 @@ +.. _adding_a_new_model: + +Adding a New Model +================== + + +Placeholder From 41aa1ad27d352cc26d2e1fe0369d3195dc77b06b Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Thu, 25 May 2023 04:09:36 +0000 Subject: [PATCH 05/12] Minor fix for convenience --- cacheflow/entrypoints/llm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cacheflow/entrypoints/llm.py b/cacheflow/entrypoints/llm.py index acb9a7473ad9..6ade19c1fb02 100644 --- a/cacheflow/entrypoints/llm.py +++ b/cacheflow/entrypoints/llm.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List, Optional, Union from tqdm import tqdm @@ -33,11 +33,13 @@ def __init__( def generate( self, - prompts: List[str], + prompts: Union[str, List[str]], sampling_params: Optional[SamplingParams] = None, prompt_token_ids: Optional[List[List[int]]] = None, use_tqdm: bool = True, ) -> List[RequestOutput]: + if isinstance(prompts, str): + prompts = [prompts] if sampling_params is None: # Use default sampling params. sampling_params = SamplingParams() From a6854a5d818b6ff954edab0ce5743964a0e1003a Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Thu, 25 May 2023 05:39:14 +0000 Subject: [PATCH 06/12] Address comments --- docs/source/models/supported_models.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index bcb1022dd40f..e73ae6bde653 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -5,7 +5,7 @@ Supported Models CacheFlow supports a variety of generative Transformer models in `HuggingFace Transformers `_. The following is the list of model architectures that are currently supported by CacheFlow. -Alongside each architecture, we have included some popular models that use it. +Alongside each architecture, we include some popular models that use it. .. list-table:: :widths: 25 75 @@ -24,7 +24,7 @@ Alongside each architecture, we have included some popular models that use it. If your model uses one of the above model architectures, you can seamlessly run your model with CacheFlow. Otherwise, please refer to :ref:`Adding a New Model ` for instructions on how to implement support for your model. -Alternatively, you can raise an issue on our `GitHub `_ page. +Alternatively, you can raise an issue on our `GitHub `_ project. .. tip:: The easiest way to check if your model is supported is to run the program below: From ab2c176a6d91210bf9849e7a588f4458111c9e28 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Thu, 25 May 2023 07:59:59 +0000 Subject: [PATCH 07/12] [WIP] --- docs/source/models/adding_model.rst | 34 ++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index b7f7f4d6fe4f..dd18e81f0a90 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -3,5 +3,37 @@ Adding a New Model ================== +This document describes the steps to add a new model to CacheFlow. +As a running example, we will walk through the process of implemting the `OPT `_ model in CacheFlow. + +.. note:: + The complexity of adding a new model into CacheFlow varies based on the model's architecture. + If the model shares the similar architecture with an already existing one in CacheFlow, the process is considerably more straightforward. + However, if the model architecture includes new operators (e.g., a new attention mechanism), the process can be more challenging. + +.. note:: + If you are having trouble integrating your model into CacheFlow, we encourage you to open an issue on our `GitHub `_ repository. + We will be happy to help you out! + + +0. Fork the CacheFlow repository +-------------------------------- + +The first step is to fork the CacheFlow repository and :ref:`build it from source `. +This will allow you to make changes to the codebase and test your model. + + +1. Bring your model code +------------------------ + + + + +2. Rewrite the “forward” methods of the layers +3. Prune away unnecessary code (e.g., the code for training) +4. Replace the Attention layer with CacheFlowAttention + +Replace nn.Linear with ParallelLinear for tensor parallelism support +Write the weight loading function +Only for the QKV linear layer -Placeholder From b24709fb0d909b0a8425f80844854d6a1d9cb83f Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 4 Jun 2023 02:00:24 +0000 Subject: [PATCH 08/12] Wrire adding_a_new_model --- docs/source/models/adding_model.rst | 77 ++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index dd18e81f0a90..e9fc45256a85 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -3,15 +3,14 @@ Adding a New Model ================== -This document describes the steps to add a new model to CacheFlow. -As a running example, we will walk through the process of implemting the `OPT `_ model in CacheFlow. +This document provides a high-level guide on the process of adding a new model into CacheFlow. .. note:: - The complexity of adding a new model into CacheFlow varies based on the model's architecture. - If the model shares the similar architecture with an already existing one in CacheFlow, the process is considerably more straightforward. + The complexity of adding a new model varies based on the model's architecture. + If the model shares the similar architecture with an already existing one in CacheFlow, the process is more straightforward. However, if the model architecture includes new operators (e.g., a new attention mechanism), the process can be more challenging. -.. note:: +.. tip:: If you are having trouble integrating your model into CacheFlow, we encourage you to open an issue on our `GitHub `_ repository. We will be happy to help you out! @@ -19,21 +18,75 @@ As a running example, we will walk through the process of implemting the `OPT `. +The first step is to fork our `GitHub `_ repository and :ref:`build it from source `. This will allow you to make changes to the codebase and test your model. 1. Bring your model code ------------------------ +Copy the PyTorch model code from the `HuggingFace Transformers `_ repository and put it into the `cacheflow/model_executor/models `_ directory. +For example, you can use the code from the HuggingFace's `modeling_llama.py `_ file for LLaMA models. + +.. warning:: + In copying the model code, make sure to review and adhere to the code's copyright and licensing terms. + + +2. Rewrite the :code:`forward` methods +-------------------------------------- + +The next step is to rewrite the :code:`forward` methods of your model by following the steps below: + +1. Prune out unnecessary code. For example, you can remove the code only used for training. +2. Change the input parameters: + +.. code-block:: diff + + def forward( + self, + input_ids: torch.Tensor, + - attention_mask: Optional[torch.Tensor] = None, + - position_ids: Optional[torch.LongTensor] = None, + - past_key_values: Optional[List[torch.FloatTensor]] = None, + - inputs_embeds: Optional[torch.FloatTensor] = None, + - labels: Optional[torch.LongTensor] = None, + - use_cache: Optional[bool] = None, + - output_attentions: Optional[bool] = None, + - output_hidden_states: Optional[bool] = None, + - return_dict: Optional[bool] = None, + -) -> Union[Tuple, CausalLMOutputWithPast]: + + positions: torch.Tensor, + + kv_caches: List[KVCache], + + input_metadata: InputMetadata, + + cache_events: Optional[List[torch.cuda.Event]], + +) -> Dict[int, SequenceOutputs]: + +3. Fix the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors that contain the data for all input sequences. +4. Replace the attention operation with either :code:`GPTCacheFlowAttention` or :code:`GPTNeoXCacheFlowAttention` depending on the model's architecture. + +.. note:: + As of now, CacheFlow supports the vanilla multi-head attention mechanism and its variant with rotary positional embeddings. + If your model uses a different attention mechanism, you need to implement a new attention layer in CacheFlow. + + +3. (Optional) Add tensor parallelism support +-------------------------------------------- +If your model is too large to fit into a single GPU, you can add tensor parallelism support to your model. +To do so, you need to replace your model's linear and embedding layers with their tensor-parallel counterparts. +For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. +For the linear layers, you need to replace them with either :code:`RowParallelLinear` or :code:`ColumnParallelLinear`. +Typically, we use :code:`ColumnParallelLinear` for QKV linear layers and the first linear layers of the MLP blocks. +We use :code:`RowParallelLinear` for the other linear layers. +4. Implement the weight loading logic +------------------------------------- -2. Rewrite the “forward” methods of the layers -3. Prune away unnecessary code (e.g., the code for training) -4. Replace the Attention layer with CacheFlowAttention +The next step is to implement :code:`load_weights` method in your :code:`*ForCausalLM` class. +This method should load the weights from the HuggingFace's checkpoint file and set them to the corresponding layers in your model. +While the process is straightforward for most layers, the tensor-parallel layers require some additional care as you need to split the weights across multiple GPUs. -Replace nn.Linear with ParallelLinear for tensor parallelism support -Write the weight loading function -Only for the QKV linear layer +5. Register your model +---------------------- +Finally, add your :code:`*ForCausalLM` class to `cacheflow/model_executor/models/__init__.py `_ and register it in :code:`_MODEL_REGISTRY` in the `cacheflow/model_executor/model_loader.py `_ file. From 422acf70f602c198f9db81209dab2f2ca5c4f4ac Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 4 Jun 2023 02:11:54 +0000 Subject: [PATCH 09/12] Minor --- docs/source/models/adding_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index e9fc45256a85..1214df8e4c0e 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -61,7 +61,7 @@ The next step is to rewrite the :code:`forward` methods of your model by followi + cache_events: Optional[List[torch.cuda.Event]], +) -> Dict[int, SequenceOutputs]: -3. Fix the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors that contain the data for all input sequences. +3. Fix the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors. 4. Replace the attention operation with either :code:`GPTCacheFlowAttention` or :code:`GPTNeoXCacheFlowAttention` depending on the model's architecture. .. note:: From 5782dc6437b4c1b64e98bb38e31c842efba97dc5 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Mon, 5 Jun 2023 09:26:55 +0000 Subject: [PATCH 10/12] Address comments --- docs/source/models/adding_model.rst | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index 1214df8e4c0e..e754e9774e68 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -4,6 +4,7 @@ Adding a New Model ================== This document provides a high-level guide on the process of adding a new model into CacheFlow. +For example, through this document you can add the `OPT model in HuggingFace `_ to `CacheFlow `_. .. note:: The complexity of adding a new model varies based on the model's architecture. @@ -37,7 +38,7 @@ For example, you can use the code from the HuggingFace's `modeling_llama.py Date: Mon, 5 Jun 2023 09:46:00 +0000 Subject: [PATCH 11/12] Polish --- docs/source/models/adding_model.rst | 49 ++++++++++++++--------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index e754e9774e68..f0a0530884c4 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -3,42 +3,41 @@ Adding a New Model ================== -This document provides a high-level guide on the process of adding a new model into CacheFlow. -For example, through this document you can add the `OPT model in HuggingFace `_ to `CacheFlow `_. +This document provides a high-level guide on integrating a `HuggingFace Transformers `_ model into CacheFlow. .. note:: - The complexity of adding a new model varies based on the model's architecture. - If the model shares the similar architecture with an already existing one in CacheFlow, the process is more straightforward. - However, if the model architecture includes new operators (e.g., a new attention mechanism), the process can be more challenging. + The complexity of adding a new model depends heavily on the model's architecture. + The process is considerably straightforward if the model shares a similar architecture with an existing model in CacheFlow. + However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. .. tip:: - If you are having trouble integrating your model into CacheFlow, we encourage you to open an issue on our `GitHub `_ repository. + If you are encountering issues while integrating your model into CacheFlow, feel free to open an issue on our `GitHub `_ repository. We will be happy to help you out! 0. Fork the CacheFlow repository -------------------------------- -The first step is to fork our `GitHub `_ repository and :ref:`build it from source `. -This will allow you to make changes to the codebase and test your model. +Start by forking our `GitHub `_ repository and then :ref:`build it from source `. +This gives you the ability to modify the codebase and test your model. 1. Bring your model code ------------------------ -Copy the PyTorch model code from the `HuggingFace Transformers `_ repository and put it into the `cacheflow/model_executor/models `_ directory. -For example, you can use the code from the HuggingFace's `modeling_llama.py `_ file for LLaMA models. +Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `cacheflow/model_executor/models `_ directory. +For instance, you can use the code from the HuggingFace's `modeling_llama.py `_ file for the LLaMA models. .. warning:: - In copying the model code, make sure to review and adhere to the code's copyright and licensing terms. + When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. 2. Rewrite the :code:`forward` methods -------------------------------------- -The next step is to rewrite the :code:`forward` methods of your model by following the steps below: +Next, you need to rewrite the :code:`forward` methods of your model by following these steps: -1. Prune out unnecessary code. For example, you can remove the code that is only used for training. +1. Remove any unnecessary code, such as the code only used for training. 2. Change the input parameters: .. code-block:: diff @@ -62,21 +61,21 @@ The next step is to rewrite the :code:`forward` methods of your model by followi + cache_events: Optional[List[torch.cuda.Event]], +) -> Dict[int, SequenceOutputs]: -3. Fix the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors. -4. Replace the attention operation with either :code:`GPTCacheFlowAttention` or :code:`GPTNeoXCacheFlowAttention` depending on the model's architecture. +3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors. +4. Replace the attention operation with either :code:`GPTCacheFlowAttention` or :code:`GPTNeoXCacheFlowAttention`, depending on the model's architecture. .. note:: - Currently, CacheFlow supports the vanilla multi-head attention mechanism and its variant with rotary positional embeddings. - If your model uses a different attention mechanism, you need to implement a new attention layer in CacheFlow. + Currently, CacheFlow supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. + If your model employs a different attention mechanism, you will need to implement a new attention layer in CacheFlow. -3. (Optional) Add tensor parallelism support --------------------------------------------- +3. (Optional) Implement tensor parallelism support +-------------------------------------------------- If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. -To do so, you need to substitute your model's linear and embedding layers with their tensor-parallel versions. +To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. -When it comes to the linear layers, you need to replace them with either :code:`RowParallelLinear` or :code:`ColumnParallelLinear`. +When it comes to the linear layers, you should use either :code:`RowParallelLinear` or :code:`ColumnParallelLinear`. Typically, :code:`ColumnParallelLinear` is used for QKV linear layers and the first linear layers of the MLP blocks. For the remaining linear layers, :code:`RowParallelLinear` is used. @@ -84,12 +83,12 @@ For the remaining linear layers, :code:`RowParallelLinear` is used. 4. Implement the weight loading logic ------------------------------------- -The next step is to implement :code:`load_weights` method in your :code:`*ForCausalLM` class. -This method should load the weights from the HuggingFace's checkpoint file and set them to the corresponding layers in your model. -While the process is straightforward for most layers, the tensor-parallel layers require some additional care as you need to split the weights across multiple GPUs. +You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class. +This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. +While the process is straightforward for most layers, the tensor-parallel layers necessitate some additional care as their weights should be partitioned to multiple GPUs. 5. Register your model ---------------------- -Finally, add your :code:`*ForCausalLM` class to `cacheflow/model_executor/models/__init__.py `_ and register it in :code:`_MODEL_REGISTRY` in the `cacheflow/model_executor/model_loader.py `_ file. +Finally, include your :code:`*ForCausalLM` class in `cacheflow/model_executor/models/__init__.py `_ and register it to the :code:`_MODEL_REGISTRY` in `cacheflow/model_executor/model_loader.py `_. From 7bdfd97d5c78100d0b0e037ab6844122db756887 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Tue, 6 Jun 2023 02:57:28 +0000 Subject: [PATCH 12/12] Address comment --- docs/source/models/adding_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index f0a0530884c4..35a107707b3b 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -26,7 +26,7 @@ This gives you the ability to modify the codebase and test your model. ------------------------ Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `cacheflow/model_executor/models `_ directory. -For instance, you can use the code from the HuggingFace's `modeling_llama.py `_ file for the LLaMA models. +For instance, CacheFlow's `OPT model `_ was adpated from the HuggingFace's `modeling_opt.py `_ file. .. warning:: When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.