vllm-project · rkooo567 · May 14, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
diff --git a/benchmarks/benchmark_hashing.py b/benchmarks/benchmark_hashing.py
@@ -0,0 +1,63 @@
+import argparse
+import cProfile
+import pstats
+
+from vllm import LLM, SamplingParams
+
+# A very long prompt, total number of tokens is about 15k.
+LONG_PROMPT = ["You are an expert in large language models, aren't you?"
+               ] * 1000
+LONG_PROMPT = ' '.join(LONG_PROMPT)
+
+
+def main(args):
+    llm = LLM(
+        model=args.model,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        tensor_parallel_size=args.tensor_parallel_size,
+        use_v2_block_manager=args.use_v2_block_manager,
+    )
+
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+    profiler = cProfile.Profile()
+
+    print("------warm up------")
+    for i in range(3):
+        output = llm.generate(LONG_PROMPT, sampling_params)
+        print(output[0].outputs[0].text)
+
+    print("------start generating------")
+    for i in range(3):
+        profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)',
+                        globals(), locals())
+
+    # analyze the runtime of hashing function
+    stats = pstats.Stats(profiler)
+    stats.sort_stats('cumulative')
+    total_time = 0
+    total_calls = 0
+    for func in stats.stats:
+        if 'hash_of_block' in func[2]:
+            total_time = stats.stats[func][3]
+            total_calls = stats.stats[func][0]
+    percentage = (total_time / stats.total_tt) * 100
+    print(f"Hashing took {total_time:.2f} seconds,"
+          f"{percentage:.2f}% of the total runtime.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Benchmark the performance of hashing function in'
+        'automatic prefix caching.')
+    parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+    parser.add_argument('--output-len', type=int, default=10)
+    parser.add_argument('--enable-prefix-caching',
+                        action='store_true',
+                        help='enable prefix caching')
+    parser.add_argument('--use-v2-block-manager',
+                        action='store_true',
+                        help='Use BlockSpaceMangerV2')
+    args = parser.parse_args()
+    main(args)
diff --git a/vllm/sequence.py b/vllm/sequence.py
@@ -2,7 +2,7 @@
 import copy
 import enum
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 from vllm.block import LogicalTokenBlock
 from vllm.lora.request import LoRARequest
@@ -119,6 +119,7 @@ def __init__(
             output_token_ids = []
 
         self.prompt_token_ids = prompt_token_ids
+        self.prompt_token_ids_tuple = tuple(prompt_token_ids)
         self.output_token_ids = output_token_ids
         self.cumulative_logprob = 0.0
         # The number of tokens that are computed (that run against the model).
@@ -141,6 +142,17 @@ def get_output_len(self) -> int:
     def get_token_ids(self) -> List[int]:
         return self.prompt_token_ids + self.output_token_ids
 
+    def get_prefix_token_ids(
+            self, num_tokens: int
+    ) -> Tuple[Tuple[int, ...], Optional[Tuple[int, ...]]]:
+        """Get prefix tokens, and make the return value hashable"""
+        prompt_length = len(self.prompt_token_ids_tuple)
+        if num_tokens > prompt_length:
+            return (self.prompt_token_ids_tuple,
+                    tuple(self.output_token_ids[:num_tokens - prompt_length]))
+        else:
+            return (self.prompt_token_ids_tuple[:num_tokens], None)
+
     def get_num_computed_tokens(self) -> int:
         """Return the number of prefill tokens that are already computed."""
         return self._num_computed_tokens
@@ -245,14 +257,9 @@ def get_output_text_to_return(self, buffer_length: int):
             self.output_text)
 
     def hash_of_block(self, logical_idx: int) -> int:
-        # TODO This can produce incorrect hash when block size > prompt size
-
-        # Compute the number of tokens in the sequence
-        # TODO: The current hashing function is O(L^2). We should optimize
-        # this in the future.
         num_tokens = self.num_hashed_tokens_of_block(logical_idx)
-        return hash(
-            (tuple(self.data.get_token_ids()[0:num_tokens]), self.lora_int_id))
+        hashed_tokens = self.data.get_prefix_token_ids(num_tokens)
+        return hash((hashed_tokens, self.lora_int_id))
 
     def num_hashed_tokens_of_block(self, logical_idx: int):
         return logical_idx * self.block_size + self.block_size