vllm-project · zhuohan123 · Jan 29, 2024 · Dec 22, 2023 · Dec 27, 2023 · Dec 27, 2023
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -24,6 +24,7 @@ def main(args: argparse.Namespace):
         trust_remote_code=args.trust_remote_code,
         dtype=args.dtype,
         enforce_eager=args.enforce_eager,
+        kv_cache_dtype=args.kv_cache_dtype,
     )
 
     sampling_params = SamplingParams(
@@ -115,6 +116,11 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--enforce-eager',
                         action='store_true',
                         help='enforce eager mode and disable CUDA graph')
+    parser.add_argument('--kv-cache-dtype',
+                        type=str,
+                        choices=['fp8', None],
-                        choices=['fp8', None],
+                        choices=['fp8_e5m2', None],
-                        choices=['fp8', None],
+                        choices=['fp8_e5m2', None],
+                        default=None,
+                        help='Data type for kv cache storage.')
     parser.add_argument(
         '--profile',
         action='store_true',

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -71,6 +71,7 @@ def run_vllm(
     dtype: str,
     max_model_len: Optional[int],
     enforce_eager: bool,
+    kv_cache_dtype: str,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(
@@ -83,6 +84,7 @@ def run_vllm(
         dtype=dtype,
         max_model_len=max_model_len,
         enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
     )
 
     # Add the requests to the engine.
@@ -206,7 +208,8 @@ def main(args: argparse.Namespace):
                                 args.quantization, args.tensor_parallel_size,
                                 args.seed, args.n, args.use_beam_search,
                                 args.trust_remote_code, args.dtype,
-                                args.max_model_len, args.enforce_eager)
+                                args.max_model_len, args.enforce_eager,
+                                args.kv_cache_dtype)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -284,6 +287,13 @@ def main(args: argparse.Namespace):
     parser.add_argument("--enforce-eager",
                         action="store_true",
                         help="enforce eager execution")
+    parser.add_argument(
+        '--kv-cache-dtype',
+        type=str,
+        choices=['fp8', None],
-        choices=['fp8', None],
+        choices=['fp8_e5m2', None],
-        choices=['fp8', None],
+        choices=['fp8_e5m2', None],
+        default=None,
+        help=
+        'Data type for kv cache storage. If None, will use model data type.')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from vllm._C import ops
+from vllm._C import ops, cache_ops
 
 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
@@ -21,6 +21,7 @@ def main(
     use_alibi: bool,
     block_size: int,
     dtype: torch.dtype,
+    use_fp8_kv_cache: bool,
-    use_fp8_kv_cache: bool,
+    kv_cache_dtype: Optional[str] = None,
-    use_fp8_kv_cache: bool,
+    kv_cache_dtype: Optional[str] = None,
     seed: int,
     do_profile: bool,
 ) -> None:
@@ -59,15 +60,36 @@ def main(
     block_tables = torch.tensor(block_tables, dtype=torch.int, device="cuda")
 
     # Create the KV cache.
-    x = 16 // torch.tensor([], dtype=dtype).element_size()
+    cache_dtype = dtype if not use_fp8_kv_cache else torch.uint8
+    x = 16 // torch.tensor([], dtype=cache_dtype).element_size()
     key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, block_size, x)
-    key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device="cuda")
-    key_cache.uniform_(-scale, scale)
+    key_cache = torch.empty(size=key_cache_shape,
+                            dtype=cache_dtype,
+                            device="cuda")
+    if not use_fp8_kv_cache:
-    if not use_fp8_kv_cache:
+    if kv_cache_dtype == None:
-    if not use_fp8_kv_cache:
+    if kv_cache_dtype == None:
+        key_cache.uniform_(-scale, scale)
+    else:
-    else:
+    elif kv_cache_dtype == 'fp8_e5m2':
-    else:
+    elif kv_cache_dtype == 'fp8_e5m2':
+        # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type,
+        # it may occur Inf or NaN if we directly use torch.randint
+        # to generate random data for fp8 cache.
+        # For example, s.11111.00 in fp8e5m2 format repesents Inf.
+        #     | E4M3        | E5M2
+        #-----|-------------|-------------------
+        # Inf | N/A         | s.11111.00
+        # NaN | s.1111.111  | s.11111.{01,10,11}
+        key_cache_tmp = torch.empty_like(key_cache, dtype=dtype)
+        key_cache_tmp.uniform_(-scale, scale)
+        cache_ops.convert_fp8(key_cache_tmp, key_cache)
-        cache_ops.convert_fp8(key_cache_tmp, key_cache)
+        cache_ops.convert_fp8_e5m2(key_cache_tmp, key_cache)
-        cache_ops.convert_fp8(key_cache_tmp, key_cache)
+        cache_ops.convert_fp8_e5m2(key_cache_tmp, key_cache)
     value_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size, block_size)
     value_cache = torch.empty(size=value_cache_shape,
-                              dtype=dtype,
+                              dtype=cache_dtype,
                               device="cuda")
-    value_cache.uniform_(-scale, scale)
+    if not use_fp8_kv_cache:
+        value_cache.uniform_(-scale, scale)
+    else:
+        value_cache_tmp = torch.empty_like(value_cache, dtype=dtype)
+        value_cache_tmp.uniform_(-scale, scale)
+        cache_ops.convert_fp8(value_cache_tmp, value_cache)
 
     # Prepare for the paged attention kernel.
     output = torch.empty_like(query)
@@ -106,6 +128,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float:
                     block_size,
                     max_context_len,
                     alibi_slopes,
+                    use_fp8_kv_cache,
                 )
             elif version == "v2":
                 ops.paged_attention_v2(
@@ -123,6 +146,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float:
                     block_size,
                     max_context_len,
                     alibi_slopes,
+                    use_fp8_kv_cache,
                 )
             else:
                 raise ValueError(f"Invalid version: {version}")
@@ -166,6 +190,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float:
                         type=str,
                         choices=["half", "bfloat16", "float"],
                         default="half")
+    parser.add_argument("--use-fp8-kv-cache", action="store_true")
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--profile", action="store_true")
     args = parser.parse_args()
@@ -188,6 +213,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float:
         block_size=args.block_size,
         use_alibi=args.use_alibi,
         dtype=dtype_to_torch_dtype[args.dtype],
+        use_fp8_kv_cache=args.use_fp8_kv_cache,
         seed=args.seed,
         do_profile=args.profile,
     )
diff --git a/csrc/attention/attention_dtypes.h b/csrc/attention/attention_dtypes.h
@@ -4,3 +4,4 @@
 #include "dtype_float16.cuh"
 #include "dtype_float32.cuh"
 #include "dtype_bfloat16.cuh"
+#include "dtype_fp8.cuh"
-#include "dtype_fp8.cuh"
+#include "dtype_fp8_e5m2.cuh"
-#include "dtype_fp8.cuh"
+#include "dtype_fp8_e5m2.cuh"