From 2dbe28c756edd34d7075734f91370c7251a3a7a9 Mon Sep 17 00:00:00 2001
From: JasonZhu1313 <jasonchu13@outlook.com>
Date: Mon, 22 Jan 2024 11:51:07 -0800
Subject: [PATCH 1/2] Remove duplicated call of context_attention_fwd in
 test_prefix_prefill.py

---
 tests/kernels/test_prefix_prefill.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 8fa6358d3ec..ab76c23473d 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -125,9 +125,6 @@ def test_contexted_kv_attention(
     v_cache = v_cache.view(-1, block_size, num_heads,
                            head_size).permute(0, 2, 3, 1).contiguous()
 
-    context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
-                          b_start_loc, b_seq_len, b_ctx_len, max_input_len)
-    torch.cuda.synchronize()
     start_time = time.time()
     context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
                           b_start_loc, b_seq_len, b_ctx_len, max_input_len)

From 12ca94e9c29722b807722ed5c0b2969b151aece5 Mon Sep 17 00:00:00 2001
From: JasonZhu1313 <jasonchu13@outlook.com>
Date: Mon, 22 Jan 2024 11:56:21 -0800
Subject: [PATCH 2/2] add doc string

---
 tests/kernels/test_prefix_prefill.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index ab76c23473d..0531b05135f 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -125,6 +125,10 @@ def test_contexted_kv_attention(
     v_cache = v_cache.view(-1, block_size, num_heads,
                            head_size).permute(0, 2, 3, 1).contiguous()
 
+    # Warm up the Triton kernel by calling it once before actually measuring generation time
+    context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
+                          b_start_loc, b_seq_len, b_ctx_len, max_input_len)
+    torch.cuda.synchronize()
     start_time = time.time()
     context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
                           b_start_loc, b_seq_len, b_ctx_len, max_input_len)