diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 8fa6358d3ec..0531b05135f 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -125,6 +125,7 @@ def test_contexted_kv_attention(
     v_cache = v_cache.view(-1, block_size, num_heads,
                            head_size).permute(0, 2, 3, 1).contiguous()
 
+    # Warm up the Triton kernel by calling it once before actually measuring generation time
     context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
                           b_start_loc, b_seq_len, b_ctx_len, max_input_len)
     torch.cuda.synchronize()