From 9a6d1fccd4d05f324c3a32e5ee7ebcad7d3e924e Mon Sep 17 00:00:00 2001 From: Lehua Ding Date: Mon, 29 Sep 2025 20:30:06 +0800 Subject: [PATCH 1/2] [perf] Use CPU tensor to reduce GPU->CPU sync Signed-off-by: Lehua Ding --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0960fe3a25fb..d70471e73a37 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2467,7 +2467,7 @@ def propose_draft_token_ids(sampled_token_ids): effective_drafter_max_model_len = ( self.speculative_config.draft_model_config.max_model_len) input_fits_in_drafter = spec_decode_common_attn_metadata and ( - spec_decode_common_attn_metadata.seq_lens.max() + + spec_decode_common_attn_metadata.seq_lens_cpu.max() + self.speculative_config.num_speculative_tokens <= effective_drafter_max_model_len) if use_padded_batch_for_eagle and input_fits_in_drafter: From 0617ab632659fb300c247c8c09b1836b9a4f95d4 Mon Sep 17 00:00:00 2001 From: Lehua Ding Date: Tue, 30 Sep 2025 00:05:48 +0800 Subject: [PATCH 2/2] [perf] Use .max_seq_len replace .seq_lens_cpu.max() Signed-off-by: Lehua Ding --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d70471e73a37..74543b0ec358 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2467,7 +2467,7 @@ def propose_draft_token_ids(sampled_token_ids): effective_drafter_max_model_len = ( self.speculative_config.draft_model_config.max_model_len) input_fits_in_drafter = spec_decode_common_attn_metadata and ( - spec_decode_common_attn_metadata.seq_lens_cpu.max() + + spec_decode_common_attn_metadata.max_seq_len + self.speculative_config.num_speculative_tokens <= effective_drafter_max_model_len) if use_padded_batch_for_eagle and input_fits_in_drafter: