From 4d9f63d3d32b9346fb806c3e9639fe795b20206c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 29 Nov 2025 04:17:47 +0000 Subject: [PATCH 1/2] [Model Runner V2] Don't use UVA buffer for prefill_len Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/model_runner.py | 2 ++ vllm/v1/worker/gpu/states.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 1b512b2ee3e5..8414ca53b874 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -410,6 +410,8 @@ def update_states(self, scheduler_output: SchedulerOutput) -> None: cu_num_new_blocks[i].append(x + len(block_ids)) new_block_ids[i].extend(block_ids) overwrite.append(True) + if scheduler_output.scheduled_new_reqs: + self.req_states.prefill_len.copy_to_gpu() # Add new blocks for the existing requests. cached_reqs = scheduler_output.scheduled_cached_reqs diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py index 4ddd2dfdd731..62dcb38ac080 100644 --- a/vllm/v1/worker/gpu/states.py +++ b/vllm/v1/worker/gpu/states.py @@ -117,7 +117,7 @@ def __init__( self.prefill_token_ids = UvaBuffer( self.max_num_reqs, self.max_model_len, dtype=torch.int32 ) - self.prefill_len = UvaBuffer(self.max_num_reqs, dtype=torch.int32) + self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32) # Number of computed tokens. self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32) self.num_computed_tokens = torch.zeros( From 8def2f87d76f7c8861443da341376055e43c9d4e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 29 Nov 2025 04:21:51 +0000 Subject: [PATCH 2/2] fix Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/states.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py index 62dcb38ac080..44b076fa4c2a 100644 --- a/vllm/v1/worker/gpu/states.py +++ b/vllm/v1/worker/gpu/states.py @@ -117,6 +117,9 @@ def __init__( self.prefill_token_ids = UvaBuffer( self.max_num_reqs, self.max_model_len, dtype=torch.int32 ) + # NOTE(woosuk): We don't use UVA for prefill_len because its GPU view + # can be used outside of update_states and prepare_inputs. + # Without async barrier, using UVA can cause race conditions. self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32) # Number of computed tokens. self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)