diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 1b512b2ee3e5..8414ca53b874 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -410,6 +410,8 @@ def update_states(self, scheduler_output: SchedulerOutput) -> None: cu_num_new_blocks[i].append(x + len(block_ids)) new_block_ids[i].extend(block_ids) overwrite.append(True) + if scheduler_output.scheduled_new_reqs: + self.req_states.prefill_len.copy_to_gpu() # Add new blocks for the existing requests. cached_reqs = scheduler_output.scheduled_cached_reqs diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py index 4ddd2dfdd731..44b076fa4c2a 100644 --- a/vllm/v1/worker/gpu/states.py +++ b/vllm/v1/worker/gpu/states.py @@ -117,7 +117,10 @@ def __init__( self.prefill_token_ids = UvaBuffer( self.max_num_reqs, self.max_model_len, dtype=torch.int32 ) - self.prefill_len = UvaBuffer(self.max_num_reqs, dtype=torch.int32) + # NOTE(woosuk): We don't use UVA for prefill_len because its GPU view + # can be used outside of update_states and prepare_inputs. + # Without async barrier, using UVA can cause race conditions. + self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32) # Number of computed tokens. self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32) self.num_computed_tokens = torch.zeros(