Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions vllm/v1/worker/gpu_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def __init__(
self.logitsprocs_need_output_token_ids = logitsprocs_need_output_token_ids

# Store last speculative tokens for sampler.
self.spec_token_ids: list[list[int] | None] = []
self.spec_token_ids: list[list[int]] = [[] for _ in range(max_num_reqs)]

# This is updated each time the batch constituents change.
self.sampling_metadata = self._make_sampling_metadata()
Expand Down Expand Up @@ -313,7 +313,7 @@ def add_request(
else:
self._req_ids[req_index] = req_id
self.req_output_token_ids[req_index] = request.output_token_ids
self.spec_token_ids[req_index] = []
self.spec_token_ids[req_index].clear()

self.req_id_to_index[req_id] = req_index

Expand Down Expand Up @@ -462,7 +462,7 @@ def remove_request(self, req_id: str) -> int | None:
self.batch_update_builder.removed_append(req_index)
self._req_ids[req_index] = None
self.req_output_token_ids[req_index] = None
self.spec_token_ids[req_index] = None
self.spec_token_ids[req_index].clear()

# LoRA
lora_id = self.request_lora_mapping[req_index]
Expand Down Expand Up @@ -654,9 +654,15 @@ def condense(self) -> None:
self.req_output_token_ids[last_req_index] = None
self.req_id_to_index[req_id] = empty_index

spec_token_ids = self.spec_token_ids[last_req_index]
self.spec_token_ids[empty_index] = spec_token_ids
self.spec_token_ids[last_req_index] = None
if last_req_index != empty_index:
(
self.spec_token_ids[last_req_index],
self.spec_token_ids[empty_index],
) = (
self.spec_token_ids[empty_index],
self.spec_token_ids[last_req_index],
)
self.spec_token_ids[last_req_index].clear()

num_tokens = self.num_tokens[last_req_index]
self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,7 +892,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
# conform to the schema. This can result in
# scheduler_output.scheduled_spec_decode_tokens being empty,
# even when speculative decoding is enabled.
self.input_batch.spec_token_ids[req_index] = spec_token_ids
self.input_batch.spec_token_ids[req_index].clear()
self.input_batch.spec_token_ids[req_index].extend(spec_token_ids)
Comment on lines +895 to +896
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this one, how is it better to repopulate an existing list and then discard the other one. Surely better to avoid that extra work and just use the other one? Either way a list is going to get garbage collected..

Copy link
Collaborator Author

@Jialin Jialin Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great question! The magic is hidden behind gc.freeze.

If we preallocate spec_token_ids lists, then run gc.freeze. The preallocate lists will NOT be GC tracked. And the spec_token_ids in this function is short living, majority of the cases, it would be gone due to reference counting, it will not be handled by gc.collect in most of the cases.

Let me know if you want more clarification :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @Jialin, my general concern about these changes is that they sometimes come at the cost of added complexity and can negatively affect the readability/maintainability.

There is a balance where it's reasonable I think to give up minor perf benefits to keep the code simpler (of course case by case weighing magnitude of complexity vs magnitude of perf benefit).

This particular change for example I think is quite fragile, especially without comments explain why we are expending extra cycles here to optimize the object lifecycle in relation to GC. It's very likely someone could change this in future without being aware. I'm not sure whether it's realistic to enforce this (actually these lists will go away anyhow soon with MRV2).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@njhill Totally agree with you for the tradeoff. And @zhuohan123 actually raised similar concerns with my other PRs as well. And I totally aligned with it.

We had internal TPGS runs to justify the impact. But moving forward, I think I should provide more data which is accessible for open access to avoid such confusion. And I'll hold off other GC changes landing to OSS side, until I came up with e2e benchmark to show e2e wins (both latency and GC costs) to justify such changes.


# there are no draft tokens with async scheduling,
# we clear the spec_decoding info in scheduler_output and
Expand Down