Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,9 +465,8 @@ def schedule(self) -> SchedulerOutput:
in self.vllm_config.model_config.model.lower()), (
"Whisper is the only supported "
"encoder-decoder model.")
num_encoder_tokens = MULTIMODAL_REGISTRY.\
get_encdec_max_encoder_len(
self.vllm_config.model_config)
num_encoder_tokens =\
self.scheduler_config.max_num_encoder_input_tokens
Comment on lines +468 to +469
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This change replaces the call to get_encdec_max_encoder_len with self.scheduler_config.max_num_encoder_input_tokens. However, max_num_encoder_input_tokens is initialized to max_num_batched_tokens in SchedulerConfig, which is not the correct value for the model's maximum encoder length (e.g., a fixed value like 3000 for Whisper). Using a potentially smaller value from max_num_batched_tokens will lead to under-allocation of the cross-attention KV cache, which can cause out-of-bounds memory access. This is a critical issue. The original call was functionally correct, though slow. I recommend reverting this change until the value is cached correctly in the configuration.

Suggested change
num_encoder_tokens =\
self.scheduler_config.max_num_encoder_input_tokens
num_encoder_tokens = MULTIMODAL_REGISTRY.\
get_encdec_max_encoder_len(
self.vllm_config.model_config)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is where it gets set to the correct value that we want:

vllm/vllm/config/__init__.py

Lines 2788 to 2790 in 218454b

elif self.model_config.is_encoder_decoder:
self.scheduler_config.max_num_encoder_input_tokens = \
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)

else:
num_encoder_tokens = 0

Expand Down
5 changes: 2 additions & 3 deletions vllm/v1/kv_cache_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.utils import cdiv, get_dtype_size

logger = init_logger(__name__)
Expand Down Expand Up @@ -230,8 +229,8 @@ class CrossAttentionSpec(AttentionSpec):
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
# For cross-attention, we need to cache encoder states
# Get encoder length (e.g., 1500 for Whisper).
max_encoder_len = MULTIMODAL_REGISTRY.\
get_encdec_max_encoder_len(vllm_config.model_config)
max_encoder_len = vllm_config.scheduler_config.\
max_num_encoder_input_tokens
return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes


Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,8 @@ def __init__(
if self.model_config.is_encoder_decoder:
# Maximum length of the encoder input, only for encoder-decoder
# models.
self.max_encoder_len = self.mm_registry.\
get_encdec_max_encoder_len(model_config)
self.max_encoder_len = scheduler_config.\
max_num_encoder_input_tokens
else:
self.max_encoder_len = 0

Expand Down