diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index c1e59423e9a1..85ca858ad7bd 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -465,9 +465,8 @@ def schedule(self) -> SchedulerOutput: in self.vllm_config.model_config.model.lower()), ( "Whisper is the only supported " "encoder-decoder model.") - num_encoder_tokens = MULTIMODAL_REGISTRY.\ - get_encdec_max_encoder_len( - self.vllm_config.model_config) + num_encoder_tokens =\ + self.scheduler_config.max_num_encoder_input_tokens else: num_encoder_tokens = 0 diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 6e8f569fff0e..0cf92a680a68 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -11,7 +11,6 @@ from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.utils import cdiv, get_dtype_size logger = init_logger(__name__) @@ -230,8 +229,8 @@ class CrossAttentionSpec(AttentionSpec): def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: # For cross-attention, we need to cache encoder states # Get encoder length (e.g., 1500 for Whisper). - max_encoder_len = MULTIMODAL_REGISTRY.\ - get_encdec_max_encoder_len(vllm_config.model_config) + max_encoder_len = vllm_config.scheduler_config.\ + max_num_encoder_input_tokens return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2ae748dee43c..2e00660cccc4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -234,8 +234,8 @@ def __init__( if self.model_config.is_encoder_decoder: # Maximum length of the encoder input, only for encoder-decoder # models. - self.max_encoder_len = self.mm_registry.\ - get_encdec_max_encoder_len(model_config) + self.max_encoder_len = scheduler_config.\ + max_num_encoder_input_tokens else: self.max_encoder_len = 0