From fca0edbde75732ea593278ef87126f4237cbb5b2 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Wed, 24 Sep 2025 01:09:00 +0000 Subject: [PATCH 01/27] calculate profiling size Signed-off-by: wwl2755 --- vllm/envs.py | 8 ++ vllm/utils/__init__.py | 48 ++++++++ vllm/v1/worker/gpu_model_runner.py | 183 ++++++++++++++++++----------- 3 files changed, 171 insertions(+), 68 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 832d031f998e..7ab40f5b28b9 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -43,6 +43,7 @@ VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None VLLM_LOG_STATS_INTERVAL: float = 10. VLLM_TRACE_FUNCTION: int = 0 + VLLM_TRACE_MEMORY_PHASES: bool = False VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None VLLM_PP_LAYER_PARTITION: Optional[str] = None @@ -597,6 +598,13 @@ def get_vllm_port() -> Optional[int]: "VLLM_TRACE_FUNCTION": lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")), + # Trace memory usage during different execution phases + # If set to 1 or true, vllm will log detailed memory usage + # during MM processing, merging, and backbone forward pass + # Useful for analyzing memory bottlenecks + "VLLM_TRACE_MEMORY_PHASES": + lambda: os.environ.get("VLLM_TRACE_MEMORY_PHASES", "0").lower() in ("1", "true"), + # Backend for attention computation # Example options: # - "TORCH_SDPA": use torch.nn.MultiheadAttention diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index c502a69ea500..915c345bd32e 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2857,6 +2857,54 @@ def memory_profiling( result.non_kv_cache_memory = non_torch_memory + peak_activation_memory + result.weights_memory # noqa +@contextlib.contextmanager +def phase_memory_profiling( + phase_name: str, + logger_instance=None, + enabled: bool = True +) -> Generator[None, None, None]: + """Memory profiling context manager for specific execution phases. + + Args: + phase_name: Name of the phase being profiled + logger_instance: Logger to use for output (optional) + enabled: Whether profiling is enabled + """ + if not enabled: + yield + return + + if logger_instance is None: + logger_instance = init_logger(__name__) + + # Take snapshot before phase + before_snapshot = MemorySnapshot() + torch.cuda.reset_peak_memory_stats() + + start_time = time.time() + + try: + yield + finally: + # Take snapshot after phase + torch.cuda.synchronize() + after_snapshot = MemorySnapshot() + end_time = time.time() + + # Calculate phase-specific metrics + peak_increase = after_snapshot.torch_peak - before_snapshot.torch_peak + memory_diff = after_snapshot.cuda_memory - before_snapshot.cuda_memory + duration = end_time - start_time + + logger_instance.info( + f"[MEMORY-PHASE] {phase_name}: " + f"peak_increase={peak_increase / (1024**3):.3f}GiB, " + f"memory_diff={memory_diff / (1024**3):.3f}GiB, " + f"duration={duration:.3f}s, " + f"current_allocated={after_snapshot.cuda_memory / (1024**3):.3f}GiB" + ) + + # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 def set_ulimit(target_soft_limit=65535): if sys.platform.startswith('win'): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index cbf439aa697b..fce2eb7d3854 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -36,6 +36,7 @@ from vllm.forward_context import (BatchDescriptor, DPMetadata, set_forward_context) from vllm.logger import init_logger +from vllm.utils import phase_memory_profiling from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.mamba.abstract import MambaBase from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding @@ -1969,10 +1970,20 @@ def _preprocess( # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - inputs_embeds_scheduled = self.model.get_input_embeddings( - input_ids=self.input_ids.gpu[:num_scheduled_tokens], - multimodal_embeddings=mm_embeds or None, - ) + + # Log MM→Text merge details for memory analysis + if envs.VLLM_TRACE_MEMORY_PHASES: + num_mm_embeddings = len(mm_embeds) if mm_embeds else 0 + num_text_tokens = num_scheduled_tokens + logger.info(f"[MM-MERGE] Merging {num_mm_embeddings} MM embeddings " + f"with {num_text_tokens} text tokens") + + with phase_memory_profiling("MM_TEXT_MERGE", logger, + enabled=envs.VLLM_TRACE_MEMORY_PHASES): + inputs_embeds_scheduled = self.model.get_input_embeddings( + input_ids=self.input_ids.gpu[:num_scheduled_tokens], + multimodal_embeddings=mm_embeds or None, + ) # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds.gpu[:num_scheduled_tokens].copy_( @@ -2289,10 +2300,20 @@ def execute_model( ), record_function_or_nullcontext("Forward"), self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output): - model_output = self.model( - input_ids=input_ids, - positions=positions, - intermediate_tensors=intermediate_tensors, + + # Log backbone forward pass details for memory analysis + if envs.VLLM_TRACE_MEMORY_PHASES: + if input_ids is not None: + logger.info(f"[BACKBONE] Processing {input_ids.numel()} input tokens") + elif inputs_embeds is not None: + logger.info(f"[BACKBONE] Processing embeddings with shape {inputs_embeds.shape}") + + with phase_memory_profiling("BACKBONE_FORWARD", logger, + enabled=envs.VLLM_TRACE_MEMORY_PHASES): + model_output = self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, **model_kwargs, ) @@ -3129,13 +3150,23 @@ def _dummy_run( cudagraph_runtime_mode=cudagraph_runtime_mode, batch_descriptor=batch_descriptor, ubatch_slices=ubatch_slices): - outputs = self.model( - input_ids=input_ids, - positions=positions, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - **model_kwargs, - ) + + # Log backbone forward pass details for memory analysis + if envs.VLLM_TRACE_MEMORY_PHASES: + if input_ids is not None: + logger.info(f"[BACKBONE] Processing {input_ids.numel()} input tokens") + elif inputs_embeds is not None: + logger.info(f"[BACKBONE] Processing embeddings with shape {inputs_embeds.shape}") + + with phase_memory_profiling("BACKBONE_FORWARD", logger, + enabled=envs.VLLM_TRACE_MEMORY_PHASES): + outputs = self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **model_kwargs, + ) if self.use_aux_hidden_state_outputs: hidden_states, _ = outputs @@ -3303,67 +3334,83 @@ def _dummy_pooler_run( return self._dummy_pooler_run_task(hidden_states, max_task) def profile_run(self) -> None: - # Profile with multimodal encoder & encoder cache. - if self.supports_mm_inputs: - if self.model_config.multimodal_config.skip_mm_profiling: - logger.info( - "Skipping memory profiling for multimodal encoder and " - "encoder cache.") - else: - mm_budget = self.mm_budget - assert mm_budget is not None + # Enable phase memory tracing for profiling + original_trace_setting = envs.VLLM_TRACE_MEMORY_PHASES + envs.VLLM_TRACE_MEMORY_PHASES = True - if (encoder_budget := mm_budget.get_encoder_budget()) > 0: - # NOTE: Currently model is profiled with a single non-text - # modality with the max possible input tokens even when - # it supports multiple. - dummy_modality = mm_budget.get_modality_with_max_tokens() - max_mm_items_per_batch = mm_budget \ - .max_items_per_batch_by_modality[dummy_modality] + try: + logger.info("[PROFILING-START] Beginning memory profiling with phase tracking") + # Profile with multimodal encoder & encoder cache. + if self.supports_mm_inputs: + if self.model_config.multimodal_config.skip_mm_profiling: logger.info( - "Encoder cache will be initialized with a budget of " - "%s tokens, and profiled with %s %s items of the " - "maximum feature size.", - encoder_budget, - max_mm_items_per_batch, - dummy_modality, - ) - - # Create dummy batch of multimodal inputs. - batched_dummy_mm_inputs = self._get_mm_dummy_batch( - dummy_modality, - max_mm_items_per_batch, - ) + "Skipping memory profiling for multimodal encoder and " + "encoder cache.") + else: + mm_budget = self.mm_budget + assert mm_budget is not None + + if (encoder_budget := mm_budget.get_encoder_budget()) > 0: + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + dummy_modality = mm_budget.get_modality_with_max_tokens() + max_mm_items_per_batch = mm_budget \ + .max_items_per_batch_by_modality[dummy_modality] + + logger.info( + "Encoder cache will be initialized with a budget of " + "%s tokens, and profiled with %s %s items of the " + "maximum feature size.", + encoder_budget, + max_mm_items_per_batch, + dummy_modality, + ) - # Run multimodal encoder. - dummy_encoder_outputs = \ - self.model.get_multimodal_embeddings( - **batched_dummy_mm_inputs) + # Create dummy batch of multimodal inputs. + batched_dummy_mm_inputs = self._get_mm_dummy_batch( + dummy_modality, + max_mm_items_per_batch, + ) - sanity_check_mm_encoder_outputs( - dummy_encoder_outputs, - expected_num_items=max_mm_items_per_batch, - ) + # Run multimodal encoder. + if envs.VLLM_TRACE_MEMORY_PHASES: + logger.info(f"[MM-INPUT] Processing {max_mm_items_per_batch} multimodal items") + with phase_memory_profiling("MM_MULTIMODAL_PROCESSING", logger, + enabled=envs.VLLM_TRACE_MEMORY_PHASES): + dummy_encoder_outputs = \ + self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) + + sanity_check_mm_encoder_outputs( + dummy_encoder_outputs, + expected_num_items=max_mm_items_per_batch, + ) - # Cache the dummy encoder outputs. - self.encoder_cache["tmp"] = dict( - enumerate(dummy_encoder_outputs)) + # Cache the dummy encoder outputs. + self.encoder_cache["tmp"] = dict( + enumerate(dummy_encoder_outputs)) - # Add `is_profile` here to pre-allocate communication buffers - hidden_states, last_hidden_states \ - = self._dummy_run(self.max_num_tokens, is_profile=True) - if get_pp_group().is_last_rank: - if self.is_pooling_model: - output = self._dummy_pooler_run(hidden_states) + # Add `is_profile` here to pre-allocate communication buffers + hidden_states, last_hidden_states \ + = self._dummy_run(self.max_num_tokens, is_profile=True) + if get_pp_group().is_last_rank: + if self.is_pooling_model: + output = self._dummy_pooler_run(hidden_states) + else: + output = self._dummy_sampler_run(last_hidden_states) else: - output = self._dummy_sampler_run(last_hidden_states) - else: - output = None - self._sync_device() - del hidden_states, output - self.encoder_cache.clear() - gc.collect() + output = None + self._sync_device() + del hidden_states, output + self.encoder_cache.clear() + gc.collect() + + logger.info("[PROFILING-END] Memory profiling completed") + finally: + # Restore original trace setting + envs.VLLM_TRACE_MEMORY_PHASES = original_trace_setting def capture_model(self) -> int: if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE: From f027a29c8408978c9dea291f66c6609ad7f7c0f9 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 25 Sep 2025 00:51:41 +0000 Subject: [PATCH 02/27] init Signed-off-by: wwl2755 --- vllm/config/model.py | 18 ++- vllm/config/multimodal.py | 126 ++++++++++++++++++-- vllm/engine/arg_utils.py | 50 ++++++++ vllm/multimodal/profiling.py | 113 ++++++++++++++++-- vllm/multimodal/registry.py | 26 ++++- vllm/v1/worker/gpu_model_runner.py | 181 +++++++++++------------------ 6 files changed, 383 insertions(+), 131 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index da01d6d4480c..ef102da90887 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -17,7 +17,7 @@ import vllm.envs as envs from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, - MultiModalConfig) + MultiModalConfig, LimitPerPromptType) from vllm.config.pooler import PoolerConfig from vllm.config.utils import assert_hashable, config from vllm.logger import init_logger @@ -274,7 +274,7 @@ class ModelConfig: multimodal_config: Optional[MultiModalConfig] = None """Configuration for multimodal model. If `None`, this will be inferred from the architecture of `self.model`.""" - limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None + limit_mm_per_prompt: InitVar[Optional[LimitPerPromptType]] = None media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None mm_processor_cache_gb: InitVar[Optional[float]] = None @@ -340,6 +340,7 @@ def compute_hash(self) -> str: return hashlib.sha256(str(factors).encode()).hexdigest() def __post_init__( +<<<<<<< HEAD self, # Multimodal config init vars limit_mm_per_prompt: Optional[dict[str, int]], @@ -353,6 +354,19 @@ def __post_init__( skip_mm_profiling: Optional[bool], video_pruning_rate: Optional[float], ) -> None: +======= + self, + # Multimodal config init vars + limit_mm_per_prompt: Optional[LimitPerPromptType], + media_io_kwargs: Optional[dict[str, dict[str, Any]]], + mm_processor_kwargs: Optional[dict[str, Any]], + mm_processor_cache_gb: Optional[float], + mm_processor_cache_type: Optional[MMCacheType], + mm_shm_cache_max_object_size_mb: Optional[int], + mm_encoder_tp_mode: Optional[MMEncoderTPMode], + interleave_mm_strings: Optional[bool], + skip_mm_profiling: Optional[bool]) -> None: +>>>>>>> init # Set the default seed to 0 in V1. # NOTE(woosuk): In V0, we set the default seed to None because the # driver worker shares the same process as the user process, and thus diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 569de9579900..324d5a30d5b9 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -4,13 +4,90 @@ import hashlib from collections.abc import Mapping from dataclasses import field -from typing import Any, Literal, Optional +from dataclasses import dataclass as standard_dataclass +from typing import Any, Dict, Literal, Optional, Union from pydantic.dataclasses import dataclass import vllm.envs as envs from vllm.config.utils import config + +@standard_dataclass +class BaseModalityOptions: + """Base class for modality-specific dummy data options.""" + count: int = 1 + + +@standard_dataclass +class VideoDummyOptions(BaseModalityOptions): + """Options for generating dummy video data during profiling.""" + count: int = 1 + num_frames: Optional[int] = None + width: Optional[int] = None + height: Optional[int] = None + + def __post_init__(self): + if self.count < 0: + raise ValueError("count must be non-negative") + if self.num_frames is not None and self.num_frames <= 0: + raise ValueError("num_frames must be positive") + if self.width is not None and self.width <= 0: + raise ValueError("width must be positive") + if self.height is not None and self.height <= 0: + raise ValueError("height must be positive") + + +@standard_dataclass +class ImageDummyOptions(BaseModalityOptions): + """Options for generating dummy image data during profiling.""" + count: int = 1 + width: Optional[int] = None + height: Optional[int] = None + max_size: Optional[tuple[int, int]] = None + + def __post_init__(self): + if self.count < 0: + raise ValueError("count must be non-negative") + if self.width is not None and self.width <= 0: + raise ValueError("width must be positive") + if self.height is not None and self.height <= 0: + raise ValueError("height must be positive") + + +@standard_dataclass +class AudioDummyOptions(BaseModalityOptions): + """Options for generating dummy audio data during profiling.""" + count: int = 1 + duration: Optional[float] = None + sample_rate: Optional[int] = None + channels: Optional[int] = None + + def __post_init__(self): + if self.count < 0: + raise ValueError("count must be non-negative") + if self.duration is not None and self.duration <= 0: + raise ValueError("duration must be positive") + if self.sample_rate is not None and self.sample_rate <= 0: + raise ValueError("sample_rate must be positive") + if self.channels is not None and self.channels <= 0: + raise ValueError("channels must be positive") + + +# Union type for all supported option types +ModalityDummyOptions = Union[ + BaseModalityOptions, + VideoDummyOptions, + ImageDummyOptions, + AudioDummyOptions +] + +# Main configuration type supporting both legacy and enhanced formats +LimitPerPromptType = Union[ + Dict[str, int], # Legacy: {"video": 1, "image": 5} + Dict[str, Union[int, ModalityDummyOptions]] # Enhanced format +] + MMEncoderTPMode = Literal["weights", "data"] MMCacheType = Literal["shm", "lru"] @@ -20,12 +97,19 @@ class MultiModalConfig: """Controls the behavior of multimodal models.""" - limit_per_prompt: dict[str, int] = field(default_factory=dict) - """The maximum number of input items allowed per prompt for each modality. + limit_per_prompt: LimitPerPromptType = field(default_factory=dict) + """The maximum number of input items and options allowed per prompt for each modality. Defaults to 1 (V0) or 999 (V1) for each modality. - For example, to allow up to 16 images and 2 videos per prompt: - `{"image": 16, "video": 2}`""" + Legacy format (count only): + {"image": 16, "video": 2} + + Enhanced format (with options): + {"video": {"count": 1, "num_frames": 32}, "image": {"count": 5, "max_size": [512, 512]}} + + Mixed format (combining both): + {"image": 16, "video": {"count": 1, "num_frames": 32}} + """ media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) """Additional args passed to process media inputs, keyed by modalities. For example, to set num_frames for video, set @@ -106,12 +190,34 @@ def compute_hash(self) -> str: def get_limit_per_prompt(self, modality: str) -> int: """ Get the maximum number of input items allowed per prompt - for the given modality. + for the given modality (backward compatible). + """ + limit_data = self.limit_per_prompt.get(modality) + + if limit_data is None: + return 999 if envs.VLLM_USE_V1 else 1 + elif isinstance(limit_data, int): + return limit_data + elif isinstance(limit_data, BaseModalityOptions): + return limit_data.count + else: + raise ValueError(f"Invalid limit data type for {modality}: {type(limit_data)}") + + def get_dummy_options(self, modality: str) -> Optional[ModalityDummyOptions]: + """ + Get the enhanced dummy data options for a modality. + Returns None if no enhanced options are configured. """ - return self.limit_per_prompt.get( - modality, - 999 if envs.VLLM_USE_V1 else 1, - ) + limit_data = self.limit_per_prompt.get(modality) + + if isinstance(limit_data, (BaseModalityOptions, VideoDummyOptions, + ImageDummyOptions, AudioDummyOptions)): + return limit_data + elif isinstance(limit_data, int): + # Convert legacy format to base options + return BaseModalityOptions(count=limit_data) + else: + return None def merge_mm_processor_kwargs( self, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7b5ed67d0adb..8d8bb9e8cada 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -98,6 +98,48 @@ def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]: return optional_type(json.loads)(val) +def parse_limit_mm_per_prompt(val: str): + """Parse limit-mm-per-prompt with support for configurable options.""" + import json + from vllm.config.multimodal import ( + VideoDummyOptions, ImageDummyOptions, AudioDummyOptions, + BaseModalityOptions, LimitPerPromptType + ) + + try: + parsed = json.loads(val) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON format for --limit-mm-per-prompt: {e}") + + if not isinstance(parsed, dict): + raise ValueError("--limit-mm-per-prompt must be a JSON object") + + result = {} + for modality, options in parsed.items(): + if isinstance(options, int): + # Legacy format + result[modality] = options + elif isinstance(options, dict): + # Enhanced format - convert to appropriate dataclass + try: + if modality == "video": + result[modality] = VideoDummyOptions(**options) + elif modality == "image": + result[modality] = ImageDummyOptions(**options) + elif modality == "audio": + result[modality] = AudioDummyOptions(**options) + else: + # Unknown modality, use base options + result[modality] = BaseModalityOptions(**options) + except TypeError as e: + raise ValueError(f"Invalid options for {modality}: {e}") + else: + raise ValueError(f"Invalid options type for {modality}: {type(options)}. " + f"Must be int or dict.") + + return result + + def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]: """Check if the type hint is a specific type.""" return type_hint is type or get_origin(type_hint) is type @@ -783,6 +825,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: # Multimodal related configs multimodal_kwargs = get_kwargs(MultiModalConfig) + # Override the parser for limit_per_prompt to support configurable options + multimodal_kwargs["limit_per_prompt"]["type"] = parse_limit_mm_per_prompt + multimodal_kwargs["limit_per_prompt"]["help"] += ( + "\n\nSupports both legacy count-only format and configurable options format:" + "\n Legacy: '{\"image\": 5, \"video\": 1}'" + "\n Configurable: '{\"video\": {\"count\": 1, \"num_frames\": 32}}'" + "\n Mixed: '{\"image\": 5, \"video\": {\"count\": 1, \"num_frames\": 32}}'" + ) multimodal_group = parser.add_argument_group( title="MultiModalConfig", description=MultiModalConfig.__doc__, diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 26c5d188964c..b1befa4c3515 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import dataclasses from abc import ABC, abstractmethod from collections.abc import Mapping from dataclasses import dataclass, field @@ -10,6 +11,8 @@ from PIL import Image import vllm.envs as envs +from vllm.config.multimodal import (ModalityDummyOptions, VideoDummyOptions, + ImageDummyOptions, AudioDummyOptions) from vllm.logger import init_logger from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, @@ -84,19 +87,109 @@ def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ) -> ProcessorInputs: """ Build the input which, after processing, results in the maximum possible number of placeholder tokens. + + Args: + seq_len: Sequence length + mm_counts: Count of items per modality + mm_options: Configurable options per modality (optional) """ dummy_text = self.get_dummy_text(mm_counts) - dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) + + # Use configurable options to guide dummy data generation if provided + if mm_options: + dummy_mm_data = self._get_configurable_dummy_data(seq_len, mm_counts, mm_options) + else: + dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) + tokenization_kwargs = {"truncation": False} return ProcessorInputs(prompt=dummy_text, mm_data=dummy_mm_data, tokenization_kwargs=tokenization_kwargs) + def _get_configurable_dummy_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, ModalityDummyOptions], + ) -> MultiModalDataDict: + """ + Generate dummy data with configurable options using parameter interception. + """ + dummy_data = {} + + # Handle images + if "image" in mm_counts and mm_counts["image"] > 0: + # Get model defaults + try: + default_width, default_height = self.info.get_image_size_with_most_features() + except (AttributeError, Exception): + default_width, default_height = 224, 224 # Fallback + + # Override with configurable options if provided + target_width, target_height = default_width, default_height + if "image" in mm_options: + image_opts = mm_options["image"] + if hasattr(image_opts, 'max_size') and image_opts.max_size: + target_width, target_height = image_opts.max_size + elif hasattr(image_opts, 'width') and hasattr(image_opts, 'height'): + if image_opts.width: target_width = image_opts.width + if image_opts.height: target_height = image_opts.height + + # Simple bounds checking + target_width = min(max(target_width, 1), default_width) + target_height = min(max(target_height, 1), default_height) + + dummy_data["image"] = self._get_dummy_images( + width=target_width, height=target_height, num_images=mm_counts["image"]) + + # Handle videos + if "video" in mm_counts and mm_counts["video"] > 0: + # Get model defaults + try: + default_width, default_height = self.info.get_image_size_with_most_features() + default_frames = self.info.get_num_frames_with_most_features(seq_len, mm_counts) + except (AttributeError, Exception): + default_width, default_height, default_frames = 224, 224, 16 # Fallback + + # Override with configurable options if provided + target_width, target_height, target_frames = default_width, default_height, default_frames + if "video" in mm_options: + video_opts = mm_options["video"] + if hasattr(video_opts, 'num_frames') and video_opts.num_frames: + target_frames = video_opts.num_frames + if hasattr(video_opts, 'width') and video_opts.width: + target_width = video_opts.width + if hasattr(video_opts, 'height') and video_opts.height: + target_height = video_opts.height + + # Simple bounds checking + target_width = min(max(target_width, 1), default_width) + target_height = min(max(target_height, 1), default_height) + target_frames = min(max(target_frames, 1), default_frames) + + dummy_data["video"] = self._get_dummy_videos( + width=target_width, height=target_height, + num_frames=target_frames, num_videos=mm_counts["video"]) + + # Handle audio (if needed) + if "audio" in mm_counts and mm_counts["audio"] > 0: + # Use existing audio generation logic - configurable options for audio not yet implemented + try: + dummy_audio_data = self.get_dummy_mm_data(seq_len, {"audio": mm_counts["audio"]}) + if "audio" in dummy_audio_data: + dummy_data["audio"] = dummy_audio_data["audio"] + except Exception: + # Fallback audio generation + dummy_data["audio"] = self._get_dummy_audios(length=16000, num_audios=mm_counts["audio"]) + + return dummy_data + def _get_dummy_audios( self, *, @@ -162,13 +255,14 @@ def _get_dummy_mm_inputs( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ) -> MultiModalInputs: if mm_counts is None: mm_counts = self.get_mm_limits() factory = self.dummy_inputs processor_inputs = factory.get_dummy_processor_inputs( - seq_len, mm_counts) + seq_len, mm_counts, mm_options) return self.processor.apply( prompt=processor_inputs.prompt, @@ -195,8 +289,9 @@ def get_encoder_dummy_data( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ) -> DummyEncoderData: - mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts, mm_options) mm_inputs = cast(MultiModalEncDecInputs, mm_inputs) # For encoder-decoder models, use encoder prompt token ids instead of @@ -228,8 +323,9 @@ def get_decoder_dummy_data( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ) -> DummyDecoderData: - mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts, mm_options) prompt_token_ids = mm_inputs["prompt_token_ids"] total_len = len(prompt_token_ids) @@ -248,6 +344,7 @@ def _get_mm_max_tokens( seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, mm_embeddings_only: bool = True, + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ) -> Mapping[str, int]: if mm_counts is None: mm_counts = self.get_mm_limits() @@ -259,7 +356,7 @@ def _get_mm_max_tokens( if max_tokens_per_item is not None: return max_tokens_per_item - mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts, mm_options) return self._get_mm_num_tokens(mm_inputs, mm_embeddings_only=mm_embeddings_only) @@ -267,6 +364,7 @@ def get_mm_max_contiguous_tokens( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ): """ Returns the maximum length of the multimodal (image placeholders+text) @@ -274,11 +372,12 @@ def get_mm_max_contiguous_tokens( ` [IMG] [IMG] [IMG] [IMG] [IMG] [IMG] ` Returns 9, even when the number of image embeddings is 6. - + This is important to take into account when profiling and initializing the encoder cache size. """ return self._get_mm_max_tokens(seq_len, mm_counts, - mm_embeddings_only=False) + mm_embeddings_only=False, + mm_options=mm_options) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 2bbc0078ad13..fd08258bd032 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -286,7 +286,18 @@ def get_decoder_dummy_data( """ processor = self.create_processor(model_config, cache=cache) profiler = MultiModalProfiler(processor) - dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts) + + # Extract configurable options from multimodal config + mm_options = None + if model_config.multimodal_config: + mm_options = {} + for modality in ["image", "video", "audio"]: + if hasattr(model_config.multimodal_config, 'get_dummy_options'): + options = model_config.multimodal_config.get_dummy_options(modality) + if options is not None: + mm_options[modality] = options + + dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options) # Having more tokens is over-conservative but otherwise fine token_ids = dummy_data.prompt_token_ids @@ -312,7 +323,18 @@ def get_encoder_dummy_data( """ processor = self.create_processor(model_config, cache=cache) profiler = MultiModalProfiler(processor) - dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts) + + # Extract configurable options from multimodal config + mm_options = None + if model_config.multimodal_config: + mm_options = {} + for modality in ["image", "video", "audio"]: + if hasattr(model_config.multimodal_config, 'get_dummy_options'): + options = model_config.multimodal_config.get_dummy_options(modality) + if options is not None: + mm_options[modality] = options + + dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options) # Having more tokens is over-conservative but otherwise fine token_ids = dummy_data.prompt_token_ids diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index fce2eb7d3854..c94b082426a5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1971,19 +1971,10 @@ def _preprocess( # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - # Log MM→Text merge details for memory analysis - if envs.VLLM_TRACE_MEMORY_PHASES: - num_mm_embeddings = len(mm_embeds) if mm_embeds else 0 - num_text_tokens = num_scheduled_tokens - logger.info(f"[MM-MERGE] Merging {num_mm_embeddings} MM embeddings " - f"with {num_text_tokens} text tokens") - - with phase_memory_profiling("MM_TEXT_MERGE", logger, - enabled=envs.VLLM_TRACE_MEMORY_PHASES): - inputs_embeds_scheduled = self.model.get_input_embeddings( - input_ids=self.input_ids.gpu[:num_scheduled_tokens], - multimodal_embeddings=mm_embeds or None, - ) + inputs_embeds_scheduled = self.model.get_input_embeddings( + input_ids=self.input_ids.gpu[:num_scheduled_tokens], + multimodal_embeddings=mm_embeds or None, + ) # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds.gpu[:num_scheduled_tokens].copy_( @@ -2301,19 +2292,10 @@ def execute_model( self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output): - # Log backbone forward pass details for memory analysis - if envs.VLLM_TRACE_MEMORY_PHASES: - if input_ids is not None: - logger.info(f"[BACKBONE] Processing {input_ids.numel()} input tokens") - elif inputs_embeds is not None: - logger.info(f"[BACKBONE] Processing embeddings with shape {inputs_embeds.shape}") - - with phase_memory_profiling("BACKBONE_FORWARD", logger, - enabled=envs.VLLM_TRACE_MEMORY_PHASES): - model_output = self.model( - input_ids=input_ids, - positions=positions, - intermediate_tensors=intermediate_tensors, + model_output = self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, **model_kwargs, ) @@ -3151,22 +3133,13 @@ def _dummy_run( batch_descriptor=batch_descriptor, ubatch_slices=ubatch_slices): - # Log backbone forward pass details for memory analysis - if envs.VLLM_TRACE_MEMORY_PHASES: - if input_ids is not None: - logger.info(f"[BACKBONE] Processing {input_ids.numel()} input tokens") - elif inputs_embeds is not None: - logger.info(f"[BACKBONE] Processing embeddings with shape {inputs_embeds.shape}") - - with phase_memory_profiling("BACKBONE_FORWARD", logger, - enabled=envs.VLLM_TRACE_MEMORY_PHASES): - outputs = self.model( - input_ids=input_ids, - positions=positions, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - **model_kwargs, - ) + outputs = self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **model_kwargs, + ) if self.use_aux_hidden_state_outputs: hidden_states, _ = outputs @@ -3334,83 +3307,71 @@ def _dummy_pooler_run( return self._dummy_pooler_run_task(hidden_states, max_task) def profile_run(self) -> None: - # Enable phase memory tracing for profiling - original_trace_setting = envs.VLLM_TRACE_MEMORY_PHASES - envs.VLLM_TRACE_MEMORY_PHASES = True + logger.info("Beginning memory profiling") + + # Profile with multimodal encoder & encoder cache. + if self.supports_mm_inputs: + if self.model_config.multimodal_config.skip_mm_profiling: + logger.info( + "Skipping memory profiling for multimodal encoder and " + "encoder cache.") + else: + mm_budget = self.mm_budget + assert mm_budget is not None - try: - logger.info("[PROFILING-START] Beginning memory profiling with phase tracking") + if (encoder_budget := mm_budget.get_encoder_budget()) > 0: + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + dummy_modality = mm_budget.get_modality_with_max_tokens() + max_mm_items_per_batch = mm_budget \ + .max_items_per_batch_by_modality[dummy_modality] - # Profile with multimodal encoder & encoder cache. - if self.supports_mm_inputs: - if self.model_config.multimodal_config.skip_mm_profiling: logger.info( - "Skipping memory profiling for multimodal encoder and " - "encoder cache.") - else: - mm_budget = self.mm_budget - assert mm_budget is not None - - if (encoder_budget := mm_budget.get_encoder_budget()) > 0: - # NOTE: Currently model is profiled with a single non-text - # modality with the max possible input tokens even when - # it supports multiple. - dummy_modality = mm_budget.get_modality_with_max_tokens() - max_mm_items_per_batch = mm_budget \ - .max_items_per_batch_by_modality[dummy_modality] - - logger.info( - "Encoder cache will be initialized with a budget of " - "%s tokens, and profiled with %s %s items of the " - "maximum feature size.", - encoder_budget, - max_mm_items_per_batch, - dummy_modality, - ) + "Encoder cache will be initialized with a budget of " + "%s tokens, and profiled with %s %s items of the " + "maximum feature size.", + encoder_budget, + max_mm_items_per_batch, + dummy_modality, + ) - # Create dummy batch of multimodal inputs. - batched_dummy_mm_inputs = self._get_mm_dummy_batch( - dummy_modality, - max_mm_items_per_batch, - ) + # Create dummy batch of multimodal inputs. + batched_dummy_mm_inputs = self._get_mm_dummy_batch( + dummy_modality, + max_mm_items_per_batch, + ) - # Run multimodal encoder. - if envs.VLLM_TRACE_MEMORY_PHASES: - logger.info(f"[MM-INPUT] Processing {max_mm_items_per_batch} multimodal items") - with phase_memory_profiling("MM_MULTIMODAL_PROCESSING", logger, - enabled=envs.VLLM_TRACE_MEMORY_PHASES): - dummy_encoder_outputs = \ - self.model.get_multimodal_embeddings( - **batched_dummy_mm_inputs) - - sanity_check_mm_encoder_outputs( - dummy_encoder_outputs, - expected_num_items=max_mm_items_per_batch, - ) + # Run multimodal encoder. + dummy_encoder_outputs = \ + self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) - # Cache the dummy encoder outputs. - self.encoder_cache["tmp"] = dict( - enumerate(dummy_encoder_outputs)) + sanity_check_mm_encoder_outputs( + dummy_encoder_outputs, + expected_num_items=max_mm_items_per_batch, + ) - # Add `is_profile` here to pre-allocate communication buffers - hidden_states, last_hidden_states \ - = self._dummy_run(self.max_num_tokens, is_profile=True) - if get_pp_group().is_last_rank: - if self.is_pooling_model: - output = self._dummy_pooler_run(hidden_states) - else: - output = self._dummy_sampler_run(last_hidden_states) + # Cache the dummy encoder outputs. + self.encoder_cache["tmp"] = dict( + enumerate(dummy_encoder_outputs)) + + # Add `is_profile` here to pre-allocate communication buffers + hidden_states, last_hidden_states \ + = self._dummy_run(self.max_num_tokens, is_profile=True) + if get_pp_group().is_last_rank: + if self.is_pooling_model: + output = self._dummy_pooler_run(hidden_states) else: - output = None - self._sync_device() - del hidden_states, output - self.encoder_cache.clear() - gc.collect() + output = self._dummy_sampler_run(last_hidden_states) + else: + output = None + self._sync_device() + del hidden_states, output + self.encoder_cache.clear() + gc.collect() - logger.info("[PROFILING-END] Memory profiling completed") - finally: - # Restore original trace setting - envs.VLLM_TRACE_MEMORY_PHASES = original_trace_setting + logger.info("Memory profiling completed") def capture_model(self) -> int: if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE: From 8b5f8c048490c99eba7ed0070cf3e16e76707500 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 25 Sep 2025 00:54:22 +0000 Subject: [PATCH 03/27] remove unused code Signed-off-by: wwl2755 --- vllm/utils/__init__.py | 46 ------------------------------ vllm/v1/worker/gpu_model_runner.py | 3 -- 2 files changed, 49 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 915c345bd32e..2fee662f86e6 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2857,52 +2857,6 @@ def memory_profiling( result.non_kv_cache_memory = non_torch_memory + peak_activation_memory + result.weights_memory # noqa -@contextlib.contextmanager -def phase_memory_profiling( - phase_name: str, - logger_instance=None, - enabled: bool = True -) -> Generator[None, None, None]: - """Memory profiling context manager for specific execution phases. - - Args: - phase_name: Name of the phase being profiled - logger_instance: Logger to use for output (optional) - enabled: Whether profiling is enabled - """ - if not enabled: - yield - return - - if logger_instance is None: - logger_instance = init_logger(__name__) - - # Take snapshot before phase - before_snapshot = MemorySnapshot() - torch.cuda.reset_peak_memory_stats() - - start_time = time.time() - - try: - yield - finally: - # Take snapshot after phase - torch.cuda.synchronize() - after_snapshot = MemorySnapshot() - end_time = time.time() - - # Calculate phase-specific metrics - peak_increase = after_snapshot.torch_peak - before_snapshot.torch_peak - memory_diff = after_snapshot.cuda_memory - before_snapshot.cuda_memory - duration = end_time - start_time - - logger_instance.info( - f"[MEMORY-PHASE] {phase_name}: " - f"peak_increase={peak_increase / (1024**3):.3f}GiB, " - f"memory_diff={memory_diff / (1024**3):.3f}GiB, " - f"duration={duration:.3f}s, " - f"current_allocated={after_snapshot.cuda_memory / (1024**3):.3f}GiB" - ) # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c94b082426a5..7dba69056340 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -36,7 +36,6 @@ from vllm.forward_context import (BatchDescriptor, DPMetadata, set_forward_context) from vllm.logger import init_logger -from vllm.utils import phase_memory_profiling from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.mamba.abstract import MambaBase from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding @@ -3307,7 +3306,6 @@ def _dummy_pooler_run( return self._dummy_pooler_run_task(hidden_states, max_task) def profile_run(self) -> None: - logger.info("Beginning memory profiling") # Profile with multimodal encoder & encoder cache. if self.supports_mm_inputs: @@ -3371,7 +3369,6 @@ def profile_run(self) -> None: self.encoder_cache.clear() gc.collect() - logger.info("Memory profiling completed") def capture_model(self) -> int: if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE: From 43472b9e0632c6d799644f4d05b8ea611aad6b52 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 25 Sep 2025 01:04:32 +0000 Subject: [PATCH 04/27] remove unused code Signed-off-by: wwl2755 --- vllm/config/multimodal.py | 10 +++++----- vllm/engine/arg_utils.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 324d5a30d5b9..2eddd59837b8 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -82,10 +82,10 @@ def __post_init__(self): AudioDummyOptions ] -# Main configuration type supporting both legacy and enhanced formats +# Main configuration type supporting both legacy and configurable formats LimitPerPromptType = Union[ Dict[str, int], # Legacy: {"video": 1, "image": 5} - Dict[str, Union[int, ModalityDummyOptions]] # Enhanced format + Dict[str, Union[int, ModalityDummyOptions]] # Configurable format ] MMEncoderTPMode = Literal["weights", "data"] @@ -104,7 +104,7 @@ class MultiModalConfig: Legacy format (count only): {"image": 16, "video": 2} - Enhanced format (with options): + Configurable format (with options): {"video": {"count": 1, "num_frames": 32}, "image": {"count": 5, "max_size": [512, 512]}} Mixed format (combining both): @@ -205,8 +205,8 @@ def get_limit_per_prompt(self, modality: str) -> int: def get_dummy_options(self, modality: str) -> Optional[ModalityDummyOptions]: """ - Get the enhanced dummy data options for a modality. - Returns None if no enhanced options are configured. + Get the configurable dummy data options for a modality. + Returns None if no configurable options are configured. """ limit_data = self.limit_per_prompt.get(modality) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8d8bb9e8cada..4610f52cb9f1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -120,7 +120,7 @@ def parse_limit_mm_per_prompt(val: str): # Legacy format result[modality] = options elif isinstance(options, dict): - # Enhanced format - convert to appropriate dataclass + # Configurable format - convert to appropriate dataclass try: if modality == "video": result[modality] = VideoDummyOptions(**options) From 0ecd2732cc79b2ccf952376c81febb81df3a0d85 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 25 Sep 2025 01:27:25 +0000 Subject: [PATCH 05/27] remove unused code Signed-off-by: wwl2755 --- vllm/envs.py | 8 -------- vllm/utils/__init__.py | 2 -- vllm/v1/worker/gpu_model_runner.py | 5 ----- 3 files changed, 15 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 7ab40f5b28b9..832d031f998e 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -43,7 +43,6 @@ VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None VLLM_LOG_STATS_INTERVAL: float = 10. VLLM_TRACE_FUNCTION: int = 0 - VLLM_TRACE_MEMORY_PHASES: bool = False VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None VLLM_PP_LAYER_PARTITION: Optional[str] = None @@ -598,13 +597,6 @@ def get_vllm_port() -> Optional[int]: "VLLM_TRACE_FUNCTION": lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")), - # Trace memory usage during different execution phases - # If set to 1 or true, vllm will log detailed memory usage - # during MM processing, merging, and backbone forward pass - # Useful for analyzing memory bottlenecks - "VLLM_TRACE_MEMORY_PHASES": - lambda: os.environ.get("VLLM_TRACE_MEMORY_PHASES", "0").lower() in ("1", "true"), - # Backend for attention computation # Example options: # - "TORCH_SDPA": use torch.nn.MultiheadAttention diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 2fee662f86e6..c502a69ea500 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2857,8 +2857,6 @@ def memory_profiling( result.non_kv_cache_memory = non_torch_memory + peak_activation_memory + result.weights_memory # noqa - - # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 def set_ulimit(target_soft_limit=65535): if sys.platform.startswith('win'): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7dba69056340..cbf439aa697b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1969,7 +1969,6 @@ def _preprocess( # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - inputs_embeds_scheduled = self.model.get_input_embeddings( input_ids=self.input_ids.gpu[:num_scheduled_tokens], multimodal_embeddings=mm_embeds or None, @@ -2290,7 +2289,6 @@ def execute_model( ), record_function_or_nullcontext("Forward"), self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output): - model_output = self.model( input_ids=input_ids, positions=positions, @@ -3131,7 +3129,6 @@ def _dummy_run( cudagraph_runtime_mode=cudagraph_runtime_mode, batch_descriptor=batch_descriptor, ubatch_slices=ubatch_slices): - outputs = self.model( input_ids=input_ids, positions=positions, @@ -3306,7 +3303,6 @@ def _dummy_pooler_run( return self._dummy_pooler_run_task(hidden_states, max_task) def profile_run(self) -> None: - # Profile with multimodal encoder & encoder cache. if self.supports_mm_inputs: if self.model_config.multimodal_config.skip_mm_profiling: @@ -3369,7 +3365,6 @@ def profile_run(self) -> None: self.encoder_cache.clear() gc.collect() - def capture_model(self) -> int: if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE: logger.warning( From f9edb4f29992448fa46de93666bb2374cd35fb9e Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 25 Sep 2025 03:59:22 +0000 Subject: [PATCH 06/27] update logic Signed-off-by: wwl2755 --- vllm/multimodal/profiling.py | 78 ++++++++++++++++++++++++++++-------- vllm/multimodal/registry.py | 23 +++++++++-- 2 files changed, 80 insertions(+), 21 deletions(-) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index b1befa4c3515..79c292903268 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -126,10 +126,13 @@ def _get_configurable_dummy_data( # Handle images if "image" in mm_counts and mm_counts["image"] > 0: # Get model defaults + vid_dims_known = True try: - default_width, default_height = self.info.get_image_size_with_most_features() + default_width, default_height = \ + self.info.get_image_size_with_most_features() except (AttributeError, Exception): - default_width, default_height = 224, 224 # Fallback + default_width, default_height = 224, 224 + vid_dims_known = False # Override with configurable options if provided target_width, target_height = default_width, default_height @@ -142,8 +145,11 @@ def _get_configurable_dummy_data( if image_opts.height: target_height = image_opts.height # Simple bounds checking - target_width = min(max(target_width, 1), default_width) - target_height = min(max(target_height, 1), default_height) + target_width = max(target_width, 1) + target_height = max(target_height, 1) + if vid_dims_known: + target_width = min(target_width, default_width) + target_height = min(target_height, default_height) dummy_data["image"] = self._get_dummy_images( width=target_width, height=target_height, num_images=mm_counts["image"]) @@ -151,11 +157,20 @@ def _get_configurable_dummy_data( # Handle videos if "video" in mm_counts and mm_counts["video"] > 0: # Get model defaults + vid_dims_known = True + try: + default_width, default_height = \ + self.info.get_image_size_with_most_features() + except (AttributeError, Exception): + default_width, default_height = 224, 224 + vid_dims_known = False + + vid_frames_known = True try: - default_width, default_height = self.info.get_image_size_with_most_features() default_frames = self.info.get_num_frames_with_most_features(seq_len, mm_counts) except (AttributeError, Exception): - default_width, default_height, default_frames = 224, 224, 16 # Fallback + default_frames = 16 + vid_frames_known = False # Override with configurable options if provided target_width, target_height, target_frames = default_width, default_height, default_frames @@ -169,24 +184,53 @@ def _get_configurable_dummy_data( target_height = video_opts.height # Simple bounds checking - target_width = min(max(target_width, 1), default_width) - target_height = min(max(target_height, 1), default_height) - target_frames = min(max(target_frames, 1), default_frames) + target_width = max(target_width, 1) + target_height = max(target_height, 1) + if vid_dims_known: + target_width = min(target_width, default_width) + target_height = min(target_height, default_height) + target_frames = max(target_frames, 1) + if vid_frames_known: + target_frames = min(target_frames, default_frames) dummy_data["video"] = self._get_dummy_videos( width=target_width, height=target_height, num_frames=target_frames, num_videos=mm_counts["video"]) - # Handle audio (if needed) + # Handle audio if "audio" in mm_counts and mm_counts["audio"] > 0: - # Use existing audio generation logic - configurable options for audio not yet implemented + # Get model defaults try: - dummy_audio_data = self.get_dummy_mm_data(seq_len, {"audio": mm_counts["audio"]}) - if "audio" in dummy_audio_data: - dummy_data["audio"] = dummy_audio_data["audio"] - except Exception: - # Fallback audio generation - dummy_data["audio"] = self._get_dummy_audios(length=16000, num_audios=mm_counts["audio"]) + feature_extractor = self.info.get_feature_extractor() + default_sample_rate = int(feature_extractor.sampling_rate) + # Most audio processors expose chunk_length in seconds + default_duration_s = float(getattr(feature_extractor, + "chunk_length", 1)) + except (AttributeError, Exception): + default_sample_rate = 16000 + default_duration_s = 1.0 + + # Override with configurable options if provided + target_sample_rate = default_sample_rate + target_duration_s = default_duration_s + target_channels = 1 + if mm_options and "audio" in mm_options: + audio_opts = mm_options["audio"] + if hasattr(audio_opts, 'sample_rate') and audio_opts.sample_rate: + target_sample_rate = int(audio_opts.sample_rate) + if hasattr(audio_opts, 'duration') and audio_opts.duration: + target_duration_s = float(audio_opts.duration) + if hasattr(audio_opts, 'channels') and audio_opts.channels: + target_channels = int(audio_opts.channels) + + # Compute effective profiling length + length_per_channel = max(1, int(target_sample_rate * target_duration_s)) + effective_length = max(1, length_per_channel * max(1, target_channels)) + + # Return arrays (consistent with legacy builders) + audios = self._get_dummy_audios(length=effective_length, + num_audios=mm_counts["audio"]) + dummy_data["audio"] = audios return dummy_data diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index fd08258bd032..3c0a47cc0ad2 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,6 +6,9 @@ import torch.nn as nn +from vllm.inputs import InputProcessingContext +from vllm.config.multimodal import (AudioDummyOptions, ImageDummyOptions, + VideoDummyOptions) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, cached_tokenizer_from_config) @@ -287,15 +290,21 @@ def get_decoder_dummy_data( processor = self.create_processor(model_config, cache=cache) profiler = MultiModalProfiler(processor) - # Extract configurable options from multimodal config + # Extract configurable options from multimodal config. + # Only include modalities that use advanced option types so legacy + # count-only behavior remains unchanged. mm_options = None if model_config.multimodal_config: mm_options = {} for modality in ["image", "video", "audio"]: if hasattr(model_config.multimodal_config, 'get_dummy_options'): options = model_config.multimodal_config.get_dummy_options(modality) - if options is not None: + if isinstance(options, + (ImageDummyOptions, VideoDummyOptions, + AudioDummyOptions)): mm_options[modality] = options + if not mm_options: + mm_options = None dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options) @@ -324,15 +333,21 @@ def get_encoder_dummy_data( processor = self.create_processor(model_config, cache=cache) profiler = MultiModalProfiler(processor) - # Extract configurable options from multimodal config + # Extract configurable options from multimodal config. + # Only include modalities that use advanced option types so legacy + # count-only behavior remains unchanged. mm_options = None if model_config.multimodal_config: mm_options = {} for modality in ["image", "video", "audio"]: if hasattr(model_config.multimodal_config, 'get_dummy_options'): options = model_config.multimodal_config.get_dummy_options(modality) - if options is not None: + if isinstance(options, + (ImageDummyOptions, VideoDummyOptions, + AudioDummyOptions)): mm_options[modality] = options + if not mm_options: + mm_options = None dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options) From 85acac51200ae99ada19ff592a30083394270e70 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 25 Sep 2025 21:16:38 +0000 Subject: [PATCH 07/27] fix lint Signed-off-by: wwl2755 --- vllm/config/model.py | 4 +-- vllm/config/multimodal.py | 30 ++++++++++----------- vllm/engine/arg_utils.py | 33 +++++++++++++---------- vllm/multimodal/profiling.py | 51 ++++++++++++++++++++++-------------- vllm/multimodal/registry.py | 37 +++++++++++++++----------- 5 files changed, 89 insertions(+), 66 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index ef102da90887..f4b60f2baa2e 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -16,8 +16,8 @@ from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE import vllm.envs as envs -from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, - MultiModalConfig, LimitPerPromptType) +from vllm.config.multimodal import (LimitPerPromptType, MMCacheType, + MMEncoderTPMode, MultiModalConfig) from vllm.config.pooler import PoolerConfig from vllm.config.utils import assert_hashable, config from vllm.logger import init_logger diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 2eddd59837b8..6023573a9e0f 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -3,9 +3,9 @@ import hashlib from collections.abc import Mapping -from dataclasses import field from dataclasses import dataclass as standard_dataclass -from typing import Any, Dict, Literal, Optional, Union +from dataclasses import field +from typing import Any, Literal, Optional, Union from pydantic.dataclasses import dataclass @@ -75,17 +75,13 @@ def __post_init__(self): # Union type for all supported option types -ModalityDummyOptions = Union[ - BaseModalityOptions, - VideoDummyOptions, - ImageDummyOptions, - AudioDummyOptions -] +ModalityDummyOptions = Union[BaseModalityOptions, VideoDummyOptions, + ImageDummyOptions, AudioDummyOptions] # Main configuration type supporting both legacy and configurable formats LimitPerPromptType = Union[ - Dict[str, int], # Legacy: {"video": 1, "image": 5} - Dict[str, Union[int, ModalityDummyOptions]] # Configurable format + dict[str, int], # Legacy: {"video": 1, "image": 5} + dict[str, Union[int, ModalityDummyOptions]] # Configurable format ] MMEncoderTPMode = Literal["weights", "data"] @@ -98,14 +94,16 @@ class MultiModalConfig: """Controls the behavior of multimodal models.""" limit_per_prompt: LimitPerPromptType = field(default_factory=dict) - """The maximum number of input items and options allowed per prompt for each modality. + """The maximum number of input items and options allowed per + prompt for each modality. Defaults to 1 (V0) or 999 (V1) for each modality. Legacy format (count only): {"image": 16, "video": 2} Configurable format (with options): - {"video": {"count": 1, "num_frames": 32}, "image": {"count": 5, "max_size": [512, 512]}} + {"video": {"count": 1, "num_frames": 32}, + "image": {"count": 5, "max_size": [512, 512]}} Mixed format (combining both): {"image": 16, "video": {"count": 1, "num_frames": 32}} @@ -201,9 +199,11 @@ def get_limit_per_prompt(self, modality: str) -> int: elif isinstance(limit_data, BaseModalityOptions): return limit_data.count else: - raise ValueError(f"Invalid limit data type for {modality}: {type(limit_data)}") + raise ValueError( + f"Invalid limit data type for {modality}: {type(limit_data)}") - def get_dummy_options(self, modality: str) -> Optional[ModalityDummyOptions]: + def get_dummy_options(self, + modality: str) -> Optional[ModalityDummyOptions]: """ Get the configurable dummy data options for a modality. Returns None if no configurable options are configured. @@ -211,7 +211,7 @@ def get_dummy_options(self, modality: str) -> Optional[ModalityDummyOptions]: limit_data = self.limit_per_prompt.get(modality) if isinstance(limit_data, (BaseModalityOptions, VideoDummyOptions, - ImageDummyOptions, AudioDummyOptions)): + ImageDummyOptions, AudioDummyOptions)): return limit_data elif isinstance(limit_data, int): # Convert legacy format to base options diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4610f52cb9f1..ab0b84c416e2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -101,15 +101,15 @@ def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]: def parse_limit_mm_per_prompt(val: str): """Parse limit-mm-per-prompt with support for configurable options.""" import json - from vllm.config.multimodal import ( - VideoDummyOptions, ImageDummyOptions, AudioDummyOptions, - BaseModalityOptions, LimitPerPromptType - ) + + from vllm.config.multimodal import (AudioDummyOptions, BaseModalityOptions, + ImageDummyOptions, VideoDummyOptions) try: parsed = json.loads(val) except json.JSONDecodeError as e: - raise ValueError(f"Invalid JSON format for --limit-mm-per-prompt: {e}") + raise ValueError( + f"Invalid JSON format for --limit-mm-per-prompt: {e}") from e if not isinstance(parsed, dict): raise ValueError("--limit-mm-per-prompt must be a JSON object") @@ -132,10 +132,11 @@ def parse_limit_mm_per_prompt(val: str): # Unknown modality, use base options result[modality] = BaseModalityOptions(**options) except TypeError as e: - raise ValueError(f"Invalid options for {modality}: {e}") + raise ValueError(f"Invalid options for {modality}: {e}") from e else: - raise ValueError(f"Invalid options type for {modality}: {type(options)}. " - f"Must be int or dict.") + raise ValueError( + f"Invalid options type for {modality}: {type(options)}. " + f"Must be int or dict.") return result @@ -825,14 +826,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: # Multimodal related configs multimodal_kwargs = get_kwargs(MultiModalConfig) - # Override the parser for limit_per_prompt to support configurable options - multimodal_kwargs["limit_per_prompt"]["type"] = parse_limit_mm_per_prompt + # Override the parser for limit_per_prompt to support configurable + # options + multimodal_kwargs["limit_per_prompt"][ + "type"] = parse_limit_mm_per_prompt multimodal_kwargs["limit_per_prompt"]["help"] += ( - "\n\nSupports both legacy count-only format and configurable options format:" + "\n\nSupports both legacy count-only format and " + "configurable options format:" "\n Legacy: '{\"image\": 5, \"video\": 1}'" - "\n Configurable: '{\"video\": {\"count\": 1, \"num_frames\": 32}}'" - "\n Mixed: '{\"image\": 5, \"video\": {\"count\": 1, \"num_frames\": 32}}'" - ) + "\n Configurable: '{\"video\": {\"count\": 1, " + "\"num_frames\": 32}}'" + "\n Mixed: '{\"image\": 5, \"video\": {\"count\": 1, " + "\"num_frames\": 32}}'") multimodal_group = parser.add_argument_group( title="MultiModalConfig", description=MultiModalConfig.__doc__, diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 79c292903268..e732f36f0fb5 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import dataclasses from abc import ABC, abstractmethod from collections.abc import Mapping from dataclasses import dataclass, field @@ -11,8 +10,7 @@ from PIL import Image import vllm.envs as envs -from vllm.config.multimodal import (ModalityDummyOptions, VideoDummyOptions, - ImageDummyOptions, AudioDummyOptions) +from vllm.config.multimodal import ModalityDummyOptions from vllm.logger import init_logger from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, @@ -102,7 +100,8 @@ def get_dummy_processor_inputs( # Use configurable options to guide dummy data generation if provided if mm_options: - dummy_mm_data = self._get_configurable_dummy_data(seq_len, mm_counts, mm_options) + dummy_mm_data = self._get_configurable_dummy_data( + seq_len, mm_counts, mm_options) else: dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) @@ -119,7 +118,8 @@ def _get_configurable_dummy_data( mm_options: Mapping[str, ModalityDummyOptions], ) -> MultiModalDataDict: """ - Generate dummy data with configurable options using parameter interception. + Generate dummy data with configurable options using parameter + interception. """ dummy_data = {} @@ -140,9 +140,12 @@ def _get_configurable_dummy_data( image_opts = mm_options["image"] if hasattr(image_opts, 'max_size') and image_opts.max_size: target_width, target_height = image_opts.max_size - elif hasattr(image_opts, 'width') and hasattr(image_opts, 'height'): - if image_opts.width: target_width = image_opts.width - if image_opts.height: target_height = image_opts.height + elif hasattr(image_opts, 'width') and hasattr( + image_opts, 'height'): + if image_opts.width: + target_width = image_opts.width + if image_opts.height: + target_height = image_opts.height # Simple bounds checking target_width = max(target_width, 1) @@ -152,7 +155,9 @@ def _get_configurable_dummy_data( target_height = min(target_height, default_height) dummy_data["image"] = self._get_dummy_images( - width=target_width, height=target_height, num_images=mm_counts["image"]) + width=target_width, + height=target_height, + num_images=mm_counts["image"]) # Handle videos if "video" in mm_counts and mm_counts["video"] > 0: @@ -164,16 +169,19 @@ def _get_configurable_dummy_data( except (AttributeError, Exception): default_width, default_height = 224, 224 vid_dims_known = False - + vid_frames_known = True try: - default_frames = self.info.get_num_frames_with_most_features(seq_len, mm_counts) + default_frames = self.info.get_num_frames_with_most_features( + seq_len, mm_counts) except (AttributeError, Exception): default_frames = 16 vid_frames_known = False # Override with configurable options if provided - target_width, target_height, target_frames = default_width, default_height, default_frames + target_width, target_height, target_frames = (default_width, + default_height, + default_frames) if "video" in mm_options: video_opts = mm_options["video"] if hasattr(video_opts, 'num_frames') and video_opts.num_frames: @@ -194,8 +202,10 @@ def _get_configurable_dummy_data( target_frames = min(target_frames, default_frames) dummy_data["video"] = self._get_dummy_videos( - width=target_width, height=target_height, - num_frames=target_frames, num_videos=mm_counts["video"]) + width=target_width, + height=target_height, + num_frames=target_frames, + num_videos=mm_counts["video"]) # Handle audio if "audio" in mm_counts and mm_counts["audio"] > 0: @@ -204,8 +214,8 @@ def _get_configurable_dummy_data( feature_extractor = self.info.get_feature_extractor() default_sample_rate = int(feature_extractor.sampling_rate) # Most audio processors expose chunk_length in seconds - default_duration_s = float(getattr(feature_extractor, - "chunk_length", 1)) + default_duration_s = float( + getattr(feature_extractor, "chunk_length", 1)) except (AttributeError, Exception): default_sample_rate = 16000 default_duration_s = 1.0 @@ -216,7 +226,8 @@ def _get_configurable_dummy_data( target_channels = 1 if mm_options and "audio" in mm_options: audio_opts = mm_options["audio"] - if hasattr(audio_opts, 'sample_rate') and audio_opts.sample_rate: + if hasattr(audio_opts, + 'sample_rate') and audio_opts.sample_rate: target_sample_rate = int(audio_opts.sample_rate) if hasattr(audio_opts, 'duration') and audio_opts.duration: target_duration_s = float(audio_opts.duration) @@ -224,8 +235,10 @@ def _get_configurable_dummy_data( target_channels = int(audio_opts.channels) # Compute effective profiling length - length_per_channel = max(1, int(target_sample_rate * target_duration_s)) - effective_length = max(1, length_per_channel * max(1, target_channels)) + length_per_channel = max( + 1, int(target_sample_rate * target_duration_s)) + effective_length = max( + 1, length_per_channel * max(1, target_channels)) # Return arrays (consistent with legacy builders) audios = self._get_dummy_audios(length=effective_length, diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 3c0a47cc0ad2..15d0b4d4415a 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,9 +6,8 @@ import torch.nn as nn -from vllm.inputs import InputProcessingContext from vllm.config.multimodal import (AudioDummyOptions, ImageDummyOptions, - VideoDummyOptions) + ModalityDummyOptions, VideoDummyOptions) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, cached_tokenizer_from_config) @@ -138,7 +137,7 @@ def get_max_tokens_per_item_by_modality( return {} processor = self.create_processor(model_config, cache=cache) - profiler = MultiModalProfiler(processor) + profiler: MultiModalProfiler = MultiModalProfiler(processor) seq_len = model_config.max_model_len mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache) @@ -192,7 +191,7 @@ def get_mm_limits_per_prompt( return {} processor = self.create_processor(model_config, cache=cache) - profiler = MultiModalProfiler(processor) + profiler: MultiModalProfiler = MultiModalProfiler(processor) return profiler.get_mm_limits() def register_processor( @@ -288,25 +287,28 @@ def get_decoder_dummy_data( The model is identified by ``model_config``. """ processor = self.create_processor(model_config, cache=cache) - profiler = MultiModalProfiler(processor) + profiler: MultiModalProfiler = MultiModalProfiler(processor) # Extract configurable options from multimodal config. # Only include modalities that use advanced option types so legacy # count-only behavior remains unchanged. - mm_options = None + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None if model_config.multimodal_config: mm_options = {} for modality in ["image", "video", "audio"]: - if hasattr(model_config.multimodal_config, 'get_dummy_options'): - options = model_config.multimodal_config.get_dummy_options(modality) + if hasattr(model_config.multimodal_config, + 'get_dummy_options'): + options = model_config.multimodal_config.get_dummy_options( + modality) if isinstance(options, (ImageDummyOptions, VideoDummyOptions, AudioDummyOptions)): mm_options[modality] = options - if not mm_options: + if len(mm_options) == 0: mm_options = None - dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options) + dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, + mm_options) # Having more tokens is over-conservative but otherwise fine token_ids = dummy_data.prompt_token_ids @@ -331,25 +333,28 @@ def get_encoder_dummy_data( The model is identified by ``model_config``. """ processor = self.create_processor(model_config, cache=cache) - profiler = MultiModalProfiler(processor) + profiler: MultiModalProfiler = MultiModalProfiler(processor) # Extract configurable options from multimodal config. # Only include modalities that use advanced option types so legacy # count-only behavior remains unchanged. - mm_options = None + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None if model_config.multimodal_config: mm_options = {} for modality in ["image", "video", "audio"]: - if hasattr(model_config.multimodal_config, 'get_dummy_options'): - options = model_config.multimodal_config.get_dummy_options(modality) + if hasattr(model_config.multimodal_config, + 'get_dummy_options'): + options = model_config.multimodal_config.get_dummy_options( + modality) if isinstance(options, (ImageDummyOptions, VideoDummyOptions, AudioDummyOptions)): mm_options[modality] = options - if not mm_options: + if len(mm_options) == 0: mm_options = None - dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options) + dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, + mm_options) # Having more tokens is over-conservative but otherwise fine token_ids = dummy_data.prompt_token_ids From 122efc9bd9ab3165ea3e576f00b939cd2d02d04e Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 25 Sep 2025 22:59:48 +0000 Subject: [PATCH 08/27] fix lint Signed-off-by: wwl2755 --- vllm/config/multimodal.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 6023573a9e0f..5eed1681933f 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -16,13 +16,13 @@ @standard_dataclass class BaseModalityOptions: """Base class for modality-specific dummy data options.""" - count: int = 1 + count: int = 999 @standard_dataclass class VideoDummyOptions(BaseModalityOptions): """Options for generating dummy video data during profiling.""" - count: int = 1 + count: int = 999 num_frames: Optional[int] = None width: Optional[int] = None height: Optional[int] = None @@ -41,7 +41,7 @@ def __post_init__(self): @standard_dataclass class ImageDummyOptions(BaseModalityOptions): """Options for generating dummy image data during profiling.""" - count: int = 1 + count: int = 999 width: Optional[int] = None height: Optional[int] = None max_size: Optional[tuple[int, int]] = None @@ -58,7 +58,7 @@ def __post_init__(self): @standard_dataclass class AudioDummyOptions(BaseModalityOptions): """Options for generating dummy audio data during profiling.""" - count: int = 1 + count: int = 999 duration: Optional[float] = None sample_rate: Optional[int] = None channels: Optional[int] = None @@ -96,7 +96,7 @@ class MultiModalConfig: limit_per_prompt: LimitPerPromptType = field(default_factory=dict) """The maximum number of input items and options allowed per prompt for each modality. - Defaults to 1 (V0) or 999 (V1) for each modality. + Defaults to 999 for each modality. Legacy format (count only): {"image": 16, "video": 2} From c2913ee8923642a6483de2670d036a2ae0f438d8 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 26 Sep 2025 05:40:12 +0000 Subject: [PATCH 09/27] refactor Signed-off-by: wwl2755 --- vllm/config/multimodal.py | 16 ++- vllm/model_executor/models/qwen2_vl.py | 27 +++++ vllm/multimodal/profiling.py | 153 ++----------------------- 3 files changed, 45 insertions(+), 151 deletions(-) diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 5eed1681933f..b614ba922143 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -9,7 +9,6 @@ from pydantic.dataclasses import dataclass -import vllm.envs as envs from vllm.config.utils import config @@ -44,7 +43,6 @@ class ImageDummyOptions(BaseModalityOptions): count: int = 999 width: Optional[int] = None height: Optional[int] = None - max_size: Optional[tuple[int, int]] = None def __post_init__(self): if self.count < 0: @@ -60,18 +58,12 @@ class AudioDummyOptions(BaseModalityOptions): """Options for generating dummy audio data during profiling.""" count: int = 999 duration: Optional[float] = None - sample_rate: Optional[int] = None - channels: Optional[int] = None def __post_init__(self): if self.count < 0: raise ValueError("count must be non-negative") if self.duration is not None and self.duration <= 0: raise ValueError("duration must be positive") - if self.sample_rate is not None and self.sample_rate <= 0: - raise ValueError("sample_rate must be positive") - if self.channels is not None and self.channels <= 0: - raise ValueError("channels must be positive") # Union type for all supported option types @@ -193,7 +185,13 @@ def get_limit_per_prompt(self, modality: str) -> int: limit_data = self.limit_per_prompt.get(modality) if limit_data is None: - return 999 if envs.VLLM_USE_V1 else 1 + # If no configuration at all, default to 999 + # If partial configuration exists, + # unspecified modalities should be 0 + if not self.limit_per_prompt: + return 999 + else: + return 0 elif isinstance(limit_data, int): return limit_data elif isinstance(limit_data, BaseModalityOptions): diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 14ea03444484..ae1116e0fddd 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -43,6 +43,8 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig +from vllm.config.multimodal import (ImageDummyOptions, ModalityDummyOptions, + VideoDummyOptions) from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger @@ -1038,15 +1040,40 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) + # Get model defaults target_width, target_height = \ self.info.get_image_size_with_most_features() target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + # Apply configurable options if provided + if mm_options: + # Handle image configuration + if "image" in mm_options: + image_opts = mm_options["image"] + if isinstance(image_opts, ImageDummyOptions): + if image_opts.width: + target_width = min(target_width, image_opts.width) + if image_opts.height: + target_height = min(target_height, image_opts.height) + + # Handle video configuration + if "video" in mm_options: + video_opts = mm_options["video"] + if isinstance(video_opts, VideoDummyOptions): + if video_opts.num_frames: + target_num_frames = min(target_num_frames, + video_opts.num_frames) + if video_opts.width: + target_width = min(target_width, video_opts.width) + if video_opts.height: + target_height = min(target_height, video_opts.height) + return { "image": self._get_dummy_images(width=target_width, diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index e732f36f0fb5..18aa5883c406 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -74,10 +74,19 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ) -> MultiModalDataDict: """ Build the multimodal input which, after processing, results in the maximum possible number of placeholder tokens. + + Args: + seq_len: Sequence length + mm_counts: Count of items per modality + mm_options: Configurable options per modality (optional). + If None, use model defaults for backward compatibility. + If provided, models can use these to customize dummy + data generation. """ raise NotImplementedError @@ -98,12 +107,8 @@ def get_dummy_processor_inputs( """ dummy_text = self.get_dummy_text(mm_counts) - # Use configurable options to guide dummy data generation if provided - if mm_options: - dummy_mm_data = self._get_configurable_dummy_data( - seq_len, mm_counts, mm_options) - else: - dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) + # Use the unified function for both legacy and configurable cases + dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options) tokenization_kwargs = {"truncation": False} @@ -111,142 +116,6 @@ def get_dummy_processor_inputs( mm_data=dummy_mm_data, tokenization_kwargs=tokenization_kwargs) - def _get_configurable_dummy_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - mm_options: Mapping[str, ModalityDummyOptions], - ) -> MultiModalDataDict: - """ - Generate dummy data with configurable options using parameter - interception. - """ - dummy_data = {} - - # Handle images - if "image" in mm_counts and mm_counts["image"] > 0: - # Get model defaults - vid_dims_known = True - try: - default_width, default_height = \ - self.info.get_image_size_with_most_features() - except (AttributeError, Exception): - default_width, default_height = 224, 224 - vid_dims_known = False - - # Override with configurable options if provided - target_width, target_height = default_width, default_height - if "image" in mm_options: - image_opts = mm_options["image"] - if hasattr(image_opts, 'max_size') and image_opts.max_size: - target_width, target_height = image_opts.max_size - elif hasattr(image_opts, 'width') and hasattr( - image_opts, 'height'): - if image_opts.width: - target_width = image_opts.width - if image_opts.height: - target_height = image_opts.height - - # Simple bounds checking - target_width = max(target_width, 1) - target_height = max(target_height, 1) - if vid_dims_known: - target_width = min(target_width, default_width) - target_height = min(target_height, default_height) - - dummy_data["image"] = self._get_dummy_images( - width=target_width, - height=target_height, - num_images=mm_counts["image"]) - - # Handle videos - if "video" in mm_counts and mm_counts["video"] > 0: - # Get model defaults - vid_dims_known = True - try: - default_width, default_height = \ - self.info.get_image_size_with_most_features() - except (AttributeError, Exception): - default_width, default_height = 224, 224 - vid_dims_known = False - - vid_frames_known = True - try: - default_frames = self.info.get_num_frames_with_most_features( - seq_len, mm_counts) - except (AttributeError, Exception): - default_frames = 16 - vid_frames_known = False - - # Override with configurable options if provided - target_width, target_height, target_frames = (default_width, - default_height, - default_frames) - if "video" in mm_options: - video_opts = mm_options["video"] - if hasattr(video_opts, 'num_frames') and video_opts.num_frames: - target_frames = video_opts.num_frames - if hasattr(video_opts, 'width') and video_opts.width: - target_width = video_opts.width - if hasattr(video_opts, 'height') and video_opts.height: - target_height = video_opts.height - - # Simple bounds checking - target_width = max(target_width, 1) - target_height = max(target_height, 1) - if vid_dims_known: - target_width = min(target_width, default_width) - target_height = min(target_height, default_height) - target_frames = max(target_frames, 1) - if vid_frames_known: - target_frames = min(target_frames, default_frames) - - dummy_data["video"] = self._get_dummy_videos( - width=target_width, - height=target_height, - num_frames=target_frames, - num_videos=mm_counts["video"]) - - # Handle audio - if "audio" in mm_counts and mm_counts["audio"] > 0: - # Get model defaults - try: - feature_extractor = self.info.get_feature_extractor() - default_sample_rate = int(feature_extractor.sampling_rate) - # Most audio processors expose chunk_length in seconds - default_duration_s = float( - getattr(feature_extractor, "chunk_length", 1)) - except (AttributeError, Exception): - default_sample_rate = 16000 - default_duration_s = 1.0 - - # Override with configurable options if provided - target_sample_rate = default_sample_rate - target_duration_s = default_duration_s - target_channels = 1 - if mm_options and "audio" in mm_options: - audio_opts = mm_options["audio"] - if hasattr(audio_opts, - 'sample_rate') and audio_opts.sample_rate: - target_sample_rate = int(audio_opts.sample_rate) - if hasattr(audio_opts, 'duration') and audio_opts.duration: - target_duration_s = float(audio_opts.duration) - if hasattr(audio_opts, 'channels') and audio_opts.channels: - target_channels = int(audio_opts.channels) - - # Compute effective profiling length - length_per_channel = max( - 1, int(target_sample_rate * target_duration_s)) - effective_length = max( - 1, length_per_channel * max(1, target_channels)) - - # Return arrays (consistent with legacy builders) - audios = self._get_dummy_audios(length=effective_length, - num_audios=mm_counts["audio"]) - dummy_data["audio"] = audios - - return dummy_data - def _get_dummy_audios( self, *, From 57f30c6dbe5c034f5e61a225d13b83c8eb484fdf Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 26 Sep 2025 15:21:44 +0000 Subject: [PATCH 10/27] change default logic Signed-off-by: wwl2755 --- vllm/config/model.py | 14 -------------- vllm/config/multimodal.py | 9 ++------- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index f4b60f2baa2e..a2bf5f52af84 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -340,7 +340,6 @@ def compute_hash(self) -> str: return hashlib.sha256(str(factors).encode()).hexdigest() def __post_init__( -<<<<<<< HEAD self, # Multimodal config init vars limit_mm_per_prompt: Optional[dict[str, int]], @@ -354,19 +353,6 @@ def __post_init__( skip_mm_profiling: Optional[bool], video_pruning_rate: Optional[float], ) -> None: -======= - self, - # Multimodal config init vars - limit_mm_per_prompt: Optional[LimitPerPromptType], - media_io_kwargs: Optional[dict[str, dict[str, Any]]], - mm_processor_kwargs: Optional[dict[str, Any]], - mm_processor_cache_gb: Optional[float], - mm_processor_cache_type: Optional[MMCacheType], - mm_shm_cache_max_object_size_mb: Optional[int], - mm_encoder_tp_mode: Optional[MMEncoderTPMode], - interleave_mm_strings: Optional[bool], - skip_mm_profiling: Optional[bool]) -> None: ->>>>>>> init # Set the default seed to 0 in V1. # NOTE(woosuk): In V0, we set the default seed to None because the # driver worker shares the same process as the user process, and thus diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index b614ba922143..3fd033afd700 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -185,13 +185,8 @@ def get_limit_per_prompt(self, modality: str) -> int: limit_data = self.limit_per_prompt.get(modality) if limit_data is None: - # If no configuration at all, default to 999 - # If partial configuration exists, - # unspecified modalities should be 0 - if not self.limit_per_prompt: - return 999 - else: - return 0 + # Unspecified modality is set to 999 by default + return 999 elif isinstance(limit_data, int): return limit_data elif isinstance(limit_data, BaseModalityOptions): From d85c8e8a976390808d11f6c8c48cff428ceb63fb Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 26 Sep 2025 15:40:53 +0000 Subject: [PATCH 11/27] move mm_option to get_dummy_images Signed-off-by: wwl2755 --- vllm/model_executor/models/qwen2_vl.py | 30 ++++---------------------- vllm/multimodal/profiling.py | 22 ++++++++++++++++++- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ae1116e0fddd..880ebc68a518 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -43,8 +43,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig -from vllm.config.multimodal import (ImageDummyOptions, ModalityDummyOptions, - VideoDummyOptions) +from vllm.config.multimodal import ModalityDummyOptions from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger @@ -1051,40 +1050,19 @@ def get_dummy_mm_data( target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) - # Apply configurable options if provided - if mm_options: - # Handle image configuration - if "image" in mm_options: - image_opts = mm_options["image"] - if isinstance(image_opts, ImageDummyOptions): - if image_opts.width: - target_width = min(target_width, image_opts.width) - if image_opts.height: - target_height = min(target_height, image_opts.height) - - # Handle video configuration - if "video" in mm_options: - video_opts = mm_options["video"] - if isinstance(video_opts, VideoDummyOptions): - if video_opts.num_frames: - target_num_frames = min(target_num_frames, - video_opts.num_frames) - if video_opts.width: - target_width = min(target_width, video_opts.width) - if video_opts.height: - target_height = min(target_height, video_opts.height) - return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + mm_options=mm_options), "video": self._get_dummy_videos( width=target_width, height=target_height, num_frames=target_num_frames, num_videos=num_videos, + mm_options=mm_options, ) } diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 18aa5883c406..fc342751df24 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -10,7 +10,8 @@ from PIL import Image import vllm.envs as envs -from vllm.config.multimodal import ModalityDummyOptions +from vllm.config.multimodal import (AudioDummyOptions, ImageDummyOptions, + ModalityDummyOptions, VideoDummyOptions) from vllm.logger import init_logger from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, @@ -121,9 +122,14 @@ def _get_dummy_audios( *, length: int, num_audios: int, + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ) -> list[npt.NDArray]: if num_audios == 0: return [] + if mm_options and isinstance( + mm_options["audio"], + AudioDummyOptions) and mm_options["audio"].length: + length = min(length, mm_options["audio"].length) audio = np.zeros((length, )) return [audio] * num_audios @@ -133,9 +139,15 @@ def _get_dummy_images( width: int, height: int, num_images: int, + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ) -> list[Image.Image]: if num_images == 0: return [] + if mm_options and isinstance(mm_options["image"], ImageDummyOptions): + if mm_options["image"].width: + width = min(width, mm_options["image"].width) + if mm_options["image"].height: + height = min(height, mm_options["image"].height) image = Image.new("RGB", (width, height), color=255) return [image] * num_images @@ -146,9 +158,17 @@ def _get_dummy_videos( height: int, num_frames: int, num_videos: int, + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ) -> list[npt.NDArray]: if num_videos == 0: return [] + if mm_options and isinstance(mm_options["video"], VideoDummyOptions): + if mm_options["video"].num_frames: + num_frames = min(num_frames, mm_options["video"].num_frames) + if mm_options["video"].width: + width = min(width, mm_options["video"].width) + if mm_options["video"].height: + height = min(height, mm_options["video"].height) video = np.full((num_frames, width, height, 3), 255) return [video] * num_videos From 1462478cd6f1a14f72698a4831cf748f5866d106 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 26 Sep 2025 15:43:38 +0000 Subject: [PATCH 12/27] fix docstring Signed-off-by: wwl2755 --- vllm/config/multimodal.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 3fd033afd700..a36e149fd05b 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -94,11 +94,12 @@ class MultiModalConfig: {"image": 16, "video": 2} Configurable format (with options): - {"video": {"count": 1, "num_frames": 32}, - "image": {"count": 5, "max_size": [512, 512]}} + {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}, + "image": {"count": 5, "width": 512, "height": 512}} Mixed format (combining both): - {"image": 16, "video": {"count": 1, "num_frames": 32}} + {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, + "height": 512}} """ media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) """Additional args passed to process media inputs, keyed by modalities. From f2c4e9ae51d96aa8684926d851c70828fae415c7 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 26 Sep 2025 15:51:19 +0000 Subject: [PATCH 13/27] fix pre-commit Signed-off-by: wwl2755 --- vllm/multimodal/profiling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index fc342751df24..417125130f6d 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -128,8 +128,8 @@ def _get_dummy_audios( return [] if mm_options and isinstance( mm_options["audio"], - AudioDummyOptions) and mm_options["audio"].length: - length = min(length, mm_options["audio"].length) + AudioDummyOptions) and mm_options["audio"].duration: + length = min(length, mm_options["audio"].duration) audio = np.zeros((length, )) return [audio] * num_audios From 9c902b321437aac4b40f33f2d90ec256535d7a93 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 26 Sep 2025 16:00:30 +0000 Subject: [PATCH 14/27] fix pre-commit Signed-off-by: wwl2755 --- vllm/config/multimodal.py | 6 +++--- vllm/multimodal/profiling.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index a36e149fd05b..9a0e439869ba 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -57,13 +57,13 @@ def __post_init__(self): class AudioDummyOptions(BaseModalityOptions): """Options for generating dummy audio data during profiling.""" count: int = 999 - duration: Optional[float] = None + length: Optional[int] = None def __post_init__(self): if self.count < 0: raise ValueError("count must be non-negative") - if self.duration is not None and self.duration <= 0: - raise ValueError("duration must be positive") + if self.length is not None and self.length <= 0: + raise ValueError("length must be positive") # Union type for all supported option types diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 417125130f6d..fc342751df24 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -128,8 +128,8 @@ def _get_dummy_audios( return [] if mm_options and isinstance( mm_options["audio"], - AudioDummyOptions) and mm_options["audio"].duration: - length = min(length, mm_options["audio"].duration) + AudioDummyOptions) and mm_options["audio"].length: + length = min(length, mm_options["audio"].length) audio = np.zeros((length, )) return [audio] * num_audios From c93672c04b15317205b7a946641e9e6a48e63fe7 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Sat, 27 Sep 2025 00:29:37 +0000 Subject: [PATCH 15/27] add qwen2_audio and refactor limit_mm_per_prompt Signed-off-by: wwl2755 --- vllm/config/multimodal.py | 44 ++++++------------ vllm/engine/arg_utils.py | 40 +++++++++-------- vllm/model_executor/models/qwen2_audio.py | 6 ++- vllm/multimodal/profiling.py | 7 +-- vllm/multimodal/registry.py | 55 +++++++++++------------ 5 files changed, 69 insertions(+), 83 deletions(-) diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 9a0e439869ba..ecc2555f8472 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -13,13 +13,7 @@ @standard_dataclass -class BaseModalityOptions: - """Base class for modality-specific dummy data options.""" - count: int = 999 - - -@standard_dataclass -class VideoDummyOptions(BaseModalityOptions): +class VideoDummyOptions: """Options for generating dummy video data during profiling.""" count: int = 999 num_frames: Optional[int] = None @@ -38,7 +32,7 @@ def __post_init__(self): @standard_dataclass -class ImageDummyOptions(BaseModalityOptions): +class ImageDummyOptions: """Options for generating dummy image data during profiling.""" count: int = 999 width: Optional[int] = None @@ -54,7 +48,7 @@ def __post_init__(self): @standard_dataclass -class AudioDummyOptions(BaseModalityOptions): +class AudioDummyOptions: """Options for generating dummy audio data during profiling.""" count: int = 999 length: Optional[int] = None @@ -67,14 +61,12 @@ def __post_init__(self): # Union type for all supported option types -ModalityDummyOptions = Union[BaseModalityOptions, VideoDummyOptions, - ImageDummyOptions, AudioDummyOptions] +ModalityDummyOptions = Union[VideoDummyOptions, ImageDummyOptions, + AudioDummyOptions] -# Main configuration type supporting both legacy and configurable formats -LimitPerPromptType = Union[ - dict[str, int], # Legacy: {"video": 1, "image": 5} - dict[str, Union[int, ModalityDummyOptions]] # Configurable format -] +# Configuration type - all values normalized to ModalityDummyOptions +# at initialization +LimitPerPromptType = dict[str, ModalityDummyOptions] MMEncoderTPMode = Literal["weights", "data"] MMCacheType = Literal["shm", "lru"] @@ -188,9 +180,9 @@ def get_limit_per_prompt(self, modality: str) -> int: if limit_data is None: # Unspecified modality is set to 999 by default return 999 - elif isinstance(limit_data, int): - return limit_data - elif isinstance(limit_data, BaseModalityOptions): + elif isinstance( + limit_data, + (VideoDummyOptions, ImageDummyOptions, AudioDummyOptions)): return limit_data.count else: raise ValueError( @@ -200,18 +192,10 @@ def get_dummy_options(self, modality: str) -> Optional[ModalityDummyOptions]: """ Get the configurable dummy data options for a modality. - Returns None if no configurable options are configured. + Returns None if no options are configured for this modality. """ - limit_data = self.limit_per_prompt.get(modality) - - if isinstance(limit_data, (BaseModalityOptions, VideoDummyOptions, - ImageDummyOptions, AudioDummyOptions)): - return limit_data - elif isinstance(limit_data, int): - # Convert legacy format to base options - return BaseModalityOptions(count=limit_data) - else: - return None + # All values are now ModalityDummyOptions after normalization + return self.limit_per_prompt.get(modality) def merge_mm_processor_kwargs( self, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ab0b84c416e2..cf60f1e8c3da 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -102,8 +102,8 @@ def parse_limit_mm_per_prompt(val: str): """Parse limit-mm-per-prompt with support for configurable options.""" import json - from vllm.config.multimodal import (AudioDummyOptions, BaseModalityOptions, - ImageDummyOptions, VideoDummyOptions) + from vllm.config.multimodal import (AudioDummyOptions, ImageDummyOptions, + VideoDummyOptions) try: parsed = json.loads(val) @@ -116,28 +116,30 @@ def parse_limit_mm_per_prompt(val: str): result = {} for modality, options in parsed.items(): + # Normalize int to dict first if isinstance(options, int): - # Legacy format - result[modality] = options - elif isinstance(options, dict): - # Configurable format - convert to appropriate dataclass - try: - if modality == "video": - result[modality] = VideoDummyOptions(**options) - elif modality == "image": - result[modality] = ImageDummyOptions(**options) - elif modality == "audio": - result[modality] = AudioDummyOptions(**options) - else: - # Unknown modality, use base options - result[modality] = BaseModalityOptions(**options) - except TypeError as e: - raise ValueError(f"Invalid options for {modality}: {e}") from e - else: + options = {"count": options} + elif not isinstance(options, dict): raise ValueError( f"Invalid options type for {modality}: {type(options)}. " f"Must be int or dict.") + # Single path: create ModalityDummyOptions from dict + try: + if modality == "video": + result[modality] = VideoDummyOptions(**options) + elif modality == "image": + result[modality] = ImageDummyOptions(**options) + elif modality == "audio": + result[modality] = AudioDummyOptions(**options) + else: + # Unknown modality, raise error + raise ValueError( + f"Unknown modality '{modality}'. " + "Supported modalities are: image, video, audio") + except TypeError as e: + raise ValueError(f"Invalid options for {modality}: {e}") from e + return result diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 762ab42e5929..4b5aa61cfbf5 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -34,6 +34,7 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig +from vllm.config.multimodal import ModalityDummyOptions from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (AudioItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, @@ -145,6 +146,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ) -> MultiModalDataDict: feature_extractor = self.info.get_feature_extractor() @@ -154,7 +156,9 @@ def get_dummy_mm_data( return { "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) + self._get_dummy_audios(length=audio_len, + num_audios=num_audios, + mm_options=mm_options) } diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index fc342751df24..5b3d7541142a 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -290,7 +290,6 @@ def _get_mm_max_tokens( seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, mm_embeddings_only: bool = True, - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ) -> Mapping[str, int]: if mm_counts is None: mm_counts = self.get_mm_limits() @@ -302,7 +301,7 @@ def _get_mm_max_tokens( if max_tokens_per_item is not None: return max_tokens_per_item - mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts, mm_options) + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) return self._get_mm_num_tokens(mm_inputs, mm_embeddings_only=mm_embeddings_only) @@ -310,7 +309,6 @@ def get_mm_max_contiguous_tokens( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, ): """ Returns the maximum length of the multimodal (image placeholders+text) @@ -325,5 +323,4 @@ def get_mm_max_contiguous_tokens( return self._get_mm_max_tokens(seq_len, mm_counts, - mm_embeddings_only=False, - mm_options=mm_options) + mm_embeddings_only=False) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 15d0b4d4415a..2620f63908da 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -97,6 +97,31 @@ def __init__(self) -> None: self._processor_factories = ClassRegistry[nn.Module, _ProcessorFactories]() + def _extract_mm_options( + self, + model_config: "ModelConfig", + ) -> Optional[Mapping[str, ModalityDummyOptions]]: + """ + Extract multimodal dummy options from model config. + + Returns None if no configurable options are found, otherwise returns + a mapping of modality names to their dummy options. + """ + if not model_config.multimodal_config: + return None + + mm_options = {} + for modality in ["image", "video", "audio"]: + if hasattr(model_config.multimodal_config, 'get_dummy_options'): + options = model_config.multimodal_config.get_dummy_options( + modality) + if isinstance( + options, + (ImageDummyOptions, VideoDummyOptions, AudioDummyOptions)): + mm_options[modality] = options + + return mm_options if len(mm_options) > 0 else None + def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool: """ Checks if the model supports multimodal inputs. @@ -292,20 +317,7 @@ def get_decoder_dummy_data( # Extract configurable options from multimodal config. # Only include modalities that use advanced option types so legacy # count-only behavior remains unchanged. - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None - if model_config.multimodal_config: - mm_options = {} - for modality in ["image", "video", "audio"]: - if hasattr(model_config.multimodal_config, - 'get_dummy_options'): - options = model_config.multimodal_config.get_dummy_options( - modality) - if isinstance(options, - (ImageDummyOptions, VideoDummyOptions, - AudioDummyOptions)): - mm_options[modality] = options - if len(mm_options) == 0: - mm_options = None + mm_options = self._extract_mm_options(model_config) dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options) @@ -338,20 +350,7 @@ def get_encoder_dummy_data( # Extract configurable options from multimodal config. # Only include modalities that use advanced option types so legacy # count-only behavior remains unchanged. - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None - if model_config.multimodal_config: - mm_options = {} - for modality in ["image", "video", "audio"]: - if hasattr(model_config.multimodal_config, - 'get_dummy_options'): - options = model_config.multimodal_config.get_dummy_options( - modality) - if isinstance(options, - (ImageDummyOptions, VideoDummyOptions, - AudioDummyOptions)): - mm_options[modality] = options - if len(mm_options) == 0: - mm_options = None + mm_options = self._extract_mm_options(model_config) dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options) From 7ca9c7fe559f268c6f31068c1b250b77b35aefb8 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Sat, 27 Sep 2025 00:40:36 +0000 Subject: [PATCH 16/27] pass only one modality each time and rename Signed-off-by: wwl2755 --- vllm/model_executor/models/qwen2_audio.py | 4 ++- vllm/model_executor/models/qwen2_vl.py | 7 +++-- vllm/multimodal/profiling.py | 36 +++++++++++------------ 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 4b5aa61cfbf5..0ad0e1afd55d 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -154,11 +154,13 @@ def get_dummy_mm_data( audio_len = feature_extractor.chunk_length * sampling_rate num_audios = mm_counts.get("audio", 0) + audio_overrides = mm_options.get("audio") if mm_options else None + return { "audio": self._get_dummy_audios(length=audio_len, num_audios=num_audios, - mm_options=mm_options) + overrides=audio_overrides) } diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 880ebc68a518..0e421eed3a57 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1050,19 +1050,22 @@ def get_dummy_mm_data( target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, num_images=num_images, - mm_options=mm_options), + overrides=image_overrides), "video": self._get_dummy_videos( width=target_width, height=target_height, num_frames=target_num_frames, num_videos=num_videos, - mm_options=mm_options, + overrides=video_overrides, ) } diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 5b3d7541142a..c69205286732 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -122,14 +122,12 @@ def _get_dummy_audios( *, length: int, num_audios: int, - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, + overrides: Optional[AudioDummyOptions] = None, ) -> list[npt.NDArray]: if num_audios == 0: return [] - if mm_options and isinstance( - mm_options["audio"], - AudioDummyOptions) and mm_options["audio"].length: - length = min(length, mm_options["audio"].length) + if overrides and overrides.length: + length = min(length, overrides.length) audio = np.zeros((length, )) return [audio] * num_audios @@ -139,15 +137,15 @@ def _get_dummy_images( width: int, height: int, num_images: int, - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, + overrides: Optional[ImageDummyOptions] = None, ) -> list[Image.Image]: if num_images == 0: return [] - if mm_options and isinstance(mm_options["image"], ImageDummyOptions): - if mm_options["image"].width: - width = min(width, mm_options["image"].width) - if mm_options["image"].height: - height = min(height, mm_options["image"].height) + if overrides: + if overrides.width: + width = min(width, overrides.width) + if overrides.height: + height = min(height, overrides.height) image = Image.new("RGB", (width, height), color=255) return [image] * num_images @@ -158,17 +156,17 @@ def _get_dummy_videos( height: int, num_frames: int, num_videos: int, - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, + overrides: Optional[VideoDummyOptions] = None, ) -> list[npt.NDArray]: if num_videos == 0: return [] - if mm_options and isinstance(mm_options["video"], VideoDummyOptions): - if mm_options["video"].num_frames: - num_frames = min(num_frames, mm_options["video"].num_frames) - if mm_options["video"].width: - width = min(width, mm_options["video"].width) - if mm_options["video"].height: - height = min(height, mm_options["video"].height) + if overrides: + if overrides.num_frames: + num_frames = min(num_frames, overrides.num_frames) + if overrides.width: + width = min(width, overrides.width) + if overrides.height: + height = min(height, overrides.height) video = np.full((num_frames, width, height, 3), 255) return [video] * num_videos From 9142c3650c1f5bfe61d54a909c3025a2413fa761 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Sat, 27 Sep 2025 05:48:20 +0000 Subject: [PATCH 17/27] preserve compatibility for OOT models Signed-off-by: wwl2755 --- vllm/config/multimodal.py | 14 ++++++++++++-- vllm/engine/arg_utils.py | 10 ++++------ vllm/multimodal/registry.py | 24 +++++++++++++----------- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index ecc2555f8472..8a3df44a5665 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -12,6 +12,16 @@ from vllm.config.utils import config +@standard_dataclass +class BaseDummyOptions: + """Base options for generating dummy data during profiling.""" + count: int = 999 + + def __post_init__(self): + if self.count < 0: + raise ValueError("count must be non-negative") + + @standard_dataclass class VideoDummyOptions: """Options for generating dummy video data during profiling.""" @@ -61,8 +71,8 @@ def __post_init__(self): # Union type for all supported option types -ModalityDummyOptions = Union[VideoDummyOptions, ImageDummyOptions, - AudioDummyOptions] +ModalityDummyOptions = Union[BaseDummyOptions, VideoDummyOptions, + ImageDummyOptions, AudioDummyOptions] # Configuration type - all values normalized to ModalityDummyOptions # at initialization diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cf60f1e8c3da..a8977f062969 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -102,8 +102,8 @@ def parse_limit_mm_per_prompt(val: str): """Parse limit-mm-per-prompt with support for configurable options.""" import json - from vllm.config.multimodal import (AudioDummyOptions, ImageDummyOptions, - VideoDummyOptions) + from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions, + ImageDummyOptions, VideoDummyOptions) try: parsed = json.loads(val) @@ -133,10 +133,8 @@ def parse_limit_mm_per_prompt(val: str): elif modality == "audio": result[modality] = AudioDummyOptions(**options) else: - # Unknown modality, raise error - raise ValueError( - f"Unknown modality '{modality}'. " - "Supported modalities are: image, video, audio") + # Unknown modality - use BaseDummyOptions for OOT models + result[modality] = BaseDummyOptions(**options) except TypeError as e: raise ValueError(f"Invalid options for {modality}: {e}") from e diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 2620f63908da..321a97530bb3 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,8 +6,9 @@ import torch.nn as nn -from vllm.config.multimodal import (AudioDummyOptions, ImageDummyOptions, - ModalityDummyOptions, VideoDummyOptions) +from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions, + ImageDummyOptions, ModalityDummyOptions, + VideoDummyOptions) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, cached_tokenizer_from_config) @@ -54,7 +55,7 @@ def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]: ... -class MultiModalProcessorFactory(Protocol[_I]): +class MultiModalProcessorFactory(Protocol[_I]): # type: ignore[misc] """ Constructs a [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor] @@ -111,14 +112,15 @@ def _extract_mm_options( return None mm_options = {} - for modality in ["image", "video", "audio"]: - if hasattr(model_config.multimodal_config, 'get_dummy_options'): - options = model_config.multimodal_config.get_dummy_options( - modality) - if isinstance( - options, - (ImageDummyOptions, VideoDummyOptions, AudioDummyOptions)): - mm_options[modality] = options + # Extract options for ALL modalities in the config, + # not just hardcoded ones + # This supports OOT models with custom modalities + for modality in model_config.multimodal_config.limit_per_prompt: + options = model_config.multimodal_config.get_dummy_options( + modality) + if isinstance(options, (BaseDummyOptions, ImageDummyOptions, + VideoDummyOptions, AudioDummyOptions)): + mm_options[modality] = options return mm_options if len(mm_options) > 0 else None From 2e0ee53ae23159463eef3d596c9a027d65c5e81f Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Wed, 1 Oct 2025 03:34:21 +0000 Subject: [PATCH 18/27] fix comments Signed-off-by: wwl2755 --- vllm/config/multimodal.py | 45 +++++++------------------- vllm/engine/arg_utils.py | 2 +- vllm/model_executor/models/qwen2_vl.py | 1 - vllm/multimodal/profiling.py | 30 +++++++++++++++++ vllm/multimodal/registry.py | 18 +++-------- 5 files changed, 48 insertions(+), 48 deletions(-) diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 8a3df44a5665..868aee318c1f 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -5,7 +5,7 @@ from collections.abc import Mapping from dataclasses import dataclass as standard_dataclass from dataclasses import field -from typing import Any, Literal, Optional, Union +from typing import Any, Literal, Optional from pydantic.dataclasses import dataclass @@ -23,16 +23,14 @@ def __post_init__(self): @standard_dataclass -class VideoDummyOptions: +class VideoDummyOptions(BaseDummyOptions): """Options for generating dummy video data during profiling.""" - count: int = 999 num_frames: Optional[int] = None width: Optional[int] = None height: Optional[int] = None def __post_init__(self): - if self.count < 0: - raise ValueError("count must be non-negative") + super().__post_init__() if self.num_frames is not None and self.num_frames <= 0: raise ValueError("num_frames must be positive") if self.width is not None and self.width <= 0: @@ -42,15 +40,13 @@ def __post_init__(self): @standard_dataclass -class ImageDummyOptions: +class ImageDummyOptions(BaseDummyOptions): """Options for generating dummy image data during profiling.""" - count: int = 999 width: Optional[int] = None height: Optional[int] = None def __post_init__(self): - if self.count < 0: - raise ValueError("count must be non-negative") + super().__post_init__() if self.width is not None and self.width <= 0: raise ValueError("width must be positive") if self.height is not None and self.height <= 0: @@ -58,26 +54,16 @@ def __post_init__(self): @standard_dataclass -class AudioDummyOptions: +class AudioDummyOptions(BaseDummyOptions): """Options for generating dummy audio data during profiling.""" - count: int = 999 length: Optional[int] = None def __post_init__(self): - if self.count < 0: - raise ValueError("count must be non-negative") + super().__post_init__() if self.length is not None and self.length <= 0: raise ValueError("length must be positive") -# Union type for all supported option types -ModalityDummyOptions = Union[BaseDummyOptions, VideoDummyOptions, - ImageDummyOptions, AudioDummyOptions] - -# Configuration type - all values normalized to ModalityDummyOptions -# at initialization -LimitPerPromptType = dict[str, ModalityDummyOptions] - MMEncoderTPMode = Literal["weights", "data"] MMCacheType = Literal["shm", "lru"] @@ -87,7 +73,7 @@ def __post_init__(self): class MultiModalConfig: """Controls the behavior of multimodal models.""" - limit_per_prompt: LimitPerPromptType = field(default_factory=dict) + limit_per_prompt: dict[str, BaseDummyOptions] = field(default_factory=dict) """The maximum number of input items and options allowed per prompt for each modality. Defaults to 999 for each modality. @@ -190,21 +176,14 @@ def get_limit_per_prompt(self, modality: str) -> int: if limit_data is None: # Unspecified modality is set to 999 by default return 999 - elif isinstance( - limit_data, - (VideoDummyOptions, ImageDummyOptions, AudioDummyOptions)): - return limit_data.count - else: - raise ValueError( - f"Invalid limit data type for {modality}: {type(limit_data)}") - - def get_dummy_options(self, - modality: str) -> Optional[ModalityDummyOptions]: + return limit_data.count + + def get_dummy_options(self, modality: str) -> Optional[BaseDummyOptions]: """ Get the configurable dummy data options for a modality. Returns None if no options are configured for this modality. """ - # All values are now ModalityDummyOptions after normalization + # All values are now DummyOptions after normalization return self.limit_per_prompt.get(modality) def merge_mm_processor_kwargs( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a8977f062969..ea7f911b5f69 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -136,7 +136,7 @@ def parse_limit_mm_per_prompt(val: str): # Unknown modality - use BaseDummyOptions for OOT models result[modality] = BaseDummyOptions(**options) except TypeError as e: - raise ValueError(f"Invalid options for {modality}: {e}") from e + raise ValueError(f"Invalid options for {modality}") from e return result diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 0e421eed3a57..c365accea0d9 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1044,7 +1044,6 @@ def get_dummy_mm_data( num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) - # Get model defaults target_width, target_height = \ self.info.get_image_size_with_most_features() target_num_frames = \ diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index c69205286732..307c27dd990e 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -127,6 +127,11 @@ def _get_dummy_audios( if num_audios == 0: return [] if overrides and overrides.length: + if overrides.length > length: + logger.warning( + "audio.length override (%d) exceeds model's " + "maximum length (%d), will be ignored", overrides.length, + length) length = min(length, overrides.length) audio = np.zeros((length, )) return [audio] * num_audios @@ -143,8 +148,18 @@ def _get_dummy_images( return [] if overrides: if overrides.width: + if overrides.width > width: + logger.warning( + "image.width override (%d) exceeds model's " + "maximum width (%d), will be ignored", overrides.width, + width) width = min(width, overrides.width) if overrides.height: + if overrides.height > height: + logger.warning( + "image.height override (%d) exceeds model's " + "maximum height (%d), will be ignored", + overrides.height, height) height = min(height, overrides.height) image = Image.new("RGB", (width, height), color=255) return [image] * num_images @@ -162,10 +177,25 @@ def _get_dummy_videos( return [] if overrides: if overrides.num_frames: + if overrides.num_frames > num_frames: + logger.warning( + "video.num_frames override (%d) exceeds model's " + "maximum number of frames (%d), will be ignored", + overrides.num_frames, num_frames) num_frames = min(num_frames, overrides.num_frames) if overrides.width: + if overrides.width > width: + logger.warning( + "video.width override (%d) exceeds model's " + "maximum width (%d), will be ignored", overrides.width, + width) width = min(width, overrides.width) if overrides.height: + if overrides.height > height: + logger.warning( + "video.height override (%d) exceeds model's " + "maximum height (%d), will be ignored", + overrides.height, height) height = min(height, overrides.height) video = np.full((num_frames, width, height, 3), 255) return [video] * num_videos diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 321a97530bb3..ef83e82da651 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,9 +6,7 @@ import torch.nn as nn -from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions, - ImageDummyOptions, ModalityDummyOptions, - VideoDummyOptions) +from vllm.config.multimodal import ModalityDummyOptions from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, cached_tokenizer_from_config) @@ -111,16 +109,10 @@ def _extract_mm_options( if not model_config.multimodal_config: return None - mm_options = {} - # Extract options for ALL modalities in the config, - # not just hardcoded ones - # This supports OOT models with custom modalities - for modality in model_config.multimodal_config.limit_per_prompt: - options = model_config.multimodal_config.get_dummy_options( - modality) - if isinstance(options, (BaseDummyOptions, ImageDummyOptions, - VideoDummyOptions, AudioDummyOptions)): - mm_options[modality] = options + mm_options = { + m: model_config.multimodal_config.get_dummy_options(m) + for m in model_config.multimodal_config.limit_per_prompt + } return mm_options if len(mm_options) > 0 else None From 37c1c491820d61afd34d0577569edb7c70a74a39 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Wed, 1 Oct 2025 03:53:50 +0000 Subject: [PATCH 19/27] fix mypy Signed-off-by: wwl2755 --- vllm/engine/arg_utils.py | 2 +- vllm/model_executor/models/qwen2_audio.py | 4 ++-- vllm/model_executor/models/qwen2_vl.py | 4 ++-- vllm/multimodal/profiling.py | 14 +++++++------- vllm/multimodal/registry.py | 8 +++++--- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ea7f911b5f69..0077d8401f31 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -124,7 +124,7 @@ def parse_limit_mm_per_prompt(val: str): f"Invalid options type for {modality}: {type(options)}. " f"Must be int or dict.") - # Single path: create ModalityDummyOptions from dict + # Single path: create BaseDummyOptions from dict try: if modality == "video": result[modality] = VideoDummyOptions(**options) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 0ad0e1afd55d..74f8ec9b2de4 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -34,7 +34,7 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig -from vllm.config.multimodal import ModalityDummyOptions +from vllm.config.multimodal import BaseDummyOptions from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (AudioItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, @@ -146,7 +146,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: feature_extractor = self.info.get_feature_extractor() diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index c365accea0d9..fe8033446517 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -43,7 +43,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig -from vllm.config.multimodal import ModalityDummyOptions +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger @@ -1039,7 +1039,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 307c27dd990e..74dc2314d2eb 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -10,8 +10,8 @@ from PIL import Image import vllm.envs as envs -from vllm.config.multimodal import (AudioDummyOptions, ImageDummyOptions, - ModalityDummyOptions, VideoDummyOptions) +from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions, + ImageDummyOptions, VideoDummyOptions) from vllm.logger import init_logger from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, @@ -75,7 +75,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: """ Build the multimodal input which, after processing, results in @@ -95,7 +95,7 @@ def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> ProcessorInputs: """ Build the input which, after processing, results in @@ -229,7 +229,7 @@ def _get_dummy_mm_inputs( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalInputs: if mm_counts is None: mm_counts = self.get_mm_limits() @@ -263,7 +263,7 @@ def get_encoder_dummy_data( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> DummyEncoderData: mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts, mm_options) mm_inputs = cast(MultiModalEncDecInputs, mm_inputs) @@ -297,7 +297,7 @@ def get_decoder_dummy_data( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, - mm_options: Optional[Mapping[str, ModalityDummyOptions]] = None, + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> DummyDecoderData: mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts, mm_options) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index ef83e82da651..24d3baa9b4e7 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,7 +6,7 @@ import torch.nn as nn -from vllm.config.multimodal import ModalityDummyOptions +from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, cached_tokenizer_from_config) @@ -99,7 +99,7 @@ def __init__(self) -> None: def _extract_mm_options( self, model_config: "ModelConfig", - ) -> Optional[Mapping[str, ModalityDummyOptions]]: + ) -> Optional[Mapping[str, BaseDummyOptions]]: """ Extract multimodal dummy options from model config. @@ -110,8 +110,10 @@ def _extract_mm_options( return None mm_options = { - m: model_config.multimodal_config.get_dummy_options(m) + m: opt for m in model_config.multimodal_config.limit_per_prompt + if (opt := model_config.multimodal_config.get_dummy_options(m) + ) is not None } return mm_options if len(mm_options) > 0 else None From 5347ed9f639b7568ce0c10f4db0c7d4f9be11000 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Wed, 1 Oct 2025 03:58:23 +0000 Subject: [PATCH 20/27] fix doc-build Signed-off-by: wwl2755 --- vllm/config/model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index a2bf5f52af84..da01d6d4480c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -16,8 +16,8 @@ from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE import vllm.envs as envs -from vllm.config.multimodal import (LimitPerPromptType, MMCacheType, - MMEncoderTPMode, MultiModalConfig) +from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, + MultiModalConfig) from vllm.config.pooler import PoolerConfig from vllm.config.utils import assert_hashable, config from vllm.logger import init_logger @@ -274,7 +274,7 @@ class ModelConfig: multimodal_config: Optional[MultiModalConfig] = None """Configuration for multimodal model. If `None`, this will be inferred from the architecture of `self.model`.""" - limit_mm_per_prompt: InitVar[Optional[LimitPerPromptType]] = None + limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None mm_processor_cache_gb: InitVar[Optional[float]] = None From 79a906850a55a98880ed39769e2ff8fea23b4b39 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Wed, 1 Oct 2025 04:21:46 +0000 Subject: [PATCH 21/27] fix type mistake Signed-off-by: wwl2755 --- vllm/config/model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index da01d6d4480c..51d24ebc7942 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -16,8 +16,8 @@ from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE import vllm.envs as envs -from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, - MultiModalConfig) +from vllm.config.multimodal import (BaseDummyOptions, MMCacheType, + MMEncoderTPMode, MultiModalConfig) from vllm.config.pooler import PoolerConfig from vllm.config.utils import assert_hashable, config from vllm.logger import init_logger @@ -274,7 +274,7 @@ class ModelConfig: multimodal_config: Optional[MultiModalConfig] = None """Configuration for multimodal model. If `None`, this will be inferred from the architecture of `self.model`.""" - limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None + limit_mm_per_prompt: InitVar[Optional[dict[str, BaseDummyOptions]]] = None media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None mm_processor_cache_gb: InitVar[Optional[float]] = None From 6a6f0ab9e3e973256032ca09e23d67bba5536515 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 1 Oct 2025 12:22:59 +0200 Subject: [PATCH 22/27] Use pydantic to validate the new classes Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/multimodal.py | 55 ++++++++++++--------------------------- 1 file changed, 16 insertions(+), 39 deletions(-) diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 868aee318c1f..807e11140a2c 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -3,69 +3,46 @@ import hashlib from collections.abc import Mapping -from dataclasses import dataclass as standard_dataclass from dataclasses import field -from typing import Any, Literal, Optional +from typing import Any, Literal, Optional, Union +from pydantic import ConfigDict, Field from pydantic.dataclasses import dataclass from vllm.config.utils import config -@standard_dataclass +@dataclass class BaseDummyOptions: """Base options for generating dummy data during profiling.""" - count: int = 999 - - def __post_init__(self): - if self.count < 0: - raise ValueError("count must be non-negative") + count: int = Field(999, ge=0) -@standard_dataclass +@dataclass(config=ConfigDict(extra="forbid")) class VideoDummyOptions(BaseDummyOptions): """Options for generating dummy video data during profiling.""" - num_frames: Optional[int] = None - width: Optional[int] = None - height: Optional[int] = None - - def __post_init__(self): - super().__post_init__() - if self.num_frames is not None and self.num_frames <= 0: - raise ValueError("num_frames must be positive") - if self.width is not None and self.width <= 0: - raise ValueError("width must be positive") - if self.height is not None and self.height <= 0: - raise ValueError("height must be positive") + num_frames: Optional[int] = Field(None, gt=0) + width: Optional[int] = Field(None, gt=0) + height: Optional[int] = Field(None, gt=0) -@standard_dataclass +@dataclass(config=ConfigDict(extra="forbid")) class ImageDummyOptions(BaseDummyOptions): """Options for generating dummy image data during profiling.""" - width: Optional[int] = None - height: Optional[int] = None + width: Optional[int] = Field(None, gt=0) + height: Optional[int] = Field(None, gt=0) - def __post_init__(self): - super().__post_init__() - if self.width is not None and self.width <= 0: - raise ValueError("width must be positive") - if self.height is not None and self.height <= 0: - raise ValueError("height must be positive") - -@standard_dataclass +@dataclass(config=ConfigDict(extra="forbid")) class AudioDummyOptions(BaseDummyOptions): """Options for generating dummy audio data during profiling.""" - length: Optional[int] = None - - def __post_init__(self): - super().__post_init__() - if self.length is not None and self.length <= 0: - raise ValueError("length must be positive") + length: Optional[int] = Field(None, gt=0) MMEncoderTPMode = Literal["weights", "data"] MMCacheType = Literal["shm", "lru"] +DummyOptions = Union[BaseDummyOptions, VideoDummyOptions, ImageDummyOptions, + AudioDummyOptions] @config @@ -73,7 +50,7 @@ def __post_init__(self): class MultiModalConfig: """Controls the behavior of multimodal models.""" - limit_per_prompt: dict[str, BaseDummyOptions] = field(default_factory=dict) + limit_per_prompt: dict[str, DummyOptions] = field(default_factory=dict) """The maximum number of input items and options allowed per prompt for each modality. Defaults to 999 for each modality. From d8e3872d13ce7900c5d44ace0cbd684a2d631de8 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 1 Oct 2025 12:26:35 +0200 Subject: [PATCH 23/27] Validate `limit_per_prompt` inside `MultiModalConfig` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/model.py | 8 +++--- vllm/config/multimodal.py | 23 +++++++++++++++- vllm/engine/arg_utils.py | 57 +-------------------------------------- 3 files changed, 28 insertions(+), 60 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index ee9cc026c026..f367c9ae0e40 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -16,8 +16,8 @@ from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE import vllm.envs as envs -from vllm.config.multimodal import (BaseDummyOptions, MMCacheType, - MMEncoderTPMode, MultiModalConfig) +from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, + MultiModalConfig) from vllm.config.pooler import PoolerConfig from vllm.config.utils import assert_hashable, config from vllm.logger import init_logger @@ -277,7 +277,9 @@ class ModelConfig: multimodal_config: Optional[MultiModalConfig] = None """Configuration for multimodal model. If `None`, this will be inferred from the architecture of `self.model`.""" - limit_mm_per_prompt: InitVar[Optional[dict[str, BaseDummyOptions]]] = None + limit_mm_per_prompt: InitVar[Optional[dict[str, Union[int, + dict[str, + int]]]]] = None media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None mm_processor_cache_gb: InitVar[Optional[float]] = None diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 807e11140a2c..fd62d2411ade 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -6,7 +6,7 @@ from dataclasses import field from typing import Any, Literal, Optional, Union -from pydantic import ConfigDict, Field +from pydantic import ConfigDict, Field, field_validator from pydantic.dataclasses import dataclass from vllm.config.utils import config @@ -124,6 +124,27 @@ class MultiModalConfig: from each video to be pruned. """ + @field_validator("limit_per_prompt", mode="before") + @classmethod + def _validate_limit_per_prompt( + cls, value: dict[str, Union[int, + dict[str, + int]]]) -> dict[str, DummyOptions]: + for k, v in value.items(): + # Handle legacy format where only count is specified + if isinstance(v, int): + v = {"count": v} + # Convert to the appropriate DummyOptions subclass + if k == "video": + value[k] = VideoDummyOptions(**v) + elif k == "image": + value[k] = ImageDummyOptions(**v) + elif k == "audio": + value[k] = AudioDummyOptions(**v) + else: + value[k] = BaseDummyOptions(**v) + return value + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0d10d424879a..678f06f2502f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -98,49 +98,6 @@ def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]: return optional_type(json.loads)(val) -def parse_limit_mm_per_prompt(val: str): - """Parse limit-mm-per-prompt with support for configurable options.""" - import json - - from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions, - ImageDummyOptions, VideoDummyOptions) - - try: - parsed = json.loads(val) - except json.JSONDecodeError as e: - raise ValueError( - f"Invalid JSON format for --limit-mm-per-prompt: {e}") from e - - if not isinstance(parsed, dict): - raise ValueError("--limit-mm-per-prompt must be a JSON object") - - result = {} - for modality, options in parsed.items(): - # Normalize int to dict first - if isinstance(options, int): - options = {"count": options} - elif not isinstance(options, dict): - raise ValueError( - f"Invalid options type for {modality}: {type(options)}. " - f"Must be int or dict.") - - # Single path: create BaseDummyOptions from dict - try: - if modality == "video": - result[modality] = VideoDummyOptions(**options) - elif modality == "image": - result[modality] = ImageDummyOptions(**options) - elif modality == "audio": - result[modality] = AudioDummyOptions(**options) - else: - # Unknown modality - use BaseDummyOptions for OOT models - result[modality] = BaseDummyOptions(**options) - except TypeError as e: - raise ValueError(f"Invalid options for {modality}") from e - - return result - - def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]: """Check if the type hint is a specific type.""" return type_hint is type or get_origin(type_hint) is type @@ -419,7 +376,7 @@ class EngineArgs: quantization: Optional[QuantizationMethods] = ModelConfig.quantization enforce_eager: bool = ModelConfig.enforce_eager disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce - limit_mm_per_prompt: dict[str, int] = \ + limit_mm_per_prompt: dict[str, Union[int, dict[str, int]]] = \ get_field(MultiModalConfig, "limit_per_prompt") interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings media_io_kwargs: dict[str, dict[str, @@ -830,18 +787,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: # Multimodal related configs multimodal_kwargs = get_kwargs(MultiModalConfig) - # Override the parser for limit_per_prompt to support configurable - # options - multimodal_kwargs["limit_per_prompt"][ - "type"] = parse_limit_mm_per_prompt - multimodal_kwargs["limit_per_prompt"]["help"] += ( - "\n\nSupports both legacy count-only format and " - "configurable options format:" - "\n Legacy: '{\"image\": 5, \"video\": 1}'" - "\n Configurable: '{\"video\": {\"count\": 1, " - "\"num_frames\": 32}}'" - "\n Mixed: '{\"image\": 5, \"video\": {\"count\": 1, " - "\"num_frames\": 32}}'") multimodal_group = parser.add_argument_group( title="MultiModalConfig", description=MultiModalConfig.__doc__, From ee66a9e4738f1103e5ab10affff18dda91851e8b Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 2 Oct 2025 03:57:07 +0000 Subject: [PATCH 24/27] add all models Signed-off-by: wwl2755 --- docs/contributing/model/multimodal.md | 12 ++++++++++-- vllm/model_executor/models/aria.py | 7 ++++++- vllm/model_executor/models/aya_vision.py | 7 ++++++- vllm/model_executor/models/blip2.py | 7 ++++++- vllm/model_executor/models/chameleon.py | 7 ++++++- vllm/model_executor/models/cohere2_vision.py | 7 ++++++- vllm/model_executor/models/deepseek_vl2.py | 7 ++++++- vllm/model_executor/models/dots_ocr.py | 7 ++++++- vllm/model_executor/models/ernie45_vl.py | 11 +++++++++-- vllm/model_executor/models/fuyu.py | 7 ++++++- vllm/model_executor/models/gemma3_mm.py | 7 ++++++- vllm/model_executor/models/gemma3n_mm.py | 12 ++++++++++-- vllm/model_executor/models/glm4_1v.py | 10 +++++++++- vllm/model_executor/models/glm4v.py | 7 ++++++- vllm/model_executor/models/granite_speech.py | 5 +++++ vllm/model_executor/models/hyperclovax_vision.py | 8 ++++++++ vllm/model_executor/models/idefics3.py | 7 ++++++- vllm/model_executor/models/interns1.py | 11 +++++++++-- vllm/model_executor/models/internvl.py | 15 ++++++++++++--- vllm/model_executor/models/keye.py | 7 +++++++ vllm/model_executor/models/kimi_vl.py | 7 ++++++- vllm/model_executor/models/llava.py | 7 ++++++- vllm/model_executor/models/llava_next_video.py | 5 +++++ vllm/model_executor/models/llava_onevision.py | 9 ++++++++- vllm/model_executor/models/midashenglm.py | 7 ++++++- vllm/model_executor/models/minicpmo.py | 10 ++++++++-- vllm/model_executor/models/minicpmv.py | 11 +++++++++-- vllm/model_executor/models/mistral3.py | 7 ++++++- vllm/model_executor/models/mllama4.py | 7 ++++++- vllm/model_executor/models/molmo.py | 7 ++++++- vllm/model_executor/models/nano_nemotron_vl.py | 15 ++++++++++++--- vllm/model_executor/models/nvlm_d.py | 7 ++++++- vllm/model_executor/models/ovis.py | 7 ++++++- vllm/model_executor/models/ovis2_5.py | 10 +++++++++- vllm/model_executor/models/paligemma.py | 7 ++++++- vllm/model_executor/models/phi3v.py | 7 ++++++- vllm/model_executor/models/phi4_multimodal.py | 11 +++++++++-- vllm/model_executor/models/phi4mm.py | 11 +++++++++-- vllm/model_executor/models/pixtral.py | 10 ++++++++-- .../model_executor/models/qwen2_5_omni_thinker.py | 15 ++++++++++++--- vllm/model_executor/models/qwen3_vl.py | 10 +++++++++- vllm/model_executor/models/qwen_vl.py | 7 ++++++- vllm/model_executor/models/rvl.py | 8 +++++++- vllm/model_executor/models/skyworkr1v.py | 7 ++++++- vllm/model_executor/models/step3_vl.py | 7 ++++++- vllm/model_executor/models/terratorch.py | 11 +++++++++++ vllm/model_executor/models/transformers.py | 7 ++++++- vllm/model_executor/models/ultravox.py | 8 +++++++- vllm/model_executor/models/voxtral.py | 11 +++++++++-- vllm/model_executor/models/whisper.py | 8 +++++++- 50 files changed, 368 insertions(+), 61 deletions(-) diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 1d72fe97b966..724dc2284e28 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -258,17 +258,21 @@ Assuming that the memory usage increases with the number of tokens, the dummy in self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } ``` @@ -438,16 +442,20 @@ Assuming that the memory usage increases with the number of tokens, the dummy in self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } ``` diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 82f35d889605..a75d9424fe21 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -23,6 +23,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) +from vllm.multimodal.options import BaseDummyOptions from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -429,17 +430,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: vision_config = self.info.get_vision_config() max_image_size = vision_config.image_size num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=max_image_size, height=max_image_size, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 81bab5b34bc6..5d57da06f1c5 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -18,6 +18,7 @@ from vllm.config import VllmConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems +from vllm.multimodal.options import BaseDummyOptions from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -166,16 +167,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) image_size = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=image_size.width, height=image_size.height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 334743a7358c..d6927b2a8ca2 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -15,6 +15,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) +from vllm.multimodal.options import BaseDummyOptions from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptIndexTargets, @@ -440,6 +441,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config @@ -447,11 +449,14 @@ def get_dummy_mm_data( max_image_size = vision_config.image_size num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=max_image_size, height=max_image_size, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 86dbf63fa5df..918d46d9bf8c 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -32,6 +32,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) +from vllm.multimodal.options import BaseDummyOptions from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -92,17 +93,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: config = self.info.get_hf_config() width = height = config.vq_config.resolution num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=width, height=height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 7162571c08d9..7b471fde2efe 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -16,6 +16,7 @@ Cohere2VisionProcessor) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import MulAndSilu from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) @@ -209,16 +210,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) image_size = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=image_size.width, height=image_size.height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 0f87fb34bf32..3914907b1765 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -14,6 +14,7 @@ from transformers import BatchFeature from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.utils import set_default_torch_dtype @@ -192,16 +193,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) max_image_size = self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=max_image_size.width, height=max_image_size.height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index 4845f19bcbc4..08c0ed071345 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -11,6 +11,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import utils as dist_utils from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) @@ -86,17 +87,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features( # noqa: E501 ) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), } diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index a73ec4f88ffe..cc72710418af 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -36,6 +36,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger @@ -1185,6 +1186,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -1194,16 +1196,21 @@ def get_dummy_mm_data( target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos(width=target_width, height=target_height, num_frames=target_num_frames, - num_videos=num_videos) + num_videos=num_videos, + overrides=video_overrides) } diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 9e491c0b50d2..4e3c724775be 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -27,6 +27,7 @@ FuyuProcessor) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.multimodal import MULTIMODAL_REGISTRY @@ -139,16 +140,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 36f8651371ba..bce32fb8f256 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -11,6 +11,7 @@ import vllm.envs as envs from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -241,17 +242,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 101e083ac123..262672565e45 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -16,6 +16,7 @@ from transformers.models.siglip import SiglipImageProcessorFast from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm @@ -141,6 +142,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_audios = mm_counts.get("audio", 0) @@ -151,13 +153,19 @@ def get_dummy_mm_data( img_width = image_processor.size.get("width", 224) img_height = image_processor.size.get("height", 224) + image_overrides = mm_options.get("image") if mm_options else None + audio_overrides = mm_options.get("audio") if mm_options else None + return { "image": self._get_dummy_images(width=img_width, height=img_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) + self._get_dummy_audios(length=audio_len, + num_audios=num_audios, + overrides=audio_overrides) } diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 722f1e428be7..d6174692d59f 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -48,6 +48,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import (get_tensor_model_parallel_world_size, parallel_state) from vllm.distributed import utils as dist_utils @@ -1107,6 +1108,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -1115,17 +1117,23 @@ def get_dummy_mm_data( self.info.get_image_size_with_most_features()) target_num_frames = self.info.get_num_frames_with_most_features( seq_len, mm_counts) + + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos( width=target_width, height=target_height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ), } diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 22ddb1d75160..678c17f4b85e 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -19,6 +19,7 @@ from vllm.attention.layer import MultiHeadAttention from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -466,6 +467,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config @@ -473,11 +475,14 @@ def get_dummy_mm_data( target_width = target_height = vision_config["image_size"] num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 0ec451356f5e..683b21931f57 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -33,6 +33,7 @@ from transformers import BatchFeature, PretrainedConfig from vllm.config import CacheConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig @@ -183,13 +184,17 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) + audio_overrides = mm_options.get("audio") if mm_options else None + return { "audio": self._get_dummy_audios( length=self.info.get_max_audio_len(), num_audios=num_audios, + overrides=audio_overrides, ) } diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index b0f9d5e2657e..10d3bc8464ba 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -29,6 +29,7 @@ from transformers.modeling_utils import no_init_weights from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache @@ -149,6 +150,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -156,12 +158,17 @@ def get_dummy_mm_data( target_width, target_height = \ self.info.get_image_size_with_most_features() target_num_frames = 32 + + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images( width=target_width, height=target_height, num_images=num_images, + overrides=image_overrides, ), "video": self._get_dummy_videos( @@ -169,6 +176,7 @@ def get_dummy_mm_data( height=target_height - 1, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ) } diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index dddf1c6fb626..567793e9b7ee 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -26,6 +26,7 @@ Idefics3Processor) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -292,17 +293,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) hf_processor = self.info.get_hf_processor() image_processor: Idefics3ImageProcessor = hf_processor.image_processor longest_edge = image_processor.max_image_size['longest_edge'] + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=longest_edge, height=longest_edge, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index 0292845f819c..de9e69f91c76 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -20,6 +20,7 @@ InternVLVideoProcessor) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.interns1_vit import InternS1VisionModel from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -270,6 +271,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() @@ -281,16 +283,21 @@ def get_dummy_mm_data( config = self.info.get_hf_config() image_size_h, image_size_w = config.vision_config.image_size + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos(width=image_size_w, height=image_size_h, num_frames=target_num_frames, - num_videos=num_videos), + num_videos=num_videos, + overrides=video_overrides), } diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 0c95c49f90b1..f06811be13d1 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -20,6 +20,7 @@ from transformers import BatchEncoding, PretrainedConfig, TensorType from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.models.intern_vit import (InternVisionModel, @@ -751,16 +752,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } @@ -917,21 +922,25 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: dummy_image = super().get_dummy_mm_data(seq_len=seq_len, - mm_counts=mm_counts) + mm_counts=mm_counts, + mm_options=mm_options) if self.info.supports_video: config = self.info.get_hf_config() image_size: int = config.vision_config.image_size target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) num_videos = mm_counts.get("video", 0) + video_overrides = mm_options.get("video") if mm_options else None dummy_video = { "video": self._get_dummy_videos(width=image_size, height=image_size, num_frames=target_num_frames, - num_videos=num_videos) + num_videos=num_videos, + overrides=video_overrides) } else: dummy_video = {} diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 10b5c45169f4..20d915d6fd38 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -19,6 +19,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -1170,6 +1171,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -1179,12 +1181,16 @@ def get_dummy_mm_data( target_num_frames = self.info.get_num_frames_with_most_features( seq_len) + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + mm_data = { "image": self._get_dummy_images( width=target_width, height=target_height, num_images=num_images, + overrides=image_overrides, ), "video": self._get_dummy_videos( @@ -1192,6 +1198,7 @@ def get_dummy_mm_data( height=target_height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ), } diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 30ec9029f74f..a47bdd2f5ab5 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -54,6 +54,7 @@ from transformers.activations import GELUActivation from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ReplicatedLinear @@ -212,14 +213,18 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=MaxImageTokenMeta.width, height=MaxImageTokenMeta.height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index d823e5cb58d2..c398f88797b9 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -15,6 +15,7 @@ from transformers.models.pixtral import PixtralProcessor from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -208,17 +209,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 697b8e819707..7aabef32b4a9 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -11,6 +11,7 @@ LlavaNextVideoProcessor) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.models.clip import CLIPVisionModel from vllm.multimodal import MULTIMODAL_REGISTRY @@ -150,6 +151,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_videos = mm_counts.get("video", 0) @@ -158,6 +160,8 @@ def get_dummy_mm_data( target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + video_overrides = mm_options.get("video") if mm_options else None + return { "video": self._get_dummy_videos( @@ -165,6 +169,7 @@ def get_dummy_mm_data( height=target_height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ) } diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 924f8ba3585f..4379f24da1bf 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -13,6 +13,7 @@ get_anyres_image_grid_shape, unpad_image) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -254,6 +255,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -264,17 +266,22 @@ def get_dummy_mm_data( self.info.get_num_frames_with_most_features(seq_len, mm_counts) + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos( width=target_width, height=target_height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ) } diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index 33bd64df5b53..65b3ee1c0e18 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -36,6 +36,7 @@ from transformers import BatchFeature from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -539,13 +540,17 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) + audio_overrides = mm_options.get("audio") if mm_options else None + return { "audio": self._get_dummy_audios(length=self.info.get_max_audio_len(), - num_audios=num_audios) + num_audios=num_audios, + overrides=audio_overrides) } diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index e5333fb652b1..74b2a2e62cd5 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -36,6 +36,7 @@ WhisperEncoder) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, NestedTensors) @@ -237,18 +238,23 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) audio_len = self.info.get_max_audio_chunks_with_most_features() * \ self.info.get_default_audio_sampling_rate() + audio_overrides = mm_options.get("audio") if mm_options else None + audio_mm_data = { "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) + self._get_dummy_audios(length=audio_len, + num_audios=num_audios, + overrides=audio_overrides) } return { - **super().get_dummy_mm_data(seq_len, mm_counts), + **super().get_dummy_mm_data(seq_len, mm_counts, mm_options), **audio_mm_data, } diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index eaa3839af37b..8bef1ec514ab 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -39,6 +39,7 @@ from typing_extensions import TypeVar from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig @@ -679,6 +680,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -690,15 +692,20 @@ def get_dummy_mm_data( num_video_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=image_width, height=image_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": [ self._get_dummy_images(width=video_width, height=video_height, - num_images=num_video_frames) + num_images=num_video_frames, + overrides=video_overrides) ] * num_videos, } diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index e932f7f007f5..d7c48758cca7 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -13,6 +13,7 @@ from transformers.models.pixtral import PixtralProcessor from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -208,17 +209,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index db5a9fbc6a33..9864ca2dc474 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -31,6 +31,7 @@ from vllm.attention.layer import MultiHeadAttention from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, @@ -689,17 +690,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) (target_width, target_height) = self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 0227a83a1f55..a77a2eb0f5a8 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -22,6 +22,7 @@ from vllm.attention.layer import MultiHeadAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, @@ -1226,16 +1227,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 2d0ebdc90277..fb90645b0e45 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -22,6 +22,7 @@ TensorType) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import ReLUSquaredActivation from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -814,6 +815,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: # Use default max_num_tiles for dummy data generation max_num_tiles = 12 @@ -821,11 +823,14 @@ def get_dummy_mm_data( self.info.get_image_size_with_most_features(max_num_tiles)) num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } @@ -842,21 +847,25 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: dummy_image = super().get_dummy_mm_data(seq_len=seq_len, - mm_counts=mm_counts) + mm_counts=mm_counts, + mm_options=mm_options) if self.info.supports_video: config = self.info.get_hf_config() image_size: int = config.force_image_size target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) num_videos = mm_counts.get("video", 0) + video_overrides = mm_options.get("video") if mm_options else None dummy_video = { "video": self._get_dummy_videos(width=image_size, height=image_size, num_frames=target_num_frames, - num_videos=num_videos) + num_videos=num_videos, + overrides=video_overrides) } else: dummy_video = {} diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 3bbf4c67604c..dcbbcf3383fa 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -14,6 +14,7 @@ import torch.nn as nn from transformers import PretrainedConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems @@ -86,16 +87,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 2e8e4a44102f..f8674b4f0e3f 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -28,6 +28,7 @@ from transformers import BatchFeature, PretrainedConfig from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.aimv2 import AIMv2Model @@ -283,17 +284,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + mm_data = { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), } return mm_data diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 9c8adb617310..18dea14379a6 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -10,6 +10,7 @@ from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.ovis import (OvisImagePatchInputs, @@ -290,6 +291,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -298,17 +300,23 @@ def get_dummy_mm_data( self.info.get_image_size_with_most_features() target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + mm_data = { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos( width=target_width, height=target_height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ) } return mm_data diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index d118e6c89ab5..744015551c0c 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -8,6 +8,7 @@ from transformers import BatchFeature, PaliGemmaConfig from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -106,6 +107,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config @@ -113,11 +115,14 @@ def get_dummy_mm_data( num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=max_image_size, height=max_image_size, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 59977796e2af..1895a30d2500 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -25,6 +25,7 @@ ProcessorMixin) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -356,17 +357,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index a4f9f96cb951..a5cc87d327b5 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -17,6 +17,7 @@ Phi4MultimodalAudioRelativeAttentionBias, adaptive_enc_mask, unfold_tensor) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn @@ -980,6 +981,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) @@ -987,14 +989,19 @@ def get_dummy_mm_data( target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + audio_overrides = mm_options.get("audio") if mm_options else None + mm_data = { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE, - num_audios=num_audios), + num_audios=num_audios, + overrides=audio_overrides), } return mm_data diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 8ccc7129ddb2..e3529dc393cf 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -11,6 +11,7 @@ SequenceFeatureExtractor, SiglipVisionConfig) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -749,6 +750,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) @@ -756,14 +758,19 @@ def get_dummy_mm_data( target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + audio_overrides = mm_options.get("audio") if mm_options else None + mm_data = { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE, - num_audios=num_audios), + num_audios=num_audios, + overrides=audio_overrides), } return mm_data diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index bf451c5005b7..1c6e3a31d985 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -24,6 +24,7 @@ from transformers.tokenization_utils_base import TextInput from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.layernorm import RMSNorm @@ -228,28 +229,33 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> ProcessorInputs: tokenizer = self.info.get_tokenizer() dummy_text = self.get_dummy_text(mm_counts) - dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) + dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options) dummy_images = dummy_mm_data.get("image", []) tokenization_kwargs = {"truncation": False} diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 8f069710b0f9..3ddb9c874c6b 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -39,6 +39,7 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -212,6 +213,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) @@ -228,19 +230,26 @@ def get_dummy_mm_data( target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + audio_overrides = mm_options.get("audio") if mm_options else None + mm_data = { "audio": self._get_dummy_audios(length=target_audio_length, - num_audios=num_audios), + num_audios=num_audios, + overrides=audio_overrides), "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos(width=target_width, height=target_height, num_frames=target_num_frames, - num_videos=num_videos), + num_videos=num_videos, + overrides=video_overrides), } return mm_data diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index ce92557d6424..efcde63701c7 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -46,6 +46,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY @@ -734,6 +735,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -748,17 +750,23 @@ def get_dummy_mm_data( num_frames=target_num_frames, image_processor=self.info.get_video_processor(), ) + + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos( width=target_video_size.width, height=target_video_size.height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ), } diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 924119ed63ab..a94e1e700c67 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -24,6 +24,7 @@ from transformers.tokenization_utils_base import TextInput from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, ReplicatedLinear, @@ -567,6 +568,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.visual @@ -574,11 +576,14 @@ def get_dummy_mm_data( target_width = target_height = vision_config["image_size"] num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/rvl.py b/vllm/model_executor/models/rvl.py index efdb01004663..594d018f6bb6 100644 --- a/vllm/model_executor/models/rvl.py +++ b/vllm/model_executor/models/rvl.py @@ -2,12 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping +from typing import Optional import torch import torch.nn as nn from transformers.activations import GELUActivation from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict @@ -38,17 +40,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = ( self.info.get_image_size_with_most_features()) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), } diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index f03022aa719c..2c23685739dd 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -17,6 +17,7 @@ from transformers import BatchEncoding, PretrainedConfig, TensorType from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig @@ -507,16 +508,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index ad295ef44732..da507e0d9732 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -17,6 +17,7 @@ from vllm.attention.layer import MultiHeadAttention from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -496,16 +497,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index 938b02e3e04b..8436cab4ef10 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -28,6 +28,8 @@ from transformers import BatchFeature from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions +from vllm.logger import init_logger from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import AutoWeightsLoader @@ -48,6 +50,8 @@ SupportsMultiModal) from .interfaces_base import default_pooling_type +logger = init_logger(__name__) + def _terratorch_field_names(pretrained_cfg: dict): input_definition = InputDefinition(**pretrained_cfg["input"]) @@ -97,9 +101,16 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: # Dummy data is generated based on the 'input' section # defined in the HF configuration file + + if mm_options: + logger.warning("Configurable multimodal profiling " + "options are not supported for Terratorch. " + "They are ignored for now.") + return self.dummy_data_generator.get_dummy_mm_data() diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 00d87f560e70..cec052b761c0 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -31,6 +31,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, VllmConfig) +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed.utils import get_pp_indices from vllm.logger import init_logger @@ -256,16 +257,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_max_image_size() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), } diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 70aabf6dfe78..7744a19946a2 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -14,6 +14,7 @@ from transformers.models.whisper.modeling_whisper import WhisperEncoder from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.model_loader import DefaultModelLoader @@ -114,6 +115,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: feature_extractor = self.info.get_feature_extractor() @@ -122,9 +124,13 @@ def get_dummy_mm_data( _MAX_ENCODER_BATCH_SIZE) num_audios = mm_counts.get("audio", 0) + audio_overrides = mm_options.get("audio") if mm_options else None + return { "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) + self._get_dummy_audios(length=audio_len, + num_audios=num_audios, + overrides=audio_overrides) } diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 1edeaeb0f319..ad494a7a7ec9 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -21,6 +21,7 @@ from transformers.tokenization_utils_base import TextInput from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig @@ -204,25 +205,31 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) target_length = self.info.get_max_audio_array_len() + audio_overrides = mm_options.get("audio") if mm_options else None + return { "audio": - self._get_dummy_audios(length=target_length, num_audios=num_audios) + self._get_dummy_audios(length=target_length, + num_audios=num_audios, + overrides=audio_overrides) } def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> ProcessorInputs: tokenizer = self.info.get_tokenizer() dummy_text = self.get_dummy_text(mm_counts) - dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) + dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options) dummy_audios = dummy_mm_data.get("audio", []) audio_chunks: list[AudioChunk] = [] diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 1eecac7ed76b..d349d91dfd76 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -18,6 +18,7 @@ from vllm.attention.layers.cross_attention import CrossAttention from vllm.config import (CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig) +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -691,6 +692,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: feature_extractor = self.info.get_feature_extractor() @@ -698,9 +700,13 @@ def get_dummy_mm_data( audio_len = feature_extractor.chunk_length * sampling_rate num_audios = mm_counts.get("audio", 0) + audio_overrides = mm_options.get("audio") if mm_options else None + return { "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) + self._get_dummy_audios(length=audio_len, + num_audios=num_audios, + overrides=audio_overrides) } From bd74d03d25620c069d1cae6013f2d9f7d9ee207f Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 2 Oct 2025 20:34:21 +0000 Subject: [PATCH 25/27] fix import error Signed-off-by: wwl2755 --- vllm/model_executor/models/aria.py | 2 +- vllm/model_executor/models/aya_vision.py | 2 +- vllm/model_executor/models/blip2.py | 2 +- vllm/model_executor/models/chameleon.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 305746527d6d..188624e606ff 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -10,6 +10,7 @@ from transformers.models.aria.processing_aria import AriaProcessor from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_rank from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.fused_moe import FusedMoE @@ -23,7 +24,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) -from vllm.multimodal.options import BaseDummyOptions from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 309fdaeed95b..a682252f4a2b 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -16,9 +16,9 @@ get_optimal_tiled_canvas) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems -from vllm.multimodal.options import BaseDummyOptions from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 536c5f463ed4..3d057654cca7 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -10,12 +10,12 @@ apply_chunking_to_forward) from vllm.config import CacheConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) -from vllm.multimodal.options import BaseDummyOptions from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptIndexTargets, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 0b30cd40c49e..b1432dcb9d6d 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -14,6 +14,7 @@ from vllm.attention import Attention from vllm.config import CacheConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul @@ -32,7 +33,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) -from vllm.multimodal.options import BaseDummyOptions from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, From 8b0bea0535512e91d9676a54dddbf2af6f027e83 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Thu, 2 Oct 2025 23:25:43 +0000 Subject: [PATCH 26/27] fix tests Signed-off-by: wwl2755 --- .../multimodal/processing/test_common.py | 22 +++++++++++++-- .../processing/test_tensor_schema.py | 17 ++++++++++- vllm/model_executor/models/glm4_1v.py | 28 +++++++++++++++++-- 3 files changed, 61 insertions(+), 6 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index ddc675b0849c..e8c28afee7e3 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -12,6 +12,8 @@ from PIL import Image from vllm.config import ModelConfig +from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions, + ImageDummyOptions, VideoDummyOptions) from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import MultiModalInputs @@ -112,12 +114,26 @@ def _test_processing_correctness( processing_info = factories.info(ctx) supported_mm_limits = processing_info.get_supported_mm_limits() - limit_mm_per_prompt = { + # Keep integer limits for local data generation + limit_mm_per_prompt_ints = { modality: 3 if limit is None else limit for modality, limit in supported_mm_limits.items() } - model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt + def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions: + if modality == "video": + return VideoDummyOptions(count=count) + if modality == "image": + return ImageDummyOptions(count=count) + if modality == "audio": + return AudioDummyOptions(count=count) + return BaseDummyOptions(count=count) + + # Assign normalized DummyOptions to the model config + model_config.get_multimodal_config().limit_per_prompt = { + modality: _to_dummy_options(modality, count) + for modality, count in limit_mm_per_prompt_ints.items() + } baseline_processor = factories.build_processor(ctx, cache=None) cached_processor = factories.build_processor(ctx, cache=cache) @@ -150,7 +166,7 @@ def _test_processing_correctness( k: [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) for _ in range(rng.randint(limit + 1))] - for k, limit in limit_mm_per_prompt.items() + for k, limit in limit_mm_per_prompt_ints.items() } mm_counts = {k: len(vs) for k, vs in mm_data.items()} diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 9d132ecc34b7..6061e4538c95 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -15,6 +15,8 @@ from PIL import Image from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config +from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions, + ImageDummyOptions, VideoDummyOptions) from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) @@ -236,7 +238,20 @@ def test_model_tensor_schema(model_arch: str, model_id: str): modality: 3 if limit is None else limit for modality, limit in supported_mm_limits.items() } - model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt + + def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions: + if modality == "video": + return VideoDummyOptions(count=count) + if modality == "image": + return ImageDummyOptions(count=count) + if modality == "audio": + return AudioDummyOptions(count=count) + return BaseDummyOptions(count=count) + + model_config.get_multimodal_config().limit_per_prompt = { + modality: _to_dummy_options(modality, count) + for modality, count in limit_mm_per_prompt.items() + } processor = factories.build_processor(ctx, cache=None) with initialize_dummy_model(model_cls, model_config) as model: diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index d6174692d59f..3587ce217997 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -29,7 +29,7 @@ import math from collections.abc import Iterable, Mapping, Sequence from functools import partial -from typing import Annotated, Any, Callable, Literal, Optional, Union +from typing import Annotated, Any, Callable, Literal, Optional, Union, override import numpy as np import torch @@ -48,7 +48,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig -from vllm.config.multimodal import BaseDummyOptions +from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import (get_tensor_model_parallel_world_size, parallel_state) from vllm.distributed import utils as dist_utils @@ -1144,7 +1144,31 @@ def _get_dummy_videos( height: int, num_frames: int, num_videos: int, + overrides: Optional[VideoDummyOptions] = None, ) -> list[VideoItem]: + if overrides: + if overrides.num_frames: + if overrides.num_frames > num_frames: + logger.warning( + "video.num_frames override (%d) exceeds model's " + "maximum number of frames (%d), will be ignored", + overrides.num_frames, num_frames) + num_frames = min(num_frames, overrides.num_frames) + if overrides.width: + if overrides.width > width: + logger.warning( + "video.width override (%d) exceeds model's " + "maximum width (%d), will be ignored", overrides.width, + width) + width = min(width, overrides.width) + if overrides.height: + if overrides.height > height: + logger.warning( + "video.height override (%d) exceeds model's " + "maximum height (%d), will be ignored", + overrides.height, height) + height = min(height, override.height) + video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8) video_items = [] for i in range(num_videos): From 79343c37ae023de2bdc1ec374d7bbe35c22b63b9 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Fri, 3 Oct 2025 05:30:08 +0000 Subject: [PATCH 27/27] fix mllama4 test Signed-off-by: wwl2755 --- tests/models/multimodal/processing/test_mllama4.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index e7b28ff8ec7f..a155ada35e92 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -17,23 +17,23 @@ def test_profiling(model_id: str, max_model_len: int): model_config_kwargs = { "max_model_len": max_model_len, } + mm_counts = {"image": 1} ctx = build_model_context( model_id, model_config_kwargs=model_config_kwargs, - limit_mm_per_prompt={"image": 1}, + limit_mm_per_prompt=mm_counts, ) - mm_config = ctx.get_mm_config() processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) profiler = MultiModalProfiler(processor) decoder_dummy_data = profiler.get_decoder_dummy_data( max_model_len, - mm_counts=mm_config.limit_per_prompt, + mm_counts=mm_counts, ) dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs( max_model_len, - mm_counts=mm_config.limit_per_prompt, + mm_counts=mm_counts, ) hf_config = ctx.get_hf_config(Llama4Config) @@ -58,7 +58,7 @@ def test_profiling(model_id: str, max_model_len: int): profiled_tokens = profiler.get_mm_max_contiguous_tokens( max_model_len, - mm_counts=mm_config.limit_per_prompt, + mm_counts=mm_counts, ) assert total_tokens == profiled_tokens["image"]