diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 1d72fe97b966..724dc2284e28 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -258,17 +258,21 @@ Assuming that the memory usage increases with the number of tokens, the dummy in self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } ``` @@ -438,16 +442,20 @@ Assuming that the memory usage increases with the number of tokens, the dummy in self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } ``` diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index ddc675b0849c..e8c28afee7e3 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -12,6 +12,8 @@ from PIL import Image from vllm.config import ModelConfig +from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions, + ImageDummyOptions, VideoDummyOptions) from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import MultiModalInputs @@ -112,12 +114,26 @@ def _test_processing_correctness( processing_info = factories.info(ctx) supported_mm_limits = processing_info.get_supported_mm_limits() - limit_mm_per_prompt = { + # Keep integer limits for local data generation + limit_mm_per_prompt_ints = { modality: 3 if limit is None else limit for modality, limit in supported_mm_limits.items() } - model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt + def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions: + if modality == "video": + return VideoDummyOptions(count=count) + if modality == "image": + return ImageDummyOptions(count=count) + if modality == "audio": + return AudioDummyOptions(count=count) + return BaseDummyOptions(count=count) + + # Assign normalized DummyOptions to the model config + model_config.get_multimodal_config().limit_per_prompt = { + modality: _to_dummy_options(modality, count) + for modality, count in limit_mm_per_prompt_ints.items() + } baseline_processor = factories.build_processor(ctx, cache=None) cached_processor = factories.build_processor(ctx, cache=cache) @@ -150,7 +166,7 @@ def _test_processing_correctness( k: [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) for _ in range(rng.randint(limit + 1))] - for k, limit in limit_mm_per_prompt.items() + for k, limit in limit_mm_per_prompt_ints.items() } mm_counts = {k: len(vs) for k, vs in mm_data.items()} diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index e7b28ff8ec7f..a155ada35e92 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -17,23 +17,23 @@ def test_profiling(model_id: str, max_model_len: int): model_config_kwargs = { "max_model_len": max_model_len, } + mm_counts = {"image": 1} ctx = build_model_context( model_id, model_config_kwargs=model_config_kwargs, - limit_mm_per_prompt={"image": 1}, + limit_mm_per_prompt=mm_counts, ) - mm_config = ctx.get_mm_config() processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) profiler = MultiModalProfiler(processor) decoder_dummy_data = profiler.get_decoder_dummy_data( max_model_len, - mm_counts=mm_config.limit_per_prompt, + mm_counts=mm_counts, ) dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs( max_model_len, - mm_counts=mm_config.limit_per_prompt, + mm_counts=mm_counts, ) hf_config = ctx.get_hf_config(Llama4Config) @@ -58,7 +58,7 @@ def test_profiling(model_id: str, max_model_len: int): profiled_tokens = profiler.get_mm_max_contiguous_tokens( max_model_len, - mm_counts=mm_config.limit_per_prompt, + mm_counts=mm_counts, ) assert total_tokens == profiled_tokens["image"] diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 9d132ecc34b7..6061e4538c95 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -15,6 +15,8 @@ from PIL import Image from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config +from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions, + ImageDummyOptions, VideoDummyOptions) from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) @@ -236,7 +238,20 @@ def test_model_tensor_schema(model_arch: str, model_id: str): modality: 3 if limit is None else limit for modality, limit in supported_mm_limits.items() } - model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt + + def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions: + if modality == "video": + return VideoDummyOptions(count=count) + if modality == "image": + return ImageDummyOptions(count=count) + if modality == "audio": + return AudioDummyOptions(count=count) + return BaseDummyOptions(count=count) + + model_config.get_multimodal_config().limit_per_prompt = { + modality: _to_dummy_options(modality, count) + for modality, count in limit_mm_per_prompt.items() + } processor = factories.build_processor(ctx, cache=None) with initialize_dummy_model(model_cls, model_config) as model: diff --git a/vllm/config/model.py b/vllm/config/model.py index e9d5b58ff2c2..be1fd9a94afa 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -276,7 +276,9 @@ class ModelConfig: multimodal_config: Optional[MultiModalConfig] = None """Configuration for multimodal model. If `None`, this will be inferred from the architecture of `self.model`.""" - limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None + limit_mm_per_prompt: InitVar[Optional[dict[str, Union[int, + dict[str, + int]]]]] = None media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None mm_processor_cache_gb: InitVar[Optional[float]] = None diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 569de9579900..fd62d2411ade 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -4,15 +4,45 @@ import hashlib from collections.abc import Mapping from dataclasses import field -from typing import Any, Literal, Optional +from typing import Any, Literal, Optional, Union +from pydantic import ConfigDict, Field, field_validator from pydantic.dataclasses import dataclass -import vllm.envs as envs from vllm.config.utils import config + +@dataclass +class BaseDummyOptions: + """Base options for generating dummy data during profiling.""" + count: int = Field(999, ge=0) + + +@dataclass(config=ConfigDict(extra="forbid")) +class VideoDummyOptions(BaseDummyOptions): + """Options for generating dummy video data during profiling.""" + num_frames: Optional[int] = Field(None, gt=0) + width: Optional[int] = Field(None, gt=0) + height: Optional[int] = Field(None, gt=0) + + +@dataclass(config=ConfigDict(extra="forbid")) +class ImageDummyOptions(BaseDummyOptions): + """Options for generating dummy image data during profiling.""" + width: Optional[int] = Field(None, gt=0) + height: Optional[int] = Field(None, gt=0) + + +@dataclass(config=ConfigDict(extra="forbid")) +class AudioDummyOptions(BaseDummyOptions): + """Options for generating dummy audio data during profiling.""" + length: Optional[int] = Field(None, gt=0) + + MMEncoderTPMode = Literal["weights", "data"] MMCacheType = Literal["shm", "lru"] +DummyOptions = Union[BaseDummyOptions, VideoDummyOptions, ImageDummyOptions, + AudioDummyOptions] @config @@ -20,12 +50,22 @@ class MultiModalConfig: """Controls the behavior of multimodal models.""" - limit_per_prompt: dict[str, int] = field(default_factory=dict) - """The maximum number of input items allowed per prompt for each modality. - Defaults to 1 (V0) or 999 (V1) for each modality. + limit_per_prompt: dict[str, DummyOptions] = field(default_factory=dict) + """The maximum number of input items and options allowed per + prompt for each modality. + Defaults to 999 for each modality. + + Legacy format (count only): + {"image": 16, "video": 2} + + Configurable format (with options): + {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}, + "image": {"count": 5, "width": 512, "height": 512}} - For example, to allow up to 16 images and 2 videos per prompt: - `{"image": 16, "video": 2}`""" + Mixed format (combining both): + {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, + "height": 512}} + """ media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) """Additional args passed to process media inputs, keyed by modalities. For example, to set num_frames for video, set @@ -84,6 +124,27 @@ class MultiModalConfig: from each video to be pruned. """ + @field_validator("limit_per_prompt", mode="before") + @classmethod + def _validate_limit_per_prompt( + cls, value: dict[str, Union[int, + dict[str, + int]]]) -> dict[str, DummyOptions]: + for k, v in value.items(): + # Handle legacy format where only count is specified + if isinstance(v, int): + v = {"count": v} + # Convert to the appropriate DummyOptions subclass + if k == "video": + value[k] = VideoDummyOptions(**v) + elif k == "image": + value[k] = ImageDummyOptions(**v) + elif k == "audio": + value[k] = AudioDummyOptions(**v) + else: + value[k] = BaseDummyOptions(**v) + return value + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, @@ -106,12 +167,22 @@ def compute_hash(self) -> str: def get_limit_per_prompt(self, modality: str) -> int: """ Get the maximum number of input items allowed per prompt - for the given modality. + for the given modality (backward compatible). + """ + limit_data = self.limit_per_prompt.get(modality) + + if limit_data is None: + # Unspecified modality is set to 999 by default + return 999 + return limit_data.count + + def get_dummy_options(self, modality: str) -> Optional[BaseDummyOptions]: + """ + Get the configurable dummy data options for a modality. + Returns None if no options are configured for this modality. """ - return self.limit_per_prompt.get( - modality, - 999 if envs.VLLM_USE_V1 else 1, - ) + # All values are now DummyOptions after normalization + return self.limit_per_prompt.get(modality) def merge_mm_processor_kwargs( self, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ec61fc4b9b06..678f06f2502f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -376,7 +376,7 @@ class EngineArgs: quantization: Optional[QuantizationMethods] = ModelConfig.quantization enforce_eager: bool = ModelConfig.enforce_eager disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce - limit_mm_per_prompt: dict[str, int] = \ + limit_mm_per_prompt: dict[str, Union[int, dict[str, int]]] = \ get_field(MultiModalConfig, "limit_per_prompt") interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings media_io_kwargs: dict[str, dict[str, diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index e4a11a3d4a51..188624e606ff 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -10,6 +10,7 @@ from transformers.models.aria.processing_aria import AriaProcessor from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_rank from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.fused_moe import FusedMoE @@ -431,17 +432,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: vision_config = self.info.get_vision_config() max_image_size = vision_config.image_size num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=max_image_size, height=max_image_size, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 04100fb8c412..a682252f4a2b 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -16,6 +16,7 @@ get_optimal_tiled_canvas) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, @@ -166,16 +167,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) image_size = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=image_size.width, height=image_size.height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 940d39c7a67c..3d057654cca7 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -10,6 +10,7 @@ apply_chunking_to_forward) from vllm.config import CacheConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY @@ -435,6 +436,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config @@ -442,11 +444,14 @@ def get_dummy_mm_data( max_image_size = vision_config.image_size num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=max_image_size, height=max_image_size, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 4999f7904b14..b1432dcb9d6d 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -14,6 +14,7 @@ from vllm.attention import Attention from vllm.config import CacheConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul @@ -92,17 +93,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: config = self.info.get_hf_config() width = height = config.vq_config.resolution num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=width, height=height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 78a7f0a52c72..70f2a3fd339a 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -16,6 +16,7 @@ Cohere2VisionProcessor) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import MulAndSilu from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) @@ -209,16 +210,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) image_size = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=image_size.width, height=image_size.height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 3e3b4e59f833..107949df2270 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -14,6 +14,7 @@ from transformers import BatchFeature from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.utils import set_default_torch_dtype @@ -191,16 +192,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) max_image_size = self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=max_image_size.width, height=max_image_size.height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index e68777aab6bf..5f9bf87456d6 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -11,6 +11,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import utils as dist_utils from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) @@ -90,17 +91,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features( # noqa: E501 ) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), } diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index c62658fa4c21..624168f67a47 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -36,6 +36,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger @@ -1186,6 +1187,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -1195,16 +1197,21 @@ def get_dummy_mm_data( target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos(width=target_width, height=target_height, num_frames=target_num_frames, - num_videos=num_videos) + num_videos=num_videos, + overrides=video_overrides) } diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 2ab2cf9b17b3..a0f8d0659c59 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -27,6 +27,7 @@ FuyuProcessor) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.multimodal import MULTIMODAL_REGISTRY @@ -136,16 +137,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 36f8651371ba..bce32fb8f256 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -11,6 +11,7 @@ import vllm.envs as envs from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -241,17 +242,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 101e083ac123..262672565e45 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -16,6 +16,7 @@ from transformers.models.siglip import SiglipImageProcessorFast from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm @@ -141,6 +142,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_audios = mm_counts.get("audio", 0) @@ -151,13 +153,19 @@ def get_dummy_mm_data( img_width = image_processor.size.get("width", 224) img_height = image_processor.size.get("height", 224) + image_overrides = mm_options.get("image") if mm_options else None + audio_overrides = mm_options.get("audio") if mm_options else None + return { "image": self._get_dummy_images(width=img_width, height=img_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) + self._get_dummy_audios(length=audio_len, + num_audios=num_audios, + overrides=audio_overrides) } diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 722f1e428be7..3587ce217997 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -29,7 +29,7 @@ import math from collections.abc import Iterable, Mapping, Sequence from functools import partial -from typing import Annotated, Any, Callable, Literal, Optional, Union +from typing import Annotated, Any, Callable, Literal, Optional, Union, override import numpy as np import torch @@ -48,6 +48,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import (get_tensor_model_parallel_world_size, parallel_state) from vllm.distributed import utils as dist_utils @@ -1107,6 +1108,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -1115,17 +1117,23 @@ def get_dummy_mm_data( self.info.get_image_size_with_most_features()) target_num_frames = self.info.get_num_frames_with_most_features( seq_len, mm_counts) + + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos( width=target_width, height=target_height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ), } @@ -1136,7 +1144,31 @@ def _get_dummy_videos( height: int, num_frames: int, num_videos: int, + overrides: Optional[VideoDummyOptions] = None, ) -> list[VideoItem]: + if overrides: + if overrides.num_frames: + if overrides.num_frames > num_frames: + logger.warning( + "video.num_frames override (%d) exceeds model's " + "maximum number of frames (%d), will be ignored", + overrides.num_frames, num_frames) + num_frames = min(num_frames, overrides.num_frames) + if overrides.width: + if overrides.width > width: + logger.warning( + "video.width override (%d) exceeds model's " + "maximum width (%d), will be ignored", overrides.width, + width) + width = min(width, overrides.width) + if overrides.height: + if overrides.height > height: + logger.warning( + "video.height override (%d) exceeds model's " + "maximum height (%d), will be ignored", + overrides.height, height) + height = min(height, override.height) + video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8) video_items = [] for i in range(num_videos): diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 22ddb1d75160..678c17f4b85e 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -19,6 +19,7 @@ from vllm.attention.layer import MultiHeadAttention from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -466,6 +467,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config @@ -473,11 +475,14 @@ def get_dummy_mm_data( target_width = target_height = vision_config["image_size"] num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 0ec451356f5e..683b21931f57 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -33,6 +33,7 @@ from transformers import BatchFeature, PretrainedConfig from vllm.config import CacheConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig @@ -183,13 +184,17 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) + audio_overrides = mm_options.get("audio") if mm_options else None + return { "audio": self._get_dummy_audios( length=self.info.get_max_audio_len(), num_audios=num_audios, + overrides=audio_overrides, ) } diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index b0f9d5e2657e..10d3bc8464ba 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -29,6 +29,7 @@ from transformers.modeling_utils import no_init_weights from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache @@ -149,6 +150,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -156,12 +158,17 @@ def get_dummy_mm_data( target_width, target_height = \ self.info.get_image_size_with_most_features() target_num_frames = 32 + + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images( width=target_width, height=target_height, num_images=num_images, + overrides=image_overrides, ), "video": self._get_dummy_videos( @@ -169,6 +176,7 @@ def get_dummy_mm_data( height=target_height - 1, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ) } diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index dddf1c6fb626..567793e9b7ee 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -26,6 +26,7 @@ Idefics3Processor) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -292,17 +293,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) hf_processor = self.info.get_hf_processor() image_processor: Idefics3ImageProcessor = hf_processor.image_processor longest_edge = image_processor.max_image_size['longest_edge'] + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=longest_edge, height=longest_edge, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index 0292845f819c..de9e69f91c76 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -20,6 +20,7 @@ InternVLVideoProcessor) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.interns1_vit import InternS1VisionModel from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -270,6 +271,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() @@ -281,16 +283,21 @@ def get_dummy_mm_data( config = self.info.get_hf_config() image_size_h, image_size_w = config.vision_config.image_size + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos(width=image_size_w, height=image_size_h, num_frames=target_num_frames, - num_videos=num_videos), + num_videos=num_videos, + overrides=video_overrides), } diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 0c95c49f90b1..f06811be13d1 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -20,6 +20,7 @@ from transformers import BatchEncoding, PretrainedConfig, TensorType from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.models.intern_vit import (InternVisionModel, @@ -751,16 +752,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } @@ -917,21 +922,25 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: dummy_image = super().get_dummy_mm_data(seq_len=seq_len, - mm_counts=mm_counts) + mm_counts=mm_counts, + mm_options=mm_options) if self.info.supports_video: config = self.info.get_hf_config() image_size: int = config.vision_config.image_size target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) num_videos = mm_counts.get("video", 0) + video_overrides = mm_options.get("video") if mm_options else None dummy_video = { "video": self._get_dummy_videos(width=image_size, height=image_size, num_frames=target_num_frames, - num_videos=num_videos) + num_videos=num_videos, + overrides=video_overrides) } else: dummy_video = {} diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 10b5c45169f4..20d915d6fd38 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -19,6 +19,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -1170,6 +1171,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -1179,12 +1181,16 @@ def get_dummy_mm_data( target_num_frames = self.info.get_num_frames_with_most_features( seq_len) + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + mm_data = { "image": self._get_dummy_images( width=target_width, height=target_height, num_images=num_images, + overrides=image_overrides, ), "video": self._get_dummy_videos( @@ -1192,6 +1198,7 @@ def get_dummy_mm_data( height=target_height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ), } diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 30ec9029f74f..a47bdd2f5ab5 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -54,6 +54,7 @@ from transformers.activations import GELUActivation from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ReplicatedLinear @@ -212,14 +213,18 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=MaxImageTokenMeta.width, height=MaxImageTokenMeta.height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 78c413b77051..883af345aab5 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -15,6 +15,7 @@ from transformers.models.pixtral import PixtralProcessor from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -195,17 +196,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 697b8e819707..7aabef32b4a9 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -11,6 +11,7 @@ LlavaNextVideoProcessor) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.models.clip import CLIPVisionModel from vllm.multimodal import MULTIMODAL_REGISTRY @@ -150,6 +151,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_videos = mm_counts.get("video", 0) @@ -158,6 +160,8 @@ def get_dummy_mm_data( target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + video_overrides = mm_options.get("video") if mm_options else None + return { "video": self._get_dummy_videos( @@ -165,6 +169,7 @@ def get_dummy_mm_data( height=target_height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ) } diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 924f8ba3585f..4379f24da1bf 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -13,6 +13,7 @@ get_anyres_image_grid_shape, unpad_image) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -254,6 +255,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -264,17 +266,22 @@ def get_dummy_mm_data( self.info.get_num_frames_with_most_features(seq_len, mm_counts) + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos( width=target_width, height=target_height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ) } diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index 33bd64df5b53..65b3ee1c0e18 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -36,6 +36,7 @@ from transformers import BatchFeature from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -539,13 +540,17 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) + audio_overrides = mm_options.get("audio") if mm_options else None + return { "audio": self._get_dummy_audios(length=self.info.get_max_audio_len(), - num_audios=num_audios) + num_audios=num_audios, + overrides=audio_overrides) } diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index e5333fb652b1..74b2a2e62cd5 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -36,6 +36,7 @@ WhisperEncoder) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, NestedTensors) @@ -237,18 +238,23 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) audio_len = self.info.get_max_audio_chunks_with_most_features() * \ self.info.get_default_audio_sampling_rate() + audio_overrides = mm_options.get("audio") if mm_options else None + audio_mm_data = { "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) + self._get_dummy_audios(length=audio_len, + num_audios=num_audios, + overrides=audio_overrides) } return { - **super().get_dummy_mm_data(seq_len, mm_counts), + **super().get_dummy_mm_data(seq_len, mm_counts, mm_options), **audio_mm_data, } diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index eaa3839af37b..8bef1ec514ab 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -39,6 +39,7 @@ from typing_extensions import TypeVar from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig @@ -679,6 +680,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -690,15 +692,20 @@ def get_dummy_mm_data( num_video_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=image_width, height=image_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": [ self._get_dummy_images(width=video_width, height=video_height, - num_images=num_video_frames) + num_images=num_video_frames, + overrides=video_overrides) ] * num_videos, } diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index e932f7f007f5..d7c48758cca7 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -13,6 +13,7 @@ from transformers.models.pixtral import PixtralProcessor from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -208,17 +209,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index db5a9fbc6a33..9864ca2dc474 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -31,6 +31,7 @@ from vllm.attention.layer import MultiHeadAttention from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, @@ -689,17 +690,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) (target_width, target_height) = self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 0227a83a1f55..a77a2eb0f5a8 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -22,6 +22,7 @@ from vllm.attention.layer import MultiHeadAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, @@ -1226,16 +1227,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 2d0ebdc90277..fb90645b0e45 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -22,6 +22,7 @@ TensorType) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import ReLUSquaredActivation from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -814,6 +815,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: # Use default max_num_tiles for dummy data generation max_num_tiles = 12 @@ -821,11 +823,14 @@ def get_dummy_mm_data( self.info.get_image_size_with_most_features(max_num_tiles)) num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } @@ -842,21 +847,25 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: dummy_image = super().get_dummy_mm_data(seq_len=seq_len, - mm_counts=mm_counts) + mm_counts=mm_counts, + mm_options=mm_options) if self.info.supports_video: config = self.info.get_hf_config() image_size: int = config.force_image_size target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) num_videos = mm_counts.get("video", 0) + video_overrides = mm_options.get("video") if mm_options else None dummy_video = { "video": self._get_dummy_videos(width=image_size, height=image_size, num_frames=target_num_frames, - num_videos=num_videos) + num_videos=num_videos, + overrides=video_overrides) } else: dummy_video = {} diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 3bbf4c67604c..dcbbcf3383fa 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -14,6 +14,7 @@ import torch.nn as nn from transformers import PretrainedConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems @@ -86,16 +87,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 2e8e4a44102f..f8674b4f0e3f 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -28,6 +28,7 @@ from transformers import BatchFeature, PretrainedConfig from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.aimv2 import AIMv2Model @@ -283,17 +284,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + mm_data = { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), } return mm_data diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 9c8adb617310..18dea14379a6 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -10,6 +10,7 @@ from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.ovis import (OvisImagePatchInputs, @@ -290,6 +291,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -298,17 +300,23 @@ def get_dummy_mm_data( self.info.get_image_size_with_most_features() target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + mm_data = { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos( width=target_width, height=target_height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ) } return mm_data diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index d118e6c89ab5..744015551c0c 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -8,6 +8,7 @@ from transformers import BatchFeature, PaliGemmaConfig from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -106,6 +107,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config @@ -113,11 +115,14 @@ def get_dummy_mm_data( num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=max_image_size, height=max_image_size, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 59977796e2af..1895a30d2500 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -25,6 +25,7 @@ ProcessorMixin) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -356,17 +357,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index a4f9f96cb951..a5cc87d327b5 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -17,6 +17,7 @@ Phi4MultimodalAudioRelativeAttentionBias, adaptive_enc_mask, unfold_tensor) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn @@ -980,6 +981,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) @@ -987,14 +989,19 @@ def get_dummy_mm_data( target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + audio_overrides = mm_options.get("audio") if mm_options else None + mm_data = { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE, - num_audios=num_audios), + num_audios=num_audios, + overrides=audio_overrides), } return mm_data diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 8ccc7129ddb2..e3529dc393cf 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -11,6 +11,7 @@ SequenceFeatureExtractor, SiglipVisionConfig) from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -749,6 +750,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) @@ -756,14 +758,19 @@ def get_dummy_mm_data( target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + audio_overrides = mm_options.get("audio") if mm_options else None + mm_data = { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE, - num_audios=num_audios), + num_audios=num_audios, + overrides=audio_overrides), } return mm_data diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index bf451c5005b7..1c6e3a31d985 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -24,6 +24,7 @@ from transformers.tokenization_utils_base import TextInput from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.layernorm import RMSNorm @@ -228,28 +229,33 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> ProcessorInputs: tokenizer = self.info.get_tokenizer() dummy_text = self.get_dummy_text(mm_counts) - dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) + dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options) dummy_images = dummy_mm_data.get("image", []) tokenization_kwargs = {"truncation": False} diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index b5e82c9b21cd..6b1254030009 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -39,6 +39,7 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -212,6 +213,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) @@ -228,19 +230,26 @@ def get_dummy_mm_data( target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + audio_overrides = mm_options.get("audio") if mm_options else None + mm_data = { "audio": self._get_dummy_audios(length=target_audio_length, - num_audios=num_audios), + num_audios=num_audios, + overrides=audio_overrides), "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos(width=target_width, height=target_height, num_frames=target_num_frames, - num_videos=num_videos), + num_videos=num_videos, + overrides=video_overrides), } return mm_data diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index f9136863b8d6..f407692e1151 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -34,6 +34,7 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (AudioItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, @@ -144,6 +145,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: feature_extractor = self.info.get_feature_extractor() @@ -151,9 +153,13 @@ def get_dummy_mm_data( audio_len = feature_extractor.chunk_length * sampling_rate num_audios = mm_counts.get("audio", 0) + audio_overrides = mm_options.get("audio") if mm_options else None + return { "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) + self._get_dummy_audios(length=audio_len, + num_audios=num_audios, + overrides=audio_overrides) } diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 38435a69444e..4ed06bae9642 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -43,6 +43,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger @@ -1038,6 +1039,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -1047,17 +1049,22 @@ def get_dummy_mm_data( target_num_frames = \ self.info.get_num_frames_with_most_features(seq_len, mm_counts) + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos( width=target_width, height=target_height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ) } diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 00de89811cc7..60f17699fe0f 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -46,6 +46,7 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY @@ -734,6 +735,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -748,17 +750,23 @@ def get_dummy_mm_data( num_frames=target_num_frames, image_processor=self.info.get_video_processor(), ) + + image_overrides = mm_options.get("image") if mm_options else None + video_overrides = mm_options.get("video") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), "video": self._get_dummy_videos( width=target_video_size.width, height=target_video_size.height, num_frames=target_num_frames, num_videos=num_videos, + overrides=video_overrides, ), } diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 924119ed63ab..a94e1e700c67 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -24,6 +24,7 @@ from transformers.tokenization_utils_base import TextInput from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, ReplicatedLinear, @@ -567,6 +568,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.visual @@ -574,11 +576,14 @@ def get_dummy_mm_data( target_width = target_height = vision_config["image_size"] num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/rvl.py b/vllm/model_executor/models/rvl.py index efdb01004663..594d018f6bb6 100644 --- a/vllm/model_executor/models/rvl.py +++ b/vllm/model_executor/models/rvl.py @@ -2,12 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping +from typing import Optional import torch import torch.nn as nn from transformers.activations import GELUActivation from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict @@ -38,17 +40,21 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = ( self.info.get_image_size_with_most_features()) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), } diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index f03022aa719c..2c23685739dd 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -17,6 +17,7 @@ from transformers import BatchEncoding, PretrainedConfig, TensorType from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig @@ -507,16 +508,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index ad295ef44732..da507e0d9732 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -17,6 +17,7 @@ from vllm.attention.layer import MultiHeadAttention from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -496,16 +497,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images) + num_images=num_images, + overrides=image_overrides) } diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index 938b02e3e04b..8436cab4ef10 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -28,6 +28,8 @@ from transformers import BatchFeature from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions +from vllm.logger import init_logger from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import AutoWeightsLoader @@ -48,6 +50,8 @@ SupportsMultiModal) from .interfaces_base import default_pooling_type +logger = init_logger(__name__) + def _terratorch_field_names(pretrained_cfg: dict): input_definition = InputDefinition(**pretrained_cfg["input"]) @@ -97,9 +101,16 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: # Dummy data is generated based on the 'input' section # defined in the HF configuration file + + if mm_options: + logger.warning("Configurable multimodal profiling " + "options are not supported for Terratorch. " + "They are ignored for now.") + return self.dummy_data_generator.get_dummy_mm_data() diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 070c77073bb0..c3760b8f951b 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -31,6 +31,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, VllmConfig) +from vllm.config.multimodal import BaseDummyOptions from vllm.config.utils import getattr_iter from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed.utils import get_pp_indices @@ -253,16 +254,20 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_max_image_size() + image_overrides = mm_options.get("image") if mm_options else None + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=num_images), + num_images=num_images, + overrides=image_overrides), } diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 70aabf6dfe78..7744a19946a2 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -14,6 +14,7 @@ from transformers.models.whisper.modeling_whisper import WhisperEncoder from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.model_loader import DefaultModelLoader @@ -114,6 +115,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: feature_extractor = self.info.get_feature_extractor() @@ -122,9 +124,13 @@ def get_dummy_mm_data( _MAX_ENCODER_BATCH_SIZE) num_audios = mm_counts.get("audio", 0) + audio_overrides = mm_options.get("audio") if mm_options else None + return { "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) + self._get_dummy_audios(length=audio_len, + num_audios=num_audios, + overrides=audio_overrides) } diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 1edeaeb0f319..ad494a7a7ec9 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -21,6 +21,7 @@ from transformers.tokenization_utils_base import TextInput from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig @@ -204,25 +205,31 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) target_length = self.info.get_max_audio_array_len() + audio_overrides = mm_options.get("audio") if mm_options else None + return { "audio": - self._get_dummy_audios(length=target_length, num_audios=num_audios) + self._get_dummy_audios(length=target_length, + num_audios=num_audios, + overrides=audio_overrides) } def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> ProcessorInputs: tokenizer = self.info.get_tokenizer() dummy_text = self.get_dummy_text(mm_counts) - dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) + dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options) dummy_audios = dummy_mm_data.get("audio", []) audio_chunks: list[AudioChunk] = [] diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 1eecac7ed76b..d349d91dfd76 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -18,6 +18,7 @@ from vllm.attention.layers.cross_attention import CrossAttention from vllm.config import (CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig) +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -691,6 +692,7 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: feature_extractor = self.info.get_feature_extractor() @@ -698,9 +700,13 @@ def get_dummy_mm_data( audio_len = feature_extractor.chunk_length * sampling_rate num_audios = mm_counts.get("audio", 0) + audio_overrides = mm_options.get("audio") if mm_options else None + return { "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) + self._get_dummy_audios(length=audio_len, + num_audios=num_audios, + overrides=audio_overrides) } diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 26c5d188964c..74dc2314d2eb 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -10,6 +10,8 @@ from PIL import Image import vllm.envs as envs +from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions, + ImageDummyOptions, VideoDummyOptions) from vllm.logger import init_logger from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, @@ -73,10 +75,19 @@ def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalDataDict: """ Build the multimodal input which, after processing, results in the maximum possible number of placeholder tokens. + + Args: + seq_len: Sequence length + mm_counts: Count of items per modality + mm_options: Configurable options per modality (optional). + If None, use model defaults for backward compatibility. + If provided, models can use these to customize dummy + data generation. """ raise NotImplementedError @@ -84,13 +95,22 @@ def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> ProcessorInputs: """ Build the input which, after processing, results in the maximum possible number of placeholder tokens. + + Args: + seq_len: Sequence length + mm_counts: Count of items per modality + mm_options: Configurable options per modality (optional) """ dummy_text = self.get_dummy_text(mm_counts) - dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) + + # Use the unified function for both legacy and configurable cases + dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options) + tokenization_kwargs = {"truncation": False} return ProcessorInputs(prompt=dummy_text, @@ -102,9 +122,17 @@ def _get_dummy_audios( *, length: int, num_audios: int, + overrides: Optional[AudioDummyOptions] = None, ) -> list[npt.NDArray]: if num_audios == 0: return [] + if overrides and overrides.length: + if overrides.length > length: + logger.warning( + "audio.length override (%d) exceeds model's " + "maximum length (%d), will be ignored", overrides.length, + length) + length = min(length, overrides.length) audio = np.zeros((length, )) return [audio] * num_audios @@ -114,9 +142,25 @@ def _get_dummy_images( width: int, height: int, num_images: int, + overrides: Optional[ImageDummyOptions] = None, ) -> list[Image.Image]: if num_images == 0: return [] + if overrides: + if overrides.width: + if overrides.width > width: + logger.warning( + "image.width override (%d) exceeds model's " + "maximum width (%d), will be ignored", overrides.width, + width) + width = min(width, overrides.width) + if overrides.height: + if overrides.height > height: + logger.warning( + "image.height override (%d) exceeds model's " + "maximum height (%d), will be ignored", + overrides.height, height) + height = min(height, overrides.height) image = Image.new("RGB", (width, height), color=255) return [image] * num_images @@ -127,9 +171,32 @@ def _get_dummy_videos( height: int, num_frames: int, num_videos: int, + overrides: Optional[VideoDummyOptions] = None, ) -> list[npt.NDArray]: if num_videos == 0: return [] + if overrides: + if overrides.num_frames: + if overrides.num_frames > num_frames: + logger.warning( + "video.num_frames override (%d) exceeds model's " + "maximum number of frames (%d), will be ignored", + overrides.num_frames, num_frames) + num_frames = min(num_frames, overrides.num_frames) + if overrides.width: + if overrides.width > width: + logger.warning( + "video.width override (%d) exceeds model's " + "maximum width (%d), will be ignored", overrides.width, + width) + width = min(width, overrides.width) + if overrides.height: + if overrides.height > height: + logger.warning( + "video.height override (%d) exceeds model's " + "maximum height (%d), will be ignored", + overrides.height, height) + height = min(height, overrides.height) video = np.full((num_frames, width, height, 3), 255) return [video] * num_videos @@ -162,13 +229,14 @@ def _get_dummy_mm_inputs( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> MultiModalInputs: if mm_counts is None: mm_counts = self.get_mm_limits() factory = self.dummy_inputs processor_inputs = factory.get_dummy_processor_inputs( - seq_len, mm_counts) + seq_len, mm_counts, mm_options) return self.processor.apply( prompt=processor_inputs.prompt, @@ -195,8 +263,9 @@ def get_encoder_dummy_data( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> DummyEncoderData: - mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts, mm_options) mm_inputs = cast(MultiModalEncDecInputs, mm_inputs) # For encoder-decoder models, use encoder prompt token ids instead of @@ -228,8 +297,9 @@ def get_decoder_dummy_data( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, ) -> DummyDecoderData: - mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts, mm_options) prompt_token_ids = mm_inputs["prompt_token_ids"] total_len = len(prompt_token_ids) @@ -274,7 +344,7 @@ def get_mm_max_contiguous_tokens( ` [IMG] [IMG] [IMG] [IMG] [IMG] [IMG] ` Returns 9, even when the number of image embeddings is 6. - + This is important to take into account when profiling and initializing the encoder cache size. """ diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 2bbc0078ad13..24d3baa9b4e7 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,6 +6,7 @@ import torch.nn as nn +from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, cached_tokenizer_from_config) @@ -52,7 +53,7 @@ def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]: ... -class MultiModalProcessorFactory(Protocol[_I]): +class MultiModalProcessorFactory(Protocol[_I]): # type: ignore[misc] """ Constructs a [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor] @@ -95,6 +96,28 @@ def __init__(self) -> None: self._processor_factories = ClassRegistry[nn.Module, _ProcessorFactories]() + def _extract_mm_options( + self, + model_config: "ModelConfig", + ) -> Optional[Mapping[str, BaseDummyOptions]]: + """ + Extract multimodal dummy options from model config. + + Returns None if no configurable options are found, otherwise returns + a mapping of modality names to their dummy options. + """ + if not model_config.multimodal_config: + return None + + mm_options = { + m: opt + for m in model_config.multimodal_config.limit_per_prompt + if (opt := model_config.multimodal_config.get_dummy_options(m) + ) is not None + } + + return mm_options if len(mm_options) > 0 else None + def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool: """ Checks if the model supports multimodal inputs. @@ -135,7 +158,7 @@ def get_max_tokens_per_item_by_modality( return {} processor = self.create_processor(model_config, cache=cache) - profiler = MultiModalProfiler(processor) + profiler: MultiModalProfiler = MultiModalProfiler(processor) seq_len = model_config.max_model_len mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache) @@ -189,7 +212,7 @@ def get_mm_limits_per_prompt( return {} processor = self.create_processor(model_config, cache=cache) - profiler = MultiModalProfiler(processor) + profiler: MultiModalProfiler = MultiModalProfiler(processor) return profiler.get_mm_limits() def register_processor( @@ -285,8 +308,15 @@ def get_decoder_dummy_data( The model is identified by ``model_config``. """ processor = self.create_processor(model_config, cache=cache) - profiler = MultiModalProfiler(processor) - dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts) + profiler: MultiModalProfiler = MultiModalProfiler(processor) + + # Extract configurable options from multimodal config. + # Only include modalities that use advanced option types so legacy + # count-only behavior remains unchanged. + mm_options = self._extract_mm_options(model_config) + + dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, + mm_options) # Having more tokens is over-conservative but otherwise fine token_ids = dummy_data.prompt_token_ids @@ -311,8 +341,15 @@ def get_encoder_dummy_data( The model is identified by ``model_config``. """ processor = self.create_processor(model_config, cache=cache) - profiler = MultiModalProfiler(processor) - dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts) + profiler: MultiModalProfiler = MultiModalProfiler(processor) + + # Extract configurable options from multimodal config. + # Only include modalities that use advanced option types so legacy + # count-only behavior remains unchanged. + mm_options = self._extract_mm_options(model_config) + + dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, + mm_options) # Having more tokens is over-conservative but otherwise fine token_ids = dummy_data.prompt_token_ids