diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 8192c3ce05dd..6ef01f333554 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -79,7 +79,7 @@ logger = init_logger(__name__) # For profile run -_MAX_FRAMES_PER_VIDEO = 32 +_MAX_FRAMES_PER_VIDEO = 14 # === Vision Inputs === # @@ -932,6 +932,7 @@ def get_num_image_tokens( _, num_image_tokens = self._get_vision_info( image_width=image_width, image_height=image_height, + num_frames=1, image_processor=image_processor, ) return num_image_tokens @@ -956,6 +957,7 @@ def get_image_size_with_most_features(self) -> ImageSize: max_image_size, _ = self._get_vision_info( image_width=9999999, image_height=9999999, + num_frames=1, image_processor=None, ) return max_image_size @@ -969,10 +971,12 @@ def get_max_image_tokens(self) -> int: image_processor=None, ) - def _get_max_video_frames(self, max_tokens: int) -> int: + def _get_max_video_frames(self, + max_tokens: int, + start_num_frames: int = 1) -> int: target_width, target_height = self.get_image_size_with_most_features() - num_frames = 0 + num_frames = start_num_frames while True: next_num_frames = num_frames + 1 @@ -994,12 +998,13 @@ def get_num_frames_with_most_features( self, seq_len: int, mm_counts: Mapping[str, int], + max_frames_per_video: int = _MAX_FRAMES_PER_VIDEO, ) -> int: max_videos = mm_counts.get("video", 0) max_total_frames = self._get_max_video_frames(seq_len) max_frames_per_video = min(max_total_frames // max(max_videos, 1), - _MAX_FRAMES_PER_VIDEO) + max_frames_per_video) return max(max_frames_per_video, 1) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 5d0b66f91ace..c8f91dd48969 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -33,11 +33,14 @@ import torch.nn.functional as F from transformers import BatchFeature from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast -from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize +from transformers.models.qwen2_vl.image_processing_qwen2_vl import ( + smart_resize as image_smart_resize) from transformers.models.qwen3_vl import (Qwen3VLProcessor, Qwen3VLVideoProcessor) from transformers.models.qwen3_vl.configuration_qwen3_vl import ( Qwen3VLConfig, Qwen3VLVisionConfig) +from transformers.models.qwen3_vl.video_processing_qwen3_vl import ( + smart_resize as video_smart_resize) from transformers.video_utils import VideoMetadata from vllm.attention.layer import check_upstream_fa_availability @@ -85,6 +88,9 @@ logger = init_logger(__name__) +# Official recommended max pixels is 24576 * 32 * 32 +_MAX_FRAMES_PER_VIDEO = 24576 + class Qwen3_VisionPatchEmbed(nn.Module): @@ -593,11 +599,16 @@ def _get_vision_info( image_height: int, num_frames: int = 2, do_resize: bool = True, - image_processor: Optional[Qwen2VLImageProcessorFast], + image_processor: Optional[Union[Qwen2VLImageProcessorFast, + Qwen3VLVideoProcessor]], ) -> tuple[ImageSize, int]: - if image_processor is None: + if image_processor is None and num_frames > 1: + image_processor = self.get_video_processor() + elif image_processor is None: image_processor = self.get_image_processor() + is_video = isinstance(image_processor, Qwen3VLVideoProcessor) + hf_config = self.get_hf_config() vision_config = hf_config.vision_config patch_size = vision_config.patch_size @@ -605,12 +616,22 @@ def _get_vision_info( temporal_patch_size = vision_config.temporal_patch_size if do_resize: + if is_video: + smart_resize = video_smart_resize + extra_kwargs = { + "num_frames": num_frames, + "temporal_factor": temporal_patch_size + } + else: + smart_resize = image_smart_resize + extra_kwargs = {} resized_height, resized_width = smart_resize( height=image_height, width=image_width, factor=patch_size * merge_size, min_pixels=image_processor.size["shortest_edge"], max_pixels=image_processor.size["longest_edge"], + **extra_kwargs, ) preprocessed_size = ImageSize(width=resized_width, height=resized_height) @@ -629,6 +650,39 @@ def _get_vision_info( return preprocessed_size, num_vision_tokens + def _get_max_video_frames(self, + max_tokens: int, + start_num_frames: int = 2) -> int: + return super()._get_max_video_frames(max_tokens, + start_num_frames=start_num_frames) + + def get_num_frames_with_most_features( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> int: + return super().get_num_frames_with_most_features( + seq_len, mm_counts, max_frames_per_video=_MAX_FRAMES_PER_VIDEO) + + def get_max_video_tokens( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> int: + target_width, target_height = self.get_image_size_with_most_features() + video_soft_tokens = self.get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self.get_num_frames_with_most_features( + seq_len, mm_counts), + image_processor=None, + ) + + # NOTE: By default in Qwen3-VL, one video token is converted to + # "<{timestamp} seconds>" (on average 9.5 tokens) + vision_start_token + video_token + vision_end_token # noqa: E501 + formatted_video_soft_tokens = video_soft_tokens * 12.5 + return int(formatted_video_soft_tokens) + def _calculate_timestamps(self, indices: list[int] | torch.Tensor, video_fps: float, merge_size: int): if not isinstance(indices, list): @@ -698,6 +752,12 @@ def get_dummy_mm_data( self.info.get_image_size_with_most_features()) target_num_frames = self.info.get_num_frames_with_most_features( seq_len, mm_counts) + target_video_size, _ = self.info._get_vision_info( + image_width=target_width, + image_height=target_height, + num_frames=target_num_frames, + image_processor=self.info.get_video_processor(), + ) return { "image": self._get_dummy_images(width=target_width, @@ -705,8 +765,8 @@ def get_dummy_mm_data( num_images=num_images), "video": self._get_dummy_videos( - width=target_width, - height=target_height, + width=target_video_size.width, + height=target_video_size.height, num_frames=target_num_frames, num_videos=num_videos, ),