Skip to content
13 changes: 9 additions & 4 deletions vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
logger = init_logger(__name__)

# For profile run
_MAX_FRAMES_PER_VIDEO = 32
_MAX_FRAMES_PER_VIDEO = 14

# === Vision Inputs === #

Expand Down Expand Up @@ -932,6 +932,7 @@ def get_num_image_tokens(
_, num_image_tokens = self._get_vision_info(
image_width=image_width,
image_height=image_height,
num_frames=1,
image_processor=image_processor,
)
return num_image_tokens
Expand All @@ -956,6 +957,7 @@ def get_image_size_with_most_features(self) -> ImageSize:
max_image_size, _ = self._get_vision_info(
image_width=9999999,
image_height=9999999,
num_frames=1,
image_processor=None,
)
return max_image_size
Expand All @@ -969,10 +971,12 @@ def get_max_image_tokens(self) -> int:
image_processor=None,
)

def _get_max_video_frames(self, max_tokens: int) -> int:
def _get_max_video_frames(self,
max_tokens: int,
start_num_frames: int = 1) -> int:
target_width, target_height = self.get_image_size_with_most_features()

num_frames = 0
num_frames = start_num_frames

while True:
next_num_frames = num_frames + 1
Expand All @@ -994,12 +998,13 @@ def get_num_frames_with_most_features(
self,
seq_len: int,
mm_counts: Mapping[str, int],
max_frames_per_video: int = _MAX_FRAMES_PER_VIDEO,
) -> int:
max_videos = mm_counts.get("video", 0)

max_total_frames = self._get_max_video_frames(seq_len)
max_frames_per_video = min(max_total_frames // max(max_videos, 1),
_MAX_FRAMES_PER_VIDEO)
max_frames_per_video)

return max(max_frames_per_video, 1)

Expand Down
70 changes: 65 additions & 5 deletions vllm/model_executor/models/qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,14 @@
import torch.nn.functional as F
from transformers import BatchFeature
from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
smart_resize as image_smart_resize)
from transformers.models.qwen3_vl import (Qwen3VLProcessor,
Qwen3VLVideoProcessor)
from transformers.models.qwen3_vl.configuration_qwen3_vl import (
Qwen3VLConfig, Qwen3VLVisionConfig)
from transformers.models.qwen3_vl.video_processing_qwen3_vl import (
smart_resize as video_smart_resize)
from transformers.video_utils import VideoMetadata

from vllm.attention.layer import check_upstream_fa_availability
Expand Down Expand Up @@ -85,6 +88,9 @@

logger = init_logger(__name__)

# Official recommended max pixels is 24576 * 32 * 32
_MAX_FRAMES_PER_VIDEO = 24576


class Qwen3_VisionPatchEmbed(nn.Module):

Expand Down Expand Up @@ -593,24 +599,39 @@ def _get_vision_info(
image_height: int,
num_frames: int = 2,
do_resize: bool = True,
image_processor: Optional[Qwen2VLImageProcessorFast],
image_processor: Optional[Union[Qwen2VLImageProcessorFast,
Qwen3VLVideoProcessor]],
) -> tuple[ImageSize, int]:
if image_processor is None:
if image_processor is None and num_frames > 1:
image_processor = self.get_video_processor()
elif image_processor is None:
image_processor = self.get_image_processor()

is_video = isinstance(image_processor, Qwen3VLVideoProcessor)

hf_config = self.get_hf_config()
vision_config = hf_config.vision_config
patch_size = vision_config.patch_size
merge_size = vision_config.spatial_merge_size
temporal_patch_size = vision_config.temporal_patch_size

if do_resize:
if is_video:
smart_resize = video_smart_resize
extra_kwargs = {
"num_frames": num_frames,
"temporal_factor": temporal_patch_size
}
else:
smart_resize = image_smart_resize
extra_kwargs = {}
resized_height, resized_width = smart_resize(
height=image_height,
width=image_width,
factor=patch_size * merge_size,
min_pixels=image_processor.size["shortest_edge"],
max_pixels=image_processor.size["longest_edge"],
**extra_kwargs,
)
preprocessed_size = ImageSize(width=resized_width,
height=resized_height)
Expand All @@ -629,6 +650,39 @@ def _get_vision_info(

return preprocessed_size, num_vision_tokens

def _get_max_video_frames(self,
max_tokens: int,
start_num_frames: int = 2) -> int:
return super()._get_max_video_frames(max_tokens,
start_num_frames=start_num_frames)

def get_num_frames_with_most_features(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> int:
return super().get_num_frames_with_most_features(
seq_len, mm_counts, max_frames_per_video=_MAX_FRAMES_PER_VIDEO)

def get_max_video_tokens(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> int:
target_width, target_height = self.get_image_size_with_most_features()
video_soft_tokens = self.get_num_video_tokens(
image_width=target_width,
image_height=target_height,
num_frames=self.get_num_frames_with_most_features(
seq_len, mm_counts),
image_processor=None,
)

# NOTE: By default in Qwen3-VL, one video token is converted to
# "<{timestamp} seconds>" (on average 9.5 tokens) + vision_start_token + video_token + vision_end_token # noqa: E501
formatted_video_soft_tokens = video_soft_tokens * 12.5
return int(formatted_video_soft_tokens)

def _calculate_timestamps(self, indices: list[int] | torch.Tensor,
video_fps: float, merge_size: int):
if not isinstance(indices, list):
Expand Down Expand Up @@ -698,15 +752,21 @@ def get_dummy_mm_data(
self.info.get_image_size_with_most_features())
target_num_frames = self.info.get_num_frames_with_most_features(
seq_len, mm_counts)
target_video_size, _ = self.info._get_vision_info(
image_width=target_width,
image_height=target_height,
num_frames=target_num_frames,
image_processor=self.info.get_video_processor(),
)
Comment on lines +755 to +760
Copy link
Member

@ywang96 ywang96 Sep 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is in fact pretty important.

Previous we're sending a [32, 4096, 4096, 3] input tensor which would OOM if we turn on DP ViT, this is now corrected to [24576, 32, 32, 3]

return {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images),
"video":
self._get_dummy_videos(
width=target_width,
height=target_height,
width=target_video_size.width,
height=target_video_size.height,
num_frames=target_num_frames,
num_videos=num_videos,
),
Expand Down