From a3bd4e2c89c261b50b44919066eccb92d43c6ee7 Mon Sep 17 00:00:00 2001 From: Eugene Khvedchenia Date: Wed, 8 Oct 2025 11:32:09 +0300 Subject: [PATCH 1/2] Allow passing "mm_processor_kwargs": dict(max_num_tiles=2), Signed-off-by: Eugene Khvedchenia --- .../model_executor/models/nano_nemotron_vl.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 039ffbddf8db..7c6b82a4b313 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -93,6 +93,7 @@ # Profiling MAX_FRAMES = 16 +DEFAULT_NUM_TILES = 12 class NanoNemotronVLImagePixelInputs(TypedDict): @@ -255,13 +256,19 @@ class BaseNanoNemotronVLProcessor(ABC): """ def __init__( - self, config: PretrainedConfig, tokenizer: AnyTokenizer, *args, **kwargs + self, + config: PretrainedConfig, + tokenizer: AnyTokenizer, + *args, + max_num_tiles: Optional[int] = None, + **kwargs, ) -> None: super().__init__() self.config = config self.tokenizer = tokenizer + self.max_num_tiles = max_num_tiles or DEFAULT_NUM_TILES image_size: int = config.force_image_size patch_size: int = config.patch_size @@ -361,7 +368,7 @@ def __call__( ) -> BatchFeature: # Use default if not provided if max_num_tiles is None: - max_num_tiles = 12 + max_num_tiles = self.max_num_tiles text, images = [self._make_batch_input(x) for x in (text, images)] @@ -390,6 +397,7 @@ def __init__( config: PretrainedConfig, tokenizer: AnyTokenizer, *, + max_num_tiles: Optional[int] = None, min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, @@ -399,6 +407,7 @@ def __init__( super().__init__( config=config, tokenizer=tokenizer, + max_num_tiles=max_num_tiles, min_dynamic_patch=min_dynamic_patch, max_dynamic_patch=max_dynamic_patch, dynamic_image_size=dynamic_image_size, @@ -506,7 +515,7 @@ def __call__( ) -> BatchFeature: # Use default if not provided if max_num_tiles is None: - max_num_tiles = 12 + max_num_tiles = self.max_num_tiles text, images, videos = [ self._make_batch_input(x) for x in (text, images, videos) @@ -635,7 +644,7 @@ def get_image_size_with_most_features(self, max_num_tiles: int) -> ImageSize: def get_max_image_tokens(self) -> int: processor = self.get_hf_processor() # Use default max_num_tiles for max tokens calculation - max_num_tiles = 12 + max_num_tiles = processor.max_num_tiles target_width, target_height = self.get_image_size_with_most_features( max_num_tiles ) @@ -768,7 +777,9 @@ def get_replacement_custom(item_idx: int): else: image_size = images.get_image_size(item_idx) # Extract max_num_tiles from kwargs, default to 12 - max_num_tiles = hf_processor_mm_kwargs.get("max_num_tiles", 12) + max_num_tiles = hf_processor_mm_kwargs.get( + "max_num_tiles", hf_processor.max_num_tiles + ) feature_size = self.info.get_num_image_tokens( image_width=image_size.width, image_height=image_size.height, From 16568945e0635f06580ae69554ac095b9f008fea Mon Sep 17 00:00:00 2001 From: Eugene Khvedchenia Date: Wed, 8 Oct 2025 11:53:39 +0300 Subject: [PATCH 2/2] Ensure video modality always uses 1 tile (performance optimization) Signed-off-by: Eugene Khvedchenia --- vllm/model_executor/models/nano_nemotron_vl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 7c6b82a4b313..91dfa6735534 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -228,6 +228,8 @@ def video_to_pixel_values( max_num_tiles: int = 1, use_thumbnail: bool, ) -> torch.Tensor: + assert max_num_tiles == 1, "Video modality always uses one tile" + # Convert each frame to a single resized tile tensor consistent # with image path frames_tensors: list[torch.Tensor] = [] @@ -530,7 +532,7 @@ def __call__( text, video_inputs = self._preprocess_video( text=text, videos=videos, - max_num_tiles=max_num_tiles, + max_num_tiles=1, dynamic_image_size=dynamic_image_size, )