From a3bd4e2c89c261b50b44919066eccb92d43c6ee7 Mon Sep 17 00:00:00 2001
From: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Date: Wed, 8 Oct 2025 11:32:09 +0300
Subject: [PATCH 1/2] Allow passing "mm_processor_kwargs":
 dict(max_num_tiles=2),

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
---
 .../model_executor/models/nano_nemotron_vl.py | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 039ffbddf8db..7c6b82a4b313 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -93,6 +93,7 @@
 
 # Profiling
 MAX_FRAMES = 16
+DEFAULT_NUM_TILES = 12
 
 
 class NanoNemotronVLImagePixelInputs(TypedDict):
@@ -255,13 +256,19 @@ class BaseNanoNemotronVLProcessor(ABC):
     """
 
     def __init__(
-        self, config: PretrainedConfig, tokenizer: AnyTokenizer, *args, **kwargs
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *args,
+        max_num_tiles: Optional[int] = None,
+        **kwargs,
     ) -> None:
         super().__init__()
 
         self.config = config
         self.tokenizer = tokenizer
 
+        self.max_num_tiles = max_num_tiles or DEFAULT_NUM_TILES
         image_size: int = config.force_image_size
         patch_size: int = config.patch_size
 
@@ -361,7 +368,7 @@ def __call__(
     ) -> BatchFeature:
         # Use default if not provided
         if max_num_tiles is None:
-            max_num_tiles = 12
+            max_num_tiles = self.max_num_tiles
 
         text, images = [self._make_batch_input(x) for x in (text, images)]
 
@@ -390,6 +397,7 @@ def __init__(
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
+        max_num_tiles: Optional[int] = None,
         min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
@@ -399,6 +407,7 @@ def __init__(
         super().__init__(
             config=config,
             tokenizer=tokenizer,
+            max_num_tiles=max_num_tiles,
             min_dynamic_patch=min_dynamic_patch,
             max_dynamic_patch=max_dynamic_patch,
             dynamic_image_size=dynamic_image_size,
@@ -506,7 +515,7 @@ def __call__(
     ) -> BatchFeature:
         # Use default if not provided
         if max_num_tiles is None:
-            max_num_tiles = 12
+            max_num_tiles = self.max_num_tiles
 
         text, images, videos = [
             self._make_batch_input(x) for x in (text, images, videos)
@@ -635,7 +644,7 @@ def get_image_size_with_most_features(self, max_num_tiles: int) -> ImageSize:
     def get_max_image_tokens(self) -> int:
         processor = self.get_hf_processor()
         # Use default max_num_tiles for max tokens calculation
-        max_num_tiles = 12
+        max_num_tiles = processor.max_num_tiles
         target_width, target_height = self.get_image_size_with_most_features(
             max_num_tiles
         )
@@ -768,7 +777,9 @@ def get_replacement_custom(item_idx: int):
             else:
                 image_size = images.get_image_size(item_idx)
                 # Extract max_num_tiles from kwargs, default to 12
-                max_num_tiles = hf_processor_mm_kwargs.get("max_num_tiles", 12)
+                max_num_tiles = hf_processor_mm_kwargs.get(
+                    "max_num_tiles", hf_processor.max_num_tiles
+                )
                 feature_size = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,

From 16568945e0635f06580ae69554ac095b9f008fea Mon Sep 17 00:00:00 2001
From: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Date: Wed, 8 Oct 2025 11:53:39 +0300
Subject: [PATCH 2/2] Ensure video modality always uses 1 tile (performance
 optimization)

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
---
 vllm/model_executor/models/nano_nemotron_vl.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 7c6b82a4b313..91dfa6735534 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -228,6 +228,8 @@ def video_to_pixel_values(
     max_num_tiles: int = 1,
     use_thumbnail: bool,
 ) -> torch.Tensor:
+    assert max_num_tiles == 1, "Video modality always uses one tile"
+
     # Convert each frame to a single resized tile tensor consistent
     # with image path
     frames_tensors: list[torch.Tensor] = []
@@ -530,7 +532,7 @@ def __call__(
         text, video_inputs = self._preprocess_video(
             text=text,
             videos=videos,
-            max_num_tiles=max_num_tiles,
+            max_num_tiles=1,
             dynamic_image_size=dynamic_image_size,
         )