Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions vllm/model_executor/models/nano_nemotron_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@

# Profiling
MAX_FRAMES = 16
DEFAULT_NUM_TILES = 12


class NanoNemotronVLImagePixelInputs(TypedDict):
Expand Down Expand Up @@ -227,6 +228,8 @@ def video_to_pixel_values(
max_num_tiles: int = 1,
use_thumbnail: bool,
) -> torch.Tensor:
assert max_num_tiles == 1, "Video modality always uses one tile"

# Convert each frame to a single resized tile tensor consistent
# with image path
frames_tensors: list[torch.Tensor] = []
Expand Down Expand Up @@ -255,13 +258,19 @@ class BaseNanoNemotronVLProcessor(ABC):
"""

def __init__(
self, config: PretrainedConfig, tokenizer: AnyTokenizer, *args, **kwargs
self,
config: PretrainedConfig,
tokenizer: AnyTokenizer,
*args,
max_num_tiles: Optional[int] = None,
**kwargs,
) -> None:
super().__init__()

self.config = config
self.tokenizer = tokenizer

self.max_num_tiles = max_num_tiles or DEFAULT_NUM_TILES
image_size: int = config.force_image_size
patch_size: int = config.patch_size

Expand Down Expand Up @@ -361,7 +370,7 @@ def __call__(
) -> BatchFeature:
# Use default if not provided
if max_num_tiles is None:
max_num_tiles = 12
max_num_tiles = self.max_num_tiles

text, images = [self._make_batch_input(x) for x in (text, images)]

Expand Down Expand Up @@ -390,6 +399,7 @@ def __init__(
config: PretrainedConfig,
tokenizer: AnyTokenizer,
*,
max_num_tiles: Optional[int] = None,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
Expand All @@ -399,6 +409,7 @@ def __init__(
super().__init__(
config=config,
tokenizer=tokenizer,
max_num_tiles=max_num_tiles,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
Expand Down Expand Up @@ -506,7 +517,7 @@ def __call__(
) -> BatchFeature:
# Use default if not provided
if max_num_tiles is None:
max_num_tiles = 12
max_num_tiles = self.max_num_tiles

text, images, videos = [
self._make_batch_input(x) for x in (text, images, videos)
Expand All @@ -521,7 +532,7 @@ def __call__(
text, video_inputs = self._preprocess_video(
text=text,
videos=videos,
max_num_tiles=max_num_tiles,
max_num_tiles=1,
dynamic_image_size=dynamic_image_size,
)

Expand Down Expand Up @@ -635,7 +646,7 @@ def get_image_size_with_most_features(self, max_num_tiles: int) -> ImageSize:
def get_max_image_tokens(self) -> int:
processor = self.get_hf_processor()
# Use default max_num_tiles for max tokens calculation
max_num_tiles = 12
max_num_tiles = processor.max_num_tiles
target_width, target_height = self.get_image_size_with_most_features(
max_num_tiles
)
Expand Down Expand Up @@ -768,7 +779,9 @@ def get_replacement_custom(item_idx: int):
else:
image_size = images.get_image_size(item_idx)
# Extract max_num_tiles from kwargs, default to 12
max_num_tiles = hf_processor_mm_kwargs.get("max_num_tiles", 12)
max_num_tiles = hf_processor_mm_kwargs.get(
"max_num_tiles", hf_processor.max_num_tiles
)
feature_size = self.info.get_num_image_tokens(
image_width=image_size.width,
image_height=image_size.height,
Expand Down