vllm-project · vllm-bot · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025
@@ -144,9 +144,9 @@ def torchao_quantize_param_data(param: torch.Tensor,
     """Quantize a Tensor with torchao quantization specified by torchao_config
 
     Args:
-       `param`: weight parameter of the linear module
-       `torchao_config`: type of quantization and their arguments we want to
-        use to quantize the Tensor
+        param: weight parameter of the linear module
+        torchao_config: type of quantization and their arguments we want to
+            use to quantize the Tensor
     """
     from torchao.core.config import AOBaseConfig
     from torchao.quantization import quantize_
@@ -172,8 +172,8 @@ class TorchAOLinearMethod(LinearMethodBase):
     """Linear method for torchao.
 
     Args:
-        torchao_config: The torchao quantization config, a string
-        that encodes the type of quantization and all relevant arguments.
+        quant_config: The torchao quantization config, a string that encodes 
+            the type of quantization and all relevant arguments.
     """
 
     def __init__(self, quant_config: TorchAOConfig):

@@ -423,7 +423,7 @@ def w8a8_block_int8_matmul(
         Bs: The per-block quantization scale for `B`.
         block_size: The block size for per-block quantization. It should be
             2-dim, e.g., [128, 128].
-        output_dytpe: The dtype of the returned tensor.
+        output_dtype: The dtype of the returned tensor.
 
     Returns:
         torch.Tensor: The result of matmul.

diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -135,8 +135,8 @@ def triton_mrope(
     """Qwen2VL mrope kernel.
 
     Args:
-        query: [num_tokens, num_heads * head_size]
-        key: [num_tokens, num_kv_heads * head_size]
+        q: [num_tokens, num_heads * head_size]
+        k: [num_tokens, num_kv_heads * head_size]
         cos: [3, num_tokens, head_size //2 ]
             (T/H/W positions with multimodal inputs)
         sin: [3, num_tokens, head_size //2 ]

@@ -171,51 +171,52 @@ class TensorizerConfig(MutableMapping):
     _is_sharded: bool = field(init=False, default=False)
     _fields: ClassVar[tuple[str, ...]]
     _keys: ClassVar[frozenset[str]]
-    """
-    Args for the TensorizerConfig class. These are used to configure the 
-    behavior of model serialization and deserialization using Tensorizer.
+    """Configuration class for Tensorizer settings.
 
-    Args:
-      tensorizer_uri: Path to serialized model tensors. Can be a local file 
-          path or a S3 URI. This is a required field unless lora_dir is 
-          provided and the config is meant to be used for the
-          `tensorize_lora_adapter` function. Unless a `tensorizer_dir` or 
-          `lora_dir` is passed to this object's initializer, this is a required 
-          argument.
-      tensorizer_dir: Path to a directory containing serialized model tensors,
-          and all other potential model artifacts to load the model, such as 
-          configs and tokenizer files. Can be passed instead of `tensorizer_uri`
-          where the `model.tensors` file will be assumed to be in this 
-          directory.
-      vllm_tensorized: If True, indicates that the serialized model is a 
-          vLLM model. This is used to determine the behavior of the 
-          TensorDeserializer when loading tensors from a serialized model.
-          It is far faster to deserialize a vLLM model as it utilizes
-          tensorizer's optimized GPU loading. Note that this is now
-          deprecated, as serialized vLLM models are now automatically
-          inferred as vLLM models.
-      verify_hash: If True, the hashes of each tensor will be verified against 
-          the hashes stored in the metadata. A `HashMismatchError` will be 
-          raised if any of the hashes do not match.
-      num_readers: Controls how many threads are allowed to read concurrently
-          from the source file. Default is `None`, which will dynamically set
-          the number of readers based on the number of available 
-          resources and model size. This greatly increases performance.
-      encryption_keyfile: File path to a binary file containing a  
-          binary key to use for decryption. `None` (the default) means 
-          no decryption. See the example script in 
-          examples/others/tensorize_vllm_model.py. 
-      s3_access_key_id: The access key for the S3 bucket. Can also be set via
-          the S3_ACCESS_KEY_ID environment variable.
-      s3_secret_access_key: The secret access key for the S3 bucket. Can also
-          be set via the S3_SECRET_ACCESS_KEY environment variable.
-      s3_endpoint: The endpoint for the S3 bucket. Can also be set via the
-          S3_ENDPOINT_URL environment variable.
-      lora_dir: Path to a directory containing LoRA adapter artifacts for 
-          serialization or deserialization. When serializing LoRA adapters 
-          this is the only necessary parameter to pass to this object's 
-          initializer.
-  """
+    These settings configure the behavior of model serialization and 
+    deserialization using Tensorizer.
+
+    Attributes:
+        tensorizer_uri: Path to serialized model tensors. Can be a local file 
+            path or a S3 URI. This is a required field unless lora_dir is 
+            provided and the config is meant to be used for the
+            `tensorize_lora_adapter` function. Unless a `tensorizer_dir` or 
+            `lora_dir` is passed to this object's initializer, this is 
+            a required argument.
+        tensorizer_dir: Path to a directory containing serialized model tensors,
+            and all other potential model artifacts to load the model, such as 
+            configs and tokenizer files. Can be passed instead of 
+            `tensorizer_uri` where the `model.tensors` file will be assumed 
+            to be in this directory.
+        vllm_tensorized: If True, indicates that the serialized model is a 
+            vLLM model. This is used to determine the behavior of the 
+            TensorDeserializer when loading tensors from a serialized model.
+            It is far faster to deserialize a vLLM model as it utilizes
+            tensorizer's optimized GPU loading. Note that this is now
+            deprecated, as serialized vLLM models are now automatically
+            inferred as vLLM models.
+        verify_hash: If True, the hashes of each tensor will be verified 
+            against the hashes stored in the metadata. A `HashMismatchError` 
+            will be raised if any of the hashes do not match.
+        num_readers: Controls how many threads are allowed to read concurrently
+            from the source file. Default is `None`, which will dynamically set
+            the number of readers based on the number of available 
+            resources and model size. This greatly increases performance.
+        encryption_keyfile: File path to a binary file containing a  
+            binary key to use for decryption. `None` (the default) means 
+            no decryption. See the example script in 
+            examples/others/tensorize_vllm_model.py. 
+        s3_access_key_id: The access key for the S3 bucket. Can also be set via
+            the S3_ACCESS_KEY_ID environment variable.
+        s3_secret_access_key: The secret access key for the S3 bucket. Can also
+            be set via the S3_SECRET_ACCESS_KEY environment variable.
+        s3_endpoint: The endpoint for the S3 bucket. Can also be set via the
+            S3_ENDPOINT_URL environment variable.
+        lora_dir: Path to a directory containing LoRA adapter artifacts for 
+            serialization or deserialization. When serializing LoRA adapters 
+            this is the only necessary parameter to pass to this object's 
+            initializer.
+    """
 
     def __post_init__(self):
         # check if the configuration is for a sharded vLLM model

diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
@@ -143,16 +143,8 @@ class AriaProjector(nn.Module):
     projects ViT's outputs into MoE's inputs.
 
     Args:
-        patch_to_query_dict (dict): Maps patch numbers to their corresponding
-        query numbers,
-            e.g., {1225: 128, 4900: 256}. This allows for different query sizes
-            based on image resolution.
-        embed_dim (int): Embedding dimension.
-        num_heads (int): Number of attention heads.
-        kv_dim (int): Dimension of key and value.
-        ff_dim (int): Hidden dimension of the feed-forward network.
-        output_dim (int): Output dimension.
-        norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm.
+        config: [AriaConfig](https://huggingface.co/docs/transformers/main/model_doc/aria#transformers.AriaConfig)
+            containing projector configuration parameters.
 
     Outputs:
         A tensor with the shape of (batch_size, query_number, output_dim)
@@ -282,8 +274,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         Forward pass of the MoE Layer.
 
         Args:
-            hidden_states (torch.Tensor): Input tensor of shape (batch_size,
-            sequence_length, hidden_size).
+            hidden_states: Input tensor of shape
+                (batch_size, sequence_length, hidden_size).
 
         Returns:
             torch.Tensor: Output tensor after passing through the MoE layer.

diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
@@ -401,8 +401,7 @@ def __init__(
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         r"""
         Args:
-            hidden_states
-                torch.Tensor of *encoder* input embeddings.
+            hidden_states: torch.Tensor of *encoder* input embeddings.
         Returns:
             Encoder layer output torch.Tensor
         """
@@ -490,10 +489,8 @@ def forward(
     ) -> torch.Tensor:
         r"""
         Args:
-            decoder_hidden_states
-                torch.Tensor of *decoder* input embeddings.
-            encoder_hidden_states
-                torch.Tensor of *encoder* input embeddings.
+            decoder_hidden_states: torch.Tensor of *decoder* input embeddings.
+            encoder_hidden_states: torch.Tensor of *encoder* input embeddings.
         Returns:
             Decoder layer output torch.Tensor
         """
@@ -584,12 +581,10 @@ def forward(
     ) -> torch.Tensor:
         r"""
         Args:
-            input_ids
-                Indices of *encoder* input sequence tokens in the vocabulary.
-                Padding will be ignored by default should you
-                provide it.
-            positions
-                Positions of *encoder* input sequence tokens.
+            input_ids: Indices of *encoder* input sequence tokens in the 
+                vocabulary.
+                Padding will be ignored by default should you provide it.
+            positions: Positions of *encoder* input sequence tokens.
         Returns:
             Decoder output torch.Tensor
         """
@@ -663,14 +658,11 @@ def forward(
     ) -> torch.Tensor:
         r"""
         Args:
-            decoder_input_ids
-                Indices of *decoder* input sequence tokens in the vocabulary.
-                Padding will be ignored by default should you
-                provide it.
-            decoder_positions
-                Positions of *decoder* input sequence tokens.
-            encoder_hidden_states:
-                Tensor of encoder output embeddings
+            decoder_input_ids: Indices of *decoder* input sequence tokens 
+                in the vocabulary.
+                Padding will be ignored by default should you provide it.
+            decoder_positions: Positions of *decoder* input sequence tokens.
+            encoder_hidden_states: Tensor of encoder output embeddings.
         Returns:
             Decoder output torch.Tensor
         """
@@ -732,16 +724,13 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 encoder_positions: torch.Tensor) -> torch.Tensor:
         r"""
         Args:
-            input_ids
-                Indices of *decoder* input sequence tokens in the vocabulary.
-                Padding will be ignored by default should you
-                provide it.
-            positions
-                Positions of *decoder* input sequence tokens.
-            encoder_input_ids
-                Indices of *encoder* input sequence tokens in the vocabulary.
-            encoder_positions:
-                Positions of *encoder* input sequence tokens.
+            input_ids: Indices of *decoder* input sequence tokens 
+                in the vocabulary.
+                Padding will be ignored by default should you provide it.
+            positions: Positions of *decoder* input sequence tokens.
+            encoder_input_ids: Indices of *encoder* input sequence tokens 
+                in the vocabulary.
+            encoder_positions: Positions of *encoder* input sequence tokens.
         Returns:
             Model output torch.Tensor
         """
@@ -848,14 +837,10 @@ def forward(
     ) -> torch.Tensor:
         r"""
         Args:
-            input_ids
-                torch.Tensor of *decoder* input token ids.
-            positions
-                torch.Tensor of *decoder* position indices.
-            encoder_input_ids
-                torch.Tensor of *encoder* input token ids.
-            encoder_positions
-                torch.Tensor of *encoder* position indices
+            input_ids: torch.Tensor of *decoder* input token ids.
+            positions: torch.Tensor of *decoder* position indices.
+            encoder_input_ids: torch.Tensor of *encoder* input token ids.
+            encoder_positions: torch.Tensor of *encoder* position indices.
         Returns:
             Output torch.Tensor
         """
@@ -912,8 +897,7 @@ class MBartEncoderLayer(BartEncoderLayer):
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         r"""
         Args:
-            hidden_states
-                torch.Tensor of *encoder* input embeddings.
+            hidden_states: torch.Tensor of *encoder* input embeddings.
         Returns:
             Encoder layer output torch.Tensor
         """
@@ -1035,12 +1019,10 @@ def forward(
     ) -> torch.Tensor:
         r"""
         Args:
-            input_ids
-                Indices of *encoder* input sequence tokens in the vocabulary.
-                Padding will be ignored by default should you
-                provide it.
-            positions
-                Positions of *encoder* input sequence tokens.
+            input_ids: Indices of *encoder* input sequence tokens in the 
+                vocabulary.
+                Padding will be ignored by default should you provide it.
+            positions: Positions of *encoder* input sequence tokens.
         Returns:
             Decoder output torch.Tensor
         """
@@ -1116,14 +1098,11 @@ def forward(
     ) -> torch.Tensor:
         r"""
         Args:
-            decoder_input_ids
-                Indices of *decoder* input sequence tokens in the vocabulary.
-                Padding will be ignored by default should you
-                provide it.
-            decoder_positions
-                Positions of *decoder* input sequence tokens.
-            encoder_hidden_states:
-                Tensor of encoder output embeddings
+            decoder_input_ids: Indices of *decoder* input sequence tokens 
+                in the vocabulary.
+                Padding will be ignored by default should you provide it.
+            decoder_positions: Positions of *decoder* input sequence tokens.
+            encoder_hidden_states: Tensor of encoder output embeddings.
         Returns:
             Decoder output torch.Tensor
         """
@@ -1185,16 +1164,13 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 encoder_positions: torch.Tensor) -> torch.Tensor:
         r"""
         Args:
-            input_ids
-                Indices of *decoder* input sequence tokens in the vocabulary.
-                Padding will be ignored by default should you
-                provide it.
-            positions
-                Positions of *decoder* input sequence tokens.
-            encoder_input_ids
-                Indices of *encoder* input sequence tokens in the vocabulary.
-            encoder_positions:
-                Positions of *encoder* input sequence tokens.
+            input_ids: Indices of *decoder* input sequence tokens 
+                in the vocabulary.
+                Padding will be ignored by default should you provide it.
+            positions: Positions of *decoder* input sequence tokens.
+            encoder_input_ids: Indices of *encoder* input sequence tokens 
+                in the vocabulary.
+            encoder_positions: Positions of *encoder* input sequence tokens.
         Returns:
             Model output torch.Tensor
         """

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
@@ -678,7 +678,6 @@ def forward(
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
-            pixel_values: The pixels in each input image.
 
         Info:
             [Blip2ImageInputs][]

diff --git a/vllm/model_executor/models/donut.py b/vllm/model_executor/models/donut.py
@@ -79,10 +79,8 @@ def forward(
     ) -> torch.Tensor:
         r"""
         Args:
-            input_ids
-                torch.Tensor of *decoder* input token ids.
-            positions
-                torch.Tensor of *decoder* position indices.
+            input_ids: torch.Tensor of *decoder* input token ids.
+            positions: torch.Tensor of *decoder* position indices.
         Returns:
             Output torch.Tensor
         """
@@ -351,14 +349,10 @@ def forward(
     ) -> torch.Tensor:
         r"""
         Args:
-            input_ids
-                torch.Tensor of *decoder* input token ids.
-            positions
-                torch.Tensor of *decoder* position indices.
-            encoder_input_ids
-                torch.Tensor of *encoder* input token ids.
-            encoder_positions
-                torch.Tensor of *encoder* position indices
+            input_ids: torch.Tensor of *decoder* input token ids.
+            positions: torch.Tensor of *decoder* position indices.
+            encoder_input_ids: torch.Tensor of *encoder* input token ids.
+            encoder_positions: torch.Tensor of *encoder* position indices
         Returns:
             Output torch.Tensor
         """
-Original file line number
+Diff line change
@@ Expand Up / @@ -678,7 +678,6 @@ def forward( @@
             Args:
                 input_ids: Flattened (concatenated) input_ids corresponding to a
                     batch.
-                pixel_values: The pixels in each input image.
             Info:
                 [Blip2ImageInputs][]
@@ Expand Down @@