diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py index 3498d2994c2a..2efb605f203f 100644 --- a/vllm/model_executor/layers/quantization/torchao.py +++ b/vllm/model_executor/layers/quantization/torchao.py @@ -144,9 +144,9 @@ def torchao_quantize_param_data(param: torch.Tensor, """Quantize a Tensor with torchao quantization specified by torchao_config Args: - `param`: weight parameter of the linear module - `torchao_config`: type of quantization and their arguments we want to - use to quantize the Tensor + param: weight parameter of the linear module + torchao_config: type of quantization and their arguments we want to + use to quantize the Tensor """ from torchao.core.config import AOBaseConfig from torchao.quantization import quantize_ @@ -172,8 +172,8 @@ class TorchAOLinearMethod(LinearMethodBase): """Linear method for torchao. Args: - torchao_config: The torchao quantization config, a string - that encodes the type of quantization and all relevant arguments. + quant_config: The torchao quantization config, a string that encodes + the type of quantization and all relevant arguments. """ def __init__(self, quant_config: TorchAOConfig): diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py index 6840cabbf1ae..62e458ec3c93 100644 --- a/vllm/model_executor/layers/quantization/utils/int8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py @@ -423,7 +423,7 @@ def w8a8_block_int8_matmul( Bs: The per-block quantization scale for `B`. block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128]. - output_dytpe: The dtype of the returned tensor. + output_dtype: The dtype of the returned tensor. Returns: torch.Tensor: The result of matmul. diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index 27ca010cdc3a..786a6e1b3e12 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -135,8 +135,8 @@ def triton_mrope( """Qwen2VL mrope kernel. Args: - query: [num_tokens, num_heads * head_size] - key: [num_tokens, num_kv_heads * head_size] + q: [num_tokens, num_heads * head_size] + k: [num_tokens, num_kv_heads * head_size] cos: [3, num_tokens, head_size //2 ] (T/H/W positions with multimodal inputs) sin: [3, num_tokens, head_size //2 ] diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 3d491be3156b..58296131fadb 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -171,51 +171,52 @@ class TensorizerConfig(MutableMapping): _is_sharded: bool = field(init=False, default=False) _fields: ClassVar[tuple[str, ...]] _keys: ClassVar[frozenset[str]] - """ - Args for the TensorizerConfig class. These are used to configure the - behavior of model serialization and deserialization using Tensorizer. + """Configuration class for Tensorizer settings. - Args: - tensorizer_uri: Path to serialized model tensors. Can be a local file - path or a S3 URI. This is a required field unless lora_dir is - provided and the config is meant to be used for the - `tensorize_lora_adapter` function. Unless a `tensorizer_dir` or - `lora_dir` is passed to this object's initializer, this is a required - argument. - tensorizer_dir: Path to a directory containing serialized model tensors, - and all other potential model artifacts to load the model, such as - configs and tokenizer files. Can be passed instead of `tensorizer_uri` - where the `model.tensors` file will be assumed to be in this - directory. - vllm_tensorized: If True, indicates that the serialized model is a - vLLM model. This is used to determine the behavior of the - TensorDeserializer when loading tensors from a serialized model. - It is far faster to deserialize a vLLM model as it utilizes - tensorizer's optimized GPU loading. Note that this is now - deprecated, as serialized vLLM models are now automatically - inferred as vLLM models. - verify_hash: If True, the hashes of each tensor will be verified against - the hashes stored in the metadata. A `HashMismatchError` will be - raised if any of the hashes do not match. - num_readers: Controls how many threads are allowed to read concurrently - from the source file. Default is `None`, which will dynamically set - the number of readers based on the number of available - resources and model size. This greatly increases performance. - encryption_keyfile: File path to a binary file containing a - binary key to use for decryption. `None` (the default) means - no decryption. See the example script in - examples/others/tensorize_vllm_model.py. - s3_access_key_id: The access key for the S3 bucket. Can also be set via - the S3_ACCESS_KEY_ID environment variable. - s3_secret_access_key: The secret access key for the S3 bucket. Can also - be set via the S3_SECRET_ACCESS_KEY environment variable. - s3_endpoint: The endpoint for the S3 bucket. Can also be set via the - S3_ENDPOINT_URL environment variable. - lora_dir: Path to a directory containing LoRA adapter artifacts for - serialization or deserialization. When serializing LoRA adapters - this is the only necessary parameter to pass to this object's - initializer. - """ + These settings configure the behavior of model serialization and + deserialization using Tensorizer. + + Attributes: + tensorizer_uri: Path to serialized model tensors. Can be a local file + path or a S3 URI. This is a required field unless lora_dir is + provided and the config is meant to be used for the + `tensorize_lora_adapter` function. Unless a `tensorizer_dir` or + `lora_dir` is passed to this object's initializer, this is + a required argument. + tensorizer_dir: Path to a directory containing serialized model tensors, + and all other potential model artifacts to load the model, such as + configs and tokenizer files. Can be passed instead of + `tensorizer_uri` where the `model.tensors` file will be assumed + to be in this directory. + vllm_tensorized: If True, indicates that the serialized model is a + vLLM model. This is used to determine the behavior of the + TensorDeserializer when loading tensors from a serialized model. + It is far faster to deserialize a vLLM model as it utilizes + tensorizer's optimized GPU loading. Note that this is now + deprecated, as serialized vLLM models are now automatically + inferred as vLLM models. + verify_hash: If True, the hashes of each tensor will be verified + against the hashes stored in the metadata. A `HashMismatchError` + will be raised if any of the hashes do not match. + num_readers: Controls how many threads are allowed to read concurrently + from the source file. Default is `None`, which will dynamically set + the number of readers based on the number of available + resources and model size. This greatly increases performance. + encryption_keyfile: File path to a binary file containing a + binary key to use for decryption. `None` (the default) means + no decryption. See the example script in + examples/others/tensorize_vllm_model.py. + s3_access_key_id: The access key for the S3 bucket. Can also be set via + the S3_ACCESS_KEY_ID environment variable. + s3_secret_access_key: The secret access key for the S3 bucket. Can also + be set via the S3_SECRET_ACCESS_KEY environment variable. + s3_endpoint: The endpoint for the S3 bucket. Can also be set via the + S3_ENDPOINT_URL environment variable. + lora_dir: Path to a directory containing LoRA adapter artifacts for + serialization or deserialization. When serializing LoRA adapters + this is the only necessary parameter to pass to this object's + initializer. + """ def __post_init__(self): # check if the configuration is for a sharded vLLM model diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 1c7960fa3e0a..db262447d7fa 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -143,16 +143,8 @@ class AriaProjector(nn.Module): projects ViT's outputs into MoE's inputs. Args: - patch_to_query_dict (dict): Maps patch numbers to their corresponding - query numbers, - e.g., {1225: 128, 4900: 256}. This allows for different query sizes - based on image resolution. - embed_dim (int): Embedding dimension. - num_heads (int): Number of attention heads. - kv_dim (int): Dimension of key and value. - ff_dim (int): Hidden dimension of the feed-forward network. - output_dim (int): Output dimension. - norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm. + config: [AriaConfig](https://huggingface.co/docs/transformers/main/model_doc/aria#transformers.AriaConfig) + containing projector configuration parameters. Outputs: A tensor with the shape of (batch_size, query_number, output_dim) @@ -282,8 +274,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: Forward pass of the MoE Layer. Args: - hidden_states (torch.Tensor): Input tensor of shape (batch_size, - sequence_length, hidden_size). + hidden_states: Input tensor of shape + (batch_size, sequence_length, hidden_size). Returns: torch.Tensor: Output tensor after passing through the MoE layer. diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index fd4d820a01e9..242530817c64 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -401,8 +401,7 @@ def __init__( def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: r""" Args: - hidden_states - torch.Tensor of *encoder* input embeddings. + hidden_states: torch.Tensor of *encoder* input embeddings. Returns: Encoder layer output torch.Tensor """ @@ -490,10 +489,8 @@ def forward( ) -> torch.Tensor: r""" Args: - decoder_hidden_states - torch.Tensor of *decoder* input embeddings. - encoder_hidden_states - torch.Tensor of *encoder* input embeddings. + decoder_hidden_states: torch.Tensor of *decoder* input embeddings. + encoder_hidden_states: torch.Tensor of *encoder* input embeddings. Returns: Decoder layer output torch.Tensor """ @@ -584,12 +581,10 @@ def forward( ) -> torch.Tensor: r""" Args: - input_ids - Indices of *encoder* input sequence tokens in the vocabulary. - Padding will be ignored by default should you - provide it. - positions - Positions of *encoder* input sequence tokens. + input_ids: Indices of *encoder* input sequence tokens in the + vocabulary. + Padding will be ignored by default should you provide it. + positions: Positions of *encoder* input sequence tokens. Returns: Decoder output torch.Tensor """ @@ -663,14 +658,11 @@ def forward( ) -> torch.Tensor: r""" Args: - decoder_input_ids - Indices of *decoder* input sequence tokens in the vocabulary. - Padding will be ignored by default should you - provide it. - decoder_positions - Positions of *decoder* input sequence tokens. - encoder_hidden_states: - Tensor of encoder output embeddings + decoder_input_ids: Indices of *decoder* input sequence tokens + in the vocabulary. + Padding will be ignored by default should you provide it. + decoder_positions: Positions of *decoder* input sequence tokens. + encoder_hidden_states: Tensor of encoder output embeddings. Returns: Decoder output torch.Tensor """ @@ -732,16 +724,13 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, encoder_positions: torch.Tensor) -> torch.Tensor: r""" Args: - input_ids - Indices of *decoder* input sequence tokens in the vocabulary. - Padding will be ignored by default should you - provide it. - positions - Positions of *decoder* input sequence tokens. - encoder_input_ids - Indices of *encoder* input sequence tokens in the vocabulary. - encoder_positions: - Positions of *encoder* input sequence tokens. + input_ids: Indices of *decoder* input sequence tokens + in the vocabulary. + Padding will be ignored by default should you provide it. + positions: Positions of *decoder* input sequence tokens. + encoder_input_ids: Indices of *encoder* input sequence tokens + in the vocabulary. + encoder_positions: Positions of *encoder* input sequence tokens. Returns: Model output torch.Tensor """ @@ -848,14 +837,10 @@ def forward( ) -> torch.Tensor: r""" Args: - input_ids - torch.Tensor of *decoder* input token ids. - positions - torch.Tensor of *decoder* position indices. - encoder_input_ids - torch.Tensor of *encoder* input token ids. - encoder_positions - torch.Tensor of *encoder* position indices + input_ids: torch.Tensor of *decoder* input token ids. + positions: torch.Tensor of *decoder* position indices. + encoder_input_ids: torch.Tensor of *encoder* input token ids. + encoder_positions: torch.Tensor of *encoder* position indices. Returns: Output torch.Tensor """ @@ -912,8 +897,7 @@ class MBartEncoderLayer(BartEncoderLayer): def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: r""" Args: - hidden_states - torch.Tensor of *encoder* input embeddings. + hidden_states: torch.Tensor of *encoder* input embeddings. Returns: Encoder layer output torch.Tensor """ @@ -1035,12 +1019,10 @@ def forward( ) -> torch.Tensor: r""" Args: - input_ids - Indices of *encoder* input sequence tokens in the vocabulary. - Padding will be ignored by default should you - provide it. - positions - Positions of *encoder* input sequence tokens. + input_ids: Indices of *encoder* input sequence tokens in the + vocabulary. + Padding will be ignored by default should you provide it. + positions: Positions of *encoder* input sequence tokens. Returns: Decoder output torch.Tensor """ @@ -1116,14 +1098,11 @@ def forward( ) -> torch.Tensor: r""" Args: - decoder_input_ids - Indices of *decoder* input sequence tokens in the vocabulary. - Padding will be ignored by default should you - provide it. - decoder_positions - Positions of *decoder* input sequence tokens. - encoder_hidden_states: - Tensor of encoder output embeddings + decoder_input_ids: Indices of *decoder* input sequence tokens + in the vocabulary. + Padding will be ignored by default should you provide it. + decoder_positions: Positions of *decoder* input sequence tokens. + encoder_hidden_states: Tensor of encoder output embeddings. Returns: Decoder output torch.Tensor """ @@ -1185,16 +1164,13 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, encoder_positions: torch.Tensor) -> torch.Tensor: r""" Args: - input_ids - Indices of *decoder* input sequence tokens in the vocabulary. - Padding will be ignored by default should you - provide it. - positions - Positions of *decoder* input sequence tokens. - encoder_input_ids - Indices of *encoder* input sequence tokens in the vocabulary. - encoder_positions: - Positions of *encoder* input sequence tokens. + input_ids: Indices of *decoder* input sequence tokens + in the vocabulary. + Padding will be ignored by default should you provide it. + positions: Positions of *decoder* input sequence tokens. + encoder_input_ids: Indices of *encoder* input sequence tokens + in the vocabulary. + encoder_positions: Positions of *encoder* input sequence tokens. Returns: Model output torch.Tensor """ diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index ed98a3008c56..c1e7a7d498b1 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -678,7 +678,6 @@ def forward( Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. - pixel_values: The pixels in each input image. Info: [Blip2ImageInputs][] diff --git a/vllm/model_executor/models/donut.py b/vllm/model_executor/models/donut.py index c00db52371b6..23f4c6a4f93f 100644 --- a/vllm/model_executor/models/donut.py +++ b/vllm/model_executor/models/donut.py @@ -79,10 +79,8 @@ def forward( ) -> torch.Tensor: r""" Args: - input_ids - torch.Tensor of *decoder* input token ids. - positions - torch.Tensor of *decoder* position indices. + input_ids: torch.Tensor of *decoder* input token ids. + positions: torch.Tensor of *decoder* position indices. Returns: Output torch.Tensor """ @@ -351,14 +349,10 @@ def forward( ) -> torch.Tensor: r""" Args: - input_ids - torch.Tensor of *decoder* input token ids. - positions - torch.Tensor of *decoder* position indices. - encoder_input_ids - torch.Tensor of *encoder* input token ids. - encoder_positions - torch.Tensor of *encoder* position indices + input_ids: torch.Tensor of *decoder* input token ids. + positions: torch.Tensor of *decoder* position indices. + encoder_input_ids: torch.Tensor of *encoder* input token ids. + encoder_positions: torch.Tensor of *encoder* position indices Returns: Output torch.Tensor """ diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index d0881231fb1e..5e05e0c60f41 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -631,16 +631,14 @@ def forward( ) -> torch.Tensor: r""" Args: - input_ids - Indices of *decoder* input sequence tokens in the vocabulary. + input_ids: Indices of *decoder* input sequence tokens + in the vocabulary. Padding will be ignored by default should you provide it. - positions - Positions of *decoder* input sequence tokens. - encoder_input_ids - Indices of *encoder* input sequence tokens in the vocabulary. - encoder_positions: - Positions of *encoder* input sequence tokens. + positions: Positions of *decoder* input sequence tokens. + encoder_input_ids: Indices of *encoder* input sequence tokens + in the vocabulary. + encoder_positions: Positions of *encoder* input sequence tokens. Returns: Model output torch.Tensor """ @@ -699,14 +697,10 @@ def forward( ) -> torch.Tensor: r""" Args: - input_ids - torch.Tensor of *decoder* input token ids. - positions - torch.Tensor of *decoder* position indices. - encoder_input_ids - torch.Tensor of *encoder* input token ids. - encoder_positions - torch.Tensor of *encoder* position indices + input_ids: torch.Tensor of *decoder* input token ids. + positions: torch.Tensor of *decoder* position indices. + encoder_input_ids: torch.Tensor of *encoder* input token ids. + encoder_positions: torch.Tensor of *encoder* position indices Returns: Output torch.Tensor """ @@ -1068,14 +1062,10 @@ def forward( ) -> torch.Tensor: r""" Args: - input_ids - torch.Tensor of *decoder* input token ids. - positions - torch.Tensor of *decoder* position indices. - encoder_input_ids - torch.Tensor of *encoder* input token ids. - encoder_positions - torch.Tensor of *encoder* position indices + input_ids: torch.Tensor of *decoder* input token ids. + positions: torch.Tensor of *decoder* position indices. + encoder_input_ids: torch.Tensor of *encoder* input token ids. + encoder_positions: torch.Tensor of *encoder* position indices Returns: Output torch.Tensor """ diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 539381b61800..e47216590ce0 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1583,17 +1583,10 @@ def forward( **NOTE**: If mrope is enabled (default setting for GLM-4V opensource models), the shape will be `(3, seq_len)`, otherwise it will be `(seq_len,). - pixel_values: Pixel values to be fed to a model. - `None` if no images are passed. - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. - `None` if no images are passed. - pixel_values_videos: Pixel values of videos to be fed to a model. - `None` if no videos are passed. - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. - `None` if no videos are passed. - second_per_grid_ts: Tensor `(num_videos)` of video time interval ( - in seconds) for each grid along the temporal dimension in the - 3D position IDs. `None` if no videos are passed. + intermediate_tensors: Optional intermediate tensors for pipeline + parallelism. + inputs_embeds: Optional pre-computed input embeddings. + **kwargs: Additional keyword arguments. """ if intermediate_tensors is not None: inputs_embeds = None