Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 23 additions & 21 deletions docs/configuration/conserving_memory.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs.
```python
from vllm import LLM

llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
tensor_parallel_size=2)
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
```

!!! warning
Expand Down Expand Up @@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option).
```python
from vllm import LLM

llm = LLM(model="adept/fuyu-8b",
max_model_len=2048,
max_num_seqs=2)
llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
```

## Reduce CUDA Graphs
Expand Down Expand Up @@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag:
```python
from vllm import LLM

llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
enforce_eager=True)
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True)
```

## Adjust cache size
Expand All @@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
from vllm import LLM

# Accept up to 3 images and 1 video per prompt
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
limit_mm_per_prompt={"image": 3, "video": 1})
llm = LLM(
model="Qwen/Qwen2.5-VL-3B-Instruct",
limit_mm_per_prompt={"image": 3, "video": 1},
)
```

You can go a step further and disable unused modalities completely by setting its limit to zero.
Expand All @@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a
from vllm import LLM

# Accept any number of images but no videos
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
limit_mm_per_prompt={"video": 0})
llm = LLM(
model="Qwen/Qwen2.5-VL-3B-Instruct",
limit_mm_per_prompt={"video": 0},
)
```

You can even run a multi-modal model for text-only inference:
Expand All @@ -118,8 +118,10 @@ You can even run a multi-modal model for text-only inference:
from vllm import LLM

# Don't accept images. Just text.
llm = LLM(model="google/gemma-3-27b-it",
limit_mm_per_prompt={"image": 0})
llm = LLM(
model="google/gemma-3-27b-it",
limit_mm_per_prompt={"image": 0},
)
```

### Configurable options
Expand Down Expand Up @@ -173,14 +175,14 @@ Here are some examples:
from vllm import LLM

# Available for Qwen2-VL series models
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
mm_processor_kwargs={
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28
})
llm = LLM(
model="Qwen/Qwen2.5-VL-3B-Instruct",
mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28
)

# Available for InternVL series models
llm = LLM(model="OpenGVLab/InternVL2-2B",
mm_processor_kwargs={
"max_dynamic_patch": 4, # Default is 12
})
llm = LLM(
model="OpenGVLab/InternVL2-2B",
mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12
)
```
24 changes: 15 additions & 9 deletions docs/configuration/optimization.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ from vllm import LLM
llm = LLM(
model="meta-llama/Llama-3.3-70B-Instruct,
tensor_parallel_size=4,
pipeline_parallel_size=2
pipeline_parallel_size=2,
)
```

Expand Down Expand Up @@ -257,18 +257,24 @@ Examples:

```python
# Use a larger cache
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
mm_processor_cache_gb=8)
llm = LLM(
model="Qwen/Qwen2.5-VL-3B-Instruct",
mm_processor_cache_gb=8,
)

# Use a shared-memory based IPC cache
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
tensor_parallel_size=2,
mm_processor_cache_type="shm",
mm_processor_cache_gb=8)
llm = LLM(
model="Qwen/Qwen2.5-VL-3B-Instruct",
tensor_parallel_size=2,
mm_processor_cache_type="shm",
mm_processor_cache_gb=8,
)

# Disable the cache
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
mm_processor_cache_gb=0)
llm = LLM(
model="Qwen/Qwen2.5-VL-3B-Instruct",
mm_processor_cache_gb=0,
)
```

### Cache Placement
Expand Down
4 changes: 2 additions & 2 deletions docs/contributing/model/basic.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
) -> torch.Tensor:
...
```
Expand Down
42 changes: 23 additions & 19 deletions docs/contributing/model/multimodal.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Further update the model as follows:
...

@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
if modality.startswith("image"):
return "<image>"

Expand Down Expand Up @@ -45,14 +45,14 @@ Further update the model as follows:
...

def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:

assert self.vision_encoder is not None
image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features)

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:

self,
**kwargs: object,
) -> MultiModalEmbeddings | None:
# Validate the multimodal input keyword arguments
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
Expand Down Expand Up @@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
For example, if the model supports any number of images but only one video per prompt:

```python
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"image": None, "video": 1}
```

Expand Down Expand Up @@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
mm_options: Mapping[str, BaseDummyOptions] | None = None,
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)

Expand Down Expand Up @@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
```python
def get_image_size_with_most_features(self) -> ImageSize:
image_processor = self.get_image_processor()
return ImageSize(width=image_processor.size["width"],
height=image_processor.size["height"])
return ImageSize(
width=image_processor.size["width"],
height=image_processor.size["height"],
)
```

Fuyu does not expect image placeholders in the inputs to HF processor, so
Expand Down Expand Up @@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in

return {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images,
overrides=image_overrides)
self._get_dummy_images(
width=target_width,
height=target_height,
num_images=num_images,
overrides=image_overrides,
)
}
```

Expand Down Expand Up @@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_width=image_size.width,
image_height=image_size.height,
)
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows

return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id],
Expand Down Expand Up @@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_width=image_size.width,
image_height=image_size.height,
)
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows

return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id],
Expand Down Expand Up @@ -810,9 +812,11 @@ to register them to the multi-modal registry:
from vllm.model_executor.models.interfaces import SupportsMultiModal
+ from vllm.multimodal import MULTIMODAL_REGISTRY

+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
+ info=YourProcessingInfo,
+ dummy_inputs=YourDummyInputsBuilder)
+ @MULTIMODAL_REGISTRY.register_processor(
+ YourMultiModalProcessor,
+ info=YourProcessingInfo,
+ dummy_inputs=YourDummyInputsBuilder,
+ )
class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
```

Expand Down
2 changes: 1 addition & 1 deletion docs/contributing/model/registration.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def register():

ModelRegistry.register_model(
"YourModelForCausalLM",
"your_code:YourModelForCausalLM"
"your_code:YourModelForCausalLM",
)
```

Expand Down
12 changes: 11 additions & 1 deletion docs/contributing/model/transcription.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Declare supported languages and capabilities:
- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).

??? code "supported_languages and supports_transcription_only"

```python
from typing import ClassVar, Mapping, Literal
import numpy as np
Expand Down Expand Up @@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
This is for controlling general behavior of the API when serving your model:

??? code "get_speech_to_text_config()"

```python
class YourASRModel(nn.Module, SupportsTranscription):
...
Expand Down Expand Up @@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:

??? code "get_generation_prompt()"

```python
class YourASRModel(nn.Module, SupportsTranscription):
...
Expand Down Expand Up @@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:

??? code "get_generation_prompt()"

```python
class YourASRModel(nn.Module, SupportsTranscription):
...
Expand Down Expand Up @@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface
If your model requires a language and you want a default, override this method (see Whisper):

??? code "validate_language()"

```python
@classmethod
def validate_language(cls, language: str | None) -> str | None:
if language is None:
logger.warning(
"Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
"Defaulting to language='en'. If you wish to transcribe "
"audio in a different language, pass the `language` field "
"in the TranscriptionRequest."
)
language = "en"
return super().validate_language(language)
```
Expand All @@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
Provide a fast duration→token estimate to improve streaming usage statistics:

??? code "get_num_audio_tokens()"

```python
class YourASRModel(nn.Module, SupportsTranscription):
...
Expand All @@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi
Relevant server logic:

??? code "_preprocess_speech_to_text()"

```python
# vllm/entrypoints/openai/speech_to_text.py
async def _preprocess_speech_to_text(...):
Expand Down
4 changes: 2 additions & 2 deletions docs/deployment/frameworks/cerebrium.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference

??? console "Command"

```python
```bash
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
-H 'Content-Type: application/json' \
-H 'Authorization: <JWT TOKEN>' \
Expand All @@ -81,7 +81,7 @@ You should get a response like:

??? console "Response"

```python
```json
{
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
"result": {
Expand Down
4 changes: 2 additions & 2 deletions docs/deployment/frameworks/dstack.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:

client = OpenAI(
base_url="https://gateway.<gateway domain>",
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",
)

completion = client.chat.completions.create(
Expand All @@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
"role": "user",
"content": "Compose a poem that explains the concept of recursion in programming.",
}
]
],
)

print(completion.choices[0].message.content)
Expand Down
2 changes: 1 addition & 1 deletion docs/deployment/frameworks/haystack.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ pip install vllm haystack-ai
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
model="mistralai/Mistral-7B-Instruct-v0.1",
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
generation_kwargs = {"max_tokens": 512}
generation_kwargs={"max_tokens": 512},
)

response = generator.run(
Expand Down
Loading