-
-
Notifications
You must be signed in to change notification settings - Fork 10.5k
[Hybrid]: Decouple Logical Block Size from Physical Page Size #24486
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
05ef7f1
0d18668
bded2b4
90c14ab
613f4c6
edfdf8d
28e94eb
b1d3dcc
e10d70a
2ce97c4
0909efd
097c11c
0e6ae07
3fd0727
ff983af
e869bf0
3bb83b9
8a7c2b6
9620fe0
5fe1e95
ddbaebb
698b55e
e013093
1a52e56
bbe2200
df485c3
29f9d30
f70aefa
beee4d3
40d7b95
1710a7a
5820c10
adba4a5
279d1d0
5d328d2
7cb4fc3
585f2bf
413272b
a51673b
5691f12
d865f00
dd7bfc8
6a97abb
74e0ff1
c9231e8
248dbd5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -361,16 +361,34 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: | |
block_size=model_config.max_model_len, | ||
).page_size_bytes | ||
|
||
# some attention backends (e.g. FA) only support setting | ||
# block size to multiple of 16, so let's suggest a value | ||
# that would work (note: FA is currently not compatible | ||
# with mamba layers, use FlashInfer instead). | ||
attn_block_size = 16 * cdiv(mamba_page_size, | ||
16 * attn_page_size_1_token) | ||
# Model may be marked as is_hybrid | ||
# but mamba is skipped via config, | ||
# return directly | ||
if mamba_page_size == 0: | ||
return | ||
|
||
# Attention backend constraints: | ||
# - FlashAttention (FA) requires block size to be multiple of 16 | ||
# - MLA (Multi-head Latent Attention) requires larger alignment: | ||
# * CUTLASS_MLA backend: 128-byte alignment | ||
# * Other MLA backends: 64-byte alignment | ||
if model_config.use_mla: | ||
use_cutlass_mla = (envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA") | ||
kernel_block_alignment_size = 128 if use_cutlass_mla else 64 | ||
else: | ||
kernel_block_alignment_size = 16 | ||
Comment on lines
+375
to
+379
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason we can't get this info from the attention backend directly?
Comment on lines
+378
to
+379
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we also need to take into account FlashInfer here. It FlashInfer uses TRTLLM (default on blackwell) then the block size cannot be more than 128. Is there any way we could get this info from the attention backend directly? Or we can't because are at init time here? |
||
|
||
# Calculate minimum attention block size that satisfies both: | ||
# 1. Backend alignment requirements (kernel_block_alignment_size) | ||
# 2. Mamba page size compatibility (attn_page_size >= mamba_page_size) | ||
attn_block_size = kernel_block_alignment_size * cdiv( | ||
mamba_page_size, | ||
kernel_block_alignment_size * attn_page_size_1_token) | ||
|
||
# override attention block size if either (a) the | ||
# user has not set it or (b) the user has set it | ||
# too small. | ||
# | ||
zhiyuan1i marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (cache_config.block_size is None | ||
or cache_config.block_size < attn_block_size): | ||
cache_config.block_size = attn_block_size | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -128,7 +128,11 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: | |
|
||
# TODO(lucas): handle this more gracefully | ||
# Note: model_config may be None during testing | ||
if model_config is not None and model_config.use_mla: | ||
# Note: block_size is initialized in | ||
# HybridAttentionMambaModelConfig.verify_and_update_config | ||
zhiyuan1i marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# and doesn't need to be reinitialized here | ||
Comment on lines
+131
to
+133
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This statement is true for hybrid models only right? |
||
if model_config is not None and model_config.use_mla \ | ||
and cache_config.block_size is not None: | ||
# If `VLLM_ATTENTION_BACKEND` is not set and we are using MLA, | ||
# then we default to FlashMLA backend for non-blackwell GPUs, | ||
# else we default to CutlassMLA. For each case, we force the | ||
|
@@ -159,17 +163,18 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: | |
|
||
from vllm.attention.ops.flashmla import is_flashmla_supported | ||
if use_flashmla and is_flashmla_supported()[0] \ | ||
and cache_config.block_size != 64: | ||
and cache_config.block_size % 64 != 0: | ||
cache_config.block_size = 64 | ||
logger.info( | ||
"Forcing kv cache block size to 64 for FlashMLA backend.") | ||
|
||
if use_cutlass_mla and cache_config.block_size != 128: | ||
if use_cutlass_mla and cache_config.block_size % 128 != 0: | ||
cache_config.block_size = 128 | ||
logger.info("Forcing kv cache block size to 128 for " | ||
"CUTLASS_MLA backend.") | ||
|
||
if use_flashinfer_mla and cache_config.block_size not in [32, 64]: | ||
if use_flashinfer_mla and cache_config.block_size != 32 and \ | ||
cache_config.block_size % 64 != 0: | ||
cache_config.block_size = 64 | ||
logger.info( | ||
"Forcing kv cache block size to 64 for FlashInferMLA " | ||
|
@@ -237,10 +242,10 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype, | |
|
||
use_cutlassmla = selected_backend == _Backend.CUTLASS_MLA or ( | ||
selected_backend is None and cls.is_device_capability(100) | ||
and block_size == 128) | ||
and block_size % 128 == 0) | ||
use_flashinfermla = selected_backend == _Backend.FLASHINFER_MLA or ( | ||
selected_backend is None and cls.is_device_capability(100) | ||
and block_size in [32, 64]) | ||
selected_backend is None and cls.is_device_capability(100) and | ||
(block_size == 32 or block_size % 64 == 0)) | ||
use_flashmla = selected_backend == _Backend.FLASHMLA or ( | ||
selected_backend is None and is_flashmla_supported()[0]) | ||
use_flashattn = selected_backend == _Backend.FLASH_ATTN_MLA or ( | ||
|
@@ -260,7 +265,7 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype, | |
return ("vllm.v1.attention.backends.mla." | ||
"flashinfer_mla.FlashInferMLABackend") | ||
if use_flashmla: | ||
if block_size != 64: | ||
if block_size % 64 != 0: | ||
logger.warning( | ||
"FlashMLA backend is not supported for block size %d" | ||
" (currently only supports block size 64).", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,7 @@ | |
|
||
from vllm import _custom_ops as ops | ||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, | ||
AttentionType) | ||
AttentionType, MultipleOf) | ||
from vllm.config import CUDAGraphMode, VllmConfig | ||
from vllm.logger import init_logger | ||
from vllm.model_executor.layers.quantization.utils.quant_utils import ( | ||
|
@@ -154,6 +154,10 @@ def get_supported_head_sizes(cls) -> list[int]: | |
# https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157 | ||
return [64, 128, 256] | ||
|
||
@staticmethod | ||
def get_supported_block_size() -> list[Union[int, MultipleOf]]: | ||
return [MultipleOf(1)] | ||
Comment on lines
+158
to
+159
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to take into account here whether FlashInfer is using TRTLLM (it has two different paths). When TRTLLM is enabled there are additional constraints on the block size (it can't be larger than 128). |
||
|
||
@classmethod | ||
def validate_head_size(cls, head_size: int) -> None: | ||
supported_head_sizes = cls.get_supported_head_sizes() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
|
||
import vllm._custom_ops as ops | ||
from vllm.attention.backends.abstract import (AttentionLayer, AttentionType, | ||
MultipleOf, | ||
is_quantized_kv_cache) | ||
from vllm.logger import init_logger | ||
from vllm.v1.attention.backends.mla.common import (MLACommonBackend, | ||
|
@@ -39,6 +40,10 @@ def get_impl_cls() -> type["CutlassMLAImpl"]: | |
def get_builder_cls() -> type["CutlassMLAMetadataBuilder"]: | ||
return CutlassMLAMetadataBuilder | ||
|
||
@staticmethod | ||
def get_supported_block_size() -> list[Union[int, MultipleOf]]: | ||
return [128] | ||
Comment on lines
+44
to
+45
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do I understand correctly that this means the block size has to be exactly 128? Or it can be a multiple? If the latter, should we return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I ask because the changes in platfom/cuda.py imply that it can be a multiple of 128 |
||
|
||
|
||
class SM100Workspace: | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
which model?