From 95df807efffd8e8460a39590a8ce83b97d9ec10e Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Fri, 19 Sep 2025 13:49:22 -0700 Subject: [PATCH] allow disable flashinfer prefill Summary: GB200 FlashInfer Prefill is not compatible with CutlassMLA FP8, allowing disable it for now. Differential Revision: D81994905 Signed-off-by: Lu Fang --- vllm/envs.py | 3 +++ vllm/v1/attention/backends/mla/common.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 19e2f8635275..294a0b920fb7 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -32,6 +32,7 @@ VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm") VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai" VLLM_NO_USAGE_STATS: bool = False + VLLM_DISABLE_FLASHINFER_PREFILL: bool = False VLLM_DO_NOT_TRACK: bool = False VLLM_USAGE_SOURCE: str = "" VLLM_CONFIGURE_LOGGING: int = 1 @@ -479,6 +480,8 @@ def get_vllm_port() -> Optional[int]: lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"), "VLLM_NO_USAGE_STATS": lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1", + "VLLM_DISABLE_FLASHINFER_PREFILL": + lambda: os.environ.get("VLLM_DISABLE_FLASHINFER_PREFILL", "0") == "1", "VLLM_DO_NOT_TRACK": lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get( "DO_NOT_TRACK", None) or "0") == "1", diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index a990cb2f1a97..5b307810de93 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -412,7 +412,8 @@ def __post_init__(self): def use_flashinfer_prefill() -> bool: # For blackwell default to flashinfer prefill if it's available since # it is faster than FA2. - return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL + return (not envs.VLLM_DISABLE_FLASHINFER_PREFILL and flashinfer_available + and not envs.VLLM_USE_CUDNN_PREFILL and current_platform.is_device_capability(100))