Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion examples/offline_inference/qwen_1m.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from vllm import LLM, SamplingParams

os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"


Expand Down
5 changes: 3 additions & 2 deletions tests/compile/test_fusion_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,8 +334,9 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
[7, 256, 533] if current_platform.is_cuda() else [8])
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("model_name, model_class", MODELS)
@pytest.mark.parametrize("backend", [_Backend.FLASHINFER] if
current_platform.is_cuda() else [_Backend.ROCM_FLASH])
@pytest.mark.parametrize("backend",
[_Backend.FLASHINFER] if current_platform.is_cuda()
else [_Backend.TRITON_ATTN_VLLM_V1])
@pytest.mark.parametrize(
"split_attention",
[False, True] if current_platform.is_rocm() else [False])
Expand Down
6 changes: 3 additions & 3 deletions tests/kernels/attention/test_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask

from vllm.attention.backends.xformers import _make_alibi_bias
from tests.kernels.utils import make_alibi_bias

FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
Expand Down Expand Up @@ -429,8 +429,8 @@ def test_multi_query_kv_attention(
alibi_bias = None
if use_alibi:
alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
seq_lens)
attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
seq_lens)
output = torch.empty_like(query)
start = 0
# Dynamic sequence length not supported with custom attn_bias.
Expand Down
1 change: 1 addition & 0 deletions tests/kernels/attention/test_attention_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def generate_params():
return params


@pytest.mark.skip(reason="Skipped for now. Should be revisited.")
@pytest.mark.parametrize("device, name, use_mla, block_size",
generate_params())
def test_env(
Expand Down
6 changes: 3 additions & 3 deletions tests/kernels/attention/test_prefix_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask

from vllm.attention.backends.xformers import _make_alibi_bias
from tests.kernels.utils import make_alibi_bias
from vllm.attention.ops.chunked_prefill_paged_decode import (
chunked_prefill_paged_decode)
from vllm.attention.ops.prefix_prefill import context_attention_fwd
Expand Down Expand Up @@ -470,7 +470,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
key = key.unsqueeze(0)
value = value.unsqueeze(0)

attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
output_ref = torch.empty_like(output)
seq_start = 0
query_start = 0
Expand All @@ -479,7 +479,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
# FIXME(DefTruth): Because xformers does not support dynamic sequence
# lengths with custom attention bias, we process each prompt one by
# one. This is inefficient, especially when we have many short prompts.
# modified from: vllm/attention/backends/xformers.py#L343
# modified from: vllm/v1/attention/backends/xformers.py#L343
for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
seq_end = seq_start + seq_len
query_end = query_start + query_len
Expand Down
1 change: 1 addition & 0 deletions tests/kernels/attention/test_rocm_attention_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def clear_cache():
_cached_get_attn_backend.cache_clear()


@pytest.mark.skip(reason="Skipped for now. Should be revisited.")
def test_selector(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
Expand Down
66 changes: 56 additions & 10 deletions tests/kernels/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,10 +513,6 @@ def make_backend(backend_name: str) -> AttentionBackend:
Construct the backend instance determined by the backend_name string
argument.

"XFORMERS" -> construct xformers backend

TODO: other backends

Note: at time of writing the Attention wrapper automatically selects
its own backend for Attention.forward(); so the backend instance which
you generate with this function is not meant to be used for *running*
Expand All @@ -528,18 +524,68 @@ def make_backend(backend_name: str) -> AttentionBackend:

* Backend instance
'''
if backend_name == STR_XFORMERS_ATTN_VAL:
# NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
from vllm.attention.backends.xformers import XFormersBackend
return XFormersBackend()
elif backend_name == STR_FLASH_ATTN_VAL:
from vllm.attention.backends.flash_attn import FlashAttentionBackend
if backend_name in (STR_XFORMERS_ATTN_VAL, "XFORMERS_VLLM_V1"):
from vllm.v1.attention.backends.xformers import (
XFormersAttentionBackend)
return XFormersAttentionBackend()
if backend_name in (STR_FLASH_ATTN_VAL, "FLASH_ATTN_VLLM_V1"):
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
return FlashAttentionBackend()
if backend_name == "TRITON_ATTN_VLLM_V1":
from vllm.v1.attention.backends.triton_attn import (
TritonAttentionBackend)
return TritonAttentionBackend()
if backend_name == "FLEX_ATTENTION":
from vllm.v1.attention.backends.flex_attention import (
FlexAttentionBackend)
return FlexAttentionBackend()
if backend_name in ("TORCH_SDPA", "TORCH_SDPA_VLLM_V1"):
from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend
return TorchSDPABackend()
if backend_name == "FLASHINFER":
from vllm.v1.attention.backends.flashinfer import FlashInferBackend
return FlashInferBackend()

raise AssertionError(
f"Unrecognized backend_name {backend_name} for unit test")


def make_alibi_bias(
alibi_slopes: torch.Tensor,
num_kv_heads: int,
dtype: torch.dtype,
seq_lens: list[int],
) -> list[Any]:
"""Create ALiBi biases compatible with xFormers attention tests."""
from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias

if alibi_slopes is None:
return [None for _ in seq_lens]

attn_biases: list[Any] = []
num_heads = alibi_slopes.shape[0]
assert num_heads >= num_kv_heads, (
"ALiBi slopes expect at least as many heads as KV heads")

for seq_len in seq_lens:
bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
bias = bias[None, :] - bias[:, None]

padded_len = (seq_len + 7) // 8 * 8
bias_tensor = torch.empty(
1,
num_heads,
seq_len,
padded_len,
device=alibi_slopes.device,
dtype=dtype,
)[:, :, :, :seq_len].copy_(bias)
bias_tensor.mul_(alibi_slopes[:, None, None])
attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor))

return attn_biases


def _make_metadata_tensors(
seq_lens: Optional[list[int]],
context_lens: Optional[list[int]],
Expand Down
5 changes: 2 additions & 3 deletions tests/models/test_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,8 @@ def _initialize_kv_caches_v1(self, vllm_config):
return

if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
# Phi4FlashForCausalLM and MotifForCausalLM
# only supports DIFFERENTIAL_FLASH_ATTN backend
m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
pytest.skip(
"Differential Flash Attention backend has been removed.")
if model_arch == "GptOssForCausalLM":
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
Expand Down
Loading