Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
388 changes: 388 additions & 0 deletions tests/kernels/test_fla_layernorm_guard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,388 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest
import torch
import torch.nn.functional as F

from vllm.model_executor.layers.fla.ops.layernorm_guard import (
layer_norm_fwd,
layernorm_fn,
rms_norm_ref,
)
from vllm.platforms import current_platform


def layer_norm_ref(
x,
weight,
bias,
z=None,
eps=1e-6,
group_size=None,
norm_before_gate=True,
is_rms_norm=False,
):
"""Reference implementation for both layer norm and RMS norm."""
if is_rms_norm:
# Use the imported rms_norm_ref for RMS norm cases
return rms_norm_ref(
x,
weight,
bias,
z=z,
eps=eps,
group_size=group_size,
norm_before_gate=norm_before_gate,
upcast=True,
)

# Layer norm implementation
dtype = x.dtype
x = x.float()
weight = weight.float()
bias = bias.float() if bias is not None else None
z = z.float() if z is not None else None

if z is not None and not norm_before_gate:
x = x * F.silu(z)

if group_size is None:
# Layer norm: subtract mean
mean = x.mean(dim=-1, keepdim=True)
var = ((x - mean).square()).mean(dim=-1, keepdim=True)
rstd = 1 / torch.sqrt(var + eps)
out = (x - mean) * rstd * weight
if bias is not None:
out = out + bias
else:
# Group norm
from einops import rearrange

x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
mean = x_group.mean(dim=-1, keepdim=True)
var = ((x_group - mean).square()).mean(dim=-1, keepdim=True)
rstd = 1 / torch.sqrt(var + eps)
x_group = (x_group - mean) * rstd
out = rearrange(x_group, "... g d -> ... (g d)") * weight
if bias is not None:
out = out + bias

if z is not None and norm_before_gate:
out *= F.silu(z)

return out.to(dtype)


DTYPES = [torch.bfloat16, torch.float32]
# Test various M sizes to ensure rows_per_block logic works correctly
NUM_TOKENS = [
1,
7,
16,
63,
128,
256,
512,
1024,
2048,
4096,
5789,
8189,
8191,
16383,
32767,
]
HIDDEN_SIZES = [64, 128, 256, 1024]
GROUP_SIZES = [None, 64, 128] # None means full hidden size
NORM_BEFORE_GATE = [True, False]
IS_RMS_NORM = [True, False]
SEEDS = [0, 42]


@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
@torch.inference_mode()
def test_layer_norm_fwd_basic(
num_tokens: int,
hidden_size: int,
dtype: torch.dtype,
seed: int,
is_rms_norm: bool,
) -> None:
"""Test basic layer norm forward pass without z (gate) tensor."""
current_platform.seed_everything(seed)
device = torch.device("cuda:0")

# Create inputs
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6

# Run the triton kernel
out, mean, rstd = layer_norm_fwd(
x, weight, bias, eps, z=None, is_rms_norm=is_rms_norm
)

# Run reference implementation
ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=is_rms_norm)

# Check outputs
assert out.shape == x.shape
assert out.dtype == x.dtype
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)

# Check mean and rstd shapes
if not is_rms_norm:
assert mean.shape == (num_tokens,)
assert rstd.shape == (num_tokens,)


@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", [128, 256, 1024])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("norm_before_gate", NORM_BEFORE_GATE)
@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
@torch.inference_mode()
def test_layer_norm_fwd_with_gate(
num_tokens: int,
hidden_size: int,
dtype: torch.dtype,
norm_before_gate: bool,
is_rms_norm: bool,
) -> None:
"""Test layer norm forward pass with z (gate) tensor."""
current_platform.seed_everything(42)
device = torch.device("cuda:0")

# Create inputs
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
z = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6

# Run the triton kernel
out, mean, rstd = layer_norm_fwd(
x,
weight,
bias,
eps,
z=z,
norm_before_gate=norm_before_gate,
is_rms_norm=is_rms_norm,
)

# Run reference implementation
ref_out = layer_norm_ref(
x,
weight,
bias,
z=z,
eps=eps,
norm_before_gate=norm_before_gate,
is_rms_norm=is_rms_norm,
)

# Check outputs
assert out.shape == x.shape
assert out.dtype == x.dtype
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)


@pytest.mark.parametrize("num_tokens", [128, 512])
@pytest.mark.parametrize("hidden_size", [512, 1024])
@pytest.mark.parametrize("group_size", [64, 128, 256])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
@torch.inference_mode()
def test_layer_norm_fwd_with_groups(
num_tokens: int,
hidden_size: int,
group_size: int,
dtype: torch.dtype,
is_rms_norm: bool,
) -> None:
"""Test layer norm forward pass with group normalization."""
if hidden_size % group_size != 0:
pytest.skip(
f"hidden_size {hidden_size} not divisible by group_size {group_size}"
)

current_platform.seed_everything(42)
device = torch.device("cuda:0")

# Create inputs
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6

ngroups = hidden_size // group_size

# Run the triton kernel
out, mean, rstd = layer_norm_fwd(
x, weight, bias, eps, z=None, group_size=group_size, is_rms_norm=is_rms_norm
)

# Run reference implementation
ref_out = layer_norm_ref(
x, weight, bias, z=None, eps=eps, group_size=group_size, is_rms_norm=is_rms_norm
)

# Check outputs
assert out.shape == x.shape
assert out.dtype == x.dtype
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)

# Check mean and rstd shapes for groups
if not is_rms_norm:
assert mean.shape == (ngroups * num_tokens,)
assert rstd.shape == (ngroups * num_tokens,)


@pytest.mark.parametrize("num_tokens", [7, 63, 128, 513, 1024, 2049])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@torch.inference_mode()
def test_layer_norm_rows_per_block(
num_tokens: int,
dtype: torch.dtype,
) -> None:
"""Test that rows_per_block logic works correctly for various M sizes."""
current_platform.seed_everything(42)
device = torch.device("cuda:0")
hidden_size = 1024

# Create inputs
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6

# Run the triton kernel
out, mean, rstd = layer_norm_fwd(x, weight, bias, eps, z=None, is_rms_norm=False)

# Run reference implementation
ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)

# Check outputs
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)


@pytest.mark.parametrize("dtype", [torch.bfloat16])
@torch.inference_mode()
def test_strided_input(dtype: torch.dtype) -> None:
"""Test that the kernel handles non-contiguous (strided)
inputs correctly."""
current_platform.seed_everything(42)
device = torch.device("cuda:0")
num_tokens = 128
hidden_size = 1024

# Create a larger tensor and take a strided slice
x_large = torch.randn(num_tokens, hidden_size * 2, dtype=dtype, device=device)
x = x_large[:, :hidden_size]

# Make it contiguous for the kernel
x_contiguous = x.contiguous()

weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6

# Run the triton kernel with contiguous input
out, mean, rstd = layer_norm_fwd(
x_contiguous, weight, bias, eps, z=None, is_rms_norm=False
)

# Run reference implementation
ref_out = layer_norm_ref(
x_contiguous, weight, bias, z=None, eps=eps, is_rms_norm=False
)

# Check outputs
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)


@pytest.mark.parametrize("num_tokens", [1, 128, 2048])
@pytest.mark.parametrize("hidden_size", [768, 4096])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@torch.inference_mode()
def test_output_buffer_provided(
num_tokens: int,
hidden_size: int,
dtype: torch.dtype,
) -> None:
"""Test that the kernel works when an output buffer is provided."""
current_platform.seed_everything(42)
device = torch.device("cuda:0")

# Create inputs
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6

# Pre-allocate output buffer
out_buffer = torch.empty_like(x)

# Run the triton kernel with provided output
out, mean, rstd = layer_norm_fwd(
x, weight, bias, eps, z=None, out=out_buffer, is_rms_norm=False
)

# Check that the provided buffer was used
assert out.data_ptr() == out_buffer.data_ptr()

# Run reference implementation
ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)

# Check outputs
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)


@pytest.mark.parametrize(
"shape",
[
(4, 16, 1024), # 3D tensor
(2, 8, 512, 256), # 4D tensor
],
)
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@torch.inference_mode()
def test_multidimensional_input(
shape: tuple,
dtype: torch.dtype,
) -> None:
"""Test that the autograd function handles multidimensional inputs."""
current_platform.seed_everything(42)
device = torch.device("cuda:0")
hidden_size = shape[-1]

# Create inputs
x = torch.randn(*shape, dtype=dtype, device=device)
weight = torch.randn(hidden_size, dtype=dtype, device=device)
bias = torch.randn(hidden_size, dtype=dtype, device=device)
eps = 1e-6

# Run through autograd function
out = layernorm_fn(x, weight, bias, z=None, eps=eps)

# Run reference implementation
ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)

# Check outputs
assert out.shape == x.shape
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)


if __name__ == "__main__":
# Run a quick smoke test
test_layer_norm_fwd_basic(128, 1024, torch.float16, 42, False)
test_layer_norm_fwd_with_gate(128, 1024, torch.float16, True, False)
test_layer_norm_rows_per_block(513, torch.float16)
print("All smoke tests passed!")
Loading