In [None]:
import triton
import triton.language as tl
import torch

# Define the Triton kernel
@triton.jit
def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
    # Get the program ID (like thread index in CUDA)
    pid = tl.program_id(axis=0)
    
    # Calculate the offset for this block
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    
    # Create a mask to prevent out-of-bounds access
    mask = offsets < n_elements
    
    # Load data from input pointers
    x = tl.load(x_ptr + offsets, mask=mask)
    y = tl.load(y_ptr + offsets, mask=mask)
    
    # Perform the computation
    output = x + y
    
    # Store the result
    tl.store(output_ptr + offsets, output, mask=mask)

# Function to launch the kernel
def vector_add(x: torch.Tensor, y: torch.Tensor):
    assert x.shape == y.shape, "Input tensors must have the same shape"
    output = torch.empty_like(x)
    
    # Number of elements
    n_elements = x.numel()
    
    # Define block size (tune this based on your GPU)
    BLOCK_SIZE = 1024
    
    # Calculate grid size (number of blocks)
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    
    # Launch the kernel
    add_kernel[grid](
        x_ptr=x,
        y_ptr=y,
        output_ptr=output,
        n_elements=n_elements,
        BLOCK_SIZE=BLOCK_SIZE
    )
    
    return output

# Test the kernel
def test_vector_add():
    # Create sample input tensors on GPU
    size = 4096
    x = torch.rand(size, device='cuda')
    y = torch.rand(size, device='cuda')
    
    # Run Triton kernel
    triton_output = vector_add(x, y)
    
    # Verify with PyTorch addition
    torch_output = x + y
    
    # Check if results match (within floating-point tolerance)
    assert torch.allclose(triton_output, torch_output, atol=1e-6), "Results don't match!"
    print("Test passed! Triton output matches PyTorch output.")

if __name__ == "__main__":
    test_vector_add()