## Load Dataset

In [1]:
from datasets import load_dataset

# Specify the dataset name and the cache directory
dataset_name = "SakanaAI/AI-CUDA-Engineer-Archive"
cache_dir = "./cache_dir"

# Load the dataset
dataset = load_dataset(dataset_name, cache_dir=cache_dir)

# Print the dataset to verify
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    level_1: Dataset({
        features: ['Op_Name', 'Level_ID', 'Task_ID', 'Kernel_Name', 'CUDA_Runtime', 'PyTorch_Native_Runtime', 'PyTorch_Compile_Runtime', 'CUDA_Speedup_Native', 'CUDA_Speedup_Compile', 'CUDA_Code', 'PyTorch_Code_Module', 'PyTorch_Code_Functional', 'Correct', 'Max_Diff', 'Error', 'NCU_Profile', 'Torch_Profile', 'Clang_Tidy', '__index_level_0__'],
        num_rows: 12157
    })
    level_2: Dataset({
        features: ['Op_Name', 'Level_ID', 'Task_ID', 'Kernel_Name', 'CUDA_Runtime', 'PyTorch_Native_Runtime', 'PyTorch_Compile_Runtime', 'CUDA_Speedup_Native', 'CUDA_Speedup_Compile', 'CUDA_Code', 'PyTorch_Code_Module', 'PyTorch_Code_Functional', 'Correct', 'Max_Diff', 'Error', 'NCU_Profile', 'Torch_Profile', 'Clang_Tidy', '__index_level_0__'],
        num_rows: 12938
    })
    level_3: Dataset({
        features: ['Op_Name', 'Level_ID', 'Task_ID', 'Kernel_Name', 'CUDA_Runtime', 'PyTorch_Native_Runtime', 'PyTorch_Compile_Runtime', 'CUDA_Speedup_Native', '

In [2]:
df_l1 = dataset["level_1"].to_pandas()

In [3]:
print(df_l1.iloc[0][["CUDA_Code"]].item())

#include <torch/extension.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <c10/cuda/CUDAException.h>

#define TILE_SIZE 16

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define CHECK_FLOAT(x) TORCH_CHECK(x.scalar_type() == torch::kFloat32, #x " must be a float32 tensor")

__global__ void matmul_tiled_kernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int N) {
    __shared__ float As[TILE_SIZE][TILE_SIZE];
    __shared__ float Bs[TILE_SIZE][TILE_SIZE];

    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int row = blockIdx.y * TILE_SIZE + ty;
    int col = blockIdx.x * TILE_SIZE + tx;

    float C_value = 0.0f;

    for (int m = 0; m < (N + TILE_SIZE - 1) / TILE_SIZE; ++m) {
        // Load tiles into shared memory
        if (row < N && m * TILE_SIZE + tx < N)
    

In [4]:
print(df_l1.iloc[0][["PyTorch_Code_Functional"]].item())

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
    """
    Performs a single square matrix multiplication (C = A * B).

    Args:
        A (torch.Tensor): Input matrix A of shape (N, N).
        B (torch.Tensor): Input matrix B of shape (N, N).

    Returns:
        torch.Tensor: Output matrix C of shape (N, N).
    """
    return torch.matmul(A, B)


class Model(nn.Module):
    """
    Simple model that performs a single square matrix multiplication (C = A * B)
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, A: torch.Tensor, B: torch.Tensor, fn=module_fn) -> torch.Tensor:
        return fn(A, B)


N = 2048


def get_inputs():
    A = torch.randn(N, N)
    B = torch.randn(N, N)
    return [A, B]


def get_init_inputs():
    return []  # No special initialization inputs needed



In [5]:
# Evaluation script for CUDA kernel
# 95_CrossEntropyLoss
# Evaluation script for CUDA kernel
# 12_Matmul_with_diagonal_matrices_
import os
import torch
import argparse
from torch.utils.cpp_extension import load
from torch.utils._pytree import tree_map
import importlib.util
from torch.utils.benchmark import Timer


def easy_to_device(pytree, device):
    return tree_map(
        lambda x: x.to(device) if isinstance(x, torch.Tensor) else x, pytree
    )


def load_module_from_path(path):
    spec = importlib.util.spec_from_file_location("module", path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--op_atol", type=float, default=1e-3)
    parser.add_argument("--op_rtol", type=float, default=1e-1)
    parser.add_argument("--rep_time", type=int, default=10000)
    parser.add_argument("--warmup_time", type=int, default=25)
    args = parser.parse_args()

    # Get task name from info.txt
    with open("task/info.txt", "r") as f:
        task_name = f.readline().strip()
        task_name = "_".join(task_name.split("_")[1:])  # Remove problem ID

    # Import the task module
    task_files = [f for f in os.listdir("task") if f.endswith("_functional.py")]
    if not task_files:
        raise RuntimeError("No functional task file found")

    task = load_module_from_path(os.path.join("task", task_files[0]))

    # Initialize model and inputs
    device_1 = torch.device("cuda:0")
    device_2 = torch.device("cuda:1")
    torch.manual_seed(0)
    inputs = task.get_inputs()
    init_inputs = task.get_init_inputs()
    model = task.Model(*init_inputs)

    # Load CUDA kernel
    kernel_files = [f for f in os.listdir("kernel") if f.endswith(".cu")]
    if not kernel_files:
        raise RuntimeError("No CUDA kernel file found")

    cuda_module = load(
        name=task_name,
        sources=[os.path.join("kernel", kernel_files[0])],
        extra_cuda_cflags=["-O3", "--use_fast_math"],
        with_cuda=True,
        verbose=True,
    )

    # Test for correctness
    with torch.no_grad():
        cuda_output = model.to(device_1)(
            *easy_to_device(inputs, device_1), fn=cuda_module.forward
        )
        torch_output = model.to(device_2)(
            *easy_to_device(inputs, device_2), fn=task.module_fn
        )

    correct = torch.allclose(
        torch_output.cpu(),
        cuda_output.cpu(),
        rtol=args.op_rtol,
        atol=args.op_atol,
    )
    max_diff = torch.max(torch.abs(torch_output.cpu() - cuda_output.cpu())).item()
    print(f"Tested CUDA kernel - Correct: {correct}, Max Diff: {max_diff}")

    if correct:
        # Evaluate CUDA kernel performance
        cuda_timer = Timer(
            stmt="model(*inputs, fn=cuda_module.forward)",
            globals={
                "model": model.to(device_1),
                "inputs": easy_to_device(inputs, device_1),
                "cuda_module": cuda_module,
            },
        )
        cuda_runtime = cuda_timer.timeit(args.rep_time).mean * 1000
        print(f"Evaluated CUDA kernel - Runtime: {cuda_runtime:.3f} ms")

        # Evaluate PyTorch baseline performance
        torch_timer = Timer(
            stmt="model(*inputs, fn=task.module_fn)",
            globals={
                "model": model.to(device_2),
                "inputs": easy_to_device(inputs, device_2),
                "task": task,
            },
        )
        torch_runtime = torch_timer.timeit(args.rep_time).mean * 1000
        print(f"Evaluated PyTorch baseline - Runtime: {torch_runtime:.3f} ms")

        # Evaluate torch compile performance
        torch_fn = task.module_fn
        compile_fn = torch.compile(torch_fn, mode="max-autotune")
        torch_compile_timer = Timer(
            stmt="model(*inputs, fn=compile_fn)",
            globals={
                "model": model.to(device_2),
                "inputs": easy_to_device(inputs, device_2),
                "compile_fn": compile_fn,
            },
        )

        torch_compile_runtime = torch_compile_timer.timeit(args.rep_time).mean * 1000
        print(f"Evaluated torch compile - Runtime: {torch_compile_runtime:.3f} ms")

        print(f"Speedup over PyTorch: {torch_runtime/cuda_runtime:.2f}x")
        print(f"Speedup over torch compile: {torch_compile_runtime/cuda_runtime:.2f}x")

        import json

        # Store the speedup times as a json file
        file_path = os.path.join(os.path.dirname(__file__), "speedup_times.json")
        with open(file_path, "w") as f:
            json.dump(
                {
                    "max_diff": max_diff,
                    "cuda_runtime": cuda_runtime,
                    "torch_runtime": torch_runtime,
                    "torch_compile_runtime": torch_compile_runtime,
                    "speedup_over_pytorch": torch_runtime / cuda_runtime,
                    "speedup_over_torch_compile": torch_compile_runtime / cuda_runtime,
                },
                f,
            )
        print(f"Speedup times stored in {file_path}")


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--op_atol OP_ATOL] [--op_rtol OP_RTOL]
                             [--rep_time REP_TIME] [--warmup_time WARMUP_TIME]
ipykernel_launcher.py: error: unrecognized arguments: --f=/home/abhiv/.local/share/jupyter/runtime/kernel-v396b2fa7030d847950f3f68082040ffefb083a289.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Load Model

In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

model_q14 = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
model_q7 = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
# model_q1 = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

tokenizer = AutoTokenizer.from_pretrained(model_q7,torch_dtype="auto", cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(model_q7, torch_dtype="auto", cache_dir=cache_dir)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.25it/s]


In [27]:
import gc
gc.collect()

import torch
with torch.cuda.device(0):  # explicitly set GPU 0 if needed
    torch.cuda.empty_cache()
    
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [5]:
import torch
device = torch.device('cuda:0')
model = model.to(device)
# tokenizer = tokenizer.to(device)

## Misc Testing

In [9]:
# Define the prompt
prompt = "what is the solution of x^2 - 2x + 1 = 0?<think>"

# prompt = "what is the second planet from the Sun?<think>"

# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate the model's response
outputs = model.generate(**inputs, max_length=10_000)

# Decode the generated tokens to get the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

inputs = inputs.to('cpu')

print(response)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


what is the solution of x^2 - 2x + 1 = 0?<think>
To solve the quadratic equation \( x^2 - 2x + 1 = 0 \), I first recognize that it is a perfect square trinomial.

I can rewrite the equation as \( (x - 1)^2 = 0 \).

Taking the square root of both sides gives \( x - 1 = 0 \).

Finally, solving for \( x \) yields \( x = 1 \).
</think>

To solve the quadratic equation:

\[
x^2 - 2x + 1 = 0
\]

we can follow these steps:

1. **Recognize the Perfect Square Trinomial:**

   Notice that the equation can be written as a perfect square:

   \[
   (x - 1)^2 = 0
   \]

2. **Take the Square Root of Both Sides:**

   Taking the square root of both sides gives:

   \[
   x - 1 = 0
   \]

3. **Solve for \( x \):**

   Adding 1 to both sides of the equation:

   \[
   x = 1
   \]

Therefore, the solution to the equation is:

\[
\boxed{1}
\]


## Prompting and Evaluation

In [25]:
df_l1.iloc[0].PyTorch_Code_Functional

'import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\ndef module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:\n    """\n    Performs a single square matrix multiplication (C = A * B).\n\n    Args:\n        A (torch.Tensor): Input matrix A of shape (N, N).\n        B (torch.Tensor): Input matrix B of shape (N, N).\n\n    Returns:\n        torch.Tensor: Output matrix C of shape (N, N).\n    """\n    return torch.matmul(A, B)\n\n\nclass Model(nn.Module):\n    """\n    Simple model that performs a single square matrix multiplication (C = A * B)\n    """\n\n    def __init__(self):\n        super(Model, self).__init__()\n\n    def forward(self, A: torch.Tensor, B: torch.Tensor, fn=module_fn) -> torch.Tensor:\n        return fn(A, B)\n\n\nN = 2048\n\n\ndef get_inputs():\n    A = torch.randn(N, N)\n    B = torch.randn(N, N)\n    return [A, B]\n\n\ndef get_init_inputs():\n    return []  # No special initialization inputs needed\n'

In [45]:
# Define the prompt
prompt = '''Given the following PyTorch code, output code for a CUDA kernel that has the same functionality as 'module_fn'. The output should have no extra text at the beginning or end and no main function. Simply an implementation of the below code. In CUDA, I want the specified function to be called 'forward' that returns a torch Tensor. The end of the file should also look like:

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 
    m.def("forward", &forward, DESCRIPTION); 
}

where DESCRIPTION is a less than 50 character string that describes what operation 'forward' / 'module_fn' does.

Again, the function that implements the CUDA version of 'module_fn' should be called 'forward' and return a torch::Tensor. 

DO EXACTLY WHAT I HAVE TOLD YOU. DO NOT GIVE AN EXPLANATION AS YOUR FINAL OUTPUT, JUST GIVE THE CODE I WANT. Here is the code that contains module_fn:


'''

functional_str = df_l1.iloc[0].PyTorch_Code_Functional
prompt += functional_str
print(prompt)

# Tokenize the input promptb
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate the model's response
outputs = model.generate(**inputs, max_length=10_000)

# Decode the generated tokens to get the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

inputs = inputs.to('cpu')

# print(response)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Given the following PyTorch code, output code for a CUDA kernel that has the same functionality as 'module_fn'. The output should have no extra text at the beginning or end and no main function. Simply an implementation of the below code. In CUDA, I want the specified function to be called 'forward' that returns a torch Tensor. The end of the file should also look like:

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 
    m.def("forward", &forward, DESCRIPTION); 
}

where DESCRIPTION is a less than 50 character string that describes what operation 'forward' / 'module_fn' does.

Again, the function that implements the CUDA version of 'module_fn' should be called 'forward' and return a torch::Tensor. 

DO EXACTLY WHAT I HAVE TOLD YOU. DO NOT GIVE AN EXPLANATION AS YOUR FINAL OUTPUT, JUST GIVE THE CODE I WANT. Here is the code that contains module_fn:


import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
    """

In [46]:
print(functional_str[functional_str.find('def module_fn'):functional_str.find('class Model')])

def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
    """
    Performs a single square matrix multiplication (C = A * B).

    Args:
        A (torch.Tensor): Input matrix A of shape (N, N).
        B (torch.Tensor): Input matrix B of shape (N, N).

    Returns:
        torch.Tensor: Output matrix C of shape (N, N).
    """
    return torch.matmul(A, B)





In [53]:
# print(response)

In [54]:
print(response[response.find("</think>")+len("</think>"):])



```
// __global__ __device__ void matmulKernel(const float *A, const float *B, float *C, int N) {
//     int i = blockIdx.x * blockDim.x + threadIdx.x;
//     int j = blockIdx.y * blockDim.y + threadIdx.y;
//     if (i >= N || j >= N) return;
//     float sum = 0.0f;
//     for (int k = 0; k < N; ++k) {
//         sum += A[i * N + k] * B[k * N + j];
//     }
//     C[i * N + j] = sum;
// }

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Perform matrix multiplication of two square matrices");
}
```


In [55]:
with open(f"tasks/{df_l1.iloc[0].Op_Name}.py", "w") as f:
    f.write(df_l1.iloc[0].PyTorch_Code_Functional)
with open(f"kernels/{df_l1.iloc[0].Op_Name}.cu", "w") as f:
    f.write(df_l1.iloc[0].CUDA_Code)

File format is:

eval_kernel

task/
- torch nn module.py
- functional.py
- info.txt (dont really need this rn)

kernel/
- kernel.cu 

In [71]:
# Evaluation script for CUDA kernel
# 12_Matmul_with_diagonal_matrices_
# Evaluation script for CUDA kernel
# 12_Matmul_with_diagonal_matrices_
import os
import torch
import argparse
from torch.utils.cpp_extension import load
from torch.utils._pytree import tree_map
import importlib.util
from torch.utils.benchmark import Timer


def easy_to_device(pytree, device):
    return tree_map(
        lambda x: x.to(device) if isinstance(x, torch.Tensor) else x, pytree
    )


def load_module_from_path(path):
    spec = importlib.util.spec_from_file_location("module", path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module


def evaluate(op_name: str):
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--op_atol", type=float, default=1e-3)
    # parser.add_argument("--op_rtol", type=float, default=1e-1)
    # parser.add_argument("--rep_time", type=int, default=10000)
    # parser.add_argument("--warmup_time", type=int, default=25)
    # args = parser.parse_args()

    # # Get task name from info.txt
    # with open("task/info.txt", "r") as f:
    #     task_name = f.readline().strip()
    #     task_name = "_".join(task_name.split("_")[1:])  # Remove problem ID

    # Import the task module
    # task_files = [f for f in os.listdir("tasks") if f.endswith("_functional.py")]
    # if not task_files:
    #     raise RuntimeError("No functional task file found")

    task = load_module_from_path(os.path.join("tasks", op_name+'.py'))

    # Initialize model and inputs
    device_1 = torch.device("cuda:0")
    torch.manual_seed(0)
    inputs = task.get_inputs()
    init_inputs = task.get_init_inputs()
    model = task.Model(*init_inputs)

    # Load CUDA kernel
    # kernel_files = [f for f in os.listdir("kernel") if f.endswith(".cu")]
    # if not kernel_files:
    #     raise RuntimeError("No CUDA kernel file found")
    
    task_name = "_".join(op_name.split("_")[1:])  # Remove problem ID
    cuda_module = load(
        name=task_name,
        sources=[os.path.join("kernels", op_name+'.cu')],
        extra_cuda_cflags=["-O3", "--use_fast_math"],
        with_cuda=True,
        verbose=True,
    )

    # Test for correctness
    with torch.no_grad():
        cuda_output = model.to(device_1)(
            *easy_to_device(inputs, device_1), fn=cuda_module.forward
        )
        torch_output = model.to(device_1)(
            *easy_to_device(inputs, device_1), fn=task.module_fn
        )

    rtol_default = 1e-1
    atol_default = 1e-3

    correct = torch.allclose(
        torch_output.cpu(),
        cuda_output.cpu(),
        rtol=rtol_default,
        atol=atol_default,
    )
    max_diff = torch.max(torch.abs(torch_output.cpu() - cuda_output.cpu())).item()
    print(f"Tested CUDA kernel - Correct: {correct}, Max Diff: {max_diff}")

    if correct:
        # Evaluate CUDA kernel performance
        cuda_timer = Timer(
            stmt="model(*inputs, fn=cuda_module.forward)",
            globals={
                "model": model.to(device_1),
                "inputs": easy_to_device(inputs, device_1),
                "cuda_module": cuda_module,
            },
        )
        cuda_runtime = cuda_timer.timeit(args.rep_time).mean * 1000
        print(f"Evaluated CUDA kernel - Runtime: {cuda_runtime:.3f} ms")

        # Evaluate PyTorch baseline performance
        torch_timer = Timer(
            stmt="model(*inputs, fn=task.module_fn)",
            globals={
                "model": model.to(device_1),
                "inputs": easy_to_device(inputs, device_1),
                "task": task,
            },
        )
        torch_runtime = torch_timer.timeit(args.rep_time).mean * 1000
        print(f"Evaluated PyTorch baseline - Runtime: {torch_runtime:.3f} ms")

        # Evaluate torch compile performance
        torch_fn = task.module_fn
        compile_fn = torch.compile(torch_fn, mode="max-autotune")
        torch_compile_timer = Timer(
            stmt="model(*inputs, fn=compile_fn)",
            globals={
                "model": model.to(device_1),
                "inputs": easy_to_device(inputs, device_1),
                "compile_fn": compile_fn,
            },
        )

        torch_compile_runtime = torch_compile_timer.timeit(args.rep_time).mean * 1000
        print(f"Evaluated torch compile - Runtime: {torch_compile_runtime:.3f} ms")

        print(f"Speedup over PyTorch: {torch_runtime/cuda_runtime:.2f}x")
        print(f"Speedup over torch compile: {torch_compile_runtime/cuda_runtime:.2f}x")

        import json

        # Store the speedup times as a json file
        file_path = os.path.join(os.path.dirname('speedups'), f"{op_name}.json")
        with open(file_path, "w") as f:
            json.dump(
                {
                    "max_diff": max_diff,
                    "cuda_runtime": cuda_runtime,
                    "torch_runtime": torch_runtime,
                    "torch_compile_runtime": torch_compile_runtime,
                    "speedup_over_pytorch": torch_runtime / cuda_runtime,
                    "speedup_over_torch_compile": torch_compile_runtime / cuda_runtime,
                },
                f,
            )
        print(f"Speedup times stored in {file_path}")

In [None]:
evaluate(df_l1.iloc[0].Op_Name)

In [110]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Model(nn.Module):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, a, b):
        return a + b


def get_inputs():
    # randomly generate input tensors based on the model architecture
    a = torch.randn(1, 128).cuda()
    b = torch.randn(1, 128).cuda()
    return [a, b]


def get_init_inputs():
    # randomly generate tensors required for initialization based on the model architecture
    return []


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.cpp_extension import load_inline

# Define the custom CUDA kernel for element-wise addition
elementwise_add_source = """
#include <torch/extension.h>
#include <cuda_runtime.h>

__global__ void elementwise_add_kernel(const float* a, const float* b, float* out, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        out[idx] = a[idx] + b[idx];
    }
}

torch::Tensor elementwise_add_cuda(torch::Tensor a, torch::Tensor b) {
    auto size = a.numel();
    auto out = torch::zeros_like(a);

    const int block_size = 256;
    const int num_blocks = (size + block_size - 1) / block_size;

    elementwise_add_kernel<<<num_blocks, block_size>>>(a.data_ptr<float>(), b.data_ptr<float>(), out.data_ptr<float>(), size);

    return out;
}
"""

elementwise_add_cpp_source = (
    "torch::Tensor elementwise_add_cuda(torch::Tensor a, torch::Tensor b);"
)

# Compile the inline CUDA code for element-wise addition
elementwise_add = load_inline(
    name="elementwise_add",
    cpp_sources=elementwise_add_cpp_source,
    cuda_sources=elementwise_add_source,
    functions=["elementwise_add_cuda"],
    verbose=True,
    extra_cflags=[""],
    extra_ldflags=[""],
)


class ModelNew(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.elementwise_add = elementwise_add

    def forward(self, a, b):
        return self.elementwise_add.elementwise_add_cuda(a, b)
    
a, b = get_inputs()
torchm = Model()
cudam = ModelNew()
torch.allclose(torchm.forward(a,b), cudam.forward(a,b) )

Using /home/abhiv/.cache/torch_extensions/py310_cu124 as PyTorch extensions root...
No modifications detected for re-loaded extension module elementwise_add, skipping build step...
Loading extension module elementwise_add...


True

In [111]:
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
    """
    Performs a single square matrix multiplication (C = A * B).

    Args:
        A (torch.Tensor): Input matrix A of shape (N, N).
        B (torch.Tensor): Input matrix B of shape (N, N).

    Returns:
        torch.Tensor: Output matrix C of shape (N, N).
    """
    return torch.matmul(A, B)


class Model(nn.Module):
    """
    Simple model that performs a single square matrix multiplication (C = A * B)
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, A: torch.Tensor, B: torch.Tensor, fn=module_fn) -> torch.Tensor:
        return fn(A, B)


N = 2048


def get_inputs():
    A = torch.randn(N, N).cuda()
    B = torch.randn(N, N).cuda()
    return [A, B]


def get_init_inputs():
    return []  # No special initialization inputs needed


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.cpp_extension import load_inline

cuda_source = """
#include <torch/extension.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <c10/cuda/CUDAException.h>

#define TILE_SIZE 16

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define CHECK_FLOAT(x) TORCH_CHECK(x.scalar_type() == torch::kFloat32, #x " must be a float32 tensor")

__global__ void matmul_tiled_kernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int N) {
    __shared__ float As[TILE_SIZE][TILE_SIZE];
    __shared__ float Bs[TILE_SIZE][TILE_SIZE];

    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int row = blockIdx.y * TILE_SIZE + ty;
    int col = blockIdx.x * TILE_SIZE + tx;

    float C_value = 0.0f;

    for (int m = 0; m < (N + TILE_SIZE - 1) / TILE_SIZE; ++m) {
        // Load tiles into shared memory
        if (row < N && m * TILE_SIZE + tx < N)
            As[ty][tx] = A[row * N + m * TILE_SIZE + tx];
        else
            As[ty][tx] = 0.0f;

        if (col < N && m * TILE_SIZE + ty < N)
            Bs[ty][tx] = B[(m * TILE_SIZE + ty) * N + col];
        else
            Bs[ty][tx] = 0.0f;

        __syncthreads();

        // Compute partial product
        for (int k = 0; k < TILE_SIZE; ++k) {
            C_value += As[ty][k] * Bs[k][tx];
        }

        __syncthreads();
    }

    // Write the result
    if (row < N && col < N)
        C[row * N + col] = C_value;
}

torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
    CHECK_INPUT(A);
    CHECK_INPUT(B);
    CHECK_FLOAT(A);
    CHECK_FLOAT(B);

    TORCH_CHECK(A.dim() == 2 && A.size(0) == A.size(1), "A must be a square matrix");
    TORCH_CHECK(B.dim() == 2 && B.size(0) == B.size(1), "B must be a square matrix");
    TORCH_CHECK(A.size(0) == B.size(0), "A and B must be of the same size");

    int64_t N = A.size(0);

    auto C = torch::zeros({N, N}, A.options());

    const float* A_data = A.data_ptr<float>();
    const float* B_data = B.data_ptr<float>();
    float* C_data = C.data_ptr<float>();

    dim3 threadsPerBlock(TILE_SIZE, TILE_SIZE);
    dim3 blocksPerGrid((N + TILE_SIZE - 1) / TILE_SIZE, (N + TILE_SIZE - 1) / TILE_SIZE);

    matmul_tiled_kernel<<<blocksPerGrid, threadsPerBlock>>>(A_data, B_data, C_data, N);

    // Check for kernel launch errors
    C10_CUDA_CHECK(cudaGetLastError());

    return C;
}
"""

cuda_cpp_source = (
    "torch::Tensor forward(torch::Tensor A, torch::Tensor B);"
)

# Compile the inline CUDA code 
cuda_mod = load_inline(
    name="matmul",
    cpp_sources=cuda_cpp_source,
    cuda_sources=cuda_source,
    functions=["forward"],
    verbose=True,
    extra_cflags=[""],
    extra_ldflags=[""],
)


class ModelNew(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.cuda_mod = cuda_mod

    def forward(self, a, b):
        return self.cuda_mod.forward(a, b)
    
torch_mod = Model()
cuda_mod = ModelNew()

a, b = get_inputs()
# a, b, = torch.eye(N).cuda(), torch.eye(N).cuda()
torch.allclose(torch_mod.forward(a, b), cuda_mod.forward(a, b), rtol=1e-1, atol=1e-3)
# torch_mod.forward(a, b), cuda_mod.forward(a, b)

Using /home/abhiv/.cache/torch_extensions/py310_cu124 as PyTorch extensions root...
No modifications detected for re-loaded extension module matmul_v3, skipping build step...
Loading extension module matmul_v3...


True

## TODOs
- ~~evaluation stuff~~
- prompting qwen for good outputs 
- ~~KernelBench method - NA~~
- coding the RL portion