## Load Dataset

In [1]:
from datasets import load_dataset

# Specify the dataset name and the cache directory
dataset_name = "SakanaAI/AI-CUDA-Engineer-Archive"
cache_dir = "./cache_dir"

# Load the dataset
dataset = load_dataset(dataset_name, cache_dir=cache_dir)

# Print the dataset to verify
print(dataset)

DatasetDict({
    level_1: Dataset({
        features: ['Op_Name', 'Level_ID', 'Task_ID', 'Kernel_Name', 'CUDA_Runtime', 'PyTorch_Native_Runtime', 'PyTorch_Compile_Runtime', 'CUDA_Speedup_Native', 'CUDA_Speedup_Compile', 'CUDA_Code', 'PyTorch_Code_Module', 'PyTorch_Code_Functional', 'Correct', 'Max_Diff', 'Error', 'NCU_Profile', 'Torch_Profile', 'Clang_Tidy', '__index_level_0__'],
        num_rows: 12157
    })
    level_2: Dataset({
        features: ['Op_Name', 'Level_ID', 'Task_ID', 'Kernel_Name', 'CUDA_Runtime', 'PyTorch_Native_Runtime', 'PyTorch_Compile_Runtime', 'CUDA_Speedup_Native', 'CUDA_Speedup_Compile', 'CUDA_Code', 'PyTorch_Code_Module', 'PyTorch_Code_Functional', 'Correct', 'Max_Diff', 'Error', 'NCU_Profile', 'Torch_Profile', 'Clang_Tidy', '__index_level_0__'],
        num_rows: 12938
    })
    level_3: Dataset({
        features: ['Op_Name', 'Level_ID', 'Task_ID', 'Kernel_Name', 'CUDA_Runtime', 'PyTorch_Native_Runtime', 'PyTorch_Compile_Runtime', 'CUDA_Speedup_Native', '

In [2]:
df_l1 = dataset["level_1"].to_pandas()

In [4]:
print(df_l1.iloc[0][["CUDA_Code"]].item())

#include <torch/extension.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <c10/cuda/CUDAException.h>

#define TILE_SIZE 16

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define CHECK_FLOAT(x) TORCH_CHECK(x.scalar_type() == torch::kFloat32, #x " must be a float32 tensor")

__global__ void matmul_tiled_kernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int N) {
    __shared__ float As[TILE_SIZE][TILE_SIZE];
    __shared__ float Bs[TILE_SIZE][TILE_SIZE];

    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int row = blockIdx.y * TILE_SIZE + ty;
    int col = blockIdx.x * TILE_SIZE + tx;

    float C_value = 0.0f;

    for (int m = 0; m < (N + TILE_SIZE - 1) / TILE_SIZE; ++m) {
        // Load tiles into shared memory
        if (row < N && m * TILE_SIZE + tx < N)
    

In [5]:
print(df_l1.iloc[0][["PyTorch_Code_Functional"]].item())

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
    """
    Performs a single square matrix multiplication (C = A * B).

    Args:
        A (torch.Tensor): Input matrix A of shape (N, N).
        B (torch.Tensor): Input matrix B of shape (N, N).

    Returns:
        torch.Tensor: Output matrix C of shape (N, N).
    """
    return torch.matmul(A, B)


class Model(nn.Module):
    """
    Simple model that performs a single square matrix multiplication (C = A * B)
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, A: torch.Tensor, B: torch.Tensor, fn=module_fn) -> torch.Tensor:
        return fn(A, B)


N = 2048


def get_inputs():
    A = torch.randn(N, N)
    B = torch.randn(N, N)
    return [A, B]


def get_init_inputs():
    return []  # No special initialization inputs needed



In [None]:
# Evaluation script for CUDA kernel
# 95_CrossEntropyLoss
# Evaluation script for CUDA kernel
# 12_Matmul_with_diagonal_matrices_
import os
import torch
import argparse
from torch.utils.cpp_extension import load
from torch.utils._pytree import tree_map
import importlib.util
from torch.utils.benchmark import Timer


def easy_to_device(pytree, device):
    return tree_map(
        lambda x: x.to(device) if isinstance(x, torch.Tensor) else x, pytree
    )


def load_module_from_path(path):
    spec = importlib.util.spec_from_file_location("module", path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--op_atol", type=float, default=1e-3)
    parser.add_argument("--op_rtol", type=float, default=1e-1)
    parser.add_argument("--rep_time", type=int, default=10000)
    parser.add_argument("--warmup_time", type=int, default=25)
    args = parser.parse_args()

    # Get task name from info.txt
    with open("task/info.txt", "r") as f:
        task_name = f.readline().strip()
        task_name = "_".join(task_name.split("_")[1:])  # Remove problem ID

    # Import the task module
    task_files = [f for f in os.listdir("task") if f.endswith("_functional.py")]
    if not task_files:
        raise RuntimeError("No functional task file found")

    task = load_module_from_path(os.path.join("task", task_files[0]))

    # Initialize model and inputs
    device_1 = torch.device("cuda:0")
    device_2 = torch.device("cuda:1")
    torch.manual_seed(0)
    inputs = task.get_inputs()
    init_inputs = task.get_init_inputs()
    model = task.Model(*init_inputs)

    # Load CUDA kernel
    kernel_files = [f for f in os.listdir("kernel") if f.endswith(".cu")]
    if not kernel_files:
        raise RuntimeError("No CUDA kernel file found")

    cuda_module = load(
        name=task_name,
        sources=[os.path.join("kernel", kernel_files[0])],
        extra_cuda_cflags=["-O3", "--use_fast_math"],
        with_cuda=True,
        verbose=True,
    )

    # Test for correctness
    with torch.no_grad():
        cuda_output = model.to(device_1)(
            *easy_to_device(inputs, device_1), fn=cuda_module.forward
        )
        torch_output = model.to(device_2)(
            *easy_to_device(inputs, device_2), fn=task.module_fn
        )

    correct = torch.allclose(
        torch_output.cpu(),
        cuda_output.cpu(),
        rtol=args.op_rtol,
        atol=args.op_atol,
    )
    max_diff = torch.max(torch.abs(torch_output.cpu() - cuda_output.cpu())).item()
    print(f"Tested CUDA kernel - Correct: {correct}, Max Diff: {max_diff}")

    if correct:
        # Evaluate CUDA kernel performance
        cuda_timer = Timer(
            stmt="model(*inputs, fn=cuda_module.forward)",
            globals={
                "model": model.to(device_1),
                "inputs": easy_to_device(inputs, device_1),
                "cuda_module": cuda_module,
            },
        )
        cuda_runtime = cuda_timer.timeit(args.rep_time).mean * 1000
        print(f"Evaluated CUDA kernel - Runtime: {cuda_runtime:.3f} ms")

        # Evaluate PyTorch baseline performance
        torch_timer = Timer(
            stmt="model(*inputs, fn=task.module_fn)",
            globals={
                "model": model.to(device_2),
                "inputs": easy_to_device(inputs, device_2),
                "task": task,
            },
        )
        torch_runtime = torch_timer.timeit(args.rep_time).mean * 1000
        print(f"Evaluated PyTorch baseline - Runtime: {torch_runtime:.3f} ms")

        # Evaluate torch compile performance
        torch_fn = task.module_fn
        compile_fn = torch.compile(torch_fn, mode="max-autotune")
        torch_compile_timer = Timer(
            stmt="model(*inputs, fn=compile_fn)",
            globals={
                "model": model.to(device_2),
                "inputs": easy_to_device(inputs, device_2),
                "compile_fn": compile_fn,
            },
        )

        torch_compile_runtime = torch_compile_timer.timeit(args.rep_time).mean * 1000
        print(f"Evaluated torch compile - Runtime: {torch_compile_runtime:.3f} ms")

        print(f"Speedup over PyTorch: {torch_runtime/cuda_runtime:.2f}x")
        print(f"Speedup over torch compile: {torch_compile_runtime/cuda_runtime:.2f}x")

        import json

        # Store the speedup times as a json file
        file_path = os.path.join(os.path.dirname(__file__), "speedup_times.json")
        with open(file_path, "w") as f:
            json.dump(
                {
                    "max_diff": max_diff,
                    "cuda_runtime": cuda_runtime,
                    "torch_runtime": torch_runtime,
                    "torch_compile_runtime": torch_compile_runtime,
                    "speedup_over_pytorch": torch_runtime / cuda_runtime,
                    "speedup_over_torch_compile": torch_compile_runtime / cuda_runtime,
                },
                f,
            )
        print(f"Speedup times stored in {file_path}")


if __name__ == "__main__":
    main()


## Load Model

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

model_q14 = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
model_q7 = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"

tokenizer = AutoTokenizer.from_pretrained(model_q7, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(model_q7, cache_dir=cache_dir)

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

In [25]:
import gc
gc.collect()

import torch
with torch.cuda.device(0):  # explicitly set GPU 0 if needed
    torch.cuda.empty_cache()

In [21]:
import torch
device = torch.device('cuda:0')
model = model.to(device)
# tokenizer = tokenizer.to(device)

In [24]:
# Define the prompt
prompt = "what is the solution of x^2 - 2x + 1 = 0?"

prompt = "what is the second planet from the Sun?"

# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate the model's response
outputs = model.generate(**inputs)

# Decode the generated tokens to get the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


what is the second planet from the Sun? I know the first is Mercury, so the second must be Venus. But wait, is it Venus


## TODOs
- evaluation stuff
- KernelBench method
- coding the RL portion