In [1]:
from rich.pretty import pprint
from datasets import load_dataset
from triton_eval.utils import run_script_on_gpu, get_tests

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("tcapelle/annotated_dataset_o3_train_pytorch_triton", split="train")

In [3]:
triton_code, pt_code= ds[0]["final_triton_code"], ds[0]["final_pytorch_code"]

In case we want to run the tests from pytorch with the generated triton kernel

In [4]:
tests = get_tests(pt_code)
print(tests)

def test_matmul_pytorch():
    """
    Test function for pure PyTorch matrix multiplication on DEVICE.

    Returns:
      dict: Dictionary storing test results for each test case.
    """
    results = {}

    # Test Case 1: Small square matrices
    A1 = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=DEVICE)
    B1 = torch.tensor([[5.0, 6.0], [7.0, 8.0]], device=DEVICE)
    C1 = matmul_pytorch(A1, B1)
    results['test_case_1'] = C1

    # Test Case 2: Rectangular matrices
    A2 = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], device=DEVICE)
    B2 = torch.tensor([[7.0, 8.0], [9.0, 10.0], [11.0, 12.0]], device=DEVICE)
    C2 = matmul_pytorch(A2, B2)
    results['test_case_2'] = C2

    # Test Case 3: Larger matrices
    torch.manual_seed(42)
    A3 = torch.randn(64, 128, device=DEVICE)
    B3 = torch.randn(128, 32, device=DEVICE)
    C3 = matmul_pytorch(A3, B3)
    expected_C3 = torch.mm(A3, B3)
    results['test_case_3'] = {
        'result': C3,
        'expected': expected_C3,
 

## CPU test

In [5]:
success, results, file_name = run_script_on_gpu(pt_code, test_content="", file_name="test.py", gpu_id=None)

In [6]:
if success:
    print(results.stdout)

{'test_case_1': tensor([[19., 22.],
        [43., 50.]], device='cuda:0'), 'test_case_2': tensor([[ 58.,  64.],
        [139., 154.]], device='cuda:0'), 'test_case_3': {'result': tensor([[  9.3524,  20.1801,   1.3200,  ..., -21.0338,   3.0357,  -8.3879],
        [ -5.5521,   5.0191, -26.5503,  ...,  -5.4739,  -7.3350,  -0.0405],
        [  2.6591,  -5.7370,   2.5628,  ...,  22.7629,   1.0609,  -6.0721],
        ...,
        [  0.7112,  11.1433,   7.8263,  ...,  -8.2718,  -5.5668,  -6.1661],
        [ 17.1974,  -6.1684,   1.1457,  ...,  -6.9263, -12.8880,   5.2832],
        [-10.5624,   2.1081, -10.1488,  ...,   7.4583,  -1.6897,  -1.7082]],
       device='cuda:0'), 'expected': tensor([[  9.3524,  20.1801,   1.3200,  ..., -21.0338,   3.0357,  -8.3879],
        [ -5.5521,   5.0191, -26.5503,  ...,  -5.4739,  -7.3350,  -0.0405],
        [  2.6591,  -5.7370,   2.5628,  ...,  22.7629,   1.0609,  -6.0721],
        ...,
        [  0.7112,  11.1433,   7.8263,  ...,  -8.2718,  -5.5668,  -6.1661

## GPU test

In [7]:
success_gpu, results_gpu, _ = run_script_on_gpu(pt_code, test_content="", file_name="test.py", gpu_id=0)

In [8]:
if success_gpu:
    print(results_gpu.stdout)

{'test_case_1': tensor([[19., 22.],
        [43., 50.]], device='cuda:0'), 'test_case_2': tensor([[ 58.,  64.],
        [139., 154.]], device='cuda:0'), 'test_case_3': {'result': tensor([[  9.3524,  20.1801,   1.3200,  ..., -21.0338,   3.0357,  -8.3879],
        [ -5.5521,   5.0191, -26.5503,  ...,  -5.4739,  -7.3350,  -0.0405],
        [  2.6591,  -5.7370,   2.5628,  ...,  22.7629,   1.0609,  -6.0721],
        ...,
        [  0.7112,  11.1433,   7.8263,  ...,  -8.2718,  -5.5668,  -6.1661],
        [ 17.1974,  -6.1684,   1.1457,  ...,  -6.9263, -12.8880,   5.2832],
        [-10.5624,   2.1081, -10.1488,  ...,   7.4583,  -1.6897,  -1.7082]],
       device='cuda:0'), 'expected': tensor([[  9.3524,  20.1801,   1.3200,  ..., -21.0338,   3.0357,  -8.3879],
        [ -5.5521,   5.0191, -26.5503,  ...,  -5.4739,  -7.3350,  -0.0405],
        [  2.6591,  -5.7370,   2.5628,  ...,  22.7629,   1.0609,  -6.0721],
        ...,
        [  0.7112,  11.1433,   7.8263,  ...,  -8.2718,  -5.5668,  -6.1661

## Map

In [44]:
from concurrent.futures import ProcessPoolExecutor, as_completed

def run_one(row, gpus=[0, 1]):
    triton_code, pt_code = row["final_triton_code"], row["final_pytorch_code"]

    with ProcessPoolExecutor(max_workers=2) as executor:
        future_to_file = {
            executor.submit(run_script_on_gpu, pt_code, test_content="", file_name="test.py", gpu_id=gpus[0]): "pytorch",
            executor.submit(run_script_on_gpu, triton_code, test_content="", file_name="test.py", gpu_id=gpus[1]): "triton"
        }
        for future in as_completed(future_to_file):
            file_name = future_to_file[future]
            success, results, _ = future.result()
            if file_name == "pytorch":
                success_pytorch = success
                results_pytorch = results
            else:
                success_triton = success
                results_triton = results
    
    outputs_match = results_pytorch.stdout == results_triton.stdout

    return {"pytorch_runs": success_pytorch, 
            "pytorch_output": {"stdout": results_pytorch.stdout, "stderr": results_pytorch.stderr}, 
            "triton_runs": success_triton, 
            "triton_output": {"stdout": results_triton.stdout, "stderr": results_triton.stderr}, 
            "outputs_match": outputs_match}


In [45]:
sample_ds = ds.select(range(10))
sample_ds = sample_ds.map(run_one, num_proc=4)

Map (num_proc=4):   0%|          | 0/10 [00:00<?, ? examples/s]

Map (num_proc=4): 100%|██████████| 10/10 [00:21<00:00,  2.20s/ examples]


In [55]:
pprint(sample_ds[4])