## Load Mojo Kernels

In [7]:
import os, numpy, torch
from pathlib import Path
from max.torch import CustomOpLibrary

assert torch.cuda.is_available()
op_dir = os.path.abspath('operations')

## Simple `add_one` Operation

In [2]:
op_lib = CustomOpLibrary(Path(op_dir))
add_one = op_lib.my_add_constant[{"value": 1}]

In [3]:
import time

def torch_add_one(inputs):
    return inputs + 1

def mojo_add_one(inputs):
    outputs = torch.zeros_like(inputs)
    add_one(outputs, inputs)
    return outputs

for device in ["cpu", "cuda"]:
    for op in [torch_add_one, mojo_add_one]:
        x = torch.zeros(1024, device=device)
        x = op(x) # warm-up
        start = time.perf_counter()
        for _ in range(1000):
            x = op(x)
        end = time.perf_counter()
        print(op.__name__, device, x, end - start)

torch_add_one cpu tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.]) 0.003813645918853581
mojo_add_one cpu tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.]) 0.15939615399111062
torch_add_one cuda tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.], device='cuda:0') 0.008109500049613416
mojo_add_one cuda tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.], device='cuda:0') 0.23655258293729275


## Different MatMul Operations

In [8]:
from max.driver import CPU, Accelerator, accelerator_count, Tensor
import torch
M = 4096
K = 6144
N = 2048
device = CPU() if accelerator_count() == 0 else Accelerator()
torch_A = torch.randn(M, K)
torch_B = torch.randn(K, N)
torch_result = (torch_A @ torch_B).numpy()
A = Tensor.from_numpy(torch_A.numpy()).to(device)
B = Tensor.from_numpy(torch_B.numpy()).to(device)

Build and test executing the CUDA graph for our MatMul kernel:

In [9]:
from max.graph import Graph, TensorType, DeviceRef, ops
def build_graph(session, algorithm):
    with Graph("matmul_graph",
               input_types=[
                   TensorType(dtype=A.dtype, shape=A.shape, device=DeviceRef.from_device(device)),
                   TensorType(dtype=B.dtype, shape=B.shape, device=DeviceRef.from_device(device))
               ],
               custom_extensions=[Path(op_dir)]) as graph:
        A_value, B_value = graph.inputs
        output = ops.custom(
            name="my_matmul",
            device=DeviceRef.from_device(device),
            values=[A_value, B_value],
            out_types=[
                TensorType(dtype=A.dtype, shape=[
                        A_value.tensor.shape[0], B_value.tensor.shape[1]
                    ], device=DeviceRef.from_device(device))
            ],
            parameters={"algorithm": algorithm},
        )
        graph.output(output[0].tensor)
    return session.load(graph) # compile the graph

from max.engine import InferenceSession
session = InferenceSession(devices=[device])
print('building cuda graph...')
graph =  build_graph(session, "naive")
mojo_result = graph.execute(A, B)[0].to_numpy() # test run

building cuda graph...
algo: naive


Now verify the kernel calculation:

In [10]:
print(mojo_result, end="\n\n")
print("reference:\n", torch_result)
assert numpy.allclose(mojo_result, torch_result, rtol=0, atol=0.002)

[[ -68.15777    -88.05523   -217.07161   ...  -44.82082     -6.8497744
   184.864    ]
 [  61.421253    -0.8291675 -120.87447   ...   86.0836     146.33058
     9.873138 ]
 [  79.264366    69.59208    -53.364     ...  204.6813       6.614575
    68.28843  ]
 ...
 [ 106.035545   -73.1687     -16.831264  ...  -40.18564     56.917828
    59.74314  ]
 [-158.22717     13.744788    41.3687    ... -117.1607    -109.138405
   120.45917  ]
 [ -36.116673   123.74782    -28.38967   ...  -56.09248    -54.63964
     1.0272372]]

reference:
 [[ -68.15774    -88.05525   -217.07114   ...  -44.820915    -6.849769
   184.86397  ]
 [  61.421227    -0.8291502 -120.8744    ...   86.08326    146.3305
     9.873069 ]
 [  79.26427     69.59227    -53.364048  ...  204.6814       6.614544
    68.28847  ]
 ...
 [ 106.03558    -73.16875    -16.831312  ...  -40.185654    56.91782
    59.74322  ]
 [-158.22719     13.744926    41.368725  ... -117.1608    -109.13831
   120.459175 ]
 [ -36.116646   123.74789    -28.38

Test the speed?

In [21]:
record = dict(torch=0, mojo=0)
for _ in range(5):
    torch_A = torch.randn(M, K).to('cuda:0')
    torch_B = torch.randn(K, N).to('cuda:0')
    A = Tensor.from_numpy(torch_A.cpu().numpy()).to(device)
    B = Tensor.from_numpy(torch_B.cpu().numpy()).to(device)
    # torch
    begin = time.perf_counter()
    torch_result = (torch_A @ torch_B).cpu().numpy()
    record['torch'] += time.perf_counter() - begin
    # mojo
    begin = time.perf_counter()
    mojo_result = graph.execute(A, B)[0].to_numpy()
    record['mojo'] += time.perf_counter() - begin
    assert numpy.allclose(mojo_result, torch_result, rtol=0, atol=0.002)
record

algo: naive
algo: naive
algo: naive
algo: naive
algo: naive


{'torch': 0.09531373414210975, 'mojo': 2.617269338108599}

## Reference
[1] https://github.com/modular/modular/blob/main/examples/custom_ops/kernels/matrix_multiplication.mojo