## Load Mojo Kernels

In [1]:
import os, numpy, torch
from pathlib import Path
from max.torch import CustomOpLibrary

assert torch.cuda.is_available()
op_dir = os.path.abspath('operations')

## Simple `add_one` Operation

In [2]:
op_lib = CustomOpLibrary(Path(op_dir))
add_one = op_lib.my_add_constant[{"value": 1}]

In [3]:
import time

def torch_add_one(inputs):
    return inputs + 1

def mojo_add_one(inputs):
    outputs = torch.zeros_like(inputs)
    add_one(outputs, inputs)
    return outputs

for device in ["cpu", "cuda"]:
    for op in [torch_add_one, mojo_add_one]:
        x = torch.zeros(1024, device=device)
        x = op(x) # warm-up
        start = time.perf_counter()
        for _ in range(1000):
            x = op(x)
        end = time.perf_counter()
        print(op.__name__, device, x, end - start)

torch_add_one cpu tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.]) 0.003813645918853581
mojo_add_one cpu tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.]) 0.15939615399111062
torch_add_one cuda tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.], device='cuda:0') 0.008109500049613416
mojo_add_one cuda tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.], device='cuda:0') 0.23655258293729275


## Different MatMul Algorithms

In [3]:
from max.driver import CPU, Accelerator, accelerator_count, Tensor
import torch
M = 4096
K = 6144
N = 2048
device = CPU() if accelerator_count() == 0 else Accelerator()
torch_A = torch.randn(M, K)
torch_B = torch.randn(K, N)
torch_result = (torch_A @ torch_B).numpy()
A = Tensor.from_numpy(torch_A.numpy()).to(device)
B = Tensor.from_numpy(torch_B.numpy()).to(device)

Build and test executing the CUDA graph for our MatMul kernel:

In [6]:
from max.graph import Graph, TensorType, DeviceRef, ops
def build_graph(session, algorithm):
    print('building cuda graph for', algorithm)
    with Graph("matmul_graph",
               input_types=[
                   TensorType(dtype=A.dtype, shape=A.shape, device=DeviceRef.from_device(device)),
                   TensorType(dtype=B.dtype, shape=B.shape, device=DeviceRef.from_device(device))
               ],
               custom_extensions=[Path(op_dir)]) as graph:
        A_value, B_value = graph.inputs
        output = ops.custom(
            name="my_matmul",
            device=DeviceRef.from_device(device),
            values=[A_value, B_value],
            out_types=[
                TensorType(dtype=A.dtype, shape=[
                        A_value.tensor.shape[0], B_value.tensor.shape[1]
                    ], device=DeviceRef.from_device(device))
            ],
            parameters={"algorithm": algorithm},
        )
        graph.output(output[0].tensor)
    print('loading cuda graph...')
    return session.load(graph) # compile the graph

from max.engine import InferenceSession
session = InferenceSession(devices=[device])
graph =  build_graph(session, "tiled")
mojo_result = graph.execute(A, B)[0].to_numpy()
print("test run:\n", mojo_result, end="\n\n")
print("reference:\n", torch_result)
assert numpy.allclose(mojo_result, torch_result, rtol=0, atol=0.005)

building cuda graph for tiled
loading cuda graph...
test run:
 [[ 5.31484146e+01  1.24493752e+02  4.53797836e+01 ...  1.43312485e+02
   2.52162476e+01  4.28258781e+01]
 [ 1.10106285e+02 -7.76783752e+01  1.87938080e+01 ...  1.09180450e+02
  -1.57604156e+02 -3.84989090e+01]
 [ 5.55642433e+01 -4.84877930e+01 -4.57554703e+01 ...  7.03113861e+01
   5.75862312e+01 -3.67812233e+01]
 ...
 [-1.48519577e+02  4.56038933e+01  1.04608299e+02 ...  4.83763046e+01
  -4.22827759e+01  3.00679798e+01]
 [ 6.41466370e+01 -5.78322220e+01 -6.82179260e+01 ... -1.12710510e+02
   6.42970276e+01  1.08137108e+02]
 [-5.26530571e+01 -1.15051851e-01 -9.58864288e+01 ... -1.25526764e+02
  -1.25661659e+02  1.05335484e+01]]

reference:
 [[ 5.31484299e+01  1.24493721e+02  4.53799553e+01 ...  1.43312698e+02
   2.52162552e+01  4.28259659e+01]
 [ 1.10106384e+02 -7.76783142e+01  1.87938328e+01 ...  1.09180618e+02
  -1.57603943e+02 -3.84988136e+01]
 [ 5.55643730e+01 -4.84877243e+01 -4.57554626e+01 ...  7.03114395e+01
   5.758

In [139]:
for row in range(torch_result.shape[0]):
    if numpy.allclose(torch_result[row], mojo_result[row], rtol=0, atol=0.005): continue
    print('mismatch row:', row)

The tiled matmul process looks like this:
![](tile-mma.png)

Run a complete benchmark for different algorithms:

In [140]:
import time
for algo in ["naive", "coalescing", "tiled"]:
    graph =  build_graph(session, algo)
    record = dict(torch=0, mojo=0)
    sampels = 5
    for _ in range(sampels):
        torch_A = torch.randn(M, K).to('cuda:0')
        torch_B = torch.randn(K, N).to('cuda:0')
        A = Tensor.from_numpy(torch_A.cpu().numpy()).to(device)
        B = Tensor.from_numpy(torch_B.cpu().numpy()).to(device)
        # torch
        torch.cuda.synchronize()
        begin = time.perf_counter()
        torch_result = torch_A @ torch_B
        torch.cuda.synchronize()
        record['torch'] += (time.perf_counter() - begin) / sampels
        # mojo
        torch.cuda.synchronize()
        begin = time.perf_counter()
        mojo_result = graph.execute(A, B)
        torch.cuda.synchronize()
        record['mojo'] += (time.perf_counter() - begin) / sampels
        assert numpy.allclose(mojo_result[0].to_numpy(), torch_result.cpu().numpy(), rtol=0, atol=0.005)
    print(algo, record)

building cuda graph for naive
loading cuda graph...
naive {'torch': 0.013435979800124186, 'mojo': 0.5127615508004966}
building cuda graph for coalescing
loading cuda graph...
coalescing {'torch': 0.013407805200404254, 'mojo': 0.13278580000041984}
building cuda graph for tiled
loading cuda graph...
tiled {'torch': 0.01342437559942482, 'mojo': 0.10924591420043725}


## Reference
[1] https://github.com/modular/modular/blob/main/examples/custom_ops/kernels/matrix_multiplication.mojo

[2] https://docs.modular.com/max/tutorials/custom-ops-matmul