## Load Mojo Kernels

In [7]:
import os, numpy, torch
from pathlib import Path
from max.torch import CustomOpLibrary

assert torch.cuda.is_available()
op_dir = os.path.abspath('operations')

## Simple `add_one` Operation

In [2]:
op_lib = CustomOpLibrary(Path(op_dir))
add_one = op_lib.my_add_constant[{"value": 1}]

In [3]:
import time

def torch_add_one(inputs):
    return inputs + 1

def mojo_add_one(inputs):
    outputs = torch.zeros_like(inputs)
    add_one(outputs, inputs)
    return outputs

for device in ["cpu", "cuda"]:
    for op in [torch_add_one, mojo_add_one]:
        x = torch.zeros(1024, device=device)
        x = op(x) # warm-up
        start = time.perf_counter()
        for _ in range(1000):
            x = op(x)
        end = time.perf_counter()
        print(op.__name__, device, x, end - start)

torch_add_one cpu tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.]) 0.003813645918853581
mojo_add_one cpu tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.]) 0.15939615399111062
torch_add_one cuda tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.], device='cuda:0') 0.008109500049613416
mojo_add_one cuda tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.], device='cuda:0') 0.23655258293729275


## Different MatMul Algorithms

In [12]:
from max.driver import CPU, Accelerator, accelerator_count, Tensor
import torch
M = 4096
K = 6144
N = 2048
device = CPU() if accelerator_count() == 0 else Accelerator()
torch_A = torch.randn(M, K)
torch_B = torch.randn(K, N)
torch_result = (torch_A @ torch_B).numpy()
A = Tensor.from_numpy(torch_A.numpy()).to(device)
B = Tensor.from_numpy(torch_B.numpy()).to(device)

Build and test executing the CUDA graph for our MatMul kernel:

In [15]:
from max.graph import Graph, TensorType, DeviceRef, ops
def build_graph(session, algorithm):
    print('building cuda graph for', algorithm)
    with Graph("matmul_graph",
               input_types=[
                   TensorType(dtype=A.dtype, shape=A.shape, device=DeviceRef.from_device(device)),
                   TensorType(dtype=B.dtype, shape=B.shape, device=DeviceRef.from_device(device))
               ],
               custom_extensions=[Path(op_dir)]) as graph:
        A_value, B_value = graph.inputs
        output = ops.custom(
            name="my_matmul",
            device=DeviceRef.from_device(device),
            values=[A_value, B_value],
            out_types=[
                TensorType(dtype=A.dtype, shape=[
                        A_value.tensor.shape[0], B_value.tensor.shape[1]
                    ], device=DeviceRef.from_device(device))
            ],
            parameters={"algorithm": algorithm},
        )
        graph.output(output[0].tensor)
    print('loading cuda graph...')
    return session.load(graph) # compile the graph

from max.engine import InferenceSession
session = InferenceSession(devices=[device])
graph =  build_graph(session, "tiled_register")
mojo_result = graph.execute(A, B)[0].to_numpy()
print("test run:\n", mojo_result, end="\n\n")
print("reference:\n", torch_result)
assert numpy.allclose(mojo_result, torch_result, rtol=0, atol=0.005)

building cuda graph for tiled_register
loading cuda graph...
test run:
 [[ 413.29004     56.92129     83.400345  ...  -12.304021   -52.580948
   -50.107594 ]
 [  50.026173   332.2174      67.8422    ...  -13.157701    71.953575
   -32.62087  ]
 [ -36.76945    -91.15466    299.37857   ...  -10.059785   -74.83853
     6.8866305]
 ...
 [  -9.585925   -24.193306     1.5354619 ...  400.3204     -90.23229
     7.2631245]
 [-117.70533    -40.119106   -37.276913  ...   15.177331   357.47803
    69.10169  ]
 [  32.88124   -116.07284    -79.88674   ...  -32.81472    -15.833847
   402.2895   ]]

reference:
 [[ 6.1341324e+00 -7.7510292e+01  1.7075717e+02 ...  2.9965069e+01
  -7.1365023e+00 -2.2929005e+01]
 [ 2.8462984e+01  4.3787811e+01  1.4333394e+02 ... -5.7538811e+01
   1.7837200e+01  4.4863251e+01]
 [ 1.0912779e+02  2.0891026e+02 -5.2901581e+01 ...  3.0110598e-02
   4.2478676e+01 -6.2596178e+00]
 ...
 [-5.0698322e+01 -1.5537634e+02 -5.5040268e+01 ...  9.0997597e+01
  -2.7807766e+01  2.2023298e

AssertionError: 

In [None]:
for row in range(torch_result.shape[0]):
    if numpy.allclose(torch_result[row], mojo_result[row], rtol=0, atol=0.005): continue
    print('mismatch row:', row)

The tiled matmul process looks like this:
![](tile-mma.png)

And the *tiled_register* version is further compute results in "subtiles":
![](subtile.png)

Run a complete benchmark for different algorithms:

In [140]:
import time
for algo in ["naive", "coalescing", "tiled"]:
    graph =  build_graph(session, algo)
    record = dict(torch=0, mojo=0)
    sampels = 5
    for _ in range(sampels):
        torch_A = torch.randn(M, K).to('cuda:0')
        torch_B = torch.randn(K, N).to('cuda:0')
        A = Tensor.from_numpy(torch_A.cpu().numpy()).to(device)
        B = Tensor.from_numpy(torch_B.cpu().numpy()).to(device)
        # torch
        torch.cuda.synchronize()
        begin = time.perf_counter()
        torch_result = torch_A @ torch_B
        torch.cuda.synchronize()
        record['torch'] += (time.perf_counter() - begin) / sampels
        # mojo
        torch.cuda.synchronize()
        begin = time.perf_counter()
        mojo_result = graph.execute(A, B)
        torch.cuda.synchronize()
        record['mojo'] += (time.perf_counter() - begin) / sampels
        assert numpy.allclose(mojo_result[0].to_numpy(), torch_result.cpu().numpy(), rtol=0, atol=0.005)
    print(algo, record)

building cuda graph for naive
loading cuda graph...
naive {'torch': 0.013435979800124186, 'mojo': 0.5127615508004966}
building cuda graph for coalescing
loading cuda graph...
coalescing {'torch': 0.013407805200404254, 'mojo': 0.13278580000041984}
building cuda graph for tiled
loading cuda graph...
tiled {'torch': 0.01342437559942482, 'mojo': 0.10924591420043725}


## Reference
[1] https://github.com/modular/modular/blob/main/examples/custom_ops/kernels/matrix_multiplication.mojo

[2] https://docs.modular.com/max/tutorials/custom-ops-matmul