## Load Mojo Kernels

In [1]:
import os, numpy, torch
from pathlib import Path
from max.torch import CustomOpLibrary

assert torch.cuda.is_available()
op_dir = os.path.abspath('operations')

## Simple `add_one` Operation

In [2]:
op_lib = CustomOpLibrary(Path(op_dir))
add_one = op_lib.my_add_constant[{"value": 1}]

In [3]:
import time

def torch_add_one(inputs):
    return inputs + 1

def mojo_add_one(inputs):
    outputs = torch.zeros_like(inputs)
    add_one(outputs, inputs)
    return outputs

for device in ["cpu", "cuda"]:
    for op in [torch_add_one, mojo_add_one]:
        x = torch.zeros(1024, device=device)
        x = op(x) # warm-up
        start = time.perf_counter()
        for _ in range(1000):
            x = op(x)
        end = time.perf_counter()
        print(op.__name__, device, x, end - start)

torch_add_one cpu tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.]) 0.004054727032780647
mojo_add_one cpu tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.]) 0.14450858801137656
torch_add_one cuda tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.], device='cuda:0') 0.007304457016289234
mojo_add_one cuda tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.], device='cuda:0') 0.11011582997161895


## Different MatMul Algorithms

In [4]:
from max.driver import Accelerator, accelerator_count, Tensor
import torch
M, K, N = 8, 8, 16 # debug
M, K, N = 4096, 6144, 2048 # real-world scale
device = Accelerator()
torch_A = torch.randn(M, K)
torch_B = torch.randn(K, N)
torch_result = (torch_A @ torch_B).detach().cpu().numpy()
A = Tensor.from_numpy(torch_A.numpy()).to(device)
B = Tensor.from_numpy(torch_B.numpy()).to(device)

Build and test executing the CUDA graph for our MatMul kernel:

In [5]:
from max.graph import Graph, TensorType, DeviceRef, ops
def build_graph(session, algorithm):
    print('building cuda graph for', algorithm)
    with Graph("matmul_graph",
               input_types=[
                   TensorType(dtype=A.dtype, shape=A.shape, device=DeviceRef.from_device(device)),
                   TensorType(dtype=B.dtype, shape=B.shape, device=DeviceRef.from_device(device))
               ],
               custom_extensions=[Path(op_dir)]) as graph:
        A_value, B_value = graph.inputs
        output = ops.custom(
            name="my_matmul",
            device=DeviceRef.from_device(device),
            values=[A_value, B_value],
            out_types=[
                TensorType(dtype=A.dtype, shape=[
                        A_value.tensor.shape[0], B_value.tensor.shape[1]
                    ], device=DeviceRef.from_device(device))
            ],
            parameters={"algorithm": algorithm},
        )
        graph.output(output[0].tensor)
    print('loading cuda graph...')
    return session.load(graph) # compile the graph

from max.engine import InferenceSession
session = InferenceSession(devices=[device])
graph =  build_graph(session, "tensor_core_matmul") # Change this to test a different algorithm
mojo_result = graph.execute(A, B)[0].to_numpy()
print("test run:\n", mojo_result, end="\n\n")
print("reference:\n", torch_result)

building cuda graph for tensor_core_matmul
loading cuda graph...
test run:
 [[ -72.80523      43.381332     -0.48134637 ...    0.5543761
    57.04894     -37.91275   ]
 [  33.03406     -26.666258     35.28915    ...  210.54146
  -114.84794      -6.355719  ]
 [-102.443634   -119.57497    -110.37925    ...   11.614085
    73.72262     -20.905973  ]
 ...
 [-151.94147      -5.1379514    -3.2820497  ...   57.451336
    14.925808     65.16257   ]
 [ -28.677015    -31.575888     30.7144     ...   17.16105
    40.508167    -76.59762   ]
 [  88.376976    -30.413141   -110.30627    ...    4.386332
   105.50478       5.332529  ]]

reference:
 [[ -72.85087      43.41088      -0.4847288  ...    0.59809685
    57.12476     -37.933838  ]
 [  33.067142    -26.697542     35.307568   ...  210.66296
  -114.961754     -6.3434296 ]
 [-102.53347    -119.66125    -110.51503    ...   11.600237
    73.74789     -20.949192  ]
 ...
 [-152.0698       -5.1372375    -3.2726097  ...   57.500652
    14.920652     65.

Verify kernel results:

In [6]:
if not numpy.allclose(mojo_result, torch_result, rtol=0, atol=0.5):
    for row in range(torch_result.shape[0]):
        if numpy.allclose(torch_result[row], mojo_result[row], rtol=0, atol=1.0): continue
        print('obviously mismatch row:', row, 'delta:', numpy.absolute(torch_result[row] - mojo_result[row]).max())
        print(torch_result[row])
        print(mojo_result[row])
    else:
        print('almost matched!')
else:
    print('all matched!')

all matched!


The tiled kernels can be visualized using the [matmul_visualization.mojo](./matmul_visualization.mojo) and `matmul_visualization_gui/*` scripts.

Here are some example screenshots:
![](./block_tiled_matrix_multiplication.png) 
![](./tensor_core_matmul_kernel.png)

Run a complete benchmark for different algorithms:

In [9]:
import time
for algo in ["naive", "coalescing", "tiled", "tiled_register", "block_tiling", "block_tiling_vectorized", "tensor_core_matmul"]:
    graph =  build_graph(session, algo)
    record = dict(torch=0, mojo=0)
    sampels = 5
    for _ in range(sampels):
        torch_A = torch.randn(M, K).to('cuda:0')
        torch_B = torch.randn(K, N).to('cuda:0')
        A = Tensor.from_numpy(torch_A.cpu().numpy()).to(device)
        B = Tensor.from_numpy(torch_B.cpu().numpy()).to(device)
        # torch
        torch.cuda.synchronize()
        begin = time.perf_counter()
        torch_result = (torch_A @ torch_B).detach().cpu().numpy()
        torch.cuda.synchronize()
        record['torch'] += (time.perf_counter() - begin) / sampels
        # mojo
        torch.cuda.synchronize()
        begin = time.perf_counter()
        mojo_result = graph.execute(A, B)
        torch.cuda.synchronize()
        record['mojo'] += (time.perf_counter() - begin) / sampels
        assert numpy.allclose(mojo_result[0].to_numpy(), torch_result, rtol=0, atol=1.0)
    print(algo, record)

building cuda graph for naive
loading cuda graph...
naive {'torch': 0.019287150795571507, 'mojo': 1.0276804403867572}
building cuda graph for coalescing
loading cuda graph...
coalescing {'torch': 0.01778892583679408, 'mojo': 0.13523418738041074}
building cuda graph for tiled
loading cuda graph...
tiled {'torch': 0.018305375589989126, 'mojo': 0.1113776879850775}
building cuda graph for tiled_register
loading cuda graph...
tiled_register {'torch': 0.01800552138593048, 'mojo': 0.4363975091837347}
building cuda graph for block_tiling
loading cuda graph...
block_tiling {'torch': 0.017901817173697057, 'mojo': 0.025444415980018675}
building cuda graph for block_tiling_vectorized
loading cuda graph...
block_tiling_vectorized {'torch': 0.01904884921386838, 'mojo': 0.03678875456098467}
building cuda graph for tensor_core_matmul
loading cuda graph...
tensor_core_matmul {'torch': 0.01817250440362841, 'mojo': 0.14949885180685668}


## Reference
[1] https://github.com/modular/modular/blob/main/examples/custom_ops/kernels/matrix_multiplication.mojo

[2] https://docs.modular.com/max/tutorials/custom-ops-matmul

[3] Tensor Core Intro: https://www.nvidia.com/en-us/on-demand/session/gtcsj20-s21745/