## Load Mojo Kernels

In [6]:
import os, torch
from pathlib import Path
from max.torch import CustomOpLibrary

assert torch.cuda.is_available()
op_dir = os.path.abspath('operations')

## Simple `add_one` Operation

In [14]:
op_lib = CustomOpLibrary(Path(op_dir))
add_one = op_lib.my_add_constant[{"value": 1}]

In [15]:
import time

def torch_add_one(inputs):
    return inputs + 1

def mojo_add_one(inputs):
    outputs = torch.zeros_like(inputs)
    add_one(outputs, inputs)
    return outputs

for device in ["cpu", "cuda"]:
    for op in [torch_add_one, mojo_add_one]:
        x = torch.zeros(1024, device=device)
        x = op(x) # warm-up
        start = time.perf_counter()
        for _ in range(1000):
            x = op(x)
        end = time.perf_counter()
        print(op.__name__, device, x, end - start)

torch_add_one cpu tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.]) 0.004273012978956103
mojo_add_one cpu tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.]) 0.15462623699568212
torch_add_one cuda tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.], device='cuda:0') 0.007748979958705604
mojo_add_one cuda tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.], device='cuda:0') 0.2673781750490889


## Different MatMul Operations

In [7]:
from max.driver import CPU, Accelerator, accelerator_count, Tensor
import torch
M = 4096
K = 6144
N = 2048
device = CPU() if accelerator_count() == 0 else Accelerator()
torch_A = torch.randn(M, K)
torch_B = torch.randn(K, N)
A = Tensor.from_numpy(torch_A.numpy()).to(device)
B = Tensor.from_numpy(torch_B.numpy()).to(device)

In [33]:
from max.graph import Graph, TensorType, DeviceRef, ops
def build_graph(session, algorithm):
    with Graph("matmul_graph",
               input_types=[
                   TensorType(dtype=A.dtype, shape=A.shape, device=DeviceRef.from_device(device)),
                   TensorType(dtype=B.dtype, shape=B.shape, device=DeviceRef.from_device(device))
               ],
               custom_extensions=[Path(op_dir)]) as graph:
        A_value, B_value = graph.inputs
        output = ops.custom(
            name="my_matmul",
            device=DeviceRef.from_device(device),
            values=[A_value, B_value],
            out_types=[
                TensorType(dtype=A.dtype, shape=[
                        A_value.tensor.shape[0], B_value.tensor.shape[1]
                    ], device=DeviceRef.from_device(device))
            ],
            parameters={"algorithm": algorithm},
        )
        graph.output(output[0].tensor)
    return session.load(graph) # compile the graph

from max.engine import InferenceSession
session = InferenceSession(devices=[device])
print('building cuda graph...')
graph =  build_graph(session, "naive")
graph.execute(A, B)[0].to_numpy() # test run

building cuda graph...
algo: naive
1 2 55.196094


array([[  96.123146, -182.77394 ,  -27.199825, ...,   39.727497,
          29.179317,   24.906761],
       [   0.      ,    0.      ,    0.      , ...,    0.      ,
           0.      ,    0.      ],
       [   0.      ,    0.      ,    0.      , ...,    0.      ,
           0.      ,    0.      ],
       ...,
       [   0.      ,    0.      ,    0.      , ...,    0.      ,
           0.      ,    0.      ],
       [   0.      ,    0.      ,    0.      , ...,    0.      ,
           0.      ,    0.      ],
       [   0.      ,    0.      ,    0.      , ...,    0.      ,
           0.      ,    0.      ]], shape=(4096, 2048), dtype=float32)

In [None]:
torch_A = torch.randn(M, K)
torch_B = torch.randn(K, N)
A = Tensor.from_numpy(torch_A.numpy()).to(device)
B = Tensor.from_numpy(torch_B.numpy()).to(device)
print(graph.execute(A, B)[0].to_numpy(), end="\n\n")
print("reference:\n", (torch_A @ torch_B).numpy())

## Reference
[1] https://github.com/modular/modular/blob/main/examples/custom_ops/kernels/matrix_multiplication.mojo