## Load Mojo Kernels

In [13]:
import os, torch
from pathlib import Path
from max.torch import CustomOpLibrary

assert torch.cuda.is_available()

In [14]:
op_dir = os.path.abspath('operations')
op_lib = CustomOpLibrary(Path(op_dir))

## Simple `add_one` Operation

In [15]:
add_one = op_lib.my_add_constant[{"value": 1}]

In [16]:
import time

def torch_add_one(inputs):
    return inputs + 1

def mojo_add_one(inputs):
    outputs = torch.zeros_like(inputs)
    add_one(outputs, inputs)
    return outputs

for device in ["cpu", "cuda"]:
    for op in [torch_add_one, mojo_add_one]:
        x = torch.zeros(1024, device=device)
        x = op(x) # warm-up
        start = time.perf_counter()
        for _ in range(1000):
            x = op(x)
        end = time.perf_counter()
        print(op.__name__, device, x, end - start)

torch_add_one cpu tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.]) 0.003918775008060038
mojo_add_one cpu tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.]) 0.20928580197505653
torch_add_one cuda tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.], device='cuda:0') 0.007804271066561341
mojo_add_one cuda tensor([1001., 1001., 1001.,  ..., 1001., 1001., 1001.], device='cuda:0') 0.28747972194105387
