In [10]:
!cat ./mlir_tensor_lib.spy

from mlir import MLIR_Type, MLIR_op, MLIR_asm


# Define dynamically sized 1D tensor in MLIR
@blue.generic
def MLIR_tensor_1d(dtype: MLIR_Type):
    return MLIR_Type("tensor<?x{}>", dtype)

# Define dynamically sized 1D memref in MLIR
@blue.generic
def MLIR_memref_1d(dtype: MLIR_Type):
    return MLIR_Type("memref<?x{}>", dtype)


# Define a Pythonic wrapper to define the binary operations
@blue.generic
def make_tensor_type(DTYPE: MLIR_Type):
    # Define the lower level MLIR type of the tensor
    T = MLIR_tensor_1d[DTYPE]

    T_index = MLIR_Type("index")
    # MLIR_asm are directly inline MLIR asm.
    mlir_zero_index = MLIR_asm("arith.constant {value=0:index}", T_index, ())
    mlir_tensor_dim = MLIR_asm("tensor.dim", T_index, (T, T_index))
    mlir_tensor_empty = MLIR_asm("tensor.empty", T, (T_index,))
    # linalg.add is more complicated as it has regions
    # so using MLIR_op uses the MLIR python binding to fill in all the necessary
    # attributes.
    mlir_linalg_add = MLIR_

Compile the SPy file

In [11]:
stdout = !python -m nbcc.compiler -shared ./mlir_tensor_lib.spy mlir_tensor_lib.so

Disassemble the compiled file.

Note: the implementation is SIMD-vectorized.

In [12]:
!llvm-objdump -d mlir_tensor_lib.so


mlir_tensor_lib.so:	file format mach-o arm64

Disassembly of section __TEXT,__text:

00000000000003b8 <_spy_mlir_tensor_lib$export_tensor_f64_add>:
     3b8: a9bd57f6     	stp	[0;36mx22[0m, [0;36mx21[0m, [[0;36msp[0m, [0;31m#-0x30[0m]!
     3bc: a9014ff4     	stp	[0;36mx20[0m, [0;36mx19[0m, [[0;36msp[0m, [0;31m#0x10[0m]
     3c0: a9027bfd     	stp	[0;36mx29[0m, [0;36mx30[0m, [[0;36msp[0m, [0;31m#0x20[0m]
     3c4: aa0603f4     	mov	[0;36mx20[0m, [0;36mx6[0m
     3c8: aa0303f5     	mov	[0;36mx21[0m, [0;36mx3[0m
     3cc: aa0103f3     	mov	[0;36mx19[0m, [0;36mx1[0m
     3d0: d37df068     	lsl	[0;36mx8[0m, [0;36mx3[0m, [0;31m#3[0m
     3d4: 91010100     	add	[0;36mx0[0m, [0;36mx8[0m, [0;31m#0x40[0m
     3d8: 9400013a     	bl	[0;33m0x8c0[0m <_malloc+0x8c0>
     3dc: 9100fc08     	add	[0;36mx8[0m, [0;36mx0[0m, [0;31m#0x3f[0m
     3e0: 927ae501     	and	[0;36mx1[0m, [0;36mx8[0m, [0;31m#0xffffffffffffffc0[0m
     3e4: 937ffea8    

DLopen the the compiled library to run the function here

In [None]:
import ctypes

lib = ctypes.CDLL('./mlir_tensor_lib.so')
# tensor_add = getattr(lib, "_mlir_ciface_spy_mlir_tensor_lib$export_tensor_f64_add")
tensor_arrayexpr = getattr(lib, "_mlir_ciface_spy_mlir_tensor_lib$export_tensor_f64_arrayexpr")

In [4]:
from ctypes import CDLL, c_double, byref
from mlir.runtime import make_nd_memref_descriptor, get_ranked_memref_descriptor, ranked_memref_to_numpy
import numpy as np


In [5]:

NELEM = 200

memref_1d_f64 = make_nd_memref_descriptor(1, c_double)

A = np.arange(NELEM, dtype=np.float64) / NELEM
B = np.arange(NELEM, dtype=np.float64) / NELEM
C = np.arange(NELEM, dtype=np.float64) / NELEM

In [7]:

argA = get_ranked_memref_descriptor(A)
argB = get_ranked_memref_descriptor(B)
argC = get_ranked_memref_descriptor(C)

out_memref = (memref_1d_f64 * 1)()
args = [out_memref, byref(argA), byref(argB), byref(argC)]
tensor_arrayexpr(*args)

output = ranked_memref_to_numpy(out_memref)

np.testing.assert_allclose(output, (A + B) * (C + B))


In [8]:
%timeit (A + B) * (C + B)

974 ns ± 1.08 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [9]:
out_memref = (memref_1d_f64 * 1)()
args = [out_memref, byref(argA), byref(argB), byref(argC)]

%timeit tensor_arrayexpr(*args)

350 ns ± 3.08 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
