# TVM 张量表达式

In [1]:
import numpy as np
import tvm
from tvm import te

In [2]:
# n = te.var("n")
n = tvm.runtime.convert(1024)
A = te.placeholder((n,), name="A")
B = te.placeholder((n,), name="B")
C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
s = te.create_schedule(C.op)
s[C].parallel(C.op.axis[0])
# 这个 factor 应该被选择来匹配适合你的 CPU 的线程数。
# 这将根据架构的不同而变化，
# 但一个好的规则是将这个 factor 设置为等于可用的 CPU 内核数。
factor = 16
outer, inner = s[C].split(C.op.axis[0], factor=factor)
s[C].parallel(outer)
s[C].vectorize(inner)
tvm.lower(s, [A, B, C], simple_mode=True).show()

In [3]:
tgt = tvm.target.Target(target="llvm", host="llvm")
dev = tvm.device(tgt.kind.name, 0)
n = 1024
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
func = tvm.build(s, [A, B, C], tgt, name="myadd")
func(a, b, c)
np.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
evaluator = func.time_evaluator(func.entry_name, dev, number=10)
evaluator(a, b, c)
temp = tvm.contrib.utils.tempdir()
func.export_library(temp.relpath("myadd_pack.so"))
f = tvm.runtime.load_module(temp.relpath("myadd_pack.so"))
f(a, b, c)
np.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

## 矩阵乘法

In [10]:
import numpy as np

M = 256
K = 1024
N = 512

# The default tensor data type in tvm
dtype = "float32"
# TVM Matrix Multiplication using TE
k = te.reduce_axis((0, K), "k")
A = te.placeholder((M, K), name="A")
B = te.placeholder((K, N), name="B")
C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C")

# Default schedule
s = te.create_schedule(C.op)
target = tvm.target.Target(target="llvm", host="llvm")
dev = tvm.device(target.kind.name, 0)
func = tvm.build(s, [A, B, C], target=target, name="mmult")

a = tvm.nd.array(np.random.rand(M, K).astype(dtype), dev)
b = tvm.nd.array(np.random.rand(K, N).astype(dtype), dev)
c = tvm.nd.array(np.zeros((M, N), dtype=dtype), dev)
func(a, b, c)
answer = np.dot(a.numpy(), b.numpy())
np.testing.assert_allclose(c.numpy(), answer, rtol=1e-5)

evaluator = func.time_evaluator(func.entry_name, dev, number=10)
evaluator(a, b, c)

BenchmarkResult(min=0.2685959028, mean=0.2685959028, median=0.2685959028, max=0.2685959028, std=0.0, results=(0.2685959028,))

In [13]:
# TVM Matrix Multiplication using TE
k = te.reduce_axis((0, K), "k")
A = te.placeholder((M, K), name="A")
B = te.placeholder((K, N), name="B")
C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C")

默认调度:

In [14]:
s = te.create_schedule(C.op)
tvm.lower(s, [A, B, C], simple_mode=True).show()

循环分块:

In [17]:
s = te.create_schedule(C.op)
xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], 32, 16)
tvm.lower(s, [A, B, C], simple_mode=True).show()

分割规约轴:

In [26]:
s = te.create_schedule(C.op)
xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], 32, 16)
(k,) = s[C].op.reduce_axis
ko, ki = s[C].split(k, factor=16)
tvm.lower(s, [A, B, C], simple_mode=True).show()

重排轴顺序:

In [27]:
s = te.create_schedule(C.op)
xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], 32, 16)
(k,) = s[C].op.reduce_axis
ko, ki = s[C].split(k, factor=16)
s[C].reorder(xo, yo, ko, ki, xi, yi)
tvm.lower(s, [A, B, C], simple_mode=True).show()

向量化:

In [28]:
s = te.create_schedule(C.op)
xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], 32, 16)
(k,) = s[C].op.reduce_axis
ko, ki = s[C].split(k, factor=16)
s[C].reorder(xo, yo, ko, ki, xi, yi)
s[C].vectorize(yi)
tvm.lower(s, [A, B, C], simple_mode=True).show()

数组打包:

In [30]:
# 必须重写算法。
bn = 32
packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name="packedB")
C = te.compute(
    (M, N),
    lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k),
    name="C",
)

s = te.create_schedule(C.op)

xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
(k,) = s[C].op.reduce_axis
ko, ki = s[C].split(k, factor=4)

s[C].reorder(xo, yo, ko, xi, ki, yi)
s[C].vectorize(yi)

x, y, z = s[packedB].op.axis
s[packedB].vectorize(z)
s[packedB].parallel(x)

# Here is the generated IR after array packing.
tvm.lower(s, [A, B, C], simple_mode=True).show()

In [8]:
# from vta.testing import simulator
# simulator.clear_stats()
# cost = evaluator(a, b, c)
# stats = simulator.stats()