# VTA 简单入门

In [1]:
import tvm
from tvm import te
import vta
from tvm.ir.module import IRModule
from tvm.script import tir as T
import numpy as np
from vta.testing import simulator # 此处一定要有

In [2]:
env = vta.get_env()
remote = tvm.rpc.LocalSession()

In [3]:
m = 1
n = 1024
A_orig = np.random.randint(-128, 128, size=(m, n)).astype(env.acc_dtype)
B_orig = np.random.randint(-128, 128, size=(m, n)).astype(env.acc_dtype)

为了适应于 VTA 设备，需要应用 packing，将 A 和 B 数组从 2D 到 4D packed layout：

In [4]:
A_packed = A_orig.reshape(m//env.BATCH, env.BATCH, n//env.BLOCK_OUT, env.BLOCK_OUT).transpose((0, 2, 1, 3))
B_packed = B_orig.reshape(m//env.BATCH, env.BATCH, n//env.BLOCK_OUT, env.BLOCK_OUT).transpose((0, 2, 1, 3))

In [5]:
# 输出通道因子 m -总共 64 x 16 = 1024 输出通道
_m = n//env.BLOCK_OUT
# Batch 因子 o - 总共 16 x 1 = 1
_o = m//env.BATCH
# VTA 向量数据 shape
shape = (_o, _m, env.BATCH, env.BLOCK_OUT)
# 平铺 A, B 占位符张量数据
A = te.placeholder(shape, name="A", dtype=env.acc_dtype)
B = te.placeholder(shape, name="B", dtype=env.acc_dtype)
# A copy buffer
A_buf = te.compute(shape, lambda *i: A[i], "A_buf")
# B copy buffer
B_buf = te.compute(shape, lambda *i: B[i], "B_buf")
# 描述 VTA 中的 ALU 加法
fcompute = lambda *i: A_buf[i].astype(env.acc_dtype) + B_buf[i].astype(env.acc_dtype)
C_buf = te.compute(shape, fcompute, name="C_buf")
# 转换为输出类型，并发送到 main memory
fcompute = lambda *i: C_buf[i].astype(env.inp_dtype)
C = te.compute(shape, fcompute, name="C")
s = te.create_schedule(C.op)
s[A_buf].set_scope(env.acc_scope)
s[B_buf].set_scope(env.acc_scope)
s[C_buf].set_scope(env.acc_scope)
s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy)
s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy)
s[C].pragma(s[C].op.axis[0], env.dma_copy)
s[C_buf].pragma(C_buf.op.axis[0], env.alu)

In [6]:
# 查看最终的 schedule
tvm.lower(s, [A, B, C], simple_mode=True).show()

In [7]:
from tvm.topi.utils import get_const_tuple

# ctx = tvm.target.Target("ext_dev", host=env.target_host)
target = "ext_dev"
my_vadd = vta.build(s, [A, B, C], target=target, name="my_vadd")
temp = tvm.contrib.utils.tempdir()
my_vadd.save(temp.relpath("vadd.o"))
remote.upload(temp.relpath("vadd.o"))
f = remote.load_module("vadd.o")
ctx = remote.ext_dev(0)
A_nd = tvm.nd.array(A_packed, ctx)
B_nd = tvm.nd.array(B_packed, ctx)
C_nd = tvm.nd.empty(get_const_tuple(C.shape), C.dtype, ctx)
f(A_nd, B_nd, C_nd)
C_ref = (A_orig.astype(env.acc_dtype) + B_orig.astype(env.acc_dtype)).astype(C.dtype)
C_ref = C_ref.reshape(m//env.BATCH, env.BATCH, n//env.BLOCK_OUT, env.BLOCK_OUT).transpose((0, 2, 1, 3))
np.testing.assert_equal(C_ref, C_nd.numpy())
print("ALU 加法测试成功！")

2023-04-17 09:09:29.921 INFO load_module /tmp/tmpe_ff8p1d/vadd.o


ALU 加法测试成功！


In [8]:
# 获取 ALU 数据搬运情况
time_f = f.time_evaluator(f.entry_name, ctx, number=20)
simulator.clear_stats()
cost = time_f(A_nd, B_nd, C_nd)
stats = simulator.stats()



In [9]:
stats

{'inp_load_nbytes': 0,
 'wgt_load_nbytes': 0,
 'acc_load_nbytes': 172032,
 'uop_load_nbytes': 84,
 'out_store_nbytes': 21504,
 'gemm_counter': 0,
 'alu_counter': 1344}

In [1]:
1344 * 16

21504

In [4]:
172032/64

2688.0

In [5]:
from tvm import relay
from tvm.ir import IRModule

data = relay.var("data")
bias = relay.var("bias")
add_op = data + bias

In [12]:
mod = IRModule()
mod['AddFunc'] = relay.Function([data, bias], add_op)
a, b, c = [relay.var(name) for name in "abc"]
add_gvar = mod.get_global_var('AddFunc')
add_01 = relay.Call(add_gvar, [a, b])
add_012 = relay.Call(add_gvar, [c, add_01])
mod['main'] = relay.Function([a, b, c], add_012)
print(mod)

def @AddFunc(%data, %bias) {
  add(%data, %bias)
}

def @main(%a, %b, %c) {
  %0 = @AddFunc(%a, %b);
  @AddFunc(%c, %0)
}

