# VTA

In [1]:
import set_env

import os
import tvm
from tvm import te
import vta
import numpy as np

In [2]:
env = vta.get_env()

In [3]:
import json
config_path = f'{vta.environment.get_vta_hw_path()}/config/vta_config.json'
with open(config_path) as fp:
    cfg = json.load(fp)

cfg.update({'TARGET': 'sim'})
vta.environment.Environment.current = vta.environment.Environment(cfg)
env = vta.get_env()
env.target

ext_dev -keys=vta,cpu -device=vta -model=sim_1x16_i8w8a32_15_15_18_17

In [4]:
# 需要 TVM RPC 模块和 VTA 仿真器模块
from tvm import rpc
from tvm.contrib import utils
from vta.testing import simulator

# 从操作系统环境中读取 Pynq RPC 主机 IP 地址和端口号
host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
port = int(os.environ.get("VTA_RPC_PORT", "9091"))

# 在 Pynq 上配置 bitstream 和 runtime 系统，以匹配 vta_config.json 文件指定的 VTA 配置
if env.TARGET == "pynq" or env.TARGET == "de10nano":

    # 确保使用 RPC=1 编译 TVM
    assert tvm.runtime.enabled("rpc")
    remote = rpc.connect(host, port)

    # 重新配置 JIT 运行时
    vta.reconfig_runtime(remote)

    # 用预编译的 VTA bitstream 编程 FPGA。
    # 通过将路径传递到 bitstream 文件而不是 None，
    # 您可以使用自己的自定义 bitstream 来编程 FPGA。
    vta.program_fpga(remote, bitstream=None)

# 在仿真模式下，本地托管 RPC 服务器。
elif env.TARGET in ("sim", "tsim", "intelfocl"):
    remote = rpc.LocalSession()

    if env.TARGET in ["intelfocl"]:
        # 编程 intelfocl aocx 
        vta.program_fpga(remote, bitstream="vta.bitstream")

In [5]:
# 输出通道因子 m - 总计 64 x 16 = 1024 输出通道
m = 64
# Batch 因子 o - 总计 1 x 1 = 1
o = 1
# tiled 数据格式的占位符张量 A
A = te.placeholder((o, m, env.BATCH, env.BLOCK_OUT), name="A", dtype=env.acc_dtype)
# tiled 数据格式的占位符张量 B
B = te.placeholder((o, m, env.BATCH, env.BLOCK_OUT), name="B", dtype=env.acc_dtype)

In [6]:
# A copy buffer
A_buf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: A(*i), "A_buf")
# B copy buffer
B_buf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: B(*i), "B_buf")

In [7]:
# Describe the in-VTA vector addition
C_buf = te.compute(
    (o, m, env.BATCH, env.BLOCK_OUT),
    lambda *i: A_buf(*i).astype(env.acc_dtype) + B_buf(*i).astype(env.acc_dtype),
    name="C_buf",
)

In [8]:
# Cast to output type, and send to main memory
C = te.compute(
    (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: C_buf(*i).astype(env.inp_dtype), name="C"
)

In [9]:
# Let's take a look at the generated schedule
s = te.create_schedule(C.op)

print(tvm.lower(s, [A, B, C], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(int32), int32, [1024], []),
             B: Buffer(B_2: Pointer(int32), int32, [1024], []),
             C: Buffer(C_2: Pointer(int8), int8, [1024], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  allocate(A_buf: Pointer(global int32), int32, [1024]), storage_scope = global;
  allocate(B_buf: Pointer(global int32), int32, [1024]), storage_scope = global {
    for (i1: int32, 0, 64) {
      for (i3: int32, 0, 16) {
        let cse_var_1: int32 = ((i1*16) + i3)
        A_buf_1: Buffer(A_buf, int32, [1024], [])[cse_var_1] = A[cse_var_1]
      }
    }
    for (i1_1: int32, 0, 64) {
      for (i3_1: int32, 0, 16) {
        let cse_var_2: int32 = ((i1_1*16) + i3_1)
        B_buf_1: Buffer(B_buf, int32, [1024], [])[cse_var_2] = B[cse_var_2]
      }
    }
    for (i1_2: int32, 0, 64) {
      for (i3_2: int32, 0

In [10]:
s[A_buf].set_scope(env.acc_scope)
s[B_buf].set_scope(env.acc_scope)
s[C_buf].set_scope(env.acc_scope)

stage(C_buf, compute(C_buf, body=[(A_buf[i0, i1, i2, i3] + B_buf[i0, i1, i2, i3])], axis=[iter_var(i0, range(min=0, ext=1)), iter_var(i1, range(min=0, ext=64)), iter_var(i2, range(min=0, ext=1)), iter_var(i3, range(min=0, ext=16))], reduce_axis=[], tag=, attrs={}))

In [11]:
# Tag the buffer copies with the DMA pragma to map a copy loop to a
# DMA transfer operation
s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy)
s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy)
s[C].pragma(s[C].op.axis[0], env.dma_copy)

In [12]:
# Tell TVM that the computation needs to be performed
# on VTA's vector ALU
s[C_buf].pragma(C_buf.op.axis[0], env.alu)

# Let's take a look at the finalized schedule
print(vta.lower(s, [A, B, C], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(int32), int32, [1024], []),
             B: Buffer(B_2: Pointer(int32), int32, [1024], []),
             C: Buffer(C_2: Pointer(int8), int8, [1024], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  attr [IterVar(vta: int32, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 2 {
    @tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), A_2, 0, 64, 1, 64, 0, 0, 0, 0, 0, 3, dtype=int32)
    @tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), B_2, 0, 64, 1, 64, 0, 0, 0, 0, 64, 3, dtype=int32)
    attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushALUOp" {
      @tir.call_extern("VTAUopLoopBegin", 64, 1, 1, 0, dtype=int32)
      @tir.vta.uop_push(1, 0, 0,

In [13]:
my_vadd = vta.build(
    s, [A, B, C], tvm.target.Target("ext_dev", host=env.target_host), name="my_vadd"
)

In [14]:
# Write the compiled module into an object file.
temp = utils.tempdir()
my_vadd.save(temp.relpath("vadd.o"))

# Send the executable over RPC
remote.upload(temp.relpath("vadd.o"))

In [15]:
f = remote.load_module("vadd.o")

In [16]:
# Get the remote device context
ctx = remote.ext_dev(0)

# Initialize the A and B arrays randomly in the int range of (-128, 128]
A_orig = np.random.randint(-128, 128, size=(o * env.BATCH, m * env.BLOCK_OUT)).astype(A.dtype)
B_orig = np.random.randint(-128, 128, size=(o * env.BATCH, m * env.BLOCK_OUT)).astype(B.dtype)

# Apply packing to the A and B arrays from a 2D to a 4D packed layout
A_packed = A_orig.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
B_packed = B_orig.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))

# Format the input/output arrays with tvm.nd.array to the DLPack standard
A_nd = tvm.nd.array(A_packed, ctx)
B_nd = tvm.nd.array(B_packed, ctx)
C_nd = tvm.nd.array(np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx)

# Invoke the module to perform the computation
f(A_nd, B_nd, C_nd)

In [17]:
# Compute reference result with numpy
C_ref = (A_orig.astype(env.acc_dtype) + B_orig.astype(env.acc_dtype)).astype(C.dtype)
C_ref = C_ref.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
np.testing.assert_equal(C_ref, C_nd.numpy())
print("Successful vector add test!")

Successful vector add test!
