In [1]:
import onnx

import glob
import os
import numpy as np
import time

import tvm
from tvm import relay, auto_scheduler
from tvm.contrib import graph_executor
from tvm.auto_scheduler.utils import request_remote
from tvm.contrib import utils, ndk

from tvm import te
from tvm import relay
from tvm.relay.testing.darknet import __darknetffi__

In [2]:
# Also replace this with the device key in your tracker
device_key = "v9h"
rpc_host = "192.168.105.70"
rpc_port = 9190
print("device:", device_key)
print("rpc_host: %s:%s" % (rpc_host, rpc_port))

device: v9h
rpc_host: 192.168.105.70:9190


In [3]:
# Define the tune 
turn_trials = 4000
turn_enable = False
preload_log_file = False
load_local_log_file = False
# Set this to True if you use ndk tools for cross compiling
use_ndk = False
# Path to cross compiler
# os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++"

In [4]:
# Define the neural network and compilation target.
# network = "op9_dla"
# network = "mobilenet"
network = "yolov3"

batch_size = 1
dtype = "float32"
layout = "NCHW"
# layout = "NHWC"

target_type = "aarch64"
# target_type = "opencl"

In [18]:
if network == "mobilenet":
    tune_model = onnx.load('../mobilenet/mobilenetv2-7.onnx')
    input_name = "input"
    input_shape = (batch_size, 3, 244, 244)
    shape_dict = {input_name: input_shape}
    print("shape_dict: ", shape_dict)
    relay_model, params = relay.frontend.from_onnx(tune_model, shape_dict)
elif network == "resnet50":
    tune_model = onnx.load('../resnet50/resnet50.onnx')
    input_name = "input"
    input_shape = (batch_size, 3, 244, 244)
    shape_dict = {input_name: input_shape}
    print("shape_dict: ", shape_dict)
    relay_model, params = relay.frontend.from_onnx(tune_model, shape_dict)
elif network == "op9_dla":
    tune_model = onnx.load('../op9_dla/20210622_320_192_op9_dla.onnx')
    input_name = "input.1"
    input_shape = (batch_size, 3, 320, 192)
    shape_dict = {input_name: input_shape}
    print("shape_dict: ", shape_dict)
    relay_model, params = relay.frontend.from_onnx(tune_model, shape_dict)
elif network == "yolov3":
    cfg_path = "../yolov3/yolov3.cfg" # cfg path
    weights_path = "../yolov3/yolov3.weights"# weights path
    lib_path = "../yolov3/libdarknet2.0.so" # lib path
    DARKNET_LIB = __darknetffi__.dlopen(lib_path)
    print(DARKNET_LIB)
    net = DARKNET_LIB.load_network(cfg_path.encode("utf-8"), weights_path.encode("utf-8"), 0)
    print(net)
    data = np.empty([batch_size, net.c, net.h, net.w], dtype)
    input_name = "data"
    input_shape = data.shape
    shape_dict = {input_name: input_shape}
    print(shape_dict)
    print(net.layers[net.n - 1].classes)
    print("Converting darknet to relay function...")
    relay_model, params = relay.frontend.from_darknet(net, dtype=dtype, shape=data.shape)

<cffi.api._make_ffi_library.<locals>.FFILibrary object at 0x7f48d9da39b0>
<cdata 'network *' 0x20254040>
{'data': (1, 3, 416, 416)}
80
Converting darknet to relay function...


In [6]:
if layout == 'NHWC':
    # convert from NCHW to NHWC
    desired_layouts = {'nn.conv2d': ['NHWC', 'default']}

    # Convert the layout to NHWC
    # RemoveUnunsedFunctions is used to clean up the graph.
    seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
                                    relay.transform.ConvertLayout(desired_layouts)])

    with tvm.transform.PassContext(opt_level=3):
        relay_model = seq(relay_model)
    
    print(relay_model)

In [7]:
if target_type == "aarch64":
    target = tvm.target.Target("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon")
elif target_type == "opencl":
    target = tvm.target.Target("opencl", host="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon")

log_file = "%s-%s-B%d-%s-C%s-T%s.json" % (network, layout, batch_size, target.kind.name, turn_trials, time.strftime('%y-%m-%d-%H-%M',time.localtime(time.time())))
print("log file:", log_file)

log file: yolov3-NCHW-B1-llvm-C4000-T21-08-17-08-50.json


In [9]:
# remote = request_remote(device_key, rpc_host, rpc_port)
# dev = remote.cl()
# print("device_name:", dev.device_name)
# print("compute_version:", dev.compute_version)
# print("max_clock_rate:", dev.max_clock_rate)
# print("multi_processor_count:", dev.multi_processor_count)
# print("max_thread_dimensions:", dev.max_thread_dimensions)
# max_shared_memory_per_block = dev.max_shared_memory_per_block
# print("max_shared_memory_per_block:", max_shared_memory_per_block)
# max_threads_per_block = dev.max_threads_per_block
# print("max_threads_per_block:", max_threads_per_block)
# warp_size = dev.warp_size
# print("warp_size: ", warp_size)

In [9]:
if turn_enable:
    if target_type == "opencl":
        max_shared_memory_per_block = 4096
        print("max_shared_memory_per_block:", max_shared_memory_per_block)
        max_threads_per_block = 512
        print("max_threads_per_block:", max_threads_per_block)
        warp_size = 2
        print("warp_size: ", warp_size)
        # There is no explicit local memory limition
        # so we can use INT32_MAX to disable the check on local_memory.
        max_local_memory_per_block = 4096000 # INT32_MAX
        print("max_local_memory_per_block:", max_local_memory_per_block)
        max_vthread_extent = 2 #int(dev.warp_size / 4) if int(dev.warp_size / 4) > 1 else dev.warp_size
        print("max_vthread_extent:", max_vthread_extent)
        num_cores = 2
        print("number of cores:", num_cores)
        vector_unit_bytes = 16
        print("vector unit bytes:", vector_unit_bytes)
        cache_line_bytes = 64
        print("cache line bytes:", cache_line_bytes)
        hardware_params = auto_scheduler.HardwareParams(num_cores,
                                                        vector_unit_bytes,
                                                        cache_line_bytes,
                                                        max_shared_memory_per_block,
                                                        max_local_memory_per_block,
                                                        max_threads_per_block,
                                                        max_vthread_extent, 
                                                        warp_size)

        tasks, task_weights = auto_scheduler.extract_tasks(relay_model["main"], params, target,
                                                           hardware_params=hardware_params)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials = turn_trials,  # change this to 20000 to achieve the best performance
            builder = auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"),
            runner  = auto_scheduler.RPCRunner(
                    device_key,
                    host=rpc_host,
                    port=rpc_port, 
                    repeat=1, 
                    timeout=30,
                    min_repeat_ms = 200),
            measure_callbacks = [auto_scheduler.RecordToFile(log_file)],
        )
    elif target_type == "aarch64":
        tasks, task_weights = auto_scheduler.extract_tasks(relay_model["main"], params, target)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials = turn_trials,  # change this to 20000 to achieve the best performance
            builder = auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"),
            runner  = auto_scheduler.RPCRunner(
                    device_key,
                    host=rpc_host,
                    port=rpc_port, 
                    repeat=1, 
                    timeout=30,
                    min_repeat_ms = 200,
                    enable_cpu_cache_flush=True),
            measure_callbacks = [auto_scheduler.RecordToFile(log_file)],
        )
        
for idx, task in enumerate(tasks):
    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
    print(task.compute_dag)

Get devices for measurement successfully!
placeholder = PLACEHOLDER [1, 256, 52, 52]
PadInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
data_vec(n, h, w, ci, vh, vw) = PadInput[n, ci, ((h*4) + vh), ((w*2) + vw)]
placeholder = PLACEHOLDER [255, 256, 1, 1]
kernel_vec(co, ci, kh, kw, vc) = placeholder[((co*3) + vc), ci, kh, kw]
conv(n, co, h, w, vh, vw, vc) += (data_vec[n, h, w, ci, (vh + kh), (vw + kw)]*kernel_vec[co, ci, kh, kw, vc])
output_unpack(n, co, h, w) = conv[n, floordiv(co, 3), floordiv(h, 4), floordiv(w, 2), floormod(h, 4), floormod(w, 2), floormod(co, 3)]
placeholder = PLACEHOLDER [1, 255, 1, 1]
T_add(ax0, ax1, ax2, ax3) = (output_unpack[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, 0, 0])

placeholder = PLACEHOLDER [1, 128, 52, 52]
PadInput(i0, i1, i2, i3) = tir.if_then_else(((((i2 >= 1) && (i2 < 53)) && (i3 >= 1)) && (i3 < 53)), placeholder[i0, i1, (i2 - 1), (i3 - 1)], 0f)
data_vec(n, h, w, ci, vh, vw) = PadInput[n, ci, (h + vh), ((w*4) + vw)]
placeholder = PLACEHOLDER [2

In [10]:
if turn_enable:
    print("Begin tuning...")
    if preload_log_file:
        load_log_file = "mobilenet-NCHW-B1-opencl-C3000-T21-08-11-21-39.json"
        print("preload file:", load_log_file)
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=load_log_file)
    else:
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)

    tuner.tune(tune_option)

Begin tuning...
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |            - |              - |      0 |
|    1 |            - |              - |      0 |
|    2 |            - |              - |      0 |
|    3 |            - |              - |      0 |
|    4 |            - |              - |      0 |
|    5 |            - |              - |      0 |
|    6 |            - |              - |      0 |
|    7 |            - |              - |      0 |
|    8 |            - |              - |      0 |
|    9 |            - |              - |      0 |
|   10 |            - |              - |      0 |
|   11 |            - |              - |      0 |
|   12 |            - |              - |      0 |
|   13 |            - |              - |      0 |
|   14 |            - |              - |      0 |
|   15 |            - |              - |      0 |
|   16 |            - |              - |      0 |
|   17 |            - |           



|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |            - |              - |      0 |
|    2 |            - |              - |      0 |
|    3 |            - |              - |      0 |
|    4 |            - |              - |      0 |
|    5 |            - |              - |      0 |
|    6 |            - |              - |      0 |
|    7 |            - |              - |      0 |
|    8 |            - |              - |      0 |
|    9 |            - |              - |      0 |
|   10 |            - |              - |      0 |
|   11 |            - |              - |      0 |
|   12 |            - |              - |      0 |
|   13 |            - |              - |      0 |
|   14 |            - |              - |      0 |
|   15 |            - |              - |      0 |
|   16 |            - |              - |      0 |
|   17 |            - |              - |      0 |


......******
......******
......******
......******
......T*****
.....T.T****
......******
......T*****
.....T.T****
....T***
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |      102.845 |          15.52 |     64 |
|    2 |       14.061 |          12.65 |     64 |
|    3 |       25.034 |          10.65 |     64 |
|    4 |        3.458 |          12.86 |     64 |
|    5 |       24.240 |           7.32 |     64 |
|    6 |            - |              - |      0 |
|    7 |            - |              - |      0 |
|    8 |            - |              - |      0 |
|    9 |            - |              - |      0 |
|   10 |            - |              - |      0 |
|   11 |            - |              - |      0 |
|   12 |            - |              - |      0 |
|   13 |            - |              - |      0 |
|   14 |            - |              - |      0 |
|   15 |            - | 

......***T***
......***T***
......******
......******
......*T*****
......******
......******
......**T****
......******
......******
....T***
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |      102.845 |          15.52 |     64 |
|    2 |       14.061 |          12.65 |     64 |
|    3 |       25.034 |          10.65 |     64 |
|    4 |        3.458 |          12.86 |     64 |
|    5 |       24.240 |           7.32 |     64 |
|    6 |      213.294 |           7.48 |     64 |
|    7 |       24.082 |          11.05 |     64 |
|    8 |        3.470 |          12.79 |     64 |
|    9 |       23.970 |           7.40 |     64 |
|   10 |      165.042 |           9.67 |     64 |
|   11 |            - |              - |      0 |
|   12 |            - |              - |      0 |
|   13 |            - |              - |      0 |
|   14 |            - |              - |      0 |
|   15 

......******
......******
......******
......*****T*
......***T***
......******
......T*****
......***T***
......******
......T*****
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |      102.845 |          15.52 |     64 |
|    2 |       14.061 |          12.65 |     64 |
|    3 |       25.034 |          10.65 |     64 |
|    4 |        3.458 |          12.86 |     64 |
|    5 |       24.240 |           7.32 |     64 |
|    6 |      213.294 |           7.48 |     64 |
|    7 |       24.082 |          11.05 |     64 |
|    8 |        3.470 |          12.79 |     64 |
|    9 |       23.970 |           7.40 |     64 |
|   10 |      165.042 |           9.67 |     64 |
|   11 |      123.836 |          12.88 |     64 |
|   12 |      187.206 |           8.52 |     64 |
|   13 |      129.154 |          12.36 |     64 |
|   14 |      193.185 |           8.26 |     64 |
|   15 |

......******
......****T**
......******
......T*****
......T*T****
......******T
......******
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |      102.845 |          15.52 |     64 |
|    2 |       14.061 |          12.65 |     64 |
|    3 |       25.034 |          10.65 |     64 |
|    4 |        3.458 |          12.86 |     64 |
|    5 |       24.240 |           7.32 |     64 |
|    6 |      213.294 |           7.48 |     64 |
|    7 |       24.082 |          11.05 |     64 |
|    8 |        3.470 |          12.79 |     64 |
|    9 |       23.970 |           7.40 |     64 |
|   10 |      165.042 |           9.67 |     64 |
|   11 |      123.836 |          12.88 |     64 |
|   12 |      187.206 |           8.52 |     64 |
|   13 |      129.154 |          12.36 |     64 |
|   14 |      193.185 |           8.26 |     64 |
|   15 |

......******
......******
......******
......******
......******
......******
......******
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |      102.845 |          15.52 |     64 |
|    2 |       14.061 |          12.65 |     64 |
|    3 |       25.034 |          10.65 |     64 |
|    4 |        3.458 |          12.86 |     64 |
|    5 |       24.240 |           7.32 |     64 |
|    6 |      213.294 |           7.48 |     64 |
|    7 |       24.082 |          11.05 |     64 |
|    8 |        3.470 |          12.79 |     64 |
|    9 |       23.970 |           7.40 |     64 |
|   10 |      165.042 |           9.67 |     64 |
|   11 |      123.836 |          12.88 |     64 |
|   12 |      187.206 |           8.52 |     64 |
|   13 |      129.154 |          12.36 |     64 |
|   14 |      193.185 |           8.26 |     64 |
|   15 |   

......******
......******
......******
.....T.T****
......******
....T.T.****
......T*****
......******
......******
.....T.T****
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |      102.845 |          15.52 |     64 |
|    2 |       14.061 |          12.65 |     64 |
|    3 |       25.034 |          10.65 |     64 |
|    4 |        3.458 |          12.86 |     64 |
|    5 |       24.240 |           7.32 |     64 |
|    6 |       74.292 |          21.48 |    128 |
|    7 |       24.082 |          11.05 |     64 |
|    8 |        3.470 |          12.79 |     64 |
|    9 |       23.970 |           7.40 |     64 |
|   10 |       59.534 |          26.80 |    128 |
|   11 |       50.917 |          31.33 |    128 |
|   12 |      187.206 |           8.52 |     64 |
|   13 |      122.897 |          12.99 |    128 |
|   14 |      193.185 |           8.26 |     64 |
|   15 |   

......******
......******
......******
......******
......******
......T*****
......******
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |       73.536 |          21.71 |    128 |
|    2 |       14.061 |          12.65 |     64 |
|    3 |       25.034 |          10.65 |     64 |
|    4 |        3.458 |          12.86 |     64 |
|    5 |        9.020 |          19.69 |    128 |
|    6 |       74.292 |          21.48 |    128 |
|    7 |       24.082 |          11.05 |     64 |
|    8 |        3.470 |          12.79 |     64 |
|    9 |       23.970 |           7.40 |     64 |
|   10 |       59.534 |          26.80 |    128 |
|   11 |       50.917 |          31.33 |    128 |
|   12 |      187.206 |           8.52 |     64 |
|   13 |       73.195 |          21.80 |    256 |
|   14 |      193.185 |           8.26 |     64 |
|   15 |   

......******
......******
......******
......******
......******
......******
......******
......T*****
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |       73.536 |          21.71 |    128 |
|    2 |       14.061 |          12.65 |     64 |
|    3 |       25.034 |          10.65 |     64 |
|    4 |        3.458 |          12.86 |     64 |
|    5 |        9.020 |          19.69 |    128 |
|    6 |       74.292 |          21.48 |    128 |
|    7 |       24.082 |          11.05 |     64 |
|    8 |        3.470 |          12.79 |     64 |
|    9 |       23.970 |           7.40 |     64 |
|   10 |       59.534 |          26.80 |    128 |
|   11 |       50.917 |          31.33 |    128 |
|   12 |       98.212 |          16.24 |    128 |
|   13 |       48.706 |          32.77 |    320 |
|   14 |      105.142 |          15.18 |    128 |
|   15 |   

......******
......******
.....T.T****
......******
......T*****
......******
......T*****
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |       73.536 |          21.71 |    128 |
|    2 |       14.061 |          12.65 |     64 |
|    3 |       25.034 |          10.65 |     64 |
|    4 |        3.458 |          12.86 |     64 |
|    5 |        9.020 |          19.69 |    128 |
|    6 |       74.292 |          21.48 |    128 |
|    7 |       24.082 |          11.05 |     64 |
|    8 |        3.470 |          12.79 |     64 |
|    9 |        9.560 |          18.56 |    128 |
|   10 |       59.534 |          26.80 |    128 |
|   11 |       50.917 |          31.33 |    128 |
|   12 |       98.212 |          16.24 |    128 |
|   13 |       48.706 |          32.77 |    320 |
|   14 |      105.142 |          15.18 |    128 |
|   15 |   

......******
......******
......******
......******
......******
......******
......******
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |       65.711 |          24.29 |    192 |
|    2 |       10.828 |          16.43 |    128 |
|    3 |       25.034 |          10.65 |     64 |
|    4 |        3.458 |          12.86 |     64 |
|    5 |        9.020 |          19.69 |    128 |
|    6 |       50.548 |          31.57 |    192 |
|    7 |       24.082 |          11.05 |     64 |
|    8 |        3.470 |          12.79 |     64 |
|    9 |        9.560 |          18.56 |    128 |
|   10 |       59.534 |          26.80 |    128 |
|   11 |       50.917 |          31.33 |    128 |
|   12 |       98.212 |          16.24 |    128 |
|   13 |       44.413 |          35.93 |    384 |
|   14 |      105.142 |          15.18 |    128 |
|   15 |   

......******
......******
......******
......******
......******
......******
......******
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |       65.711 |          24.29 |    192 |
|    2 |       10.828 |          16.43 |    128 |
|    3 |       25.034 |          10.65 |     64 |
|    4 |        3.458 |          12.86 |     64 |
|    5 |        9.020 |          19.69 |    128 |
|    6 |       50.548 |          31.57 |    192 |
|    7 |       24.082 |          11.05 |     64 |
|    8 |        3.470 |          12.79 |     64 |
|    9 |        9.560 |          18.56 |    128 |
|   10 |       53.031 |          30.08 |    192 |
|   11 |       43.914 |          36.33 |    192 |
|   12 |       98.212 |          16.24 |    128 |
|   13 |       44.322 |          36.01 |    448 |
|   14 |      105.142 |          15.18 |    128 |
|   15 |   

......******
......******
......T*****
......******
......******
......******
......******
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |       35.558 |           9.95 |     64 |
|    1 |       64.465 |          24.76 |    256 |
|    2 |       10.828 |          16.43 |    128 |
|    3 |       25.034 |          10.65 |     64 |
|    4 |        3.458 |          12.86 |     64 |
|    5 |        9.020 |          19.69 |    128 |
|    6 |       50.548 |          31.57 |    192 |
|    7 |       24.082 |          11.05 |     64 |
|    8 |        3.470 |          12.79 |     64 |
|    9 |        9.560 |          18.56 |    128 |
|   10 |       53.031 |          30.08 |    192 |
|   11 |       42.419 |          37.61 |    320 |
|   12 |       98.212 |          16.24 |    128 |
|   13 |       43.625 |          36.58 |    512 |
|   14 |      105.142 |          15.18 |    128 |
|   15 |   

In [11]:
# Compile the whole network
print("Compile...")
if load_local_log_file:
    log_file = "mobilenet-NCHW-B1-opencl-C20000-T21-08-13-20-16.json"
    # log_file = "mobilenet-NCHW-B1-opencl-C3000-T21-08-11-21-39.json" # 76ms -> opencl
    print("Load Local File:", log_file)
else:
    print("Load Tune File:", log_file)
with auto_scheduler.ApplyHistoryBest(log_file):
    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
        lib = relay.build(relay_model, target, params=params)
        
with tvm.transform.PassContext(opt_level=3):
    model_lib = relay.build(relay_model, target=target, target_host=target_host, params=params, mod_name='mnist')

Compile...
Load Tune File: yolov3-NCHW-B1-llvm-C4000-T21-08-17-08-50.json


In [12]:
# Export lib
temp = utils.tempdir()
filename = target_type + "_deploy_lib.tar"
path_lib = temp.relpath(filename)
lib.export_library(path_lib)
# lib.export_library(path_lib, ndk.create_shared)

In [13]:
# upload module to device
print("Upload...")
remote = request_remote(device_key, rpc_host, rpc_port, timeout = 10000)
remote.upload(path_lib)
loaded_lib = remote.load_module(filename)

Upload...


In [19]:
# Create graph executor
if target_type == "aarch64":
    dev = remote.cpu()
elif target_type == "opencl":
    dev = remote.cl()
module = graph_executor.GraphModule(loaded_lib["default"](dev))
module.set_input(input_name, tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)))

In [20]:
# Evaluate
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", dev, number=50, repeat=3, min_repeat_ms=50)

Evaluate inference time cost...


In [23]:
prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))

Mean inference time (std dev): 2578.05 ms (3.26 ms)


## Evaluate inference time cost
|device|log file | Mean inference time(ms)|
|------|---------| -----------------------|
| opencl | op9_dla-NCHW-B1-opencl-C4000-T21-08-16-15-38.json | 139 |
| aarch64 | yolov3-NCHW-B1-llvm-C4000-T21-08-17-08-50.json | 2578|