In [1]:
import onnx
import glob
import os
import numpy as np
import time

import tvm
from tvm import relay, auto_scheduler
# import tvm.relay.testing
from tvm.contrib import graph_executor
from tvm.auto_scheduler.utils import request_remote

In [2]:
mobilenet_model = onnx.load('../mobilenet/mobilenetv2-7.onnx')

In [3]:
input_name = "input"
input_shape = (1, 3, 244, 244)
shape_dict = {input_name: input_shape}
print("shape_dict: ", shape_dict)

shape_dict:  {'input': (1, 3, 244, 244)}


In [4]:
model, params = relay.frontend.from_onnx(mobilenet_model, shape_dict)

In [5]:
# Also replace this with the device key in your tracker
device_key = "v9h"
rpc_host = "192.168.105.70"
rpc_port = 9190

# Define the neural network and compilation target.
network = "mobilenet"
batch_size = 1
layout = "NCHW"
turn_trials = 20000
turn_enable = True
preload_log_file = True
# Set this to True if you use ndk tools for cross compiling
use_ndk = False
# Path to cross compiler
# os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++"
target = tvm.target.Target("opencl", host="llvm -mtriple=aarch64-linux-gnu")
dtype = "float32"
log_file = "%s-%s-B%d-%s-C%s-T%s.json" % (network, layout, batch_size, target.kind.name, turn_trials, time.strftime('%y-%m-%d-%H-%M',time.localtime(time.time())))
print("device:", device_key)
print("rpc_host: %s:%s" % (rpc_host, rpc_port))
print("log file:", log_file)

device: v9h
rpc_host: 192.168.105.70:9190
log file: mobilenet-NCHW-B1-opencl-C20000-T21-08-13-15-24.json


In [6]:
if layout == 'NHWC':
    # convert from NCHW to NHWC
    desired_layouts = {'nn.conv2d': ['NHWC', 'default']}

    # Convert the layout to NHWC
    # RemoveUnunsedFunctions is used to clean up the graph.
    seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
                                    relay.transform.ConvertLayout(desired_layouts)])

    with tvm.transform.PassContext(opt_level=3):
        model = seq(model)

In [7]:
# remote = request_remote(device_key, rpc_host, rpc_port)
# dev = remote.cl()
# print("device_name:", dev.device_name)
# print("compute_version:", dev.compute_version)
# print("max_clock_rate:", dev.max_clock_rate)
# print("multi_processor_count:", dev.multi_processor_count)
# print("max_thread_dimensions:", dev.max_thread_dimensions)
# max_shared_memory_per_block = dev.max_shared_memory_per_block
# print("max_shared_memory_per_block:", max_shared_memory_per_block)
# max_threads_per_block = dev.max_threads_per_block
# print("max_threads_per_block:", max_threads_per_block)
# warp_size = dev.warp_size
# print("warp_size: ", warp_size)

In [None]:
if turn_enable:
    max_shared_memory_per_block = 4096
    print("max_shared_memory_per_block:", max_shared_memory_per_block)
    max_threads_per_block = 512
    print("max_threads_per_block:", max_threads_per_block)
    warp_size = 2
    print("warp_size: ", warp_size)

    # There is no explicit local memory limition
    # so we can use INT32_MAX to disable the check on local_memory.
    max_local_memory_per_block = 4096000 # INT32_MAX
    print("max_local_memory_per_block:", max_local_memory_per_block)

    max_vthread_extent = 2 #int(dev.warp_size / 4) if int(dev.warp_size / 4) > 1 else dev.warp_size
    print("max_vthread_extent:", max_vthread_extent)

    num_cores = 2
    print("number of cores:", num_cores)

    vector_unit_bytes = 16
    print("vector unit bytes:", vector_unit_bytes)

    cache_line_bytes = 64
    print("cache line bytes:", cache_line_bytes)
    
    hardware_params = auto_scheduler.HardwareParams(num_cores, vector_unit_bytes, cache_line_bytes,
                                                max_shared_memory_per_block, max_local_memory_per_block,
                                                max_threads_per_block, max_vthread_extent, warp_size)
    
    tasks, task_weights = auto_scheduler.extract_tasks(model["main"], params, target, hardware_params=hardware_params)
    
    print("Begin tuning...")
    if preload_log_file:
        load_log_file = "mobilenet-NCHW-B1-opencl-C3000-T21-08-11-21-39.json"
        print("preload file:", load_log_file)
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=load_log_file)
    else:
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)

    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=turn_trials,  # change this to 20000 to achieve the best performance
        builder=auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"),
        runner=auto_scheduler.RPCRunner(
            device_key, host=rpc_host, port=rpc_port, repeat=3, timeout=50
        ),
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

    tuner.tune(tune_option)

max_shared_memory_per_block: 4096
max_threads_per_block: 512
warp_size:  2
max_local_memory_per_block: 4096000
max_vthread_extent: 2
number of cores: 2
vector unit bytes: 16
cache line bytes: 64
Begin tuning...
preload file: mobilenet-NCHW-B1-opencl-C3000-T21-08-11-21-39.json
Get devices for measurement successfully!




|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        2.699 |           0.95 |    128 |
|    1 |        0.380 |          -0.00 |    128 |
|    2 |        2.040 |          25.83 |    128 |
|    3 |        1.381 |          28.50 |    128 |
|    4 |        0.330 |           3.92 |    192 |
|    5 |        0.814 |          24.37 |    192 |
|    6 |        0.954 |          20.63 |    192 |
|    7 |        0.561 |          21.03 |    128 |
|    8 |        0.582 |           1.33 |    128 |
|    9 |        1.242 |          23.15 |    256 |
|   10 |        1.111 |          25.53 |    256 |
|   11 |        0.580 |           5.34 |    128 |
|   12 |        0.633 |          29.85 |    128 |
|   13 |        0.469 |           4.41 |    256 |
|   14 |        0.577 |          22.32 |    192 |
|   15 |        0.454 |          27.80 |    128 |
|   16 |        0.308 |          20.45 |    128 |
|   17 |        0.717 |           1.44 |    128 |


In [None]:
# Compile the whole network
print("Compile...")
# log_file = "mobilenet-NCHW-B1-opencl-C2800-T21-08-12-19-02.json"
# log_file = "mobilenet-NCHW-B1-opencl-C3000-T21-08-11-21-39.json" # 76ms -> opencl
print("Load File:", log_file)
with auto_scheduler.ApplyHistoryBest(log_file):
    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
        lib = relay.build(model, target, params=params)

In [None]:
# Create graph executor
print("=============== Request Remote ===============")
from tvm.auto_scheduler.utils import request_remote

remote = request_remote(device_key, rpc_host, rpc_port)
dev = remote.cl()

In [None]:
# from tvm.contrib import utils, ndk
# temp = utils.tempdir()
filename = "deploy_lib.tar"
path_lib = "./" + filename
# lib.export_library(path_lib, ndk.create_shared)
lib.export_library(path_lib)
remote.upload(path_lib)
loaded_lib = remote.load_module(filename)
module = graph_executor.GraphModule(loaded_lib["default"](dev))
data = (np.random.uniform(size=input_shape)).astype(dtype)
data_tvm = tvm.nd.array(data)
module.set_input(input_name, data_tvm)

In [None]:
# Evaluate
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500)

In [None]:
prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))