In [1]:
import onnx
import glob
import os
import numpy as np
import time

import tvm
from tvm import relay, auto_scheduler
# import tvm.relay.testing
from tvm.contrib import graph_executor
from tvm.auto_scheduler.utils import request_remote

In [2]:
mobilenet_model = onnx.load('../mobilenet/mobilenetv2-7.onnx')

In [3]:
input_name = "input"
input_shape = (1, 3, 244, 244)
shape_dict = {input_name: input_shape}
print("shape_dict: ", shape_dict)

shape_dict:  {'input': (1, 3, 244, 244)}


In [4]:
model, params = relay.frontend.from_onnx(mobilenet_model, shape_dict)

In [5]:
# Also replace this with the device key in your tracker
device_key = "v9h"
rpc_host = "192.168.105.70"
rpc_port = 9190

# Define the neural network and compilation target.
network = "mobilenet"
batch_size = 1
layout = "NCHW"
turn_trials = 3000
# Set this to True if you use ndk tools for cross compiling
use_ndk = False
# Path to cross compiler
# os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++"
# target = tvm.target.Target("opencl", host="llvm -mtriple=aarch64-linux-gnu")
target = tvm.target.Target("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon")
dtype = "float32"
print(time.strftime('%y-%m-%d-%H-%M',time.localtime(time.time())))
log_file = "%s-%s-B%d-%s-C%s-T%s.json" % (network, layout, batch_size, target.kind.name, turn_trials, time.strftime('%y-%m-%d-%H-%M',time.localtime(time.time())))
print("device:", device_key)
print("rpc_host: %s:%s" % (rpc_host, rpc_port))
print("log file:", log_file)

21-08-11-11-39
device: v9h
rpc_host: 192.168.105.70:9190
log file: mobilenet-NCHW-B1-llvm-C3000-T21-08-11-11-39.json


In [6]:
if layout == 'NHWC':
    # convert from NCHW to NHWC
    desired_layouts = {'nn.conv2d': ['NHWC', 'default']}

    # Convert the layout to NHWC
    # RemoveUnunsedFunctions is used to clean up the graph.
    seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
                                    relay.transform.ConvertLayout(desired_layouts)])

    with tvm.transform.PassContext(opt_level=3):
        model = seq(model)

In [7]:
tasks, task_weights = auto_scheduler.extract_tasks(model["main"], params, target)

In [8]:
print("Begin tuning...")
tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
tune_option = auto_scheduler.TuningOptions(
    num_measure_trials=turn_trials,  # change this to 20000 to achieve the best performance
    builder=auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"),
    runner=auto_scheduler.RPCRunner(
        device_key, host=rpc_host, port=rpc_port, repeat=3, timeout=50
    ),
    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
)

tuner.tune(tune_option)

Begin tuning...
Get devices for measurement successfully!
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |            - |              - |      0 |
|    1 |            - |              - |      0 |
|    2 |            - |              - |      0 |
|    3 |            - |              - |      0 |
|    4 |            - |              - |      0 |
|    5 |            - |              - |      0 |
|    6 |            - |              - |      0 |
|    7 |            - |              - |      0 |
|    8 |            - |              - |      0 |
|    9 |            - |              - |      0 |
|   10 |            - |              - |      0 |
|   11 |            - |              - |      0 |
|   12 |            - |              - |      0 |
|   13 |            - |              - |      0 |
|   14 |            - |              - |      0 |
|   15 |            - |              - |      0 |
|   16 |            - |              - |  



|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        0.550 |           4.66 |     64 |
|    1 |            - |              - |      0 |
|    2 |            - |              - |      0 |
|    3 |            - |              - |      0 |
|    4 |            - |              - |      0 |
|    5 |            - |              - |      0 |
|    6 |            - |              - |      0 |
|    7 |            - |              - |      0 |
|    8 |            - |              - |      0 |
|    9 |            - |              - |      0 |
|   10 |            - |              - |      0 |
|   11 |            - |              - |      0 |
|   12 |            - |              - |      0 |
|   13 |            - |              - |      0 |
|   14 |            - |              - |      0 |
|   15 |            - |              - |      0 |
|   16 |            - |              - |      0 |
|   17 |            - |              - |      0 |


......******
......******
......******
......******
......******
......******
......******T
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        0.550 |           4.66 |     64 |
|    1 |        0.046 |          -0.00 |     64 |
|    2 |        5.050 |          10.43 |     64 |
|    3 |        3.404 |          11.56 |     64 |
|    4 |        0.141 |           9.15 |     64 |
|    5 |        1.235 |          16.07 |     64 |
|    6 |            - |              - |      0 |
|    7 |            - |              - |      0 |
|    8 |            - |              - |      0 |
|    9 |            - |              - |      0 |
|   10 |            - |              - |      0 |
|   11 |            - |              - |      0 |
|   12 |            - |              - |      0 |
|   13 |            - |              - |      0 |
|   14 |            - |              - |      0 |
|   15 |  

......******
......******
......******
......******
......******
......******
......******
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        0.550 |           4.66 |     64 |
|    1 |        0.046 |          -0.00 |     64 |
|    2 |        5.050 |          10.43 |     64 |
|    3 |        3.404 |          11.56 |     64 |
|    4 |        0.141 |           9.15 |     64 |
|    5 |        1.235 |          16.07 |     64 |
|    6 |        2.066 |           9.52 |     64 |
|    7 |        0.917 |          12.88 |     64 |
|    8 |        0.170 |           4.55 |     64 |
|    9 |        1.508 |          19.07 |     64 |
|   10 |        2.637 |          10.75 |     64 |
|   11 |            - |              - |      0 |
|   12 |            - |              - |      0 |
|   13 |            - |              - |      0 |
|   14 |            - |              - |      0 |
|   15 |   

......******
......******
......******
......******
......******
......******
......******
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        0.550 |           4.66 |     64 |
|    1 |        0.046 |          -0.00 |     64 |
|    2 |        5.050 |          10.43 |     64 |
|    3 |        3.404 |          11.56 |     64 |
|    4 |        0.141 |           9.15 |     64 |
|    5 |        1.235 |          16.07 |     64 |
|    6 |        2.066 |           9.52 |     64 |
|    7 |        0.917 |          12.88 |     64 |
|    8 |        0.170 |           4.55 |     64 |
|    9 |        1.508 |          19.07 |     64 |
|   10 |        2.637 |          10.75 |     64 |
|   11 |        0.358 |           8.65 |     64 |
|   12 |        1.610 |          11.74 |     64 |
|   13 |        0.257 |           8.02 |     64 |
|   14 |        0.791 |          16.27 |     64 |
|   15 |   

......******
......******
......******
......******
......******
......******
......******
......******
......******
......T*****
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        0.550 |           4.66 |     64 |
|    1 |        0.046 |          -0.00 |     64 |
|    2 |        5.050 |          10.43 |     64 |
|    3 |        3.404 |          11.56 |     64 |
|    4 |        0.141 |           9.15 |     64 |
|    5 |        1.235 |          16.07 |     64 |
|    6 |        2.066 |           9.52 |     64 |
|    7 |        0.917 |          12.88 |     64 |
|    8 |        0.170 |           4.55 |     64 |
|    9 |        1.508 |          19.07 |     64 |
|   10 |        2.637 |          10.75 |     64 |
|   11 |        0.358 |           8.65 |     64 |
|   12 |        1.610 |          11.74 |     64 |
|   13 |        0.257 |           8.02 |     64 |
|   14 |        0.791 |          16.27 |     64 |
|   15 |   

......******
......******
......******
......******
......******
......******
......******T
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        0.550 |           4.66 |     64 |
|    1 |        0.046 |          -0.00 |     64 |
|    2 |        5.050 |          10.43 |     64 |
|    3 |        3.404 |          11.56 |     64 |
|    4 |        0.141 |           9.15 |     64 |
|    5 |        1.235 |          16.07 |     64 |
|    6 |        2.066 |           9.52 |     64 |
|    7 |        0.917 |          12.88 |     64 |
|    8 |        0.170 |           4.55 |     64 |
|    9 |        1.508 |          19.07 |     64 |
|   10 |        2.637 |          10.75 |     64 |
|   11 |        0.358 |           8.65 |     64 |
|   12 |        1.610 |          11.74 |     64 |
|   13 |        0.257 |           8.02 |     64 |
|   14 |        0.791 |          16.27 |     64 |
|   15 |  

......******
......******
......******T
......******
......******
......****T**
......******
......******
......******
.....T.T****
....***T*
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        0.550 |           4.66 |     64 |
|    1 |        0.046 |          -0.00 |     64 |
|    2 |        5.050 |          10.43 |     64 |
|    3 |        3.404 |          11.56 |     64 |
|    4 |        0.141 |           9.15 |     64 |
|    5 |        1.235 |          16.07 |     64 |
|    6 |        2.066 |           9.52 |     64 |
|    7 |        0.917 |          12.88 |     64 |
|    8 |        0.170 |           4.55 |     64 |
|    9 |        1.508 |          19.07 |     64 |
|   10 |        2.637 |          10.75 |     64 |
|   11 |        0.358 |           8.65 |     64 |
|   12 |        1.610 |          11.74 |     64 |
|   13 |        0.257 |           8.02 |     64 |
|   14 |        0.791 |          16.27 |     64 |
|   15 |

......******
......******
......******
......******
......******
......******
......******
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        0.550 |           4.66 |     64 |
|    1 |        0.046 |          -0.00 |     64 |
|    2 |        5.050 |          10.43 |     64 |
|    3 |        3.404 |          11.56 |     64 |
|    4 |        0.141 |           9.15 |     64 |
|    5 |        1.235 |          16.07 |     64 |
|    6 |        2.066 |           9.52 |     64 |
|    7 |        0.917 |          12.88 |     64 |
|    8 |        0.170 |           4.55 |     64 |
|    9 |        1.508 |          19.07 |     64 |
|   10 |        2.637 |          10.75 |     64 |
|   11 |        0.358 |           8.65 |     64 |
|   12 |        1.610 |          11.74 |     64 |
|   13 |        0.257 |           8.02 |     64 |
|   14 |        0.791 |          16.27 |     64 |
|   15 |   

......******
......******
......******
......******
......******
......******
......******
......*****T*
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        0.550 |           4.66 |     64 |
|    1 |        0.046 |          -0.00 |     64 |
|    2 |        3.026 |          17.41 |    128 |
|    3 |        3.404 |          11.56 |     64 |
|    4 |        0.141 |           9.15 |     64 |
|    5 |        1.235 |          16.07 |     64 |
|    6 |        1.391 |          14.15 |    128 |
|    7 |        0.917 |          12.88 |     64 |
|    8 |        0.170 |           4.55 |     64 |
|    9 |        1.395 |          20.61 |    128 |
|   10 |        2.187 |          12.97 |    128 |
|   11 |        0.358 |           8.65 |     64 |
|   12 |        1.610 |          11.74 |     64 |
|   13 |        0.257 |           8.02 |     64 |
|   14 |        0.791 |          16.27 |     64 |
|   15 |  

......******
......******
......******
......******
......******
......******
......******
......******
......******
......******
....****
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        0.550 |           4.66 |     64 |
|    1 |        0.046 |          -0.00 |     64 |
|    2 |        3.026 |          17.41 |    128 |
|    3 |        2.375 |          16.57 |    128 |
|    4 |        0.141 |           9.15 |     64 |
|    5 |        0.836 |          23.73 |    128 |
|    6 |        1.391 |          14.15 |    128 |
|    7 |        0.917 |          12.88 |     64 |
|    8 |        0.170 |           4.55 |     64 |
|    9 |        1.395 |          20.61 |    128 |
|   10 |        2.187 |          12.97 |    128 |
|   11 |        0.358 |           8.65 |     64 |
|   12 |        1.610 |          11.74 |     64 |
|   13 |        0.257 |           8.02 |     64 |
|   14 |        0.791 |          16.27 |     64 |
|   15 |   

In [9]:
# Compile the whole network
print("Compile...")
with auto_scheduler.ApplyHistoryBest(log_file):
    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
        lib = relay.build(model, target, params=params)

Compile...


In [16]:
# Create graph executor
print("=============== Request Remote ===============")
from tvm.auto_scheduler.utils import request_remote

remote = request_remote(device_key, rpc_host, rpc_port)
dev = remote.cpu()



In [17]:
from tvm.contrib import utils, ndk
temp = utils.tempdir()
filename = "deploy_cpu.tar"
path_lib = temp.relpath(filename)
# lib.export_library(path_lib, ndk.create_shared)
lib.export_library(path_lib)
remote.upload(path_lib)
loaded_lib = remote.load_module(filename)
module = graph_executor.GraphModule(loaded_lib["default"](dev))
data = (np.random.uniform(size=input_shape)).astype(dtype)
data_tvm = tvm.nd.array(data)
module.set_input(input_name, data_tvm)

In [18]:
# Evaluate
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=1000)

Evaluate inference time cost...


In [19]:
prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))

Mean inference time (std dev): 79.03 ms (1.25 ms)
