In [None]:
import onnx
from tvm.contrib.download import download_testdata
from PIL import Image
import numpy as np
from tvm import relax,relay,auto_scheduler
from tvm.script import tir as T
from tvm.script import relax as R   
import tvm
from tvm.contrib import graph_executor
from tvm.relax.testing import from_relay

import torch
import torchvision
from torch import fx

In [None]:
model_name = "resnet18"
model = getattr(torchvision.models, model_name)(pretrained=True)
model = model.eval()

# We grab the TorchScripted model via tracing
input_shape = [1, 3, 224, 224]
input_data = torch.randn(input_shape)
scripted_model = torch.jit.trace(model, input_data).eval()

from PIL import Image

img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
img_path = download_testdata(img_url, "cat.png", module="data")
print(img_path)
img = Image.open(img_path).resize((224, 224))

# Preprocess the image and convert to tensor
from torchvision import transforms


my_preprocess = transforms.Compose(
    [
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)
img = my_preprocess(img)
img = np.expand_dims(img, 0)

In [None]:
from tvm.relax.frontend.torch import from_fx
from tvm import meta_schedule as ms

input_info = [(img.shape, "float32")]
with torch.no_grad():
    fx_module = fx.symbolic_trace(model)
    mod_from_torch = from_fx(fx_module, input_info, keep_params_as_input=True)

mod_from_torch, params_from_torch = relax.frontend.detach_params(mod_from_torch)
mod = relax.transform.LegalizeOps()(mod_from_torch)
mod = relax.get_pipeline()(mod_from_torch)

l = list(mod.get_global_vars())
mod_list =[]
for i in range(len(l)):
    mod_list.append(str(mod.get_global_vars()[i]))
mod_list = list(map(lambda x : x.split('"')[-2], mod_list))
mod_list.remove("main")

nd_params = {k : tvm.nd.array(v.detach().numpy()) for k,v in model.named_parameters()}


In [None]:
target = tvm.target.Target("llvm --num-cores=8")

In [None]:
database = ms.tune_tir(
        mod=mod,
        target="llvm --num-cores=8",
        max_trials_global=600,
        num_trials_per_iter=10,
        work_dir="./tune_tmp",
        runner = ms.runner.LocalRunner(
          evaluator_config=ms.runner.EvaluatorConfig(),
          alloc_repeat=1,
        ),
        cost_model=ms.cost_model.XGBModel(  
                extractor=ms.feature_extractor.PerStoreFeature(),
                adaptive_training=True,
        ),
        strategy=ms.search_strategy.EvolutionarySearch(),
)

In [None]:
MyMod2 = relax.transform.BindParams("main", nd_params)(mod)
for i in range(len(mod_list)):
    mod_str = mod_list[i]
    sch = ms.tir_integration.compile_tir(database, mod[mod_str], "llvm --num-cores=8")
    if(sch == None):
        print(1)
        continue
    new_func = sch.mod["main"].with_attr("global_symbol", mod_str)
    gv = MyMod2.get_global_var(mod_str)
    MyMod2.update_func(gv, new_func)

In [None]:
data_nd = np.random.rand(1,3,244,244)
data_nd = data_nd.astype(np.float32)
data_nd = tvm.nd.array(data_nd)

dev = tvm.device("llvm  --num-cores=8")

In [None]:
ex = relax.build(MyMod2, target="llvm  --num-cores=8")
vm = relax.VirtualMachine(ex, dev)
nd_res = vm["main"](data_nd)

pred_kind = np.argmax(nd_res.numpy(), axis=1)
print("MyModuleWithParams2 Prediction:",pred_kind)

ftimer = vm.module.time_evaluator("main", dev, number=10)
print("MyModuleWithParams time-cost: %g ms" % (ftimer(data_nd).mean * 1000))

GPU VERSION DOWN relax

In [None]:
target = tvm.target.Target("cuda")

In [None]:
from tvm import dlight as dl

with tvm.target.Target("cuda"):
    gpu_mod = dl.ApplyDefaultSchedule(
    )(mod)

In [None]:
def tune_my_tir(mod_str, max_trials_global,num_trials_per_iter,min_repeat_ms):
    if (min_repeat_ms > 1000):
        min_repeat_ms = 1000
    database = ms.tune_tir(
        mod=gpu_mod[mod_str],
        target="nvidia/geforce-rtx-4090",
        max_trials_global=max_trials_global,
        num_trials_per_iter=num_trials_per_iter,
        work_dir="./tune_tmp",
        runner = ms.runner.LocalRunner(
          evaluator_config=ms.runner.EvaluatorConfig(
            number=10,
            repeat=1,
            min_repeat_ms=min_repeat_ms,
          ),
          alloc_repeat=1,
        ),
        cost_model=ms.cost_model.XGBModel(  
                extractor=ms.feature_extractor.PerStoreFeature(),
                adaptive_training=True,
        ),
        strategy=ms.search_strategy.EvolutionarySearch(),
    )
    return database

In [None]:
MyModgpu = relax.transform.BindParams("main", nd_params)(gpu_mod)
for i in range(len(mod_list)):
    max_trials_global = 64
    num_trials_per_iter = 64
    min_repeat_ms = 200
    mod_str = mod_list[i]
    print(mod_str)
    database = tune_my_tir(mod_str, max_trials_global,num_trials_per_iter,min_repeat_ms)
    sch = ms.tir_integration.compile_tir(database, gpu_mod[mod_str], "nvidia/geforce-rtx-4090")
    while( sch is None):
       print("retune begin ................")
       max_trials_global = max_trials_global * 2
       num_trials_per_iter = num_trials_per_iter * 2
       min_repeat_ms = int(min_repeat_ms * 1.25)
       database = tune_my_tir(mod_str, max_trials_global,num_trials_per_iter,min_repeat_ms) 
       sch = ms.tir_integration.compile_tir(database, gpu_mod[mod_str], "nvidia/geforce-rtx-4090")
       if(max_trials_global > 10000):
           break
    new_func = sch.mod["main"].with_attr("global_symbol", mod_str)
    gv = MyModgpu.get_global_var(mod_str)
    MyModgpu.update_func(gv, new_func)

In [None]:
dev = tvm.device('cuda',0)
data_nd = tvm.nd.array(img, dev)

In [None]:
exec = relax.build(MyModgpu, target="cuda")
vm = relax.VirtualMachine(exec, dev)

nd_res = vm["main"](data_nd)

pred_kind = np.argmax(nd_res.numpy(), axis=1)
print("MyModuleWithGPU Prediction:",pred_kind)

ftimer = vm.module.time_evaluator("main", dev, number=1000)
print("MyModuleWithParams time-cost: %g ms" % (ftimer(data_nd).mean * 1000))

FIRST WE USE RELAY DO SOMETHING 

In [None]:
input_info = [('input1', img.shape)]

In [None]:
mod, params = relay.frontend.from_pytorch(scripted_model, input_info)

In [None]:
target=tvm.target.Target("cuda")
dtype="float32"

In [None]:
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params=params, target=target, )

In [None]:
def run_tuning():
  print("Begin Tunning....")
  measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
  tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
  tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=200,  # change this to 20000 to achieve the best performance
        runner=measure_ctx.runner,
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

  tuner.tune(tune_option)

In [None]:
log_file = "try_to_tune_1.json"

In [None]:
from tvm.contrib import graph_executor

print("Compile...")
with auto_scheduler.ApplyHistoryBest(log_file):
    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
        lib = relay.build(mod, target=target, params=params)

# Create graph executor
dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))
data_tvm = tvm.nd.array((np.random.uniform(size=img.shape)).astype(dtype))
module.set_input("input1", data_tvm)

# Evaluate
print("Evaluate inference time cost...")
print(module.benchmark(dev, repeat=3, min_repeat_ms=500))