# Depthwise-seperable 2D Convolution on GPU

## 1. Set-up

In [31]:
!pip install tlcpack-nightly-cu102 -f https://tlcpack.ai/wheels
!pip install "numpy<2.0.0"

Looking in links: https://tlcpack.ai/wheels


## 3. Implement `make_dwsp_conv2d_gpu_scheduler` function in `src.ops`

In that function, you are required to implemented 2D convolution and use TVM to optimize it.
Please use zero padding and unit stride.
You can assume kernel size to be an odd number.
The padding will equals to kernel size minus ones.
In this case, the output image will preserve the input image dimension.

The `make_dwsp_conv2d_gpu_scheduler` takes following arguments:
1. Batch size $B$;
2. Input channel size $C$;
3. Input image height $H$;
4. Input image width $W$;
5. Output number of channels $O$;
6. Kernel size $K$

You should return both the TVM scheduler and the TVM opterator for
1. Input tensor $x$ with size (B, C, H, W)
2. Input kernel weight $y$ with size (O, 1, K, K)
3. Output $out$ with size (B, O, H, W)

The scheduler should be able to used to build a function with signature $func(x, y, out)$.
Please see the following cells the usage.

In [32]:
import time
import tvm
import numpy as np
from tvm import te
import torch.nn.functional as F
import torch

# benchmark for tvm implementation
def benchmark_conv2d_tvm(schedule_func, B, C, H, W, K, device, a_np, w_np, num_runs=5, repeat=5):
    s, inp, ker, out = schedule_func(B, C, H, W, K)
    func = tvm.build(s, [inp, ker, out], "cuda")

    dev = tvm.cuda(0)
    a = tvm.nd.array(a_np, dev)
    w = tvm.nd.array(w_np, dev)
    b = tvm.nd.array(np.zeros((B, C, H, W), dtype), dev)
    evaluator = func.time_evaluator(func.entry_name, dev, number=num_runs, repeat=repeat)
    cost = evaluator(a, w, b).mean
    return cost, b.asnumpy(), func


# pytorch ref and time usage
def pytorch_depthwise_conv2d(input_data, kernel, device='cuda', number=10, repeat=3, warmup=2):
    input_tensor = torch.from_numpy(input_data).to(device)
    kernel_tensor = torch.from_numpy(kernel).to(device)
    _, _, K_h, K_w = kernel.shape
    pad_h = (K_h - 1) // 2
    pad_w = (K_w - 1) // 2

    C = input_data.shape[1]

    def compute():
        with torch.no_grad():
            return F.conv2d(input_tensor, kernel_tensor,
                           padding=(pad_h, pad_w),
                           groups=C)
    for _ in range(warmup):
        output = compute()
        torch.cuda.synchronize()

    times = []
    for _ in range(repeat):
        torch.cuda.synchronize()
        start_time = time.time()

        for _ in range(number):
            output = compute()
            torch.cuda.synchronize()

        end_time = time.time()
        times.append((end_time - start_time) / number * 1000)

    avg_time = sum(times) / len(times)
    return output.cpu().numpy(), avg_time

B, C, H, W, K = 3, 4, 16, 32, 7
# B, C, H, W, K = 1, 5, 128, 128, 3
dtype = 'float32'
a_np = np.random.rand(B, C, H, W).astype(dtype)
w_np = np.random.rand(C, 1, K, K).astype(dtype)

ref, pytorch_time = pytorch_depthwise_conv2d(a_np, w_np)
print(f"2DConv PyTorch: {pytorch_time:.4f} ms")

2DConv PyTorch: 0.0696 ms


In [33]:
def base_declaration(B, C, H, W, K):
    assert K % 2 == 1
    inp = te.placeholder((B, C, H, W), name="A")
    ker = te.placeholder((C, 1, K, K), name="W")

    ry = te.reduce_axis((0, K), name='ry')
    rx = te.reduce_axis((0, K), name='rx')
    pad_h = (K - 1) // 2
    pad_w = (K - 1) // 2

    padded = te.compute(
        (B, C, H + 2*pad_h, W + 2*pad_w),
        lambda b, c, h, w: tvm.tir.if_then_else(
            tvm.tir.all(h >= pad_h, h < H + pad_h, w >= pad_w, w < W + pad_w),
            inp[b, c, h - pad_h, w - pad_w],
            tvm.tir.const(0.0, "float32")
        ),
        name="padded"
    )

    out = te.compute(
        (B, C, H, W),
        lambda b, c, h, w: te.sum(
            padded[b, c, h + ry, w + rx] * ker[c, 0, ry, rx],
            axis=[ry, rx]
        ),
        name="depthwise_conv"
    )

    s = te.create_schedule(out.op)
    return s, inp, ker, out, padded

def make_dwsp_conv2d_gpu_scheduler_naive(B, C, H, W, K, verbose=True):
    s, inp, ker, out, padded = base_declaration(B, C, H, W, K)
    block_x = te.thread_axis("blockIdx.x")
    b, c, h, w = s[out].op.axis
    s[out].bind(b, block_x)
    # compute inline: only compute padding when calculating the out
    s[padded].compute_inline()
    if verbose:
        print("=" * 100)
        print(tvm.lower(s, [inp, ker, out], simple_mode=True))
        print("=" * 100)

    return s, inp, ker, out

dev = tvm.cuda()
naive_time, naive_res, naive_func = benchmark_conv2d_tvm(
    make_dwsp_conv2d_gpu_scheduler_naive, B, C, H, W, K, dev, a_np, w_np, num_runs=5, repeat=5
)
np.testing.assert_allclose(naive_res, ref, rtol=1e-4)
print(f"[TVM Naive] time: {naive_time*1e3:.4f} ms")

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((3, 4, 16, 32), "float32"), W: T.Buffer((4, 1, 7, 7), "float32"), depthwise_conv: T.Buffer((3, 4, 16, 32), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        blockIdx_x = T.launch_thread("blockIdx.x", 3)
        for c, h, w in T.grid(4, 16, 32):
            depthwise_conv_1 = T.Buffer((6144,), data=depthwise_conv.data)
            depthwise_conv_1[blockIdx_x * 2048 + c * 512 + h * 32 + w] = T.float32(0)
            for ry, rx in T.grid(7, 7):
                cse_var_2: T.int32 = h + ry
                cse_var_1: T.int32 = w + rx
                A_1 = T.Buffer((6144,), data=A.data)
                W_1 = T.Buffer((196,), data=W.data)
                depthwise_conv_1[blockIdx_x * 2048 + c * 512 + h * 32 + w] = depthwise_conv_1[blockIdx_x * 2048 + c * 512 + h * 32 + w] + T.if_then_else(3 <= cse

In [34]:
# opt v1: 2d block architecture
def make_dwsp_conv2d_gpu_scheduler_v1(B, C, H, W, K, verbose=True):
    s, inp, ker, out, padded = base_declaration(B, C, H, W, K)
    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")

    b, c, h, w = s[out].op.axis
    s[out].bind(b, block_x)
    s[out].bind(c, block_y)
    s[padded].compute_inline()
    if verbose:
        print("=" * 100)
        print(tvm.lower(s, [inp, ker, out], simple_mode=True))
        print("=" * 100)

    return s, inp, ker, out

dev = tvm.cuda()
t, res, func = benchmark_conv2d_tvm(
    make_dwsp_conv2d_gpu_scheduler_v1, B, C, H, W, K, dev, a_np, w_np, num_runs=5, repeat=5
)
np.testing.assert_allclose(res, ref, rtol=1e-4)
print(f"[TVM v1] time: {t*1e3:.4f} ms")

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((3, 4, 16, 32), "float32"), W: T.Buffer((4, 1, 7, 7), "float32"), depthwise_conv: T.Buffer((3, 4, 16, 32), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        blockIdx_x = T.launch_thread("blockIdx.x", 3)
        blockIdx_y = T.launch_thread("blockIdx.y", 4)
        for h, w in T.grid(16, 32):
            depthwise_conv_1 = T.Buffer((6144,), data=depthwise_conv.data)
            depthwise_conv_1[blockIdx_x * 2048 + blockIdx_y * 512 + h * 32 + w] = T.float32(0)
            for ry, rx in T.grid(7, 7):
                cse_var_2: T.int32 = h + ry
                cse_var_1: T.int32 = w + rx
                A_1 = T.Buffer((6144,), data=A.data)
                W_1 = T.Buffer((196,), data=W.data)
                depthwise_conv_1[blockIdx_x * 2048 + blockIdx_y * 512 + h * 32 + w] = depthwise_conv_1[b

In [35]:
# opt v2: block fuse
def make_dwsp_conv2d_gpu_scheduler_v2(B, C, H, W, K, verbose=True):
    s, inp, ker, out, padded = base_declaration(B, C, H, W, K)
    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")

    b, c, h, w = s[out].op.axis
    bc = s[out].fuse(b, c)
    s[out].bind(bc, block_x)
    s[out].bind(h, block_y)
    s[padded].compute_inline()

    if verbose:
        print("=" * 100)
        print(tvm.lower(s, [inp, ker, out], simple_mode=True))
        print("=" * 100)

    return s, inp, ker, out

dev = tvm.cuda()
t, res, func = benchmark_conv2d_tvm(
    make_dwsp_conv2d_gpu_scheduler_v2, B, C, H, W, K, dev, a_np, w_np, num_runs=5, repeat=5
)
np.testing.assert_allclose(res, ref, rtol=1e-4)
print(f"[TVM v2] time: {t*1e3:.4f} ms")

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((3, 4, 16, 32), "float32"), W: T.Buffer((4, 1, 7, 7), "float32"), depthwise_conv: T.Buffer((3, 4, 16, 32), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        blockIdx_x = T.launch_thread("blockIdx.x", 12)
        blockIdx_y = T.launch_thread("blockIdx.y", 16)
        for w in range(32):
            depthwise_conv_1 = T.Buffer((6144,), data=depthwise_conv.data)
            depthwise_conv_1[blockIdx_x * 512 + blockIdx_y * 32 + w] = T.float32(0)
            for ry, rx in T.grid(7, 7):
                cse_var_1: T.int32 = w + rx
                A_1 = T.Buffer((6144,), data=A.data)
                W_1 = T.Buffer((196,), data=W.data)
                depthwise_conv_1[blockIdx_x * 512 + blockIdx_y * 32 + w] = depthwise_conv_1[blockIdx_x * 512 + blockIdx_y * 32 + w] + T.if_then_else(3 <= blockIdx_y 

In [36]:
# opt v3: v2 + 2d threads
def make_dwsp_conv2d_gpu_scheduler_v3(B, C, H, W, K, verbose=True):
    s, inp, ker, out, padded = base_declaration(B, C, H, W, K)

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")

    b, c, h, w = s[out].op.axis
    bc = s[out].fuse(b, c)
    h_outer, h_inner = s[out].split(h, factor=16)
    w_outer, w_inner = s[out].split(w, factor=16)

    s[out].bind(bc, block_x)
    s[out].bind(h_outer, block_y)
    s[out].bind(h_inner, thread_y)
    s[out].bind(w_inner, thread_x)

    s[padded].compute_inline()

    if verbose:
        print("=" * 100)
        print(tvm.lower(s, [inp, ker, out], simple_mode=True))
        print("=" * 100)

    return s, inp, ker, out

dev = tvm.cuda()
naive_time, naive_res, naive_func = benchmark_conv2d_tvm(
    make_dwsp_conv2d_gpu_scheduler_v3, B, C, H, W, K, dev, a_np, w_np, num_runs=20, repeat=20
)
np.testing.assert_allclose(naive_res, ref, rtol=1e-4)
print(f"[TVM v3] time: {naive_time*1e3:.4f} ms")

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((3, 4, 16, 32), "float32"), W: T.Buffer((4, 1, 7, 7), "float32"), depthwise_conv: T.Buffer((3, 4, 16, 32), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        blockIdx_x = T.launch_thread("blockIdx.x", 12)
        blockIdx_y = T.launch_thread("blockIdx.y", 1)
        threadIdx_y = T.launch_thread("threadIdx.y", 16)
        for w_outer in range(2):
            threadIdx_x = T.launch_thread("threadIdx.x", 16)
            depthwise_conv_1 = T.Buffer((6144,), data=depthwise_conv.data)
            depthwise_conv_1[blockIdx_x * 512 + threadIdx_y * 32 + w_outer * 16 + threadIdx_x] = T.float32(0)
            for ry, rx in T.grid(7, 7):
                cse_var_1: T.int32 = w_outer * 16
                A_1 = T.Buffer((6144,), data=A.data)
                W_1 = T.Buffer((196,), data=W.data)
           

In [37]:
# opt v4: v3 + fuse at hw outer
def make_dwsp_conv2d_gpu_scheduler_v4(B, C, H, W, K, verbose=True):
    s, inp, ker, out, padded = base_declaration(B, C, H, W, K)
    b, c, h, w = s[out].op.axis
    bc = s[out].fuse(b, c)

    h_outer, h_inner = s[out].split(h, factor=16)
    w_outer, w_inner = s[out].split(w, factor=16)
    # we must reorder to do the fuse
    s[out].reorder(bc, h_outer, w_outer, h_inner, w_inner)
    hw_outer = s[out].fuse(h_outer, w_outer)

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")

    s[out].bind(bc, block_x)
    s[out].bind(hw_outer, block_y)
    s[out].bind(h_inner, thread_y)
    s[out].bind(w_inner, thread_x)

    s[padded].compute_inline()

    if verbose:
        print("=" * 100)
        print(tvm.lower(s, [inp, ker, out], simple_mode=True))
        print("=" * 100)

    return s, inp, ker, out

dev = tvm.cuda()
v4_time, v4_res, v4_func = benchmark_conv2d_tvm(
    make_dwsp_conv2d_gpu_scheduler_v4, B, C, H, W, K, dev, a_np, w_np, num_runs=20, repeat=20
)
np.testing.assert_allclose(v4_res, ref, rtol=1e-4)
print(f"[TVM v4] time: {v4_time*1e3:.4f} ms")


# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((3, 4, 16, 32), "float32"), W: T.Buffer((4, 1, 7, 7), "float32"), depthwise_conv: T.Buffer((3, 4, 16, 32), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)})
        blockIdx_x = T.launch_thread("blockIdx.x", 12)
        blockIdx_y = T.launch_thread("blockIdx.y", 2)
        threadIdx_y = T.launch_thread("threadIdx.y", 16)
        threadIdx_x = T.launch_thread("threadIdx.x", 16)
        depthwise_conv_1 = T.Buffer((6144,), data=depthwise_conv.data)
        depthwise_conv_1[blockIdx_x * 512 + threadIdx_y * 32 + blockIdx_y * 16 + threadIdx_x] = T.float32(0)
        for ry, rx in T.grid(7, 7):
            A_1 = T.Buffer((6144,), data=A.data)
            W_1 = T.Buffer((196,), data=W.data)
            depthwise_conv_1[blockIdx_x * 512 + threadIdx_y * 32 + blockIdx_y * 16 + threadIdx_x] = depthwise_conv_

In [38]:
from tvm import te, autotvm

@autotvm.template("depthwise_conv2d_template")
def depthwise_conv2d_template(B, C, H, W, K):
    s, inp, ker, out, padded = base_declaration(B, C, H, W, K)
    cfg = autotvm.get_config()

    b, c, h, w = s[out].op.axis
    bc = s[out].fuse(b, c)

    # possible tile size
    cfg.define_split("tile_h", h, num_outputs=2)
    cfg.define_split("tile_w", w, num_outputs=2)

    h_outer, h_inner = cfg["tile_h"].apply(s, out, h)
    w_outer, w_inner = cfg["tile_w"].apply(s, out, w)

    # reorder possibilities
    cfg.define_knob("reorder_strategy", [1, 2, 3])
    reorder_strategy = cfg["reorder_strategy"].val

    if reorder_strategy == 1:
        s[out].reorder(bc, h_outer, w_outer, h_inner, w_inner)
    elif reorder_strategy == 2:
        s[out].reorder(bc, h_outer, h_inner, w_outer, w_inner)
    else:
        s[out].reorder(bc, h_outer, w_outer, w_inner, h_inner)

    hw_outer = s[out].fuse(h_outer, w_outer)

    # reduce optimization
    cfg.define_knob("optimize_reduce_axis", [0, 1])
    if cfg["optimize_reduce_axis"].val:
        ry, rx = out.op.reduce_axis
        ryo, ryi = s[out].split(ry, factor=1)
        rxo, rxi = s[out].split(rx, factor=1)
        s[out].reorder(bc, hw_outer, ryo, rxo, h_inner, w_inner, ryi, rxi)
        s[out].unroll(ryi)
        s[out].unroll(rxi)

    # use local cache
    cfg.define_knob("use_local_cache", [0, 1])
    if cfg["use_local_cache"].val:
        local_out = s.cache_write(out, "local")
        s[local_out].compute_at(s[out], hw_outer)

    # use shared memory
    cfg.define_knob("use_shared_memory", [0, 1])
    if cfg["use_shared_memory"].val:
        padded_shared = s.cache_read(padded, "shared", [out])
        kernel_shared = s.cache_read(ker, "shared", [out])
        s[padded_shared].compute_at(s[out], hw_outer)
        s[kernel_shared].compute_at(s[out], hw_outer)

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")

    s[out].bind(bc, block_x)
    s[out].bind(hw_outer, block_y)
    s[out].bind(h_inner, thread_y)
    s[out].bind(w_inner, thread_x)

    s[padded].compute_inline()

    # compiler unroll level
    cfg.define_knob("unroll_level", [0, 512, 1024, 1500])
    if cfg["unroll_level"].val > 0:
        s[out].pragma(hw_outer, "auto_unroll_max_step", cfg["unroll_level"].val)
        s[out].pragma(hw_outer, "unroll_explicit", True)

    return s, [inp, ker, out]

def tune_depthwise_conv2d(B, C, H, W, K, tuning_rounds=200):
    task = autotvm.task.create("depthwise_conv2d_template",
                               args=(B, C, H, W, K),
                               target="cuda")

    print(task.config_space)

    measure_option = autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(number=10, repeat=3, min_repeat_ms=100, timeout=4)
    )
    log_file = "depthwise_conv2d_autotvm.log"

    tuner = autotvm.tuner.XGBTuner(task)
    tuner.tune(
        n_trial=tuning_rounds,
        measure_option=measure_option,
        callbacks=[autotvm.callback.log_to_file(log_file)]
    )

    with autotvm.apply_history_best(log_file):
        with tvm.target.Target("cuda"):
            s, args = depthwise_conv2d_template(B, C, H, W, K)
            func = tvm.build(s, args)

    dev = tvm.cuda(0)
    a_np = np.random.rand(B, C, H, W).astype("float32")
    w_np = np.random.rand(C, 1, K, K).astype("float32")
    a = tvm.nd.array(a_np, dev)
    w = tvm.nd.array(w_np, dev)
    b = tvm.nd.array(np.zeros((B, C, H, W), dtype="float32"), dev)
    func(a, w, b)

    evaluator = func.time_evaluator(func.entry_name, dev, number=20, repeat=20)
    time_ms = evaluator(a, w, b).mean * 1e3
    print(f"[AutoTVM optimized] time: {time_ms:.4f} ms")

    return func, time_ms

def run_autotvm_optimization(B, C, H, W, K, tuning_rounds=50):
    print(f"AutoTVM: B={B}, C={C}, H={H}, W={W}, K={K}")
    best_func, time_ms = tune_depthwise_conv2d(B, C, H, W, K, tuning_rounds)
    return best_func, time_ms

In [39]:
best_func, time_ms = run_autotvm_optimization(B, C, H, W, K, 60)

AutoTVM: B=3, C=4, H=16, W=32, K=7
ConfigSpace (len=2880, range_length=2880, space_map=
   0 tile_h: Split(policy=factors, product=16, num_outputs=2) len=5
   1 tile_w: Split(policy=factors, product=32, num_outputs=2) len=6
   2 reorder_strategy: OtherOption([1, 2, 3]) len=3
   3 optimize_reduce_axis: OtherOption([0, 1]) len=2
   4 use_local_cache: OtherOption([0, 1]) len=2
   5 use_shared_memory: OtherOption([0, 1]) len=2
   6 unroll_level: OtherOption([0, 512, 1024, 1500]) len=4
)
[AutoTVM optimized] time: 0.0035 ms


In [40]:
# pytest
%pip install ipytest
import ipytest
ipytest.autoconfig()




In [43]:
%%ipytest
import tvm
import torch
import pytest
import timeit
import numpy as np
import torch.nn.functional as F


make_dwsp_conv2d_gpu_scheduler = make_dwsp_conv2d_gpu_scheduler_v4
dev = tvm.cuda(0)


def make_func(*args):
    s, A, W, O = make_dwsp_conv2d_gpu_scheduler(*args)
    func = tvm.build(s, [A, W, O], "cuda")
    return func


def ans_torch(a_torch, w_torch):
    B, C, H, W = a_torch.size()
    O, D, K1, K2 = w_torch.size()
    assert K1 == K2
    assert D == 1
    K = K1

    torch.cuda.synchronize()
    b_torch = F.conv2d(
        a_torch, w_torch, bias=None, stride=1,
        padding=((K - 1)//2), dilation=1, groups=C)
    torch.cuda.synchronize()
    return b_torch


@pytest.mark.parametrize('B', [1, 2, 3, 4, 5, 11, 32])
@pytest.mark.parametrize('C', [1, 3, 4, 64])
@pytest.mark.parametrize('H', [1, 3, 4, 128])
@pytest.mark.parametrize('W', [1, 3, 4, 128])
@pytest.mark.parametrize('K', [1, 3, 5])
def test1_M1_N1(B, C, H, W, K):
    # Define dimension
    func = make_func(B, C, H, W, K)

    # Create random test data
    np.random.seed(seed=100)
    a_np = np.random.rand(B, C, H, W).astype(np.float32)
    w_np = np.random.rand(C, 1, K, K).astype(np.float32)

    # Torch input
    a_torch = torch.tensor(a_np).float()
    w_torch = torch.tensor(w_np).float()
    b_np = ans_torch(a_torch, w_torch).detach().numpy()

    a = tvm.nd.array(a_np, dev)
    w = tvm.nd.array(w_np, dev)
    b = tvm.nd.array(np.zeros(tuple(b_np.shape), dtype='float32'), dev)
    func(a, w, b)
    b_out = b.numpy()

    assert b_np.shape == b_out.shape, \
        "Shape mismatch: " + str(b_np.shape) + "\t" + str(b_out.shape)
    assert np.allclose(b_np, b_out), "Value mismatch: %s %s" % (b_np, b_out)

@pytest.mark.parametrize(
    'execution_number', [2, 4, 8, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
def test1_speed_torch(execution_number):
    # Define dimension
    B, C, H, W, K = 1, 5, 128, 128, 3
    n_repeat = 100

    # Create random test data
    np.random.seed(seed=1024)
    a_np = np.random.rand(B, C, H, W).astype(np.float32)
    w_np = np.random.rand(C, 1, K, K).astype(np.float32)

    # Torch input
    a_torch = torch.tensor(a_np).float()
    w_torch = torch.tensor(w_np).float()

    # Time the torch implementation
    def torch_time():
        ans_torch(a_torch, w_torch)
    time_torch = timeit.timeit(torch_time, number=n_repeat)
    b_torch = ans_torch(a_torch, w_torch)

    # Time the optimized implementation
    func = make_func(B, C, H, W, K)
    a = tvm.nd.array(a_np, dev)
    w = tvm.nd.array(w_np, dev)
    b = tvm.nd.array(np.zeros(tuple(b_torch.shape), dtype='float32'), dev)
    func(a, w, b)
    def tvm_time():
        func(a, w, b)
    time_tvm = timeit.timeit(tvm_time, number=n_repeat)

    opt_folds = float(execution_number)
    assert time_tvm * opt_folds <= time_torch, \
        "%dx speed-up failed: TVM Time: %.5es TorchTime: %.5es" \
        % (execution_number, time_tvm, time_torch, )



[33m[33mno tests ran[0m[33m in 0.00s[0m[0m
