In [1]:
import tvm

In [2]:
import numpy as np

# 循環優化

## Loop unroll 循環展開

In [3]:
n = tvm.te.var("n")
A = tvm.te.placeholder((n, n), name='A')
B = tvm.te.placeholder((n, n), name='B')
C = tvm.te.compute((n, n), lambda i, j: A[i, j] + B[i, j], name='C')


In [4]:
s = tvm.te.create_schedule(C.op)
xo, xi = s[C].split(s[C].op.axis[0], factor=4)


In [5]:
print(tvm.lower(s, [A, B, C], simple_mode=True))
print("---------Loop unroll---------")


# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.int32()
        stride = T.int32()
        stride_1 = T.int32()
        A_1 = T.match_buffer(A, (n, n), strides=(stride, stride_1), type="auto")
        stride_2 = T.int32()
        stride_3 = T.int32()
        B_1 = T.match_buffer(B, (n, n), strides=(stride_2, stride_3), type="auto")
        stride_4 = T.int32()
        stride_5 = T.int32()
        C_1 = T.match_buffer(C, (n, n), strides=(stride_4, stride_5), type="auto")
        for i_outer, i_inner in T.grid((n + 3) // 4, 4):
            if T.likely(i_outer * 4 + i_inner < n):
                for j in range(n):
                    cse_var_1: T.int32 = i_outer * 4 + i_inner
                    C_2 = T.Buffer((stride_4 * n,), data=C_1.data, type="auto")
      

In [6]:
s[C].unroll(xi)
print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.int32()
        stride = T.int32()
        stride_1 = T.int32()
        A_1 = T.match_buffer(A, (n, n), strides=(stride, stride_1), type="auto")
        stride_2 = T.int32()
        stride_3 = T.int32()
        B_1 = T.match_buffer(B, (n, n), strides=(stride_2, stride_3), type="auto")
        stride_4 = T.int32()
        stride_5 = T.int32()
        C_1 = T.match_buffer(C, (n, n), strides=(stride_4, stride_5), type="auto")
        for i_outer in range((n + 3) // 4):
            C_2 = T.Buffer((stride_4 * n,), data=C_1.data, type="auto")
            A_2 = T.Buffer((stride * n,), data=A_1.data, type="auto")
            B_2 = T.Buffer((stride_2 * n,), data=B_1.data, type="auto")
            if T.likely(i_outer * 4

## Loop Tiling(循環分塊)

循環分塊（Loop Tiling）
循環分塊是利用cache的數據局部性進行優化的一種方法。現代CPU通常具有多級cache，在記憶體結構中，cache是除CPU寄存器外最接近CPU的存儲層次，相比主記憶體速度更快，但是容量更小。cache中複製了CPU頻繁使用的數據，所以CPU可以進行快速訪問。由於cache的容量有限，數據會在cache中進行換入換出。當訪問的數據在cache中沒有時，產生cache miss，會向低一級存儲層次發出訪問請求，然後該數據存儲進cache，這時訪問數據的時間就大大提高。當訪問數據就在cache中時，會直接使用該數據以進行復用。

循環分塊主要針對大型數據集進行優化，大數據集無法一次全部存入cache中。當遍歷該數據集時，循環按照順序進行訪問，會替換掉之前加載進cache的數據，導致後面的指令對之前的數據無法復用，要重新加載數據，產生大量的cache miss，數據的復用性很差。程序執行時間變長，大量時間花費在載入數據上。

循環分塊將大數據集分成多個小塊以充分進行數據復用。數據塊的內存訪問是一個具有高內存局部性的小鄰域。該數據塊可以一次加載進cache，執行完所有或者盡可能多的計算任務後才被替換出。原始的矩陣乘法存儲訪問模式和分塊後的存儲訪問模式見下圖1。


In [9]:
import tvm
n = tvm.te.var("n")
A = tvm.te.placeholder((n, n), name='A')
B = tvm.te.placeholder((n, n), name='B')
K = tvm.te.reduce_axis((0, n), name='K')
C = tvm.te.compute((n, n), lambda i, j: tvm.te.sum(A[i, K] * B[K, j], axis=K), name='C')

s = tvm.te.create_schedule(C.op)

print(tvm.lower(s, [A, B, C], simple_mode=True))
print("---------Loop Tiling---------")


# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.int32()
        stride = T.int32()
        stride_1 = T.int32()
        A_1 = T.match_buffer(A, (n, n), strides=(stride, stride_1), type="auto")
        stride_2 = T.int32()
        stride_3 = T.int32()
        B_1 = T.match_buffer(B, (n, n), strides=(stride_2, stride_3), type="auto")
        stride_4 = T.int32()
        stride_5 = T.int32()
        C_1 = T.match_buffer(C, (n, n), strides=(stride_4, stride_5), type="auto")
        for i, j in T.grid(n, n):
            C_2 = T.Buffer((stride_4 * n,), data=C_1.data, type="auto")
            C_2[i * stride_4 + j * stride_5] = T.float32(0)
            for K in range(n):
                A_2 = T.Buffer((stride * n,), data=A_1.data, type="auto")
                B_2 = 

In [10]:
xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], 32, 32)

print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.int32()
        stride = T.int32()
        stride_1 = T.int32()
        A_1 = T.match_buffer(A, (n, n), strides=(stride, stride_1), type="auto")
        stride_2 = T.int32()
        stride_3 = T.int32()
        B_1 = T.match_buffer(B, (n, n), strides=(stride_2, stride_3), type="auto")
        stride_4 = T.int32()
        stride_5 = T.int32()
        C_1 = T.match_buffer(C, (n, n), strides=(stride_4, stride_5), type="auto")
        for i_outer, j_outer, i_inner, j_inner in T.grid((n + 31) // 32, (n + 31) // 32, 32, 32):
            C_2 = T.Buffer((stride_4 * n,), data=C_1.data, type="auto")
            if T.likely(i_outer * 32 + i_inner < n):
                if T.likely(j_outer * 32 + j_inner < n):
             

## Loop Reorder (循環重排)

循環重排（Loop Reorder）
循環重排序（reorder）是矩陣乘法常見的優化方式，特別是在CNN中卷積層的應用。在矩陣乘法計算中，B是逐列訪問的，在行優先的存儲模式下訪問模式很不友好。切換內層的循環順序可以使得所有元素按順序讀取和寫入。一次計算輸出的一行，得到的是中間結果，全部累加即可得到結果矩陣的一行最終結果，這種方式利用的是內存的空間局部性(spatial locality)。

Loop Reorder藉由內外層循環重排，改善記體體的空間局部性，並最大限度地利用引入cache的數據。對循環進行重新排序，以最大程度減少跨布將訪問模式與記憶體中的數據儲存模式對齊

In [15]:
# 範例：：以矩陣乘法為例，M, N, K三維，往往是將K放在最外層可以最大程度利用局部性。
n = tvm.te.var("n")
dtype = "float32"
A = tvm.te.placeholder((n, n), dtype=dtype, name='A')
B = tvm.te.placeholder((n, n), dtype=dtype, name='B')
C = tvm.te.compute((n, n), lambda i, j: A[i,j] + B[i,j], name='C')

s = tvm.te.create_schedule(C.op)
xo, xi = s[C].split(s[C].op.axis[0], factor=32)
yo, yi = s[C].split(s[C].op.axis[1], factor=32)
print(tvm.lower(s, [A, B], simple_mode=True))
print("-Loop Reorder ------")


# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.int32()
        stride = T.int32()
        stride_1 = T.int32()
        A_1 = T.match_buffer(A, (n, n), strides=(stride, stride_1), type="auto")
        stride_2 = T.int32()
        stride_3 = T.int32()
        B_1 = T.match_buffer(B, (n, n), strides=(stride_2, stride_3), type="auto")
        C = T.allocate([n * n], "float32", "global")
        for i_outer, i_inner in T.grid((n + 31) // 32, 32):
            if T.likely(i_outer * 32 + i_inner < n):
                for j_outer, j_inner in T.grid((n + 31) // 32, 32):
                    if T.likely(j_outer * 32 + j_inner < n):
                        cse_var_3: T.int32 = j_outer * 32
                        cse_var_2: T.int32 = cse_var_3 + j_inner
                        cse_v

In [16]:

s[C].reorder(xo, yo, yi, xi)

print(tvm.lower(s, [A, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.int32()
        stride = T.int32()
        stride_1 = T.int32()
        A_1 = T.match_buffer(A, (n, n), strides=(stride, stride_1), type="auto")
        stride_2 = T.int32()
        stride_3 = T.int32()
        B_1 = T.match_buffer(B, (n, n), strides=(stride_2, stride_3), type="auto")
        C = T.allocate([n * n], "float32", "global")
        for i_outer, j_outer, j_inner in T.grid((n + 31) // 32, (n + 31) // 32, 32):
            if T.likely(j_outer * 32 + j_inner < n):
                for i_inner in range(32):
                    if T.likely(i_outer * 32 + i_inner < n):
                        cse_var_3: T.int32 = j_outer * 32
                        cse_var_2: T.int32 = cse_var_3 + j_inner
                        cse_va

## Loop Fusion (循環融合)

Loop Fusion是將相鄰或緊密間隔的循環融合在一起，減少循環開銷和增加計算密度可改善軟體流水線，數據結構的cache局部性增加



In [19]:
n = tvm.te.var("n")
A = tvm.te.placeholder((n,), name='A')
k = tvm.te.reduce_axis((0, n), name='k')

B = tvm.te.compute((1,), lambda i: tvm.te.sum(A[k], axis=k), name='B')

s = tvm.te.create_schedule(B.op)

ko, ki = s[B].split(B.op.reduce_axis[0], factor=32)

print(tvm.lower(s, [A, B], simple_mode=True))
print("---------Loop Fusion---------")



# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.Buffer((1,), "float32")):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.int32()
        stride = T.int32()
        A_1 = T.match_buffer(A, (n,), strides=(stride,), type="auto")
        B_1 = T.Buffer((1,), data=B.data)
        B_1[0] = T.float32(0)
        for k_outer, k_inner in T.grid((n + 31) // 32, 32):
            if T.likely(k_outer * 32 + k_inner < n):
                A_2 = T.Buffer((stride * n,), data=A_1.data, type="auto")
                B_1[0] = B_1[0] + A_2[(k_outer * 32 + k_inner) * stride]
---------Loop Fusion---------


In [20]:
s[B].fuse(ko, ki)

print(tvm.lower(s, [A, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.Buffer((1,), "float32")):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.int32()
        stride = T.int32()
        A_1 = T.match_buffer(A, (n,), strides=(stride,), type="auto")
        B_1 = T.Buffer((1,), data=B.data)
        B_1[0] = T.float32(0)
        for k_outer_k_inner_fused in range((n + 31) // 32 * 32):
            if T.likely(k_outer_k_inner_fused < n):
                A_2 = T.Buffer((stride * n,), data=A_1.data, type="auto")
                B_1[0] = B_1[0] + A_2[k_outer_k_inner_fused * stride]


## Loop Split (循環拆分)

In [None]:
Loop Split主要是將循環分成多個循環，可以在有條件的循環中使用，分為無條件循環和含條件循環。

In [None]:
# 範例：上面程式碼是將條件判斷放在循環中，這樣執行循環時，每次都要判斷一次條件，所以可以將條件判斷和計算兩者拆開成兩個循環，這樣執行速度會比較快。

In [22]:
import tvm

n = tvm.te.var("n")
A = tvm.te.placeholder((n,), name='A')
k = tvm.te.reduce_axis((0, n), name='k')

B = tvm.te.compute((1,), lambda i: tvm.te.sum(A[k], axis=k), name='B')

s = tvm.te.create_schedule(B.op)

print(tvm.lower(s, [A, B], simple_mode=True))

print("---------Loop Split ---------")

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.Buffer((1,), "float32")):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.int32()
        stride = T.int32()
        A_1 = T.match_buffer(A, (n,), strides=(stride,), type="auto")
        B_1 = T.Buffer((1,), data=B.data)
        B_1[0] = T.float32(0)
        for k in range(n):
            A_2 = T.Buffer((stride * n,), data=A_1.data, type="auto")
            B_1[0] = B_1[0] + A_2[k * stride]
---------Loop Split ---------


In [23]:
ko, ki = s[B].split(B.op.reduce_axis[0], factor=32)

print(tvm.lower(s, [A, B], simple_mode=True))



# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.Buffer((1,), "float32")):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.int32()
        stride = T.int32()
        A_1 = T.match_buffer(A, (n,), strides=(stride,), type="auto")
        B_1 = T.Buffer((1,), data=B.data)
        B_1[0] = T.float32(0)
        for k_outer, k_inner in T.grid((n + 31) // 32, 32):
            if T.likely(k_outer * 32 + k_inner < n):
                A_2 = T.Buffer((stride * n,), data=A_1.data, type="auto")
                B_1[0] = B_1[0] + A_2[(k_outer * 32 + k_inner) * stride]


# Instructions Optimization

## Vectorization (向量化)

向量化（Vectorization）
向量化是一種數據級並行優化。向量化即「批量操作」，在計算機中常見執行模型是單指令多數據（SIMD，Single Instruction Multiple Data）。通過對批量數據同時進行相同計算以提高效率。向量體系結構獲取在存儲器中散布的數據集，將多個數據元素放在大型的順序寄存器堆疊(stack)即向量寄存器中，對整個寄存器進行操作從而同時計算了多個數據元素。向量本身可以容納不同大小數據，因此如果一個向量寄存器可以容納64個64 bit元素，那麼也可以容納128個32 bit元素或者512個8 bit元素。憑借這種硬體上的多樣性，向量化特別適合用於多媒體應用和科學計算。


傳統的執行方式為單指令單數據（SISD，Single Instruction Single Data），硬體不支持並行計算。現代CPU幾乎都支持SIMD指令集，如Intel的SSE和AVX系列指令集。


In [25]:
import tvm
import numpy
import timeit

M = tvm.te.var("m")
N = tvm.te.var("n")
A = tvm.te.placeholder((M, N), name='A')
B = tvm.te.placeholder((M, N), name='B')
C = tvm.te.compute(
           (M, N),
           lambda x, y: A[x, y] + B[x, y],
           name='C')

s = tvm.te.create_schedule(C.op)
xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], 32, 32)

print(tvm.lower(s, [A, B, C], simple_mode=True))
print("---------Vectorization---------")


# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        m = T.int32()
        n = T.int32()
        stride = T.int32()
        stride_1 = T.int32()
        A_1 = T.match_buffer(A, (m, n), strides=(stride, stride_1), type="auto")
        stride_2 = T.int32()
        stride_3 = T.int32()
        B_1 = T.match_buffer(B, (m, n), strides=(stride_2, stride_3), type="auto")
        stride_4 = T.int32()
        stride_5 = T.int32()
        C_1 = T.match_buffer(C, (m, n), strides=(stride_4, stride_5), type="auto")
        for x_outer, y_outer, x_inner in T.grid((m + 31) // 32, (n + 31) // 32, 32):
            if T.likely(x_outer * 32 + x_inner < m):
                for y_inner in range(32):
                    if T.likely(y_outer * 32 + y_inner < n):
                        cse_va

In [26]:

s[C].vectorize(yi)

print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, C: T.handle):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        m = T.int32()
        n = T.int32()
        stride = T.int32()
        stride_1 = T.int32()
        A_1 = T.match_buffer(A, (m, n), strides=(stride, stride_1), type="auto")
        stride_2 = T.int32()
        stride_3 = T.int32()
        B_1 = T.match_buffer(B, (m, n), strides=(stride_2, stride_3), type="auto")
        stride_4 = T.int32()
        stride_5 = T.int32()
        C_1 = T.match_buffer(C, (m, n), strides=(stride_4, stride_5), type="auto")
        for x_outer, y_outer, x_inner in T.grid((m + 31) // 32, (n + 31) // 32, 32):
            if T.likely(x_outer * 32 + x_inner < m):
                for y_inner_s in range(32):
                    if T.likely(y_outer * 32 + y_inner_s < n):
                        cs

## Tensorization (張量化）

主流 CPU/GPU 硬件廠商都提供了專門角於張量化計算的張量指令，如英偉達的張量核指令、英特爾的VN。利用張量指令的一種方法是調用硬件廠商提供的算子庫，如英偉達的 cuBLAS 和 cDNN， 以及英特爾的 oneDNN 等。

然而，當模型中出現新的算子或需要進—步提高性能時，這種方法的局限性便顯露無遺。


In [108]:
import tvm
from tvm import te
import tvm.testing
import numpy as np

In [109]:
N, M, L = 1024, 512, 64
A = te.placeholder((N, L), name="A")
B = te.placeholder((M, L), name="B")
k = te.reduce_axis((0, L), name="k")
C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[j, k], axis=k), name="C")
s = te.create_schedule(C.op)
print(tvm.lower(s, [A, B, C], simple_mode=True))


# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((1024, 64), "float32"), B: T.Buffer((512, 64), "float32"), C: T.Buffer((1024, 512), "float32")):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        for i, j in T.grid(1024, 512):
            C_1 = T.Buffer((524288,), data=C.data)
            C_1[i * 512 + j] = T.float32(0)
            for k in range(64):
                cse_var_1: T.int32 = i * 512 + j
                A_1 = T.Buffer((65536,), data=A.data)
                B_1 = T.Buffer((32768,), data=B.data)
                C_1[cse_var_1] = C_1[cse_var_1] + A_1[i * 64 + k] * B_1[j * 64 + k]


In [110]:
factor = 16
x, y = C.op.axis
(z,) = C.op.reduce_axis
yo, yi = s[C].split(y, factor=factor)
s[C].reorder(x, yo, yi, z)
print(tvm.lower(s, [A, B, C], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((1024, 64), "float32"), B: T.Buffer((512, 64), "float32"), C: T.Buffer((1024, 512), "float32")):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        for i, j_outer, j_inner in T.grid(1024, 32, 16):
            C_1 = T.Buffer((524288,), data=C.data)
            C_1[i * 512 + j_outer * 16 + j_inner] = T.float32(0)
            for k in range(64):
                cse_var_1: T.int32 = i * 512 + j_outer * 16 + j_inner
                A_1 = T.Buffer((65536,), data=A.data)
                B_1 = T.Buffer((32768,), data=B.data)
                C_1[cse_var_1] = C_1[cse_var_1] + A_1[i * 64 + k] * B_1[j_outer * 1024 + j_inner * 64 + k]


In [111]:
def intrin_gemv(m, l):
    a = te.placeholder((l,), name="a")
    b = te.placeholder((m, l), name="b")
    k = te.reduce_axis((0, l), name="k")
    c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name="c")
    Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1])
    Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[te.var("s1"), 1])
    Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1])

    def intrin_func(ins, outs):
        ib = tvm.tir.ir_builder.create()
        aa, bb = ins
        cc = outs[0]
        ib.emit(
            tvm.tir.call_extern(
                "int32",
                "gemv_update",
                cc.access_ptr("w"),
                aa.access_ptr("r"),
                bb.access_ptr("r"),
                m,
                l,
                bb.strides[0],
            )
        )
        return ib.get()

    return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})

In [112]:
print("----Tensorization----")
gemv = intrin_gemv(factor, L)
s[C].tensorize(yi, gemv)
print(tvm.lower(s, [A, B, C], simple_mode=True))

----Tensorization----
# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((1024, 64), "float32"), B: T.Buffer((512, 64), "float32"), C: T.Buffer((1024, 512), "float32")):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        for i, j_outer in T.grid(1024, 32):
            T.call_extern("int32", "gemv_update", T.tvm_access_ptr(T.type_annotation("float32"), C.data, i * 512 + j_outer * 16, 16, 2), T.tvm_access_ptr(T.type_annotation("float32"), A.data, i * 64, 64, 1), T.tvm_access_ptr(T.type_annotation("float32"), B.data, j_outer * 1024, 1024, 1), 16, 64, 64)


# parallel

In [113]:
import tvm
n = 1024
m = 1024
n = tvm.te.var("n")
m = tvm.te.var("m")

A = tvm.te.placeholder((n, m), name='A')
l = tvm.te.reduce_axis((0, m), name = 'l')

B = tvm.te.compute((n,), lambda i: tvm.te.sum(A[i, l], axis=l), name='B')

s = tvm.te.create_schedule(B.op)

print(tvm.lower(s, [A, B], simple_mode=True))


# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.int32()
        m = T.int32()
        stride = T.int32()
        stride_1 = T.int32()
        A_1 = T.match_buffer(A, (n, m), strides=(stride, stride_1), type="auto")
        stride_2 = T.int32()
        B_1 = T.match_buffer(B, (n,), strides=(stride_2,), type="auto")
        for i in range(n):
            B_2 = T.Buffer((stride_2 * n,), data=B_1.data, type="auto")
            B_2[i * stride_2] = T.float32(0)
            for l in range(m):
                A_2 = T.Buffer((stride * n,), data=A_1.data, type="auto")
                B_2[i * stride_2] = B_2[i * stride_2] + A_2[i * stride + l * stride_1]


In [114]:
print("---------parallel---------")

s[B].parallel(B.op.reduce_axis[0])
print(tvm.lower(s, [A, B], simple_mode=True))

---------parallel---------
# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        n = T.int32()
        m = T.int32()
        stride = T.int32()
        stride_1 = T.int32()
        A_1 = T.match_buffer(A, (n, m), strides=(stride, stride_1), type="auto")
        stride_2 = T.int32()
        B_1 = T.match_buffer(B, (n,), strides=(stride_2,), type="auto")
        for i in range(n):
            B_2 = T.Buffer((stride_2 * n,), data=B_1.data, type="auto")
            B_2[i * stride_2] = T.float32(0)
            for l in T.parallel(m):
                A_2 = T.Buffer((stride * n,), data=A_1.data, type="auto")
                B_2[i * stride_2] = B_2[i * stride_2] + A_2[i * stride + l * stride_1]
