<a href="https://colab.research.google.com/github/swillenson/Developing-and-Designing-Interactive-Devices/blob/master/1-conv1d_cpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1D Convolution on CPU

## 1. Set-up 

In [1]:
# Mount google drive 
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# Make sure your token is stored in a txt file at the location below.
# This way there is no risk that you will push it to your repo
# Never share your token with anyone, it is basically your github password!
with open('/content/gdrive/MyDrive/ece5545/token.txt') as f:
    token = f.readline().strip()
# Use another file to store your github username    
with open('/content/gdrive/MyDrive/ece5545/git_username.txt') as f:
    handle = f.readline().strip()

In [3]:
# Clone your github repo
YOUR_TOKEN = token
YOUR_HANDLE = handle
BRANCH = "main"

%mkdir /content/gdrive/MyDrive/ece5545
%cd /content/gdrive/MyDrive/ece5545
!git clone https://{YOUR_TOKEN}@github.com/ML-HW-SYS/a3-{YOUR_HANDLE}.git
%cd /content/gdrive/MyDrive/ece5545/a3-{YOUR_HANDLE}
!git checkout {BRANCH}
!git pull
%cd /content/gdrive/MyDrive/ece5545

PROJECT_ROOT = f"/content/gdrive/MyDrive/ece5545/a3-{YOUR_HANDLE}"

mkdir: cannot create directory ‘/content/gdrive/MyDrive/ece5545’: File exists
/content/gdrive/MyDrive/ece5545
fatal: destination path 'a3-swillenson' already exists and is not an empty directory.
/content/gdrive/MyDrive/ece5545/a3-swillenson
M	src/ops.py
Already on 'main'
Your branch is up to date with 'origin/main'.
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 6 (delta 4), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), 1.79 KiB | 4.00 KiB/s, done.
From https://github.com/ML-HW-SYS/a3-swillenson
   494d2e4..b7d7662  main       -> origin/main
Updating 494d2e4..b7d7662
Fast-forward
 2-conv1d_gpu.ipynb | 167 [32m+++++++++++++++++++++++++++++[m[31m------------------------[m
 1 file changed, 90 insertions(+), 77 deletions(-)
/content/gdrive/MyDrive/ece5545


In [4]:
# This extension reloads all imports before running each cell
%load_ext autoreload
%autoreload 2

Verify the following cell prints your github repository.

In [5]:
!ls {PROJECT_ROOT}

1-conv1d_cpu.ipynb   4-gemm_gpu.ipynb	    README.md
2-conv1d_gpu.ipynb   5-conv2d_dw_gpu.ipynb  src
3-conv1d_fpga.ipynb  leaderboard_id.txt     tests


## 2. Install TVM

In [6]:
!pip install tlcpack-nightly-cu102 -f https://tlcpack.ai/wheels

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://tlcpack.ai/wheels
Collecting tlcpack-nightly-cu102
  Downloading https://github.com/tlc-pack/tlcpack/releases/download/v0.12.dev/tlcpack_nightly_cu102-0.12.dev505%2Bga84a2cbe0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (408.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m408.0/408.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tlcpack-nightly-cu102
Successfully installed tlcpack-nightly-cu102-0.12.dev505+ga84a2cbe0


## 3. Implement `make_conv1d_cpu_scheduler_func` function in `src.ops`

In that function, you are required to implemented 1D convolution and use TVM to optimize it.
Let $x \in \mathbb{R}^m$ and $y \in \mathbb{R}^n$, then 
$$
\operatorname{conv1d}(x, y)_i = \sum_{j=-\infty}^{\infty} x[j]y[i-j], \forall i \in \{0, 1, \dots, m + n - 1\}
$$

Please use zero padding and unit stride. Please see the numpy convolution function for more detail: [link](https://numpy.org/doc/stable/reference/generated/numpy.convolve.html).

The `make_conv1d_cpu_scheduler_func` takes $m$ and $n$, which are the size of the two 1D input array. 
You should return both the TVM schedule and the TVM operator for 
1. Input $x$
2. Input $y$
3. Output $out$

The schedule should be able to used to build a function with signature $func(x, y, out)$. 
Please see the following cells the usage.

In [7]:
import tvm
import numpy as np
import sys
# Adding assignment 3 to the system path
# Make sure this matches your git directory
sys.path.insert(0, PROJECT_ROOT)
from src.ops import make_conv1d_cpu_scheduler

M = 4096
N = 128
dtype = 'float32'
a_np = np.random.rand(M).astype(dtype)
w_np = np.random.rand(N).astype(dtype)
b_np = np.convolve(a_np, w_np)

s, A, W, B = make_conv1d_cpu_scheduler(M, N)
func = tvm.build(s, [A, W, B], "llvm")

dev = tvm.cpu()
a = tvm.nd.array(a_np, dev)
w = tvm.nd.array(w_np, dev)
b = tvm.nd.array(np.zeros((M+N-1), dtype), dev)
func(a, w, b)
evaluator = func.time_evaluator(func.entry_name, dev, number=1, repeat =1)


print("Answer:", b_np)
print("Output:", b)
print(f"1D conv TVM runtime: %f ms" % (evaluator(a, w, b).mean * 1e3))

Answer: [0.52809936 0.5587445  0.87160236 ... 1.1231958  0.337116   0.24206445]
Output: [0.52809936 0.5587445  0.87160236 ... 1.1231958  0.337116   0.24206445]
1D conv TVM runtime: 13.114928 ms


Optimization 1

In [8]:
import tvm
import numpy as np
import sys
# Adding assignment 3 to the system path
# Make sure this matches your git directory
sys.path.insert(0, PROJECT_ROOT)
from src.ops import make_conv1d_cpu_scheduler

M = 4096
N = 128
dtype = 'float32'
a_np = np.random.rand(M).astype(dtype)
w_np = np.random.rand(N).astype(dtype)
b_np = np.convolve(a_np, w_np)

s, A, W, B = make_conv1d_cpu_scheduler(M, N)

def schedule_optimization_1(s, B):
    # Blocking
    n = B.op.axis[0]
    n_outer, n_inner = s[B].split(n, factor=8)
    s[B].reorder(n_outer, n_inner)
    
    s[B].parallel(n_outer)

    return s

s = schedule_optimization_1(s, B)

func = tvm.build(s, [A, W, B], "llvm")

dev = tvm.cpu()
a = tvm.nd.array(a_np, dev)
w = tvm.nd.array(w_np, dev)
b = tvm.nd.array(np.zeros((M+N-1), dtype), dev)
func(a, w, b)
evaluator = func.time_evaluator(func.entry_name, dev, number=1, repeat =1)


print("Answer:", b_np)
print("Output:", b)
print(f"1D conv TVM runtime: %f ms" % (evaluator(a, w, b).mean * 1e3))

Answer: [0.15239523 0.08954454 0.2154056  ... 0.87708545 0.85651064 0.44574815]
Output: [0.15239523 0.08954454 0.2154056  ... 0.87708545 0.85651064 0.44574815]
1D conv TVM runtime: 24.576561 ms


Optimization 2

In [9]:
import tvm
import numpy as np
import sys
# Adding assignment 3 to the system path
# Make sure this matches your git directory
sys.path.insert(0, PROJECT_ROOT)
from src.ops import make_conv1d_cpu_scheduler

M = 4096
N = 128
dtype = 'float32'
a_np = np.random.rand(M).astype(dtype)
w_np = np.random.rand(N).astype(dtype)
b_np = np.convolve(a_np, w_np)

s, A, W, B = make_conv1d_cpu_scheduler(M, N)

def schedule_optimization_2(s, B):
    # Blocking and unrolling
    n = B.op.axis[0]
    n_outer, n_inner = s[B].split(n, factor=8)
    s[B].reorder(n_outer, n_inner)
    s[B].parallel(n_outer)
    s[B].unroll(n_inner)

    return s

s = schedule_optimization_2(s, B)

func = tvm.build(s, [A, W, B], "llvm")

dev = tvm.cpu()
a = tvm.nd.array(a_np, dev)
w = tvm.nd.array(w_np, dev)
b = tvm.nd.array(np.zeros((M+N-1), dtype), dev)
func(a, w, b)
evaluator = func.time_evaluator(func.entry_name, dev, number=1, repeat =1)


print("Answer:", b_np)
print("Output:", b)
print(f"1D conv TVM runtime: %f ms" % (evaluator(a, w, b).mean * 1e3))

Answer: [0.08665892 0.07472762 0.14315361 ... 0.8658664  0.36473423 0.19977169]
Output: [0.08665892 0.07472762 0.14315361 ... 0.8658664  0.36473423 0.19977169]
1D conv TVM runtime: 26.647831 ms


In [12]:
import tvm
import numpy as np
import sys
# Adding assignment 3 to the system path
# Make sure this matches your git directory
sys.path.insert(0, PROJECT_ROOT)
from src.ops import make_conv1d_cpu_scheduler

M = 4096
N = 128
dtype = 'float32'
a_np = np.random.rand(M).astype(dtype)
w_np = np.random.rand(N).astype(dtype)
b_np = np.convolve(a_np, w_np)

s, A, W, B = make_conv1d_cpu_scheduler(M, N)

def schedule_optimization_3(s, B):
    # Blocking, unrolling, and vectorization
    n = B.op.axis[0]
    n_outer, n_inner = s[B].split(n, factor=8)
    n_inner_outer, n_inner_inner = s[B].split(n_inner, factor=4)
    s[B].reorder(n_outer, n_inner_outer, n_inner_inner)
    s[B].parallel(n_outer)
    s[B].unroll(n_inner_outer)
    s[B].vectorize(n_inner_inner)

    return s

s = schedule_optimization_3(s, B)

func = tvm.build(s, [A, W, B], "llvm")

dev = tvm.cpu()
a = tvm.nd.array(a_np, dev)
w = tvm.nd.array(w_np, dev)
b = tvm.nd.array(np.zeros((M+N-1), dtype), dev)
func(a, w, b)
evaluator = func.time_evaluator(func.entry_name, dev, number=1, repeat =1)


print("Answer:", b_np)
print("Output:", b)
print(f"1D conv TVM runtime: %f ms" % (evaluator(a, w, b).mean * 1e3))

Answer: [0.00233093 0.02083708 0.05971455 ... 0.09911914 0.4058059  0.0085637 ]
Output: [0.00233093 0.02083708 0.05971455 ... 0.09911914 0.4058059  0.0085637 ]
1D conv TVM runtime: 13.134852 ms


In [13]:
print(tvm.lower(s, [A, W, B], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((4096,), "float32"), W: T.Buffer((128,), "float32"), B: T.Buffer((4223,), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
        for n_outer in T.parallel(528):
            B_1 = T.Buffer((4223,), data=B.data)
            for n_inner_inner_s in range(4):
                B_1[n_outer * 8 + n_inner_inner_s] = T.float32(0)
            A_1 = T.Buffer((4096,), data=A.data)
            W_1 = T.Buffer((128,), data=W.data)
            for k, n_inner_inner_s in T.grid(4223, 4):
                cse_var_2: T.int32 = n_outer * 8 + n_inner_inner_s
                cse_var_1: T.int32 = cse_var_2 - k
                B_1[cse_var_2] = B_1[cse_var_2] + T.if_then_else(4096 <= k or cse_var_1 < 0 or 128 <= cse_var_1, T.float32(0), A_1[k] * W_1[cse_var_1])
            for n_inner_inner_s in ra

In [12]:
%cd {PROJECT_ROOT}
!python -m pytest tests/test_1dconv_cpu.py

/content/gdrive/MyDrive/ece5545/a3-swillenson
platform linux -- Python 3.9.16, pytest-7.2.2, pluggy-1.0.0
rootdir: /content/gdrive/MyDrive/ece5545/a3-swillenson
plugins: anyio-3.6.2
collected 15 items                                                             [0m

tests/test_1dconv_cpu.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                 [100%][0m



Submitted Final

In [6]:
def make_conv1d_cpu_scheduler(M, N):
    A = te.placeholder((M,), name="A")
    W = te.placeholder((N,), name="W")

    k = te.reduce_axis((0, M + N - 1), "k")
    B = te.compute(
        (M + N - 1,),
        lambda n: te.sum(tvm.tir.if_then_else(
            tvm.tir.any(k < 0, k >= M, n - k < 0, n - k >= N),
            tvm.tir.const(0.0, "float32"),
            A[k] * W[n - k]), axis=k),
        name="B",
    )

    s = te.create_schedule(B.op)
    factor = 4
    tgt = tvm.target.Target(target="llvm", host="llvm")
    outer, inner = s[B].split(B.op.axis[0], factor=factor)
    s[B].parallel(outer)
    s[B].vectorize(inner)

    fadd_vector = tvm.build(s, [A, W, B], tgt, name="myadd_parallel")

    # evaluate_addition(fadd_vector, tgt, "vector", log=log)

    return s, A, W, B


In [None]:
def make_conv1d_cpu_scheduler(M, N):
    A = te.placeholder((M,), name="A")
    W = te.placeholder((N,), name="W")

    tgt = tvm.target.Target(target="llvm", host="llvm")
    n = te.var("n")
    B = te.compute(A.shape, lambda i: A[i] + W[i], name="B")
    s = te.create_schedule(B.op)
    return s, A, W, B


In [11]:
import tvm
import numpy as np
import sys
# Adding assignment 3 to the system path
# Make sure this matches your git directory
sys.path.insert(0, PROJECT_ROOT)
from src.ops import make_conv1d_cpu_scheduler

M = 4096
N = 128
dtype = 'float32'
a_np = np.random.rand(M).astype(dtype)
w_np = np.random.rand(N).astype(dtype)
b_np = np.convolve(a_np, w_np)

s, A, W, B = make_conv1d_cpu_scheduler(M, N)
func = tvm.build(s, [A, W, B], "llvm")

dev = tvm.cpu()
a = tvm.nd.array(a_np, dev)
w = tvm.nd.array(w_np, dev)
b = tvm.nd.array(np.zeros((M+N-1), dtype), dev)
func(a, w, b)
evaluator = func.time_evaluator(func.entry_name, dev, number=1, repeat =1)


print("Answer:", b_np)
print("Output:", b)
print(f"1D conv TVM runtime: %f ms" % (evaluator(a, w, b).mean * 1e3))

Answer: [0.00457172 0.01427385 0.02212985 ... 0.1306706  0.02902382 0.00925419]
Output: [0.00457172 0.01427385 0.02212985 ... 0.1306706  0.02902382 0.00925419]
1D conv TVM runtime: 13.436247 ms
