In [None]:
import numpy as np
from XS3VPU import XS3VPU

import tensorflow as tf
print("Executing eagerly: {}".format(tf.executing_eagerly()))

# Dense matrix multiplication

In [None]:
def compute_chunk(vpu, W_XS3, W_start, N, x_XS3, x_chunk_start):
    # total of vpu.acc_period + 1 instructions
    vpu.VLDC(x_XS3[x_chunk_start:x_chunk_start+vpu.ve])
    rw = W_start
    for _ in range(vpu.acc_period):  # unroll in asm
        vpu.VLMACCR(W_XS3[rw:rw+vpu.ve])
        rw += N
    
def compute_strip(vpu, W_XS3, strip_start, N, x_XS3, y_XS3, y_ind):
    # total estimated instructions: N_chunks * (vpu.acc_period + 3) + 1
    
    vpu.VCLRDR()
    W_start = strip_start
    N_chunks = N // vpu.ve  # 1 instr.
    x_chunk_start = 0
    
    for _ in range(N_chunks):  # 1 instr. per loop
        compute_chunk(vpu, W_XS3, W_start, N, x_XS3, x_chunk_start)  # 17 instr. per loop
        W_start += vpu.ve
        x_chunk_start += vpu.ve  # 1 instr. per loop
        
    y_XS3[y_ind:y_ind+vpu.acc_period] = vpu._combine_vD_vR()  # VLSAT, VST*, 2 instr.

def XS3_matmul(vpu, W_XS3, N, M, x_XS3, y_XS3):  # ~5  instr. for function call
    # total estimated instructions M_chunks * (N_chunks * (vpu.acc_period + 3) + 3) + 6
    
    y_ind = 0
    strip_start = 0
    M_chunks = M // vpu.acc_period # 1
    for _ in range(M_chunks):  # 1 instr. per loop
        compute_strip(vpu, W_XS3, strip_start, N, x_XS3, y_XS3, y_ind)  # N_chunks * (vpu.acc_period + 3) + 1 instr. per loop
        strip_start += vpu.acc_period * N
        y_ind += vpu.acc_period  # 1 instr. per loop

In [None]:
vpu = XS3VPU(bpe=8)
int8min, int8max = vpu._sat_bounds(vpu._single)

In [None]:
N_chunks, M_chunks = 3, 2
N = N_chunks * vpu._ve
M = M_chunks * (vpu._ve//2)

# generate coefficients and data
W = np.random.randint(int8min, int8max, size=(M, N), dtype=np.int8)
x = np.random.randint(int8min, int8max, size=(N,), dtype=np.int8)

# reference output
y = np.matmul(np.int32(W), np.int32(x))

In [None]:
# mimic XS3 layout in memory
W_XS3 = np.copy(W.flatten())
x_XS3 = np.copy(x)
y_XS3 = np.zeros(y.shape, dtype=y.dtype)

In [None]:
XS3_matmul(vpu, W_XS3, N, M, x_XS3, y_XS3)
        
print("Output matches reference: {}".format(np.all(y_XS3 == y)))
#print("Estimated vector instruction count: {}".format(cnt))

# 1x1 Convolution

In [None]:
def compute_chunk_of_in_pixel(vpu, K_XS3, C_in, K_in_chunk_start, D_XS3, p_chunk_start):
    vpu.VLDC(D_XS3[p_chunk_start:p_chunk_start+vpu.ve])
    rk = K_in_chunk_start
    for _ in range(vpu.acc_period):
        vpu.VLMACCR(K_XS3[rk:rk+vpu.ve])
        rk += C_in

def compute_chunk_of_out_pixel(vpu, K_XS3, K_out_chunk_start, C_in, D_XS3, in_pixel_start, Y_XS3, Y_start):
    vpu.VCLRDR()
    C_in_chunks = C_in // vpu.ve
    rc = K_out_chunk_start
    rp = in_pixel_start
    for _ in range(C_in_chunks):
        compute_chunk_of_in_pixel(vpu, K_XS3, C_in, K_in_chunk_start=rc, D_XS3=D_XS3, p_chunk_start=rp)
        rc += vpu.ve
        rp += vpu.ve
    Y_XS3[Y_start:Y_start+vpu.acc_period] = vpu._combine_vD_vR()
        
def compute_chunk_of_out_image(vpu, K_XS3, out_chunk_start, C_in, C_out, D_XS3, M, N, Y_XS3):
    ry = out_chunk_start
    rx = 0
    C_out_chunks = C_out // vpu.acc_period
    for _ in range(M):
        for _ in range(N):
            compute_chunk_of_out_pixel(vpu, K_XS3, K_out_chunk_start=out_chunk_start*C_in,
                                       C_in=C_in, D_XS3=D_XS3, in_pixel_start=rx, Y_XS3=Y_XS3, Y_start=ry)
            rx += C_in_chunks * vpu.ve
            ry += C_out_chunks * vpu.acc_period
    
def XS3_conv2d_1x1(vpu, K_XS3, C_in, C_out, D_XS3, M, N, Y_XS3):
    out_chunk_start = 0
    for _ in range(C_out_chunks):
        compute_chunk_of_out_image(vpu, K_XS3, out_chunk_start, C_in, C_out, D_XS3, M, N, Y_XS3)
        out_chunk_start += vpu.acc_period

In [None]:
vpu = XS3VPU(bpe=8)
int8min, int8max = vpu._sat_bounds(vpu._single)

In [None]:
C_in_chunks, C_out_chunks = 7, 5
C_in = C_in_chunks * vpu._ve
C_out = C_out_chunks * (vpu._ve//2)
N = 3
M = 4

# generate kernels and data
K = np.random.randint(int8min, int8max, size=(C_out, C_in), dtype=np.int8)
D = np.random.randint(int8min, int8max, size=(M, N, C_in), dtype=np.int8)

In [None]:
# convert to tf.float64, because tf.nn.conv2d cannot handle integer tensors
D_tf = tf.convert_to_tensor(D, dtype=tf.float64)
D_tf = tf.expand_dims(D_tf, axis=0)
K_tf = tf.convert_to_tensor(K.T, dtype=tf.float64)
K_tf = tf.expand_dims(tf.expand_dims(K_tf, axis=0), axis=0)

# reference output
Y_tf = tf.nn.conv2d(D_tf, K_tf, strides=1, padding="VALID", data_format='NHWC')
Y_tf = tf.cast(Y_tf, tf.int32)
Y = np.squeeze(Y_tf.numpy())

In [None]:
# mimic XS3 layout in memory
K_XS3 = np.copy(K.flatten())
D_XS3 = np.copy(D.flatten())
Y_XS3 = np.zeros((M, N, C_out), dtype=np.int32).flatten()

In [None]:
XS3_conv2d_1x1(vpu, K_XS3, C_in, C_out, D_XS3, M, N, Y_XS3)

print("Output matches reference: {}".format(np.all(Y_XS3 == Y.flatten())))