In [None]:
import numpy as np
from XS3VPU import XS3VPU

import tensorflow as tf
print("Executing eagerly: {}".format(tf.executing_eagerly()))

vpu = XS3VPU(bpe=8)
int8min, int8max = vpu._sat_bounds(vpu._single)

# Chunk

A chunk is an `acc_period` $\times$ `ve` size submatrix that operates on a single vector of data.

In [None]:
def compute_chunk(vpu, W, x, W_start, W_step, x_start):
    vpu.VLDC(x[x_start:x_start+vpu.ve]); rw = W_start
    for _ in range(vpu.acc_period):  # unroll in asm
        vpu.VLMACCR(W[rw:rw+vpu.ve]); rw += W_step

In [None]:
vpu = XS3VPU(bpe=8)

# generate coefficients and data for test
W = np.random.randint(int8min, int8max, size=(vpu.acc_period, vpu.ve), dtype=np.int8)
x = np.random.randint(int8min, int8max, size=(vpu.ve,), dtype=np.int8)

# reference output
y = np.matmul(np.int32(W), np.int32(x))

# mimic XS3 layout in memory
W_XS3 = np.copy(W.flatten())
x_XS3 = np.copy(x)

compute_chunk(vpu, W_XS3, x_XS3, W_start=0, W_step=vpu.ve, x_start=0)
print("Result matches reference: {}".format(np.all(np.all(vpu._combine_vD_vR() == y))))

# Tile

A tile is a sequence of chunks producing a single vector of outputs, while operating on continuously layed out data.

In [None]:
def compute_tile(vpu, W, x, N_chunks, W_start, W_step, W_chunk_step, x_start):
    rx = x_start; rw = W_start
    for _ in range(N_chunks):
        compute_chunk(vpu, W, x, W_start=rw, W_step=W_step, x_start=rx)
        rx += vpu.ve; rw += W_chunk_step

In [None]:
vpu = XS3VPU(bpe=8)
N_chunks = 3
N = vpu.ve * N_chunks

# generate coefficients and data for test
W = np.random.randint(int8min, int8max, size=(vpu.acc_period, N), dtype=np.int8)
x = np.random.randint(int8min, int8max, size=(N,), dtype=np.int8)

# reference output
y = np.matmul(np.int32(W), np.int32(x))

# mimic XS3 layout in memory
W_XS3 = np.copy(W.flatten())
x_XS3 = np.copy(x)

compute_tile(vpu, W_XS3, x_XS3, N_chunks=N_chunks,
             W_start=0, W_step=N, W_chunk_step=vpu.ve,
             x_start=0)
print("Result matches reference: {}".format(np.all(np.all(vpu._combine_vD_vR() == y))))

# Band

A band is a sequence of tiles producing a single vector of outputs, operating on a sequence of equally spaced continously layed out data vectors.

In [None]:
def compute_band(vpu, W, x, N_chunks, N_tiles, W_start, W_step, W_chunk_step, W_tile_step, x_start, x_tile_step):
    rx = x_start; rw = W_start
    for _ in range(N_tiles):
        compute_tile(vpu, W, x, N_chunks, W_start=rw, W_step=W_step, W_chunk_step=W_chunk_step, x_start=rx)
        rx += x_tile_step; rw += W_tile_step

In [None]:
vpu = XS3VPU(bpe=8)
N_chunks, N_tiles = 3, 2
N = vpu.ve * N_chunks * N_tiles

# generate coefficients and data for test
W = np.random.randint(int8min, int8max, size=(vpu.acc_period, N), dtype=np.int8)
x = np.random.randint(int8min, int8max, size=(N,), dtype=np.int8)

# reference output
y = np.matmul(np.int32(W), np.int32(x))

# mimic XS3 layout in memory
W_XS3 = np.copy(W.flatten())
x_XS3 = np.copy(x)

compute_band(vpu, W_XS3, x_XS3, N_chunks=N_chunks, N_tiles=N_tiles,
             W_start=0, W_step=N, W_chunk_step=vpu.ve, W_tile_step=N_chunks*vpu.ve,
             x_start=0, x_tile_step=N_chunks*vpu.ve)
print("Result matches reference: {}".format(np.all(np.all(vpu._combine_vD_vR() == y))))

# Dense matrix multiplication

In [None]:
def XS3_matmul(vpu, W, x, y, N_bands, N_chunks):
    rw = 0; ry = 0
    for _ in range(N_bands):
        vpu.VCLRDR()
        compute_tile(vpu, W, x, N_chunks,
                     W_start=rw, W_step=N_chunks*vpu.ve, W_chunk_step=vpu.ve,
                     x_start=0)
        y[ry:ry+vpu.acc_period] = vpu._combine_vD_vR()
        rw += vpu.acc_period * N_chunks * vpu.ve; ry += vpu.acc_period

In [None]:
vpu = XS3VPU(bpe=8)
N_chunks, N_bands = 3, 2
N = N_chunks * vpu.ve
M = N_bands * vpu.acc_period

# generate coefficients and data
W = np.random.randint(int8min, int8max, size=(M, N), dtype=np.int8)
x = np.random.randint(int8min, int8max, size=(N,), dtype=np.int8)

# reference output
y = np.matmul(np.int32(W), np.int32(x))

# mimic XS3 layout in memory
W_XS3 = np.copy(W.flatten())
x_XS3 = np.copy(x)
y_XS3 = np.zeros(y.shape, dtype=y.dtype)

XS3_matmul(vpu, W_XS3, x_XS3, y_XS3, N_bands, N_chunks)
print("Output matches reference: {}".format(np.all(y_XS3 == y)))

# 1x1 Convolution

In [None]:
def XS3_conv2d_1x1(vpu, K, x, y, height, width, C_out_bands, C_in_chunks):
    rx = 0; ry = 0;
    for _ in range(height * width):
        XS3_matmul(vpu, K, x[rx:], y[ry:], C_out_bands, C_in_chunks)
        rx += C_in_chunks * vpu.ve
        ry += C_out_bands * vpu.acc_period

In [None]:
vpu = XS3VPU(bpe=8)
C_in_chunks, C_out_bands = 2, 3
C_in = C_in_chunks * vpu.ve
C_out = C_out_bands * vpu.acc_period
width = 4
height = 6

# generate kernels and data
K = np.random.randint(int8min, int8max, size=(C_out, C_in), dtype=np.int8)
D = np.random.randint(int8min, int8max, size=(height, width, C_in), dtype=np.int8)

# convert to tf.float64, because tf.nn.conv2d cannot handle integer tensors
D_tf = tf.convert_to_tensor(D, dtype=tf.float64)
D_tf = tf.expand_dims(D_tf, axis=0)
K_tf = tf.convert_to_tensor(K.T, dtype=tf.float64)
K_tf = tf.expand_dims(tf.expand_dims(K_tf, axis=0), axis=0)

# reference output
Y_tf = tf.nn.conv2d(D_tf, K_tf, strides=1, padding="VALID", data_format='NHWC')
Y_tf = tf.cast(Y_tf, tf.int32)
Y = Y_tf.numpy()

# mimic XS3 layout in memory
K_XS3 = np.copy(K.flatten())
D_XS3 = np.copy(D.flatten())
Y_XS3 = np.zeros((height, width, C_out), dtype=np.int32).flatten()

XS3_conv2d_1x1(vpu, K_XS3, D_XS3, Y_XS3, height, width, C_out_bands, C_in_chunks)
print("Output matches reference: {}".format(np.all(Y_XS3 == Y.flatten())))

# 3x1 Convolution

In [None]:
def XS3_conv2d(vpu, K, x, y, height, width, K_h, K_w, C_out_bands, C_in_chunks):
    rx = 0;# ry = 0;
    for hi in range(height - K_h + 1):
        for wi in range(width - K_w + 1):
            rw = 0;
            rx = (hi * width + wi) * C_in_chunks * vpu.ve
            ry = (hi * (width - K_w + 1) + wi) * C_out_bands * vpu.acc_period
            for _ in range(C_out_bands):
                vpu.VCLRDR()
                compute_band(vpu, K, x, N_chunks=K_w*C_in_chunks, N_tiles=K_h,
                             W_start=rw, W_step=K_h*K_w*C_in_chunks*vpu.ve,
                             W_chunk_step=vpu.ve, W_tile_step=K_w*C_in_chunks*vpu.ve,
                             x_start=rx, x_tile_step=width*C_in_chunks*vpu.ve)
                y[ry:ry+vpu.acc_period] = vpu._combine_vD_vR()
                rw += vpu.acc_period * K_h * K_w * C_in_chunks * vpu.ve;
                ry += vpu.acc_period

In [None]:
vpu = XS3VPU(bpe=8)
C_in_chunks, C_out_bands = 2, 3
C_in = C_in_chunks * vpu.ve
C_out = C_out_bands * vpu.acc_period
height, width = 7, 5
K_h, K_w = 3, 4

# generate kernels and data
K = np.random.randint(int8min, int8max, size=(C_out, K_h, K_w, C_in), dtype=np.int8)
D = np.random.randint(int8min, int8max, size=(height, width, C_in), dtype=np.int8)

# convert to tf.float64, because tf.nn.conv2d cannot handle integer tensors
D_tf = tf.convert_to_tensor(D, dtype=tf.float64)
D_tf = tf.expand_dims(D_tf, axis=0)
K_tf = tf.convert_to_tensor(K, dtype=tf.float64)
K_tf = tf.transpose(K_tf, perm=[1, 2, 3, 0])

# reference output
Y_tf = tf.nn.conv2d(D_tf, K_tf, strides=1, padding="VALID", data_format='NHWC')
Y_tf = tf.cast(Y_tf, tf.int32)
Y = Y_tf.numpy()

# mimic XS3 layout in memory
K_XS3 = np.copy(K.flatten())
D_XS3 = np.copy(D.flatten())
Y_XS3 = np.zeros((height - K_h + 1, width - K_w + 1, C_out), dtype=np.int32).flatten()

XS3_conv2d(vpu, K_XS3, D_XS3, Y_XS3, height, width, K_h, K_w, C_out_bands, C_in_chunks)
print("Output matches reference: {}".format(np.all(Y_XS3 == Y.flatten())))