In [10]:
import numpy as np
import warnings
import tensorflow as tf
warnings.filterwarnings(action='ignore')
import tensorflow as tf
warnings.filterwarnings(action='default')
import os

In [2]:
iinfo_s32 = np.iinfo(np.int32)
iinfo_s16 = np.iinfo(np.int16)
iinfo_s8 = np.iinfo(np.int8)

bitdepth_map = { # because using "np.int8" directly as the key doesn't work
    np.zeros(0,dtype=np.int8).dtype:   8,
    np.zeros(0,dtype=np.int16).dtype: 16,
    np.zeros(0,dtype=np.int32).dtype: 32,
}

unit_test_dir = "../../../xcore/operator_book/test/nn_operators/"

# Helper Functions

In [3]:
def rand_tensor_s8(*dims):
    return np.random.randint(-128, 128, size=tuple(dims), dtype=np.int8)
def rand_tensor_s32(*dims):
    return np.random.randint(iinfo_s32.min, np.int64(iinfo_s32.max)+1, size=tuple(dims), dtype=np.int32)

def vlsat_s16(x, shift):
    psh = shift > 0
    nsh = shift < 0
    x[...,psh] += 1 << (shift[psh]-1)
    x[...,psh] = x[...,psh] >> shift[psh]
    x[...,nsh] = x[...,nsh] << -shift[nsh]
    return np.clip(x, a_min = iinfo_s16.min+1, a_max = iinfo_s16.max).astype(np.int16)
    

def vlmul_s16(x, scales):
    assert(x.shape[-1] == scales.shape[-1])
    while scales.ndim < x.ndim:
        scales = np.expand_dims(scales, axis=0)
    y = x.astype(np.int32) * scales
    assert(y.dtype == np.int32)
    #vlmul for s16 has an implicit built-in vlsat on 
    # the 32-bit result, where all shfits are 14
    shifts = 14 *np.ones(y.shape, dtype=np.int16)
    return vlsat_s16(y, shifts)

def vdepth8(x):
    shr = bitdepth_map[x.dtype] - 8
    x = x.astype(np.int64)
    assert(shr > 0)
    x += 1<<(shr-1)
    x = x >> shr
    x = np.clip(x, a_min = iinfo_s8.min+1, a_max = iinfo_s8.max).astype(np.int8)
    return x
    

def conv2d_s8(K, X, zero_point=0):
    C_out, K_h, K_w, C_in = K.shape
    height, width, C_inx = X.shape
    assert(K_h % 2 == 1)
    assert(K_w % 2 == 1)
    assert(C_in == C_inx)
    P_h, P_w = (K_h//2),(K_w//2)
    
    X_padded = (np.ones(shape=(height+2*P_h, width+2*P_w, C_in), dtype=np.int8) * zero_point).astype(np.int8)
    X_padded[P_h:height+P_h,P_w:width+P_w,:] = X
    Y = np.zeros(shape=(height,width, C_out), dtype=np.int32)
    
    for i in range(height):
        for j in range(width):
            patch = X_padded[i:i+K_h,j:j+K_w,:].astype(np.int32)
            for k in range(C_out):
                kernel = K[k,:,:,:]
                Y[i,j,k] = np.sum(kernel * patch)
    
    return Y


def inflate_BTensor(B, height, width):
    K_h, K_w, C_out = B.shape
    P = np.array((K_h//2, K_w//2))
    D = (height,width)
    Q = P - D + 1
    
    res = np.zeros(shape=(height, width, C_out), dtype=B.dtype)
    
    for row in range(height):
        for col in range(width):
            
            C = np.array((row, col))
            
            pad_sta = ( (P-C) > 0 ) * (P-C)
            pad_sto = ( (C+Q) > 0 ) * (C+Q)
            
            br,bc = pad_sto - pad_sta + P
            
            res[row,col] = B[br,bc]
            
    return res


In [4]:
#Just testing inflate_BTensor()
if False:
    B = np.array([['🡔','🡑','🡕'],['🡐','O','🡒'],['🡗','🡓','🡖']])[:,:,np.newaxis]

    print("B:\n",B[:,:,0], "\n\n===")
    res = inflate_BTensor(B, 3, 3)
    print(res[:,:,0])
    res = inflate_BTensor(B, 6, 10)
    print(res[:,:,0])
    res = inflate_BTensor(B, 4, 7)
    print(res[:,:,0])

# maxpool2d_deep()

In [5]:

def maxpool2d_deep(tensor_in):
    assert(tensor_in.ndim == 3)
    return tf.nn.max_pool2d(td.expand_dims(tensor_in, axis=0),
                            ksize=2, strides=2, padding='VALID'
                           )[0,:,:,:].numpy()


def test_case_maxpool2d_deep(width, height, chans, writefile=None):
    #produce a maxpool2d_deep() test case
    A = rand_tensor_s8(width, height, chans)

    A_out = maxpool2d_deep(A)
    if writefile:
        with open(writefile, "wb+") as file:
            A.tofile(file)
            A_out.tofile(file)
        



# fc_deepin_shallowout_16()

In [6]:


def fc_deepin_shallowout_16(W, B, X, shifts, scales):
    y = np.matmul(W.astype(np.int32), X.astype(np.int32)) + B
    assert(y.shape == shifts.shape)
    y = vlsat_s16(y, shifts)
    y = vlmul_s16(y, scales)
    return y


def test_case_fc_deepin_shallowout_16(C_in, C_out, writefile=None):
    #produce a fc_deepin_shallowout_8() test case
    W = rand_tensor_s8(C_out, C_in)
    X = rand_tensor_s8(C_in)
    B = rand_tensor_s32(C_out)
    
    shifts = np.random.randint(0, 16, size=C_out, dtype=np.int16)
    scales = np.random.randint(0x4000, 0x8000, size=C_out, dtype=np.int16)
    
    #shouldn't let bias dominate the pre-activation value
    if(1):
        while(True):
            tmp1 = np.matmul(W.astype(np.int32), X.astype(np.int32))
            
            too_biased = np.abs(tmp1) < np.abs(B)
            
            if np.sum(too_biased) <= (C_out / 10):
                break
                
            B[too_biased] = (B[too_biased] * 0.5).astype(np.int32)
        
    
    #going to iterate to make sure our shifts are reasonable
    if(1):
        #only allow <10% of values to be 0 
        while(True):
            tmp = np.matmul(W.astype(np.int32), X.astype(np.int32)) + B
            tmp = vlsat_s16(tmp, shifts)

            zeros = (tmp == 0)
            if np.sum(zeros) <= (C_out / 10):
                break
                
            shifts[zeros] = shifts[zeros] - 2

        #only allow <10% of values to saturate
        while(True):
            tmp = np.matmul(W.astype(np.int32), X.astype(np.int32)) + B
            tmp = vlsat_s16(tmp, shifts)
            
            sats = np.logical_or(tmp == iinfo_s16.max,tmp == (iinfo_s16.min+1))
            if np.sum(sats) <= (C_out / 10):
                break
                
            shifts[sats] = shifts[sats] + 2
    
    #going to iterate to make sure our scales are reasonable
    if(1):
        #only allow <10% of values to saturate
        while(True):
            tmp = np.matmul(W.astype(np.int32), X.astype(np.int32)) + B
            tmp = vlsat_s16(tmp, shifts)
            tmp = vlmul_s16(tmp, scales)

            sats = np.logical_or(tmp == iinfo_s16.max,tmp == (iinfo_s16.min+1))
            if np.sum(sats) <= (C_out / 10):
                break
                
            scales[sats] = (scales[sats] * 0.9).astype(np.int16)
        
    
    Y = fc_deepin_shallowout_16(W, B, X, shifts, scales)
    
    if writefile:
        with open(writefile, "wb+") as file:
#             print(W.dtype, X.dtype, B.dtype, shifts.dtype, scales.dtype, Y.dtype)
            assert(W.dtype == np.int8 and X.dtype == np.int8 and B.dtype == np.int32 
                    and shifts.dtype == np.int16 and scales.dtype == np.int16
                    and Y.dtype == np.int16)
            W.tofile(file)
            X.tofile(file)
            B.tofile(file)
            shifts.tofile(file)
            scales.tofile(file)
            (Y).tofile(file)
    
    return W,X,B,shifts,scales,Y


#Generate test vectors for fc_deepin_shallowout_16()
if False:
    thing = [
        (2, 32, 4, 10),
        (3, 96, 15, 10),
    ]
    for (caseNum, C_in, C_out, vecs) in thing:
                
        Ws      = np.zeros(shape=(0, C_out, C_in), dtype=np.int8)
        Xs      = np.zeros(shape=(0, C_in), dtype=np.int8)
        Bs      = np.zeros(shape=(0, C_out), dtype=np.int32)
        shiftss = np.zeros(shape=(0, C_out), dtype=np.int16)
        scaless = np.zeros(shape=(0, C_out), dtype=np.int16)
        Ys      = np.zeros(shape=(0, C_out), dtype=np.int16)
        
        for i in range(vecs):
            W,X,B,shifts,scales,Y = test_case_fc_deepin_shallowout_16(C_in, C_out, 
                    writefile=os.path.join(unit_test_dir, 
                        "test_data/fc_deepin_shallowout_8_case{0}.{1}.dat".format(caseNum, i)))
            
            Ws      = np.append(Ws,      W[np.newaxis,...],      axis=0)
            Xs      = np.append(Xs,      X[np.newaxis,...],      axis=0)
            Bs      = np.append(Bs,      B[np.newaxis,...],      axis=0)
            shiftss = np.append(shiftss, shifts[np.newaxis,...], axis=0)
            scaless = np.append(scaless, scales[np.newaxis,...], axis=0)
            Ys      = np.append(Ys,      Y[np.newaxis,...],      axis=0)


        #These will be used to ensure we're not passing the test because files weren't
        # correctly loaded
        ychk_str = ("#undef Y_CHECK\n#define Y_CHECK  " 
                    +  ",".join([str(x) for x in Ys[:,0]]) + "\n")

        with open(os.path.join(unit_test_dir, 
                               "test_data/fc_deepin_shallowout_16_case{0}.h".format(caseNum)), "w+"
                 ) as file:
                  file.write(ychk_str)

# fc_deepin_shallowout_8()

In [14]:

def fc_deepin_shallowout_8(W, B, X, shifts, scales):
    y = np.matmul(W.astype(np.int32), X.astype(np.int32)) + B
    assert(y.shape == shifts.shape)
    y = vlsat_s16(y, shifts)
    y = vlmul_s16(y, scales)
    y = vdepth8(y)
    return y


def test_case_fc_deepin_shallowout_8(C_in, C_out, writefile=None):
    #produce a fc_deepin_shallowout_8() test case
    W = rand_tensor_s8(C_out, C_in)
    X = rand_tensor_s8(C_in)
    B = rand_tensor_s32(C_out)
    
    shifts = np.random.randint(0, 16, size=C_out, dtype=np.int16)
    scales = np.random.randint(0x4000, 0x8000, size=C_out, dtype=np.int16)
    
    #shouldn't let bias dominate the pre-activation value
    if(1):
        while(True):
            tmp1 = np.matmul(W.astype(np.int32), X.astype(np.int32))
            
            too_biased = np.abs(tmp1) < np.abs(B)
            
            if np.sum(too_biased) <= (C_out / 10):
                break
                
            B[too_biased] = (B[too_biased] * 0.5).astype(np.int32)
        
    
    #going to iterate to make sure our shifts are reasonable
    if(1):
        #only allow <10% of values to be 0 
        while(True):
            tmp = np.matmul(W.astype(np.int32), X.astype(np.int32)) + B
            tmp = vlsat_s16(tmp, shifts)

            zeros = (tmp == 0)
            if np.sum(zeros) <= (C_out / 10):
                break
                
            shifts[zeros] = shifts[zeros] - 2

        #only allow <10% of values to saturate
        while(True):
            tmp = np.matmul(W.astype(np.int32), X.astype(np.int32)) + B
            tmp = vlsat_s16(tmp, shifts)
            
            sats = np.logical_or(tmp == iinfo_s16.max,tmp == (iinfo_s16.min+1))
            if np.sum(sats) <= (C_out / 10):
                break
                
            shifts[sats] = shifts[sats] + 2
    
    #going to iterate to make sure our scales are reasonable
    if(1):
        #only allow <10% of values to saturate
        while(True):
            tmp = np.matmul(W.astype(np.int32), X.astype(np.int32)) + B
            tmp = vlsat_s16(tmp, shifts)
            tmp = vlmul_s16(tmp, scales)

            sats = np.logical_or(tmp == iinfo_s16.max,tmp == (iinfo_s16.min+1))
            if np.sum(sats) <= (C_out / 10):
                break
                
            scales[sats] = (scales[sats] * 0.9).astype(np.int16)
        
    
    Y = fc_deepin_shallowout_8(W, B, X, shifts, scales)
    
    if writefile:
        with open(writefile, "wb+") as file:
#             print(W.dtype, X.dtype, B.dtype, shifts.dtype, scales.dtype, Y.dtype)
            assert(W.dtype == np.int8 and X.dtype == np.int8 and B.dtype == np.int32 
                    and shifts.dtype == np.int16 and scales.dtype == np.int16
                    and Y.dtype == np.int8)
            W.tofile(file)
            X.tofile(file)
            B.tofile(file)
            shifts.tofile(file)
            scales.tofile(file)
            (Y).tofile(file)
    
    return W,X,B,shifts,scales,Y


#Generate test vectors for fc_deepin_shallowout_8()
if True:
    thing = [
        (2, 32, 4, 10),
        (3, 96, 15, 10),
    ]
    for (caseNum, C_in, C_out, vecs) in thing:
                
        Ws      = np.zeros(shape=(0, C_out, C_in), dtype=np.int8)
        Xs      = np.zeros(shape=(0, C_in), dtype=np.int8)
        Bs      = np.zeros(shape=(0, C_out), dtype=np.int32)
        shiftss = np.zeros(shape=(0, C_out), dtype=np.int16)
        scaless = np.zeros(shape=(0, C_out), dtype=np.int16)
        Ys      = np.zeros(shape=(0, C_out), dtype=np.int8)
        
        for i in range(vecs):
            W,X,B,shifts,scales,Y = test_case_fc_deepin_shallowout_8(C_in, C_out, 
                    writefile=os.path.join(unit_test_dir, 
                        "test_data/fc_deepin_shallowout_8_case{0}.{1}.dat".format(caseNum, i)))
            
            Ws      = np.append(Ws,      W[np.newaxis,...],      axis=0)
            Xs      = np.append(Xs,      X[np.newaxis,...],      axis=0)
            Bs      = np.append(Bs,      B[np.newaxis,...],      axis=0)
            shiftss = np.append(shiftss, shifts[np.newaxis,...], axis=0)
            scaless = np.append(scaless, scales[np.newaxis,...], axis=0)
            Ys      = np.append(Ys,      Y[np.newaxis,...],      axis=0)


        #These will be used to ensure we're not passing the test because files weren't correctly loaded
        ychk_str = ("#undef Y_CHECK\n#define Y_CHECK  " 
                    +  ",".join([str(x) for x in Ys[:,0]]) + "\n")

        with open(os.path.join(unit_test_dir, 
                               "test_data/fc_deepin_shallowout_8_case{0}.h".format(caseNum)), "w+"
                 ) as file:
                  file.write(ychk_str)

# conv2d_deepin_deepout_relu()

In [8]:
def conv2d_deepin_deepout_relu(K, B_tensor, X, shifts, scales, debug=False):
    y = conv2d_s8(K, X)
    y = y + inflate_BTensor(B_tensor, X.shape[0], X.shape[1])
    
    assert(y.dtype == np.int32)
    y = vlsat_s16(y, shifts)
    assert(y.dtype == np.int16)
    y = vlmul_s16(y, scales)
    assert(y.dtype == np.int16)
    y = vdepth8(y)
    assert(y.dtype == np.int8)
    
    return y


def test_case_conv2d_deepin_deepout_relu(height, width, K_h, K_w, C_in, C_out, writefile=None):
    #produce a conv2d_deepin_deepout_relu() test case
    
    def formBTensor(B, K, zero_point=0):
        B_tensor = B.reshape((1,1,C_out))
        B_tensor = np.repeat(B_tensor, K_w, axis=1)
        B_tensor = np.repeat(B_tensor, K_h, axis=0)
        assert(B_tensor.shape == (K_h, K_w, C_out))
        K_h_half, K_w_half = K_h//2, K_w//2
        
        img = np.zeros(shape=(K_h, K_w, C_in), dtype=np.int8)
        
        img = conv2d_s8(K, img, zero_point)
        assert(img.dtype==np.int32)
        
        for kh in range(-K_h_half, K_h_half+1):
            for kw in range(-K_w_half, K_w_half+1):
                B_tensor[kh+K_h_half,kw+K_w_half,:] = B + img[kh+K_h_half,kw+K_w_half,:]
        
        return B_tensor

    def transformK(K):
        C_out,K_h,K_w,C_in = K.shape
        assert(C_out % 16 == 0)
        assert(C_in % 32 == 0)
        assert(K_h % 2 == 1)
        assert(K_w % 2 == 1)

        Q = K.reshape((K.shape[0]//16, 16, K.shape[1], K.shape[2], K.shape[3]//32, 32))
        Q = np.transpose(np.flip(Q, axis=1), axes=(0, 2, 3, 4, 1, 5)).flatten()
        
        return Q
    
    def transformB(B):
        assert(B.dtype == np.int32)
        assert(B.shape[:2] == (K_h, K_w))
        
        C_out = B.shape[2]
                
        B_out = np.zeros(shape=(K_h, K_w, 2, C_out), dtype=np.uint16)
        
        B_out[:,:,1,:] = B >> 16
        B_out[:,:,0,:] = B
        return B_out

    K = rand_tensor_s8(C_out, K_h, K_w, C_in)
    X = rand_tensor_s8(height, width, C_in)
    zero_point = np.random.randint(-64, 64)
    
    tmp = conv2d_s8(K, X)
    assert(tmp.dtype == np.int32)
    B = -(np.mean(tmp, axis=(0,1))).astype(np.int32)
    
    B_tensor = formBTensor(B, K, zero_point)
    
    tmp = tmp + inflate_BTensor(B_tensor, height, width)
    assert(tmp.dtype == np.int32)
    
    acc32s = np.array(tmp)
    
    tmp_min = np.min(tmp, axis=(0,1))
    tmp_max = np.max(tmp, axis=(0,1))
    tmp_max = np.max((tmp_max, np.abs(tmp_min)), axis=0)
    shifts  = np.ceil(np.log2(tmp_max)).astype(np.int16) - 15
    
    scales = np.ones(C_out, dtype=np.int16) * 0x4000
    
    Y = conv2d_deepin_deepout_relu(K, B_tensor, X, shifts, scales)
    
    if writefile:
        with open(writefile, "wb+") as file:
            assert(K.dtype==np.int8);       assert(X.dtype==np.int8);  
            assert(shifts.dtype==np.int16); assert(scales.dtype==np.int16); assert(Y.dtype==np.int8)
                
#             tK = transformK(K)
#             for i in range(16):
#                 assert(np.all(K[i,:,:,:] == tK[15-i,:,:,:]))
            
            transformK(K).tofile(file)
            transformB(B_tensor).tofile(file)
            X.tofile(file)
            shifts.tofile(file)
            scales.tofile(file)
            Y.tofile(file)
            acc32s.tofile(file)
            
    return K,B_tensor,X,shifts,scales,Y,acc32s

#Generate test vectors for conv2d_deepin_deepout_relu()
FNAME = "conv2d_deepin_deepout_relu"
if False:
    thing = [ #caseNum, C_in, C_out, K_h, K_w, height, width, numVectors
        (2, 32, 16, 1, 1, 4, 4, 10),
        (3, 32, 16, 3, 3, 4, 4, 10),
        (4, 64, 32, 3, 3, 8, 8, 4),
    ]
    for (caseNum, C_in, C_out, K_h, K_w, height, width, vecs) in thing:
        
        Ks      = np.zeros(shape=(0, C_out, K_h, K_w, C_in), dtype=np.int8)
        Xs      = np.zeros(shape=(0, height, width, C_in), dtype=np.int8)
        Bs      = np.zeros(shape=(0, K_h, K_w, C_out), dtype=np.int32)
        shiftss = np.zeros(shape=(0, C_out), dtype=np.int16)
        scaless = np.zeros(shape=(0, C_out), dtype=np.int16)
        Ys      = np.zeros(shape=(0, height, width, C_out), dtype=np.int8)
        acc32s  = np.zeros(shape=(0, height, width, C_out), dtype=np.int32)
        
        for i in range(vecs):
            K,B,X,shifts,scales,Y,acc32 = test_case_conv2d_deepin_deepout_relu(
                    height, width, K_h, K_w, C_in, C_out,
                    writefile=os.path.join(unit_test_dir, 
                        "test_data/{0}_case{1}.{2}.dat".format(FNAME, caseNum, i)))
            Ks      = np.append(Ks,      K[np.newaxis,...],      axis=0)
            Xs      = np.append(Xs,      X[np.newaxis,...],      axis=0)
            Bs      = np.append(Bs,      B[np.newaxis,...],      axis=0)
            shiftss = np.append(shiftss, shifts[np.newaxis,...], axis=0)
            scaless = np.append(scaless, scales[np.newaxis,...], axis=0)
            Ys      = np.append(Ys,      Y[np.newaxis,...],      axis=0)
            acc32s  = np.append(acc32s,  acc32[np.newaxis,...],  axis=0)


        #These will be used to ensure we're not passing the test because files weren't
        # correctly loaded
        y_vec_count_str = "#undef TEST_VECTOR_COUNT\n#define TEST_VECTOR_COUNT    ({0})\n".format(vecs)
        ychk_str = ("#undef Y_CHECK\n#define Y_CHECK  " 
                    +  ",".join([str(x) for x in Ys[:,0,0,0]]) + "\n")

        with open(os.path.join(unit_test_dir, 
                               "test_data/{0}_case{1}.h".format(FNAME, caseNum)), "w+"
                 ) as file:
                  file.write(y_vec_count_str)
                  file.write(ychk_str)

    print("Done.")

# conv2d_shallowin_deepout_relu()

In [9]:


def conv2d_shallowin_deepout_relu(K, B_tensor, X, shifts, scales):
    C_out, K_h, K_w, C_in
    assert(C_out % 16 == 0)
    assert(C_in == 4)
    assert(K_h % 2 == 1)
    assert((K_w < 8) and (K_w % 2 == 1))
    y = conv2d_s8(K, X)
    y += inflate_BTensor(B_tensor, X.shape[0], X.shape[1])
    
    y = vlsat_s16(y, shifts)
    assert(y.dtype == np.int16)
    y = vlmul_s16(y, scales)
    y = vdepth8(y)
    assert(y.dtype == np.int8)
    
    return y
    



def test_case_conv2d_shallowin_deepout_relu(height, width, K_h, K_w, C_in, C_out, writefile=None):
    #produce a conv2d_deepin_deepout_relu() test case
    assert(C_in == 4)
    
    def formBTensor(B, K, zero_point=0):
        B_tensor = B.reshape((1,1,C_out))
        B_tensor = np.repeat(B_tensor, K_w, axis=1)
        B_tensor = np.repeat(B_tensor, K_h, axis=0)
        assert(B_tensor.shape == (K_h, K_w, C_out))
        K_h_half, K_w_half = K_h//2, K_w//2
        
        img = np.zeros(shape=(K_h, K_w, C_in), dtype=np.int8)
        
        img = conv2d_s8(K, img, zero_point)
        assert(img.dtype==np.int32)
        
        for kh in range(-K_h_half, K_h_half+1):
            for kw in range(-K_w_half, K_w_half+1):
                B_tensor[kh+K_h_half,kw+K_w_half,:] = B + img[kh+K_h_half,kw+K_w_half,:]
        
        return B_tensor
    
    def transformK(K):
        """K layout needs to be:
            - C_out group (i.e. C_out // 16)
            - Row
            - C_out index (i.e. C_out % 16)
            - Col
            - C_in
        """
        cou, hei, wid, cin = K.shape
        assert(cou % 16 == 0); 
        assert(wid <= 8);
        assert(cin <= 4)
        
        cout_groups = cou // 16
        
        # Pad width to 8 and cin to 4
        K = np.concatenate((K, np.zeros(shape=(cou, hei, 8-wid, cin), dtype=np.int8)),axis=2)
        K = np.concatenate((K, np.zeros(shape=(cou, hei, 8, 4-cin), dtype=np.int8)), axis=3)
        
        K = K.reshape((cout_groups, 16, hei, 8, 4))
        K = np.flip(K, axis=1).transpose(0,2,1,3,4)
        
        return K
    
    def transformB(B, K_h, K_w):
        assert(B.dtype == np.int32)
        assert(B.shape[:2] == (K_h, K_w))
        C_out = B.shape[2]
        B_out = np.zeros(shape=(K_h, K_w, 2, C_out), dtype=np.uint16)
        B_out[:,:,1,:] = B >> 16
        B_out[:,:,0,:] = B
        return B_out
        
    
    K = rand_tensor_s8(C_out, K_h, K_w, C_in)
    X = rand_tensor_s8(height, width, C_in)
    zero_point = np.random.randint(-64, 64)
    
    tmp = conv2d_s8(K, X)
    B = -(np.mean(tmp, axis=(0,1))).astype(np.int32)
    
    B_tensor = formBTensor(B, K, zero_point)
    
    tmp = tmp + inflate_BTensor(B_tensor, height, width)
    assert(tmp.dtype == np.int32)
    
    acc32s = np.array(tmp)
    
    tmp_min = np.min(tmp, axis=(0,1))
    tmp_max = np.max(tmp, axis=(0,1))
    tmp_max = np.max((tmp_max, np.abs(tmp_min)), axis=0)
    shifts  = np.ceil(np.log2(tmp_max)).astype(np.int16) - 15
    shifts  = np.clip(shifts, a_min=0, a_max=None)
    
    scales = np.ones(C_out, dtype=np.int16) * 0x4000
    
    Y = conv2d_shallowin_deepout_relu(K, B_tensor, X, shifts, scales)
    
    
    if writefile:
        with open(writefile, "wb+") as file:
            assert(K.dtype==np.int8);       assert(X.dtype==np.int8);  
            assert(shifts.dtype==np.int16); assert(scales.dtype==np.int16); assert(Y.dtype==np.int8)
            transformK(K).tofile(file)
            transformB(B_tensor, K_h, K_w).tofile(file)
            X.tofile(file)
            shifts.tofile(file)
            scales.tofile(file)
            Y.tofile(file)
            acc32s.tofile(file)
            
    return K,B_tensor,X,shifts,scales,Y,acc32s


#Generate test vectors for conv2d_shallowin_deepout_relu()
FNAME = "conv2d_shallowin_deepout_relu"
if False:
    thing = [ #caseNum, C_in, C_out, K_h, K_w, height, width, numVectors
        (2, 4, 16, 1, 1, 2, 2, 10),
        (3, 4, 16, 3, 3, 4, 4, 10),
        (4, 4, 32, 3, 3, 8, 8, 4),
    ]
    for (caseNum, C_in, C_out, K_h, K_w, height, width, vecs) in thing:
        
        Ks      = np.zeros(shape=(0, C_out, K_h, K_w, C_in), dtype=np.int8)
        Xs      = np.zeros(shape=(0, height, width, C_in), dtype=np.int8)
        Bs      = np.zeros(shape=(0, K_h, K_w, C_out), dtype=np.int32)
        shiftss = np.zeros(shape=(0, C_out), dtype=np.int16)
        scaless = np.zeros(shape=(0, C_out), dtype=np.int16)
        Ys      = np.zeros(shape=(0, height, width, C_out), dtype=np.int8)
        acc32s  = np.zeros(shape=(0, height, width, C_out), dtype=np.int32)
        
        for i in range(vecs):
            K,B,X,shifts,scales,Y,acc32 = test_case_conv2d_shallowin_deepout_relu(
                    height, width, K_h, K_w, C_in, C_out,
                    writefile=os.path.join(unit_test_dir, 
                        "test_data/{0}_case{1}.{2}.dat".format(FNAME, caseNum, i)))
            Ks      = np.append(Ks,      K[np.newaxis,...],      axis=0)
            Xs      = np.append(Xs,      X[np.newaxis,...],      axis=0)
            Bs      = np.append(Bs,      B[np.newaxis,...],      axis=0)
            shiftss = np.append(shiftss, shifts[np.newaxis,...], axis=0)
            scaless = np.append(scaless, scales[np.newaxis,...], axis=0)
            Ys      = np.append(Ys,      Y[np.newaxis,...],      axis=0)
            acc32s  = np.append(acc32s,  acc32[np.newaxis,...],  axis=0)


        #These will be used to ensure we're not passing the test because files weren't
        # correctly loaded
        y_vec_count_str = "#undef TEST_VECTOR_COUNT\n#define TEST_VECTOR_COUNT    ({0})\n".format(vecs)
        ychk_str = ("#undef Y_CHECK\n#define Y_CHECK  " 
                    +  ",".join([str(x) for x in Ys[:,0,0,0]]) + "\n")

        with open(os.path.join(unit_test_dir, 
                               "test_data/{0}_case{1}.h".format(FNAME, caseNum)), "w+"
                 ) as file:
                  file.write(y_vec_count_str)
                  file.write(ychk_str)

    print("Done.")