# mmult unit test

In [1]:
import numpy as np
import cffi

from pynq import Overlay

# load Base Overlay
Overlay("/home/xilinx/pynq/bitstream/base.bit").download()

import sys
sys.path.append("..")

from pynq_chainer import overlays

PYNQ True
pcsim is not run on pynq


In [2]:
from pynq_chainer import utils

In [3]:
from pynq.drivers import xlnk
mmu = xlnk.xlnk()
mmu.xlnk_reset()

In [4]:
mmult = overlays.BinMmult()
ffi = cffi.FFI()

load Overlay


In [5]:
def debug_cdata(name, cdata, show=True):
    if not show:
        return
    print(name)
    for i in range(5):
        print(cdata[i])

In [6]:
dtype = "unsigned int"
npdtype = np.uint32

In [9]:
def test(debug=False):
    ffi = cffi.FFI()
    
    x_size = (1, 32)
    w_size = (16, 32)
    
    x = np.random.randint(-255, 255, x_size)
    w = np.random.randint(-255, 255, w_size)
    
    # HW
    x_hw = np.where(x>=0, 1, 0).astype(npdtype, copy=True)
    w_hw = np.where(w>=0, 1, 0).astype(npdtype, copy=True).T.copy()
    
    # SW
    x_sw = np.where(x>=0, 1, -1).astype(np.float32, copy=True)
    w_sw = np.where(w>=0, 1, -1).astype(np.float32, copy=True)

    x_nrows, x_ncols = x.shape
    w_nrows, w_ncols = w.shape
    y_hw, y_cdata = utils.malloc_cma_ndarray((w_nrows, x_nrows), dtype, npdtype)
    
    x_hw, x_cdata = utils.copy_cma_ndarray(x_hw, dtype)
    #w_, w_cdata = utils.copy_cma_ndarray(w.T.copy(), dtype)
    w_hw, w_cdata = utils.copy_cma_ndarray(w_hw, dtype)
    
    if debug:
        print('x', x_cdata)
        print('w', w_cdata)
        print('y', y_cdata)
    debug_cdata("x", x_cdata)
    debug_cdata("w", w_cdata)
    debug_cdata("y", y_cdata)

    mmult(x_cdata, w_cdata, y_cdata, x_nrows, w_nrows, x_ncols)
    
    debug_cdata("y", y_cdata)
    y_sw = x_sw.dot(w_sw.T)
    y_hw = y_hw.T
    
    debug_cdata("y", y_cdata)

    if debug:
        print("Actual(C):")
        print(y_hw)
        print("Expected(NumPy):")
        print(y_sw)
    
    if np.allclose(y_hw, y_sw, rtol=1e-04, atol=1e-04):
        print("OK")
    else:
        print("NG")
        
    mmu.cma_free(x_cdata)
    mmu.cma_free(w_cdata)
    mmu.cma_free(y_cdata)

In [10]:
for i in range(1):
    test(debug=True)

cma alloc
cma alloc
cma copy 
cma alloc
cma copy 
x <cdata 'unsigned int *' 0x36961000>
w <cdata 'unsigned int *' 0x36960000>
y <cdata 'unsigned int *' 0x36962000>
x
0
1
0
0
0
w
1
1
1
1
0
y
0
0
0
0
0
y
32
32
32
32
32
y
32
32
32
32
32
Actual(C):
[[32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32]]
Expected(NumPy):
[[  6.  10.  -2.   4.  -6.  12.   6.  -4.   8.  14.   2.  -4.   2.   4.
    0.   6.]]
NG


In [11]:
L = 784
M = 32
x_size = (1, 32)
w_size = (32, 32)
x = np.ones(x_size).astype(npdtype)
w = np.ones(w_size).astype(npdtype)

x_nrows, x_ncols = x.shape
w_nrows, w_ncols = w.shape
y, y_cdata = utils.malloc_cma_ndarray((x_nrows, w_nrows), dtype, npdtype)
    
# x, x_cdata = utils.copy_cma_ndarray(x, dtype)
# w_, w_cdata = utils.copy_cma_ndarray(w, dtype)

x, x_cdata = utils.malloc_cma_ndarray(x_size, dtype, npdtype)
w_, w_cdata = utils.malloc_cma_ndarray(w_size, dtype, npdtype)

cma alloc
cma alloc
cma alloc


In [12]:
%timeit -n 1 -o x.dot(w.T)

1 loop, best of 3: 66.5 µs per loop


<TimeitResult : 1 loop, best of 3: 66.5 µs per loop>

In [13]:
%timeit -n 1 -o mmult(x_cdata, w_cdata, y_cdata, x_nrows, w_nrows, x_ncols)

1 loop, best of 3: 31.5 ms per loop


<TimeitResult : 1 loop, best of 3: 31.5 ms per loop>