###  mmult on PYNQ
##### Source : https://github.com/tkat0/pynqmmult

* SDSoC付属の内積演算サンプルをPYNQで動かしてみた
* ドライバは共有ライブラリとしてビルドし、CFFIを利用してPythonから呼び出す
* CMAでアロケートした連続領域をnumpyのndarrayとして扱えるようにした
* 開発環境は、SDSoC 2015.4

In [4]:
import numpy as np
import cffi
from pynq import Overlay
# load Base Overlay
Overlay("/home/xilinx/pynq/bitstream/base.bit").download()

In [5]:
from pynq import xlnk
mmu = xlnk.Xlnk()

In [6]:
# SDSoCでビルドした共有ライブラリから、HW化した関数の名前を把握する
# SW版の関数名は、mmult_accelだが、HW化すると_p0_mmult_accel_0となる。
# さらに、コンパイラの名前マングリングにより_Z17_p0_mmult_accel_0PfS_S_になるよう
!nm -C ../src/libpynqmmult.so | grep mmult_accel
!echo ---
!nm -D ../src/libpynqmmult.so | grep mmult_accel

00012670 B _p0_mmult_accel_1_noasync_num_C
000125f8 D _p0_swinst_mmult_accel_1
000125c8 D _p0_swinst_mmult_accel_1_cmd_mmult_accel_info
000120cc D _p0_swinst_mmult_accel_1_cmd_mmult_accel_sg_list
000120d0 D _sds__p0_mmult_accel_1
00001018 T mmult_accel(float*, float*, float*)
00001284 T _p0_mmult_accel_1_noasync(float*, float*, float*)
---
00012670 B _p0_mmult_accel_1_noasync_num_C
000125f8 D _p0_swinst_mmult_accel_1
000125c8 D _p0_swinst_mmult_accel_1_cmd_mmult_accel_info
000120cc D _p0_swinst_mmult_accel_1_cmd_mmult_accel_sg_list
000120d0 D _sds__p0_mmult_accel_1
00001018 T _Z11mmult_accelPfS_S_
00001284 T _Z25_p0_mmult_accel_1_noasyncPfS_S_


### Call Accelerator

In [7]:
class Mmult():
    def __init__(self):
        self.bitfile = "./pynqmmult.bit"
        self.libfile = "../src/libpynqmmult.so"
        self.ffi = cffi.FFI()
        self.ffi.cdef("int _Z25_p0_mmult_accel_1_noasyncPfS_S_(float*, float*, float*);")
        self.lib = self.ffi.dlopen(self.libfile)
        Overlay(self.bitfile).download()
    
    def __call__(self, a, b, c):
        # a,b,c is CData Object
        self.lib._Z25_p0_mmult_accel_1_noasyncPfS_S_(a,b,c)


In [8]:
mmult = Mmult()

In [9]:
# SDSoCにより高位合成したHWにDMA(not SG)経由で読み書きするメモリ領域は連続領域である必要がある
# 連続領域は、CMAのAPIにより、確保する
# 連続領域をndarrayとして扱う

ffi = cffi.FFI()

 # TODO 現状32x32の配列のみ対応。汎用化する
def init_contiguous_ndarray(size=(32,32), dtype="float"):
    buf = mmu.cma_alloc(32*32, data_type=dtype)
    cbuf = ffi.buffer(buf,  32*32 * ffi.sizeof(dtype))
    return np.frombuffer(cbuf, dtype=np.float32).reshape(size), buf

In [10]:
a, pa = init_contiguous_ndarray()
b, pb = init_contiguous_ndarray()
c, pc = init_contiguous_ndarray()

In [11]:
a += 1
b += 2

print("A", a.shape, type(a))
print(a,pa)
print("B", b.shape, type(b))
print(b,pb)
print("C", c.shape, type(c))
print(c,pc)

A (32, 32) <class 'numpy.ndarray'>
[[ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 ..., 
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]] <cdata 'float *' 0x36f64000>
B (32, 32) <class 'numpy.ndarray'>
[[ 2.  2.  2. ...,  2.  2.  2.]
 [ 2.  2.  2. ...,  2.  2.  2.]
 [ 2.  2.  2. ...,  2.  2.  2.]
 ..., 
 [ 2.  2.  2. ...,  2.  2.  2.]
 [ 2.  2.  2. ...,  2.  2.  2.]
 [ 2.  2.  2. ...,  2.  2.  2.]] <cdata 'float *' 0x36f4b000>
C (32, 32) <class 'numpy.ndarray'>
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]] <cdata 'float *' 0x36f4a000>


In [12]:
mmu.cma_stats()

{'Buffer Count': 3,
 'CMA Memory Available': 90435584,
 'CMA Memory Usage': 12288}

In [13]:
mmult(pa, pb, pc)

In [14]:
print("C", c.shape, type(c))
print(c)

C (32, 32) <class 'numpy.ndarray'>
[[ 64.  64.  64. ...,  64.  64.  64.]
 [ 64.  64.  64. ...,  64.  64.  64.]
 [ 64.  64.  64. ...,  64.  64.  64.]
 ..., 
 [ 64.  64.  64. ...,  64.  64.  64.]
 [ 64.  64.  64. ...,  64.  64.  64.]
 [ 64.  64.  64. ...,  64.  64.  64.]]


### Test

In [15]:
# For comparison
np.dot(a,b)

array([[ 64.,  64.,  64., ...,  64.,  64.,  64.],
       [ 64.,  64.,  64., ...,  64.,  64.,  64.],
       [ 64.,  64.,  64., ...,  64.,  64.,  64.],
       ..., 
       [ 64.,  64.,  64., ...,  64.,  64.,  64.],
       [ 64.,  64.,  64., ...,  64.,  64.,  64.],
       [ 64.,  64.,  64., ...,  64.,  64.,  64.]], dtype=float32)

In [10]:
# SWとHWの計算結果の全ての要素が一致することを確認
if np.alltrue(c == np.dot(a, b)):
    print("OK")
else:
    print("NG")

OK


### Benchmarks

In [11]:
# HW
t_hw = %timeit -n 100 -o mmult(pa, pb, pc)

100 loops, best of 3: 33.6 µs per loop


In [12]:
# SW
t_sw = %timeit -n 100 -o np.dot(a, b)

100 loops, best of 3: 7.36 ms per loop
