In [5]:
import numpy as np
import cffi
from pynq import Overlay
# load Base Overlay
Overlay("/home/xilinx/pynq/bitstream/base.bit").download()

from pynq.drivers import xlnk
import chainer
from chainer import links as L
from chainer import functions as F

%matplotlib inline
import matplotlib.pyplot as plt
import sys

In [6]:
train, test = chainer.datasets.get_mnist()

x, label = train[0]

img = x.reshape(28,28)

In [7]:
input_image = x[np.newaxis,:]
input_image = input_image.astype(np.float32)
input_image = chainer.Variable(input_image)

# CPU 

In [8]:
class MLP(chainer.Chain):

    def __init__(self, n_units, n_out):
        super(MLP, self).__init__(
            # the size of the inputs to each layer will be inferred
            l1=L.Linear(None, n_units),  # n_in -> n_units
            l2=L.Linear(None, n_units),  # n_units -> n_units
            l3=L.Linear(None, n_out),  # n_units -> n_out
        )

    def __call__(self, x):
        h1 = F.relu(self.l1(x))
        h2 = F.relu(self.l2(h1))
        h3 = self.l3(h2)
        return h3

In [9]:
mlp_cpu = MLP(n_units=32, n_out=10)

resume = "./mnist_iter_12000.npz"
chainer.serializers.load_npz(resume, mlp_cpu)

y = F.argmax(F.softmax(mlp_cpu(input_image)))

print("result", y.data)
print("seikai", label)

result 5
seikai 5


# FPGA

In [11]:
# 実際はon-the-fly
bitfile = "./bitstream.bit"
libfile = "./src/libaccel.so"
ffi = cffi.FFI()
ffi.cdef("void _Z18_p0_mmult_accel1_0PfS_S_iii(float*, float*, float*, int, int, int);")
lib = ffi.dlopen(libfile)
Overlay(bitfile).download()

In [19]:
memmanager = xlnk.xlnk()

def init_contiguous_ndarray(size=(32,32), dtype="float"):
    buf_size = size[0]*size[1]
    buf = memmanager.cma_alloc(buf_size, data_type=dtype)
    v_cdata = ffi.buffer(buf,  buf_size * ffi.sizeof(dtype))
    v = np.frombuffer(v_cdata, dtype=np.float32).reshape(size)
    return v, v_cdata

In [20]:
class _Linear(L.Linear):
    def __call__(self, x):
        
        if self.has_uninitialized_params:
            self._initialize_params(x.size // x.shape[0])

        #y = Linear()(x, self.W)
        y = self.linear(x.data, self.W.data)

        return chainer.Variable(y)

    def linear(self, x, w):
        import overlay, cffi
        self.ffi = cffi.FFI()
        x_nrows, x_ncols = x.shape
        w_nrows, w_ncols = w.shape
        #y = np.zeros((x_nrows, w_nrows)).astype(np.float32) #XXX
        y, y_cdata = init_contiguous_ndarray((x_nrows, w_nrows))

        x_cdata = self.ffi.from_buffer(x.data)
        w_cdata = self.ffi.from_buffer(w.data)
        #y_cdata = self.ffi.from_buffer(y.data)
        lib.mmult_accel1(x_cdata, w_cdata, y_cdata, x_nrows, w_nrows, x_ncols)
        return y


In [21]:
class MLP_FPGA(chainer.Chain):

    def __init__(self, n_units, n_out):
        super(MLP_FPGA, self).__init__(
            # the size of the inputs to each layer will be inferred
            l1=_Linear(None, n_units),  # n_in -> n_units
            l2=_Linear(None, n_units),  # n_units -> n_units
            l3=_Linear(None, n_out),  # n_units -> n_out
        )

    def __call__(self, x):
        h1 = F.relu(self.l1(x))
        h2 = F.relu(self.l2(h1))
        h3 = self.l3(h2)
        print("call")
        print(x.shape)
        print(h1.shape)
        print(h1.data)
        print(h2.shape)
        print(h2.data)
        print(h3.shape)
        print(h3.data)
        print("call done")
        return h3

In [22]:
mlp_fpga = MLP_FPGA(n_units=32, n_out=10)

resume = "./mnist_iter_12000.npz"
chainer.serializers.load_npz(resume, mlp_fpga)

# 入力画像をCMA-arrayに変換
input_image, _ = init_contiguous_ndarray((1, 784))

In [29]:
# モデル内のVariableをCMA-arrayに変換

def cma_from_array(array):
    shape = array.shape
    dstarray, cbuf = init_contiguous_ndarray(shape)
    # copy
    return dstarray, cbuf

for p in mlp_fpga.namedparams():
    print(p)


('/l2/b', <variable b>)
('/l2/W', <variable W>)
('/l1/b', <variable b>)
('/l1/W', <variable W>)
('/l3/b', <variable b>)
('/l3/W', <variable W>)


In [32]:
shape = mlp_fpga.l2.W.data.shape
ndarray, cbuf = init_contiguous_ndarray(shape)


(32, 32)

In [None]:
h = mlp_fpga(input_image)
#print(h.data)
h = F.softmax(h)
print(h.data)
y = F.argmax(h)
print("result", y.data)
print("seikai", label)

# Benchmark 

In [None]:
 %timeit -n 10 -o mlp_cpu(input_image)

In [None]:
 %timeit -n 10 -o mlp_fpga(input_image)