In [1]:
import numpy as np
import cffi
from pynq import Overlay
# load Base Overlay
Overlay("/home/xilinx/pynq/bitstream/base.bit").download()

from pynq.drivers import xlnk
import chainer
from chainer import links as L
from chainer import functions as F
from chainer import Variable

%matplotlib inline
import matplotlib.pyplot as plt
import sys

In [2]:
train, test = chainer.datasets.get_mnist()

x, label = train[0]

img = x.reshape(28,28)

In [3]:
input_image = x[np.newaxis,:]
input_image = input_image.astype(np.float32)
input_image = chainer.Variable(input_image)

# CPU 

In [4]:
class MLP(chainer.Chain):

    def __init__(self, n_units, n_out):
        super(MLP, self).__init__(
            # the size of the inputs to each layer will be inferred
            l1=L.Linear(None, n_units),  # n_in -> n_units
            l2=L.Linear(None, n_units),  # n_units -> n_units
            l3=L.Linear(None, n_out),  # n_units -> n_out
        )

    def __call__(self, x):
        h1 = F.relu(self.l1(x))
        h2 = F.relu(self.l2(h1))
        h3 = self.l3(h2)
        return h3

In [5]:
mlp_cpu = MLP(n_units=32, n_out=10)

resume = "../examples/mnist/mnist_iter_12000.npz"
chainer.serializers.load_npz(resume, mlp_cpu)

h = F.softmax(mlp_cpu(input_image))
y = F.argmax(h)

print(h.data)
print("result", y.data)
print("seikai", label)

[[  1.72933567e-13   1.29591760e-08   4.09196055e-08   2.45058932e-03
    3.96322804e-23   9.97549355e-01   1.99584192e-15   1.14123750e-12
    1.81231957e-11   1.20258378e-11]]
result 5
seikai 5


# FPGA

In [6]:
# 実際はon-the-fly
bitfile = "../pynq_chainer/HLS/bitstream.bit"
libfile = "../pynq_chainer/HLS/src/libaccel.so"
ffi = cffi.FFI()
ffi.cdef("int _Z18_p0_mmult_accel1_0PfS_S_iii(float*, float*, float*, int, int, int);")
lib = ffi.dlopen(libfile)
accFn = lib._Z18_p0_mmult_accel1_0PfS_S_iii
Overlay(bitfile).download()

In [7]:
memmanager = xlnk.xlnk()

def init_contiguous_ndarray(size=(32,32), dtype="float"):
    buf_size = size[0]*size[1]
    buf = memmanager.cma_alloc(buf_size, data_type=dtype)
    v_cdata = ffi.buffer(buf,  buf_size * ffi.sizeof(dtype))
    v = np.frombuffer(v_cdata, dtype=np.float32).reshape(size)
    #print(v, buf, v_cdata)
    print("cma alloc")
    return v, buf

In [8]:
x_nrows, x_ncols = 32, 32
w_nrows, w_ncols = 32, 32
y, y_cdata = init_contiguous_ndarray((x_nrows, w_nrows))
x, x_cdata = init_contiguous_ndarray((x_nrows, x_ncols))
w, w_cdata = init_contiguous_ndarray((w_nrows, w_ncols))
accFn(x_cdata, w_cdata, y_cdata, x_nrows, w_nrows, x_ncols)

cma alloc
cma alloc
cma alloc


0

In [9]:
def copy_cma(array):
    x, cdata = init_contiguous_ndarray(array.shape)
    np.copyto(x, array)
    return x, cdata

In [17]:
memmanager.xlnk_reset()
for i in range(5):
    x_nrows, x_ncols = 3, 2
    w_nrows, w_ncols = 2, 2
    
    x = np.random.uniform(-1,1,(x_nrows, x_ncols))
    w = np.random.uniform(-1,1,(w_nrows, w_ncols))

    x, x_cdata = copy_cma(x)
    w, w_cdata = copy_cma(w)
    #y, y_cdata = init_contiguous_ndarray((x_nrows, w_nrows))
    y, y_cdata = init_contiguous_ndarray((w_nrows, x_nrows))
    
    
    #y = np.asfortranarray(y)
    accFn(x_cdata, w_cdata, y_cdata, x_nrows, w_nrows, x_ncols)
    y = y.T
    #y = np.asfortranarray(y)
    y_ = x.dot(w.T)
    
    if np.allclose(y, y_, rtol=1e-04, atol=1e-04):
        print('OK')
    else:
        print("NG")
        print(y.shape)
        print(y_)
        print(y)
        print(y.flags)
    

cma alloc
cma alloc
cma alloc
NG
(3, 2)
[[-1.29007959 -0.35883552]
 [ 0.32835293 -0.15911043]
 [-0.47456002 -0.64056128]]
[[ 0.  0.]
 [ 0.  0.]
 [ 0.  0.]]
  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False
cma alloc
cma alloc
cma alloc
NG
(3, 2)
[[-1.04454648 -0.74541163]
 [-0.02386647  0.3104001 ]
 [-0.40054369 -0.04571752]]
[[ 0.  0.]
 [ 0.  0.]
 [ 0.  0.]]
  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False
cma alloc
cma alloc
cma alloc
NG
(3, 2)
[[-0.64585561  0.78773791]
 [-0.40645465  0.73415315]
 [-0.42389965  0.18171066]]
[[-0.64585561  0.78773791]
 [-0.40645465  0.73415315]
 [-0.42389965  0.18171066]]
  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False
cma alloc
cma alloc
cma alloc
NG
(3, 2)
[[-0.6050719   0.51083308]
 [ 0.68736738  1.21400881]
 [ 0.39497739 -0.1391260

In [10]:
class _Linear(L.Linear):
    def __call__(self, x):
        
        if self.has_uninitialized_params:
            print('init')
            self._initialize_params(x.size // x.shape[0])
            
        if not hasattr(x, "cdata"):
            # copy
            #print("input x is not CMA. copy...")
            x_, x_cdata = init_contiguous_ndarray(x.shape)
            x_ = Variable(x_)
            x_.cdata = x_cdata
            np.copyto(x_.data, x.data)
            #print(x.data)
            x = x_
            
            #print(x_.data)

        #y = Linear()(x, self.W)
        #print(type(x), type(self.W))
        #y = self.linear(x.data, self.W.data)
        y = self.linear(x, self.W)

        return y #chainer.Variable(y)

    def linear(self, x, w):
        #import cffi
        self.ffi = ffi #cffi.FFI()
        
        if False:
            import overlay, cffi
            self.accel_fun =lib.mmult_accel1
        else:
            self.accel_fun = accFn #lib.__Z12mmult_accel1PfS_S_iii
        
        x_nrows, x_ncols = x.shape
        w_nrows, w_ncols = w.shape
        #y = np.zeros((x_nrows, w_nrows)).astype(np.float32) #XXX

        #x_cdata = self.ffi.from_buffer(x.data)
        #w_cdata = self.ffi.from_buffer(w.data) # not contigunous ?
        x_cdata = x.cdata
        w_cdata = w.cdata
        
        # tukaimawasitai
        y, y_cdata = init_contiguous_ndarray((x_nrows, w_nrows))
        y = Variable(y)
        y.cdata = y_cdata
        
        self.accel_fun(x_cdata, w_cdata, y_cdata, x_nrows, w_nrows, x_ncols)
        #print(y.cdata)
        return y
    
    
    def add_param(self, name, shape, dtype=np.float32, initializer=None):
        """Registers a parameter to the link.
        """
        d = self.__dict__
        if name in d:
            raise AttributeError(
                'cannot register a new parameter %s: attribute exists'
                % name)
        if initializer is not None:
            raise AttributeError('initializer is not supported')
        
        #data = self.xp.full(shape, numpy.nan, dtype=dtype)
        
        data, cdata = init_contiguous_ndarray(shape)
        var = Variable(data, volatile='auto', name=name)
        var.cdata = cdata
        print('init model cma array', name, cdata, var)
        
        self._params.append(name)
        d[name] = var
        if name in self._uninitialized_params:
            del self._uninitialized_params[name]



In [11]:
class MLP_FPGA(chainer.Chain):

    def __init__(self, n_units, n_out):
        super(MLP_FPGA, self).__init__(
            # the size of the inputs to each layer will be inferred
            l1=_Linear(None, n_units, nobias=True),  # n_in -> n_units
            l2=_Linear(None, n_units, nobias=True),  # n_units -> n_units
            l3=_Linear(None, n_out, nobias=True),  # n_units -> n_out
        )

    def __call__(self, x):
        if False:
            h1 = self.l1(x)
            print(h1.data)
            h2 = self.l2(h1)
            h3 = self.l3(h2)
            return h3
        
        h = self.l1(x)
        print(h)
        h1 = F.relu(h)
        print("h1 done")
        h2 = F.relu(self.l2(h1))
        print("h2 done")
        h3 = self.l3(h2)
        print("h3 done")
        print(x.shape)
        print(h1.shape)
        print(h1.data)
        print(h2.shape)
        print(h2.data)
        print(h3.shape)
        print(h3.data)
        print("call done")
        return h3

In [12]:
memmanager.xlnk_reset()
mlp_fpga = MLP_FPGA(n_units=32, n_out=10)

resume = "../examples/mnist/mnist_iter_12000.npz"
chainer.serializers.load_npz(resume, mlp_fpga)

cma alloc
init model cma array W <cdata 'float *' 0x30218000> W
cma alloc
init model cma array W <cdata 'float *' 0x36964000> W
cma alloc
init model cma array W <cdata 'float *' 0x36963000> W


In [13]:
# 入力画像をCMA-arrayに変換
if False:
    in_fpga, cdata = init_contiguous_ndarray((1, 784))
    in_fpga = Variable(in_fpga)
    in_fpga.cdata = cdata
    print('init input cma array', cdata)
    np.copyto(in_fpga.data, input_image.data)
else:
    in_fpga = input_image


In [14]:
memmanager.cma_stats()

{'Buffer Count': 3,
 'CMA Memory Available': 21188608,
 'CMA Memory Usage': 105728}

In [15]:
h = mlp_fpga(in_fpga)
#print(h.data)
h = F.softmax(h)
print(h.data)
y = F.argmax(h)
print("result", y.data)
print("seikai", label)

cma alloc
cma alloc
<var@302c4910>
h1 done
cma alloc
cma alloc
h2 done
cma alloc
cma alloc
h3 done
(1, 784)
(1, 32)
[[ 3.05095577  0.          2.55680013  4.07168865  1.99023378  1.09072948
   0.          4.49700737  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.        ]]
(1, 32)
[[ 1.10642385  0.          0.08190703  0.          0.          2.12742639
   4.6531868   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.        ]]
(1, 10)
[[-1.36190712  3.29412651 -0.30608779  1.43432307 -2.61085033 -0.78231484
  -3.06695724 -0.66578758 -0.97282892  1.06473207]

In [16]:
[[ 3.05095577 -2.61691952  2.55680013  4.07168865  1.99023378  1.09072948
  -1.82831872  4.49700737  5.57241726 -4.70780516 -2.25808644 -4.83468103
   3.18601227 -3.28089046 -0.62773103 -0.6149773  -0.7455532   2.0017643
   0.73404318  3.90470552  3.48775458  3.08954358 -0.47251615 -0.90851831
  -0.327535    3.60635543  3.32248259 -3.12803578  3.87468791  6.82556391
  -0.1521184   4.46334553]]
cma alloc

SyntaxError: invalid syntax (<ipython-input-16-94e6e27230f3>, line 1)

In [None]:
mlp_fpga.l1.W.data

# Benchmark 

In [None]:
 %timeit -n 2 -o mlp_cpu(input_image)

In [None]:
 %timeit -n 2 -o mlp_fpga(in_fpga)

In [None]:
memmanager.xlnk_reset()