In [1]:
from pynq import Overlay
from pynq import allocate
from pynq import MMIO
import numpy as np
import time
import copy

In [2]:
hw = Overlay('RBM.bit')

In [3]:
# DMA init
mm2s_control_in = hw.dma_control_in.sendchannel
mm2s_sigmoid_switch_in = hw.dma_sigmoid_switch_in.sendchannel
mm2s_vector_in_len = hw.dma_vector_in_len_in.sendchannel
mm2s_vector_out_len = hw.dma_vector_out_len_in.sendchannel
mm2s_vector_in = hw.dma_vector_inout.sendchannel
s2mm_vector_out = hw.dma_vector_inout.recvchannel
mm2s_weight_in = hw.dma_weight_in.sendchannel
mm2s_bias_in = hw.dma_bias_in.sendchannel

In [4]:
# control signal value
CTRL_IDLE = np.uint8(0)
CTRL_SIZE_IN = np.uint8(1)
CTRL_WEIGHT_IN = np.uint8(2)
CTRL_BIAS_IN = np.uint8(3)
CTRL_START = np.uint8(4)

In [5]:
# RBM parameters and test data
weight_folder_path = './parameters/'
w_name_list = [
    ['w_128_16.npy', 128, 16, '128_16'],
    ['w_128_32.npy', 128, 32, '128_32'],
    ['w_128_64.npy', 128, 64, '128_64'],
    ['w_256_16.npy', 256, 16, '256_16'],
    ['w_256_32.npy', 256, 32, '256_32'],
    ['w_256_64.npy', 256, 64, '256_64'],
    ['w_256_128.npy', 256, 128, '256_128'],
    ['w_512_16.npy', 512, 16, '512_16'],
    ['w_512_32.npy', 512, 32, '512_32'],
    ['w_512_64.npy', 512, 64, '512_64'],
    ['w_512_128.npy', 512, 128, '512_128'],
    ['w_512_256.npy', 512, 256, '512_256']
]
hidden_bias_folder_path = './parameters/'
hb_name_list = [
    ['hb_128_16.npy', 128, 16, '128_16'],
    ['hb_128_32.npy', 128, 32, '128_32'],
    ['hb_128_64.npy', 128, 64, '128_64'],
    ['hb_256_16.npy', 256, 16, '256_16'],
    ['hb_256_32.npy', 256, 32, '256_32'],
    ['hb_256_64.npy', 256, 64, '256_64'],
    ['hb_256_128.npy', 256, 128, '256_128'],
    ['hb_512_16.npy', 512, 16, '512_16'],
    ['hb_512_32.npy', 512, 32, '512_32'],
    ['hb_512_64.npy', 512, 64, '512_64'],
    ['hb_512_128.npy', 512, 128, '512_128'],
    ['hb_512_256.npy', 512, 256, '512_256']
]
visible_bias_folder_path = './parameters/'
vb_name_list = [
    ['vb_128_16.npy', 128, 16, '128_16'],
    ['vb_128_32.npy', 128, 32, '128_32'],
    ['vb_128_64.npy', 128, 64, '128_64'],
    ['vb_256_16.npy', 256, 16, '256_16'],
    ['vb_256_32.npy', 256, 32, '256_32'],
    ['vb_256_64.npy', 256, 64, '256_64'],
    ['vb_256_128.npy', 256, 128, '256_128'],
    ['vb_512_16.npy', 512, 16, '512_16'],
    ['vb_512_32.npy', 512, 32, '512_32'],
    ['vb_512_64.npy', 512, 64, '512_64'],
    ['vb_512_128.npy', 512, 128, '512_128'],
    ['vb_512_256.npy', 512, 256, '512_256']
]
data_folder_path = './input data/'
data_name_list = [
    ['real_64.npy', 64, 'real_64'],
    ['real_128.npy', 128, 'real_128'],
    ['real_256.npy', 256, 'real_256'],
    ['real_512.npy', 512, 'real_512'],
    ['imaginary_64.npy', 64, 'imag_64'],
    ['imaginary_128.npy', 128, 'imag_128'],
    ['imaginary_256.npy', 256, 'imag_256'],
    ['imaginary_512.npy', 512, 'imag_512'],
]
def weight_read(weight_tag):
    weight_file_name = []
    weight = []
    for index1, element in enumerate(w_name_list):
        if weight_tag == element[3]:
            weight_file_name = element[0]
            break
    if weight_file_name == []:
        pass
    else:
        path_weight_file = weight_folder_path + weight_file_name
        weight = np.load(path_weight_file)
    return weight

def hidden_bias_read(hidden_bias_tag):
    hidden_bias_file_name = []
    hidden_bias = []
    for index1, element in enumerate(hb_name_list):
        if hidden_bias_tag == element[3]:
            hidden_bias_file_name = element[0]
            break
    if hidden_bias_file_name == []:
        pass
    else:
        path_hidden_bias_file = hidden_bias_folder_path + hidden_bias_file_name
        hidden_bias = np.load(path_hidden_bias_file)
    return hidden_bias

def visible_bias_read(visible_bias_tag):
    visible_bias_file_name = []
    visible_bias = []
    for index1, element in enumerate(vb_name_list):
        if visible_bias_tag == element[3]:
            visible_bias_file_name = element[0]
            break
    if visible_bias_file_name == []:
        pass
    else:
        path_visible_bias_file = visible_bias_folder_path + visible_bias_file_name
        with open(path_visible_bias_file, encoding='utf-8') as file:
            visible_bias = file.read()
        visible_bias = visible_bias.split()
        visible_bias = list(map(lambda vb: float(vb), visible_bias))
        visible_bias = np.array(visible_bias)
        visible_bias = visible_bias.reshape([element[1], 1])
    return visible_bias

def visible_bias_read(visible_bias_tag):
    visible_bias_file_name = []
    visible_bias = []
    for index1, element in enumerate(vb_name_list):
        if visible_bias_tag == element[3]:
            visible_bias_file_name = element[0]
            break
    if visible_bias_file_name == []:
        pass
    else:
        path_visible_bias_file = visible_bias_folder_path + visible_bias_file_name
        visible_bias = np.load(path_visible_bias_file)
    return visible_bias

def data_read(data_tag):
    data_file_name = []
    data = []
    for index1, element in enumerate(data_name_list):
        if data_tag == element[2]:
            data_file_name = element[0]
            break
    if data_file_name == []:
        pass
    else:
        path_data_file = data_folder_path + data_file_name
        data = np.load(path_data_file)
    return data

In [6]:
def RBM_init(vector_in_len, vector_out_len, sigmoid_switch, weight, bias):
    # memory allocate
    control_in_Buf = allocate(shape=(2,), dtype = np.uint8)
    sigmoid_switch_Buf = allocate(shape=(2,), dtype = np.uint8)
    vector_in_len_Buf = allocate(shape=(2,), dtype = np.uint16)
    vector_out_len_Buf = allocate(shape=(2,), dtype = np.uint16)
    weight_in_Buf = allocate(shape=(vector_in_len * vector_out_len), dtype = np.float32)
    bias_in_Buf = allocate(shape=(vector_out_len), dtype = np.float32)
    
    # sigmoid switch dma input
    sigmoid_switch_Buf[:] = np.uint8(sigmoid_switch)
    mm2s_sigmoid_switch_in.transfer(sigmoid_switch_Buf)
    mm2s_sigmoid_switch_in.wait()
    
    # RBM size dma input
    control_in_Buf[:] =  np.uint8(CTRL_SIZE_IN)  # CTRL_SIZE_IN
    mm2s_control_in.transfer(control_in_Buf)
    mm2s_control_in.wait()
    
    vector_in_len_Buf[:] = np.uint16(vector_in_len)
    mm2s_vector_in_len.transfer(vector_in_len_Buf)
    mm2s_vector_in_len.wait()
    
    vector_out_len_Buf[:] = np.uint16(vector_out_len)
    mm2s_vector_out_len.transfer(vector_out_len_Buf)
    mm2s_vector_out_len.wait()
    
    # weight dma input
    control_in_Buf[:] = np.uint8(CTRL_WEIGHT_IN)
    mm2s_control_in.transfer(control_in_Buf)
    mm2s_control_in.wait()
    
    weight_in_Buf[:] = np.float32(weight.reshape(-1,)[:])
    mm2s_weight_in.transfer(weight_in_Buf)
    mm2s_weight_in.wait()
    
    # bias dma input
    control_in_Buf[:] = np.uint8(CTRL_BIAS_IN)
    mm2s_control_in.transfer(control_in_Buf)
    mm2s_control_in.wait()
    
    bias_in_Buf[:] = np.float32(bias.reshape(-1)[:])
    mm2s_bias_in.transfer(bias_in_Buf)
    mm2s_bias_in.wait()
    
    # IDLE input
    control_in_Buf[:] = np.uint8(CTRL_IDLE)
    mm2s_control_in.transfer(control_in_Buf)
    mm2s_control_in.wait()

In [7]:
def sigmoid(vector_in):
    vector_out = 1/(1+np.exp(-vector_in))
    return vector_out

def RBM_computing_numpy(vector_in, weight, bias, vector_in_len, vector_out_len, N, sigmoid_en):
    vector_out = np.zeros([N * vector_out_len, 1])
    time_start = time.time()
    for i1 in np.arange(N):
        vector_out[i1*vector_out_len:(i1+1)*vector_out_len] = np.matmul(vector_in[i1*vector_in_len:(i1+1)*vector_in_len].T, weight[:, :]).T
        vector_out[i1*vector_out_len:(i1+1)*vector_out_len] = vector_out[i1*vector_out_len:(i1+1)*vector_out_len] + bias[:]
        if sigmoid_en != 0:
            vector_out[i1*vector_out_len:(i1+1)*vector_out_len] = sigmoid(vector_out[i1*vector_out_len:(i1+1)*vector_out_len])
    time_end = time.time()
    time_cost = time_end - time_start
    return vector_out, time_cost

def RBM_computing_fpga(vector_in, control_in_Buf, vector_in_Buf, vector_out_Buf):
    control_in_Buf[:] =  np.uint8(CTRL_START) 
    vector_in_Buf[:] = np.int16(vector_in.reshape(-1)[:])
    mm2s_control_in.wait()
    time_start = time.time()
    mm2s_control_in.transfer(control_in_Buf)
    mm2s_vector_in.transfer(vector_in_Buf)
    s2mm_vector_out.transfer(vector_out_Buf)
    mm2s_vector_in.wait()
    s2mm_vector_out.wait()
    time_end = time.time()
    time_cost = time_end - time_start
    vector_out = np.array(vector_out_Buf)
    return vector_out, time_cost

In [8]:
def numpy_fpga_test(vector_in_len, vector_out_len, sigmoid_switch):
    N = int(32768)
    M = int(32768 / N)
    vector_in_len = 128
    vector_out_len = 16
    sigmoid_switch = 1

    parameters_tag = str(vector_in_len) + '_' + str(vector_out_len)
    weight = weight_read(parameters_tag)
    bias = hidden_bias_read(parameters_tag)
    data_tag = 'real_' + str(vector_in_len)
    data = data_read(data_tag)
    RBM_init(vector_in_len, vector_out_len, sigmoid_switch, weight, bias)

    RBM_np_out = np.zeros([32768, vector_out_len])
    # np_start_time = time.time()
    np_time_cost = 0
    for index1 in np.arange(M):
        RBM_in = data[index1*N:(index1+1)*N, :]
        vector_in = RBM_in.reshape(-1, 1)[:]
        vector_out, time_cost = RBM_computing_numpy(vector_in, weight, bias, vector_in_len, vector_out_len, N, sigmoid_switch)
        RBM_np_out[index1*N:(index1+1)*N, :] = vector_out.reshape(-1, vector_out_len)
        np_time_cost = np_time_cost + time_cost
    # np_end_time = time.time()
    # np_time_cost = np_end_time - np_start_time

    RBM_fpga_out = np.zeros([32768, vector_out_len])
    control_in_Buf = allocate(shape=(2,), dtype = np.uint8)
    vector_in_Buf = allocate(shape=(vector_in_len*N,), dtype = np.int16)
    vector_out_Buf = allocate(shape=(vector_out_len*N,), dtype = np.float32)
    # fpga_start_time = time.time()
    fpga_time_cost = 0
    for index1 in np.arange(M):
        RBM_in = data[index1*N:(index1+1)*N, :]
        vector_in = RBM_in.reshape(-1, 1)[:]
        vector_out, time_cost = RBM_computing_fpga(vector_in, control_in_Buf, vector_in_Buf, vector_out_Buf)
        RBM_fpga_out[index1*N:(index1+1)*N, :] = vector_out.reshape(-1, vector_out_len)
        fpga_time_cost = fpga_time_cost + time_cost
    # fpga_end_time = time.time()
    # fpga_time_cost = fpga_end_time - fpga_start_time
    absolute_error = np.sum(abs(RBM_np_out - RBM_fpga_out))/np.sum(abs(RBM_np_out))
    print(f'Numpy time cost = {np_time_cost}s')
    print(f'FPGA time cost = {fpga_time_cost}s')
    print(f'Numpy throughput = {vector_in_len*32768/np_time_cost/1e6}M')
    print(f'FPGA throughput cost = {vector_in_len*32768/fpga_time_cost/1e6}M')
    print(f'acceleration ratio = {np_time_cost/fpga_time_cost}')
    print(f'error = {absolute_error}')

In [9]:
vector_in_len = 128
vector_out_len = 16
sigmoid_switch = 1
numpy_fpga_test(vector_in_len, vector_out_len, sigmoid_switch)

  vector_out = 1/(1+np.exp(-vector_in))


Numpy time cost = 12.284239530563354s
FPGA time cost = 0.08551740646362305s
Numpy throughput = 0.341437822794363M
FPGA throughput cost = 49.04620209435551M
acceleration ratio = 143.64607205187824
error = 4.593223112589297e-08


In [10]:
vector_in_len = 128
vector_out_len = 32
sigmoid_switch = 1
numpy_fpga_test(vector_in_len, vector_out_len, sigmoid_switch)

  vector_out = 1/(1+np.exp(-vector_in))


Numpy time cost = 11.976669549942017s
FPGA time cost = 0.08833551406860352s
Numpy throughput = 0.3502062056993387M
FPGA throughput cost = 47.48151458928061M
acceleration ratio = 135.58159112133137
error = 4.593223112589297e-08


In [11]:
vector_in_len = 128
vector_out_len = 64
sigmoid_switch = 1
numpy_fpga_test(vector_in_len, vector_out_len, sigmoid_switch)

  vector_out = 1/(1+np.exp(-vector_in))


Numpy time cost = 11.556419849395752s
FPGA time cost = 0.0882720947265625s
Numpy throughput = 0.36294146930109217M
FPGA throughput cost = 47.51562782091616M
acceleration ratio = 130.91815579083837
error = 4.593223112589297e-08


In [12]:
vector_in_len = 256
vector_out_len = 16
sigmoid_switch = 1
numpy_fpga_test(vector_in_len, vector_out_len, sigmoid_switch)

  vector_out = 1/(1+np.exp(-vector_in))


Numpy time cost = 11.684956073760986s
FPGA time cost = 0.08829855918884277s
Numpy throughput = 0.3589490601011731M
FPGA throughput cost = 47.5013866424446M
acceleration ratio = 132.33461770050573
error = 4.593223112589297e-08


In [13]:
vector_in_len = 256
vector_out_len = 32
sigmoid_switch = 1
numpy_fpga_test(vector_in_len, vector_out_len, sigmoid_switch)

  vector_out = 1/(1+np.exp(-vector_in))


Numpy time cost = 12.23525357246399s
FPGA time cost = 0.08833789825439453s
Numpy throughput = 0.34280482829056175M
FPGA throughput cost = 47.48023309227132M
acceleration ratio = 138.50514687624826
error = 4.593223112589297e-08


In [14]:
vector_in_len = 256
vector_out_len = 64
sigmoid_switch = 1
numpy_fpga_test(vector_in_len, vector_out_len, sigmoid_switch)

  vector_out = 1/(1+np.exp(-vector_in))


Numpy time cost = 12.025245904922485s
FPGA time cost = 0.08817458152770996s
Numpy throughput = 0.3487915368352741M
FPGA throughput cost = 47.56817585441999M
acceleration ratio = 136.37996003580014
error = 4.593223112589297e-08


In [15]:
vector_in_len = 512
vector_out_len = 16
sigmoid_switch = 1
numpy_fpga_test(vector_in_len, vector_out_len, sigmoid_switch)

  vector_out = 1/(1+np.exp(-vector_in))


Numpy time cost = 12.260785818099976s
FPGA time cost = 0.0884552001953125s
Numpy throughput = 0.3420909607447968M
FPGA throughput cost = 47.41726875004312M
acceleration ratio = 138.6101189192686
error = 4.593223112589297e-08


In [16]:
vector_in_len = 512
vector_out_len = 32
sigmoid_switch = 1
numpy_fpga_test(vector_in_len, vector_out_len, sigmoid_switch)

  vector_out = 1/(1+np.exp(-vector_in))


Numpy time cost = 12.25828218460083s
FPGA time cost = 0.08831477165222168s
Numpy throughput = 0.34216082945691956M
FPGA throughput cost = 47.49266653280744M
acceleration ratio = 138.8021726747278
error = 4.593223112589297e-08


In [17]:
vector_in_len = 512
vector_out_len = 64
sigmoid_switch = 1
numpy_fpga_test(vector_in_len, vector_out_len, sigmoid_switch)

  vector_out = 1/(1+np.exp(-vector_in))


Numpy time cost = 11.975296974182129s
FPGA time cost = 0.08850908279418945s
Numpy throughput = 0.3502463453760366M
FPGA throughput cost = 47.388402044036916M
acceleration ratio = 135.300204183884
error = 4.593223112589297e-08
