# SAA GEMM测试
## 1. 加载Overlay

In [1]:
import time
import random
from pynq import Overlay
import numpy as np
from pynq import Xlnk
        
# 加载Overlay
overlay = Overlay("saa.bit")
print("Overlay downloaded successfully!")



Overlay downloaded successfully!


## 2. 定义IP寄存器映射驱动

In [2]:
systolic_array_ip = overlay.systolic_array_0
xlnk = Xlnk()

# 脉动阵列驱动函数
def RunSystolic(array, din_a, din_b, bias, out):
    array.write(0x10, din_a.shape[0])
    array.write(0x18, din_a.shape[1])
    array.write(0x20, din_b.shape[1])
    array.write(0x28, din_a.physical_address)
    array.write(0x30, din_b.physical_address)
    array.write(0x38, bias.physical_address)
    array.write(0x40, out.physical_address)
    array.write(0, (array.read(0) & 0x80) | 0x01)
    tp = array.read(0)
    while not ((tp >> 1) & 0x1):
        tp = array.read(0)

# 脉动阵列驱动函数



## 3.连续缓存申请

## 4.测试数据生成

## 3. 生成测试数据

In [3]:
row = 128
col = 16
col1 = 200

ref = np.zeros((row, col1), dtype = np.float32)

# 在PS端的DRAM中为IP核的输入输出数据分配存储空间
buf_a = xlnk.cma_array(shape = (row, col), cacheable = 0, dtype = np.float32)
buf_b = xlnk.cma_array(shape = (col, col1), cacheable = 0, dtype = np.float32)
buf_c = xlnk.cma_array(shape = (row, col1), cacheable = 0, dtype = np.float32)
bias  = xlnk.cma_array(shape = (row), cacheable = 0, dtype = np.float32)

# 随机生成测试数据
for r in range(row):
    for c in range(col):
        buf_a[r][c] = random.uniform(-20, 20)
    
    bias[r] = 0.0
        
for r in range(col):
    for c in range(col1):
        buf_b[r][c] = random.uniform(-5, 5)

print("ready for test")

ready for test


## 4. 运行测试
### 4.1 软件矩阵乘法

In [4]:
pt0 = time.clock()

for r in range(row):
    for c1 in range(col1):
        tmp = 0.0
        for c in range(col):
            tmp += buf_a[r][c] * buf_b[c][c1]
        ref[r][c1] = tmp

pt1 = time.clock()
time_sw = pt1 - pt0

print("pure software: %fs" % time_sw)

pure software: 117.487336s


### 4.2 硬件矩阵乘法

In [5]:
pt0 = time.clock()

RunSystolic(systolic_array_ip, buf_a, buf_b, bias, buf_c)

pt1 = time.clock()
time_hw = pt1 - pt0

print("hardware-accelerated: %fs" % time_hw)
print("speedup: %.2f" % (time_sw/time_hw))

hardware-accelerated: 0.491987s
speedup: 238.80


### 4.3 校验结果，计算加速比

In [6]:
def relative_err(ref, val):
    err = val - ref if val > ref else ref - val
    return err/ref if ref != 0 else err

flag = True

for r in range(row):
    if flag is False:
        break
    for c in range(col1):
        if relative_err(ref[r][c], buf_c[r][c]) > 0.01:
            print("Test failed at (%d, %d)" % (r, c))
            flag = False
            break

if flag:
    print("Test Passed!")
    
print("\nreference result: ")
print(ref)
print("\narray output:")
print(buf_c)

Test Passed!

reference result: 
[[-157.63024902 -204.63713074  -12.40508175 ..., -128.27319336
   249.55308533 -206.39657593]
 [ 182.0716095   -82.07845306 -174.98095703 ..., -134.14312744
    72.68138885 -209.85731506]
 [  73.51898193 -231.73474121 -141.91290283 ...,   95.98344421
   105.55813599  -79.91520691]
 ..., 
 [  63.57735062   22.40550995 -137.68212891 ...,  106.6142807    18.72444916
    18.9588356 ]
 [  18.03356743   15.57955837   84.51091766 ...,   -0.42201138
    78.21536255    3.26641178]
 [-131.02407837  145.26586914  218.48310852 ...,  -68.29304504 -236.0149231
    46.97134018]]

array output:
[[-157.63024902 -204.63711548  -12.40507889 ..., -128.27319336
   249.55307007 -206.39657593]
 [ 182.0716095   -82.07844543 -174.98097229 ..., -134.14312744
    72.68139648 -209.85728455]
 [  73.51898193 -231.73477173 -141.91290283 ...,   95.98344421
   105.55813599  -79.91521454]
 ..., 
 [  63.57734299   22.40550232 -137.68212891 ...,  106.6142807    18.72445297
    18.95882797