# SAA_top测试
## 1. 加载Overlay

In [15]:
import numpy as np
from pynq import allocate
import random
import time
import saa_top_driver
from saa_insn_driver import * 
# 创建 SaaDriver 实例
saa_driver = saa_top_driver.SaaDriver("saa_top.bit")

saa_top Overlay downloaded successfully!


## 2. 数据类型定义

In [16]:
# 定义指令的数据类型
Instruct_DataType = np.dtype([('low', np.uint64), ('high', np.uint64)]) # 定义一个由两个64位整数组成的128位数据类型
Input_DataType =  np.int8
Weight_DataType = np.int8
Output_DataType = np.int32

## 3.连续缓存申请

In [17]:
# 定义指令缓冲区大小
insn_count = 4000 # 最多能容纳2000条指令

# 定义buffer大小,这是执行一个批量的大小
row = 4
col = 4
col1 = 4

# 定义PS端缓冲区,不使用cache，数据类型注意
instruct_buffer = allocate(shape = (insn_count), cacheable = 0, dtype = Instruct_DataType)
input_buffer = allocate(shape = (row, col), cacheable = 0, dtype = Input_DataType)
weight_buffer = allocate(shape = (col, col1), cacheable = 0, dtype = Weight_DataType)
output_buffer  = allocate(shape = (row,col1), cacheable = 0, dtype = Input_DataType)

## 4.测试数据生成

In [18]:
# 随机生成矩阵并存储到相应的数据缓冲区中
np.random.seed(2)  # 设置随机种子以确保生成的随机数相同
input_buffer[:] = np.random.randint(0, 100, size=(row, col), dtype=np.int8)
weight_buffer[:] = np.random.randint(0, 100, size=(col, col1), dtype=np.int8)

# 将输入矩阵转换为np.int32类型，以避免溢出
input_buffer_int32 = input_buffer.astype(np.int32)
weight_buffer_int32 = weight_buffer.astype(np.int32)

# 打印生成的随机矩阵
print("Randomly generated input buffer:")
print(input_buffer)

print("\nRandomly generated weight buffer:")
print(weight_buffer)

# 定义input_buffer和weight_buffer的矩阵乘法结果的结果矩阵
pt0 = time.perf_counter()
result_matrix = np.dot(input_buffer_int32, weight_buffer_int32)
pt1 = time.perf_counter()
time_sw = pt1 - pt0
print("pure software: %fs" % time_sw)


# 打印矩阵乘法结果
print("Matrix multiplication result:")
print(result_matrix)


Randomly generated input buffer:
[[40 92 29 15]
 [10 97 47 25]
 [35  6 72 22]
 [46 54 12 43]]

Randomly generated weight buffer:
[[82 73 75  4]
 [24 57 29  7]
 [45 14 82 34]
 [82 16 84 49]]
pure software: 0.000515s
Matrix multiplication result:
[[ 8023  8810  9306  2525]
 [ 7313  7317  9517  3542]
 [ 8058  4257 10551  3708]
 [ 9134  7292  9612  3077]]


## 5.指令生成

In [21]:
# 定义本次执行的指令数量
now_insn_count = 6
instructions = [] #临时存储指令
# 加载矩阵A
load_A_insn = create_load_instruction(
    opcode=OPCODE_LOAD,
    buffer_id=INPUT_BUFFER_ID,  # 假设的buffer ID
    dram_addr=0,  # 假设的DRAM地址
    buffer_addr=0,  # 假设的buffer地址
    y_size=MATRIX_WIDTH,  # 假设的传输尺寸y
    x_size=MATRIX_WIDTH,  # 假设的传输尺寸x
    x_stride=MATRIX_WIDTH  # 假设的传输步进
)
instructions.append(load_A_insn)
# 加载矩阵B
load_B_insn = create_load_instruction(
    opcode=OPCODE_LOAD,
    buffer_id=WEIGHT_BUFFER_ID,  # 假设的buffer ID
    dram_addr=0,  # 假设的DRAM地址
    buffer_addr=0,  # 假设的buffer地址
    y_size=MATRIX_WIDTH,  # 假设的传输尺寸y
    x_size=MATRIX_WIDTH,  # 假设的传输尺寸x
    x_stride=MATRIX_WIDTH  # 假设的传输步进
)
instructions.append(load_B_insn)
# 计算A*B = C
# 预加载权重
compute_preload_insn = create_compute_instruction(
    OPCODE_COMPUTE,  # 操作码
    WEIGHT_PRELOAD,  # 计算类型
    0,  # 权重地址
    0,  # 输入地址
    0,  # 输出地址
    0,  # 权重切换
    0,  # 计算切换
    0  # 不累加
)
instructions.append(compute_preload_insn)
# 计算
compute_insn = create_compute_instruction(
    OPCODE_COMPUTE,  # 操作码
    COMPUTE,  # 计算类型
    0,  # 权重地址
    0,  # 输入地址
    0,  # 输出地址
    0,  # 权重切换
    0,  # 计算切换
    0  # 不累加
)
instructions.append(compute_insn)
# 缓存矩阵C
store_C_insn = create_load_instruction(
    opcode=OPCODE_STORE,
    buffer_id=INPUT_BUFFER_ID,  # 假设的buffer ID
    dram_addr=0,#设的DRAM地址
    buffer_addr=0,  # 假设的buffer地址
    y_size=1,#的传输尺寸y
    x_size=2*MATRIX_WIDTH,#设的传输尺寸x,
    x_stride=MATRIX_WIDTH  # 假设的传输步进
)
instructions.append(store_C_insn)
# 计算完成指令，使得软件运算结束
done_insn = create_load_instruction(
    opcode=OPCODE_DONE,
    buffer_id=0,  # 假设的buffer ID
    dram_addr=0,  # 假设的DRAM地址
    buffer_addr=0,  # 假设的buffer地址
    y_size=0,# 假设的传输尺寸y
    x_size=0,  # 假设的传输尺寸x
    x_stride=0  # 假设的传输步进
)
instructions.append(done_insn)
print(done_insn)

# 将生成的指令批量存入缓冲区，然后启动saa让其读取指令
for i, instruction in enumerate(instructions):
    instruct_buffer[i] = np.frombuffer(instruction, dtype=Instruct_DataType)
    print(instruct_buffer[i])
    print_binary(instruct_buffer[i]) # 输出指令的二进制表示


b'\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
(8, 17180131332)
00000000000000000000000000000100000000000000010000000000000001000000000000000000000000000000000000000000000000000000000000001000
(0, 17180131332)
00000000000000000000000000000100000000000000010000000000000001000000000000000000000000000000000000000000000000000000000000000000
(1, 0)
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001
(9, 0)
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001001
(10, 17180655617)
00000000000000000000000000000100000000000000110000000000000000010000000000000000000000000000000000000000000000000000000000001010
(3, 0)
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011


## 6.运行saa进行测试

In [22]:
# 测试写入done信号并且读取done信号
done_flag = saa_driver.read_ip_register(0x48) # 从computeIP的done寄存器读取本次指令是否执行完毕
print(done_flag)

# 测试运行saa硬件
wait_cycles = 100000 # 定义一次最多等待周期为1000万周期
pt0 = time.perf_counter()
saa_driver.run_saa(now_insn_count,
       instruct_buffer.physical_address,
       input_buffer.physical_address,
       weight_buffer.physical_address,
       output_buffer.physical_address,
       wait_cycles)
pt1 = time.perf_counter()
time_sw = pt1 - pt0
print("pure software: %fs" % time_sw)

# 打印
done_flag = saa_driver.read_ip_register(0x48) # 从computeIP的done寄存器读取本次指令是否执行完毕
print(done_flag)

print(instruct_buffer.physical_address)
print(input_buffer.physical_address)
print(weight_buffer.physical_address)
print(output_buffer.physical_address)
print(instruct_buffer)
print(input_buffer)
print(weight_buffer)
print(output_buffer)


1
done: 0
pure software: 0.002208s
1
907804672
1999147008
2000134144
1999179776
[(8, 17180131332) (0, 17180131332) (1,           0) ... (0,           0)
 (0,           0) (0,           0)]
[[40 92 29 15]
 [10 97 47 25]
 [35  6 72 22]
 [46 54 12 43]]
[[82 73 75  4]
 [24 57 29  7]
 [45 14 82 34]
 [82 16 84 49]]
[[40 92 29 15]
 [10 97 47 25]
 [35  6 72 22]
 [46 54 12 43]]


## 7. 回收缓冲区

In [30]:
# 查看完成后清空缓冲区
del output_buffer
del instruct_buffer