# SAA_top测试
## 1. 加载Overlay

In [12]:
import numpy as np
from pynq import allocate

# 假设矩阵和块的大小
matrix_rows, matrix_cols = 11, 11  # 矩阵大小，这次尝试一个不能整除的情况
block_size = 4  # 块大小为 2x2

# 创建一个示例矩阵
matrix = np.arange(matrix_rows * matrix_cols).reshape(matrix_rows, matrix_cols)
print("Original matrix:")
print(matrix)

# 计算填充后的新大小
new_rows = matrix_rows + (block_size - matrix_rows % block_size) % block_size
new_cols = matrix_cols + (block_size - matrix_cols % block_size) % block_size

# 创建填充后的矩阵
padded_matrix = np.zeros((new_rows, new_cols), dtype=matrix.dtype)
padded_matrix[:matrix_rows, :matrix_cols] = matrix  # 将原矩阵复制到填充矩阵中
print("Padded matrix:")
print(padded_matrix)

# 分配连续缓冲区，这次根据填充后的尺寸来分配
buffer_size = new_rows * new_cols  # 缓冲区大小等于填充后矩阵的元素个数
buffer = allocate(shape=(buffer_size,), dtype=np.int32)

# 矩阵按块大小分割并打包到缓冲区
def pack_matrix_to_buffer(matrix, block_size, buffer):
    rows, cols = matrix.shape
    buffer_index = 0
    for block_row in range(0, rows, block_size):
        for block_col in range(0, cols, block_size):
            # 提取块数据，使用np.pad确保块的大小为block_size x block_size
            block = matrix[block_row:block_row+block_size, block_col:block_col+block_size]
            if block.shape[0] < block_size or block.shape[1] < block_size:
                # 如果块小于block_size，使用0填充。
                block = np.pad(block, ((0, block_size-block.shape[0]), (0, block_size-block.shape[1])), 'constant')
            buffer[buffer_index:buffer_index+block.size] = block.flatten() #二维块降到一维填充
            buffer_index += block.size

# 执行打包操作
pack_matrix_to_buffer(padded_matrix, block_size, buffer)

# 将连续缓冲区的内容打印出来，以验证结果
print("Buffer content (flattened):")
print(buffer)

Original matrix:
[[  0   1   2   3   4   5   6   7   8   9  10]
 [ 11  12  13  14  15  16  17  18  19  20  21]
 [ 22  23  24  25  26  27  28  29  30  31  32]
 [ 33  34  35  36  37  38  39  40  41  42  43]
 [ 44  45  46  47  48  49  50  51  52  53  54]
 [ 55  56  57  58  59  60  61  62  63  64  65]
 [ 66  67  68  69  70  71  72  73  74  75  76]
 [ 77  78  79  80  81  82  83  84  85  86  87]
 [ 88  89  90  91  92  93  94  95  96  97  98]
 [ 99 100 101 102 103 104 105 106 107 108 109]
 [110 111 112 113 114 115 116 117 118 119 120]]
Padded matrix:
[[  0   1   2   3   4   5   6   7   8   9  10   0]
 [ 11  12  13  14  15  16  17  18  19  20  21   0]
 [ 22  23  24  25  26  27  28  29  30  31  32   0]
 [ 33  34  35  36  37  38  39  40  41  42  43   0]
 [ 44  45  46  47  48  49  50  51  52  53  54   0]
 [ 55  56  57  58  59  60  61  62  63  64  65   0]
 [ 66  67  68  69  70  71  72  73  74  75  76   0]
 [ 77  78  79  80  81  82  83  84  85  86  87   0]
 [ 88  89  90  91  92  93  94  95  96  97 

## 1. 加载Overlay

In [1]:
import numpy as np
from pynq import allocate
import random
import time
import saa_top_driver
from saa_insn_driver import * 
from saa_utils import * 
# 创建 SaaDriver 实例
saa_driver = saa_top_driver.SaaDriver("saa_top.bit")

saa_top Overlay downloaded successfully!


## 3.连续缓存申请

In [5]:
# 定义指令缓冲区大小
insn_count = 4000 # 最多能容纳2000条指令
block_size = 2*MATRIX_WIDTH # 以脉动阵列大小作为分块
# 定义buffer大小,这是执行一个批量的大小
row =  6*MATRIX_WIDTH
col =  6*MATRIX_WIDTH
col1 = 6*MATRIX_WIDTH

# 定义PS端缓冲区,不使用cache，数据类型注意
# instruct_buffer = allocate(shape = (insn_count), cacheable = 0, dtype = Instruct_DataType)
input_buffer = allocate(shape = (row*col), cacheable = 0, dtype = Input_DataType)
weight_buffer = allocate(shape = (col*col1), cacheable = 0, dtype = Weight_DataType)
bias_buffer  = allocate(shape = (row*col1), cacheable = 0, dtype = Output_DataType)
output_buffer  = allocate(shape = (row*col1), cacheable = 0, dtype = Output_DataType)

## 4.测试数据生成

In [6]:
# 随机生成矩阵并存储到相应的数据缓冲区中
np.random.seed(2)  # 设置随机种子以确保生成的随机数相同
input_matrix = np.random.randint(0, 100, size=(row, col), dtype=Input_DataType)
weight_matrix = np.random.randint(0, 100, size=(col, col1), dtype=Weight_DataType)
bias_matrix = np.random.randint(0, 100, size=(row, col1), dtype=Output_DataType)
print("Randomly generated input buffer:")
print(input_matrix)
print("\nRandomly generated weight buffer:")
print(weight_matrix)
print("\nRandomly generated bias buffer:")
print(bias_matrix)

# 执行打包操作
pack_matrix_to_buffer(input_matrix, MATRIX_WIDTH, input_buffer)
pack_matrix_to_buffer(weight_matrix, MATRIX_WIDTH, weight_buffer)
pack_matrix_to_buffer(bias_matrix, MATRIX_WIDTH, bias_buffer)
print("Packed input buffer:")
print(input_buffer)
print("Packed weight buffer:")
print(weight_buffer)
print("Packed bias buffer:")
print(bias_buffer)

# 将输入矩阵转换为np.int32类型，以避免溢出
input_matrix_int32 = input_matrix.astype(np.int32)
weight_matrix_int32 = weight_matrix.astype(np.int32)
bias_matrix_int32 = bias_matrix.astype(np.int32)
# 定义input_buffer和weight_buffer的矩阵乘法结果的结果矩阵
pt0 = time.perf_counter()
result_matrix = np.dot(input_matrix_int32, weight_matrix_int32) + bias_matrix_int32
pt1 = time.perf_counter()
time_sw = pt1 - pt0
print("pure software: %fs" % time_sw)
# 打印矩阵乘法结果
print("Matrix multiplication result:")
print(result_matrix)

Randomly generated input buffer:
[[40 92 29 15 10 97 47 25]
 [35  6 72 22 46 54 12 43]
 [30 82 73 75  4 24 57 29]
 [ 7 45 14 82 34 82 16 84]
 [49  1  8 39 95 90 99 52]
 [75 42 50 85 10  8 30 47]
 [20 53 30 63 43 54 76 31]
 [52 90 74 78 68 20 25 33]]

Randomly generated weight buffer:
[[32  2 31 78 81 37 91  7]
 [39 74 90 46 67 84 34 52]
 [33  4 91  3 42 83 56 22]
 [55 47 51 80 65 56 57 11]
 [73 38 44 66 31 30 90 33]
 [62 84 11 58 78  6 83 69]
 [64 56 88 67 93 78 45 69]
 [37 99 20 88 84 57 68 78]]

Randomly generated bias buffer:
[[46 70 95 83 31 66 80 52]
 [76 50  4 90 63 79 49 39]
 [46  8 50 15  8 17 22 73]
 [57 90 62 83 96 43 32 26]
 [ 8 76 10 40 34 60  9 70]
 [86 70 19 56 82  1 68 40]
 [81 61 70 97 18 84 90 87]
 [22 43 52 74 72 90 99 91]]
Packed input buffer:
[40 92 29 15 35  6 72 22 30 82 73 75  7 45 14 82 10 97 47 25 46 54 12 43
  4 24 57 29 34 82 16 84 49  1  8 39 75 42 50 85 20 53 30 63 52 90 74 78
 95 90 99 52 10  8 30 47 43 54 76 31 68 20 25 33]
Packed weight buffer:
[32  2 31

## 分块矩阵乘法

In [7]:
# 执行分块矩阵乘法
blocked_gemm_test(saa_driver,
              row, 
              col1, 
              col, 
              input_buffer, 
              weight_buffer,
              bias_buffer, 
              output_buffer, 
              block_size, 
              1)
# 解包并检查输出
print("output_buffer result:")
print(output_buffer)
output_matrix = np.zeros((row, col1), dtype=Output_DataType)
unpack_buffer_to_matrix(output_matrix, MATRIX_WIDTH, output_buffer)
print("un_pack result:")
print(output_matrix)
output_buffer[:]=0
# del output_buffer

INFO - Blocked GEMM test: dim_I=8, dim_J=8, dim_K=8, block=8, bias_use=1
insn_idx: 7
done: 3
INFO - Saa run time: 0.001419s
INFO - Synchronization time: 1.418724ms
INFO - Throughput: 0.001083GOPs/s
output_buffer result:
[17373 21414 19162 20357 14081 13099 13837 15828 17239 18184 24864 20373
 18706 24736 15271 24765 25975 18494 22093 18135 18120 14177 20810 11848
 25606 24637 20792 14961 25590 17384 23955 18061 24799 23975 19258 29751
 15334 14908 19117 22058 19741 20599 21197 23015 20953 19700 26548 24959
 30481 18881 31184 21302 24268 20581 22076 11402 25761 20603 22863 17143
 27257 26075 27630 15544]
un_pack result:
[[17373 21414 19162 20357 25975 18494 22093 18135]
 [14081 13099 13837 15828 18120 14177 20810 11848]
 [17239 18184 24864 20373 25606 24637 20792 14961]
 [18706 24736 15271 24765 25590 17384 23955 18061]
 [24799 23975 19258 29751 30481 18881 31184 21302]
 [15334 14908 19117 22058 24268 20581 22076 11402]
 [19741 20599 21197 23015 25761 20603 22863 17143]
 [20953 19700 26

In [9]:
# 随机生成矩阵并存储到相应的数据缓冲区中
np.random.seed(3)  # 设置随机种子以确保生成的随机数相同
input_matrix = np.random.randint(0, 100, size=(row, col), dtype=Input_DataType)
weight_matrix = np.random.randint(0, 100, size=(col, col1), dtype=Weight_DataType)
print("Randomly generated input buffer:")
print(input_matrix)
print("\nRandomly generated weight buffer:")
print(weight_matrix)

# 执行打包操作
pack_matrix_to_buffer(input_matrix, MATRIX_WIDTH, input_buffer)
pack_matrix_to_buffer(weight_matrix, MATRIX_WIDTH, weight_buffer)
print("Packed input buffer:")
print(input_buffer)
print("Packed input buffer:")
print(weight_buffer)

# 将输入矩阵转换为np.int32类型，以避免溢出
input_matrix_int32 = input_matrix.astype(np.int32)
weight_matrix_int32 = weight_matrix.astype(np.int32)
# 定义input_buffer和weight_buffer的矩阵乘法结果的结果矩阵
pt0 = time.perf_counter()
result_matrix = np.dot(input_matrix_int32, weight_matrix_int32)
pt1 = time.perf_counter()
time_sw = pt1 - pt0
print("pure software: %fs" % time_sw)
# 打印矩阵乘法结果
print("Matrix multiplication result:")
print(result_matrix)

Randomly generated input buffer:
[[23  1 13 24  6 27 18 44 73 53  3  6 87 56 59 74]
 [72 99 15 31  0 25 69  2 21 99 62 17 43 24 19 74]
 [99 74 43 76  4 91 38 32 41 70 10 10 94 13 53 21]
 [27 63 44 13 38 23 96 88 20 95 49 44 37  7 93 54]
 [37 39  5 14 25  3 77 46 38 26 14 48 55 81 74 71]
 [90  5 25 90 22 10 31 45 66 99 29  2 73 66 23 63]
 [31 78 60 45 35  6  1 50 13 68 51 15 17 15 90  3]
 [65 69 94 94 66 27  1 18 68 97  2 13 29 25 21 72]
 [38 78 24 49 57 44 22 62 19 65  7 42 97 70 40 40]
 [43 44 97 33 30 11 89 65 79 53 66 71 48 80 76 81]
 [37 87 78 20 28 65 94 59 82 56 49 31 21 21 82 99]
 [ 1 55 78 86 54 70  5 28 86 90 31  9 36 54 86 33]
 [44  0 68 71 83 64 34 32 21 79 18 33  3 88 69 36]
 [63 55 68 96 37 62 57 56  8 27 23 56 71 69 39 37]
 [ 1 73 38 78 14 46 62 44 54 33  2 15 92  8  5  1]
 [44 29 57 85 31 86 80 48 17 51 17 16 74 12 62  7]]

Randomly generated weight buffer:
[[31 68 75 16 28 99 50 74 91 10 15 52 16 42 89 84]
 [39 80 75 92 32 78 37 41 16 43 20 22 19 94 18 41]
 [18 72 75 31

In [10]:
# 执行分块矩阵乘法
blocked_gemm_test(saa_driver,
              row, 
              col1, 
              col, 
              input_buffer, 
              weight_buffer,
              bias_buffer, 
              output_buffer, 
              block_size, 
              0)
# 检查输出
print(output_buffer)

INFO - Blocked GEMM test: dim_I=16, dim_J=16, dim_K=16, block=8, bias_use=0
compute_insn_count: 65
insn_size: 130
insn_idx: 90
done: 2
INFO - Saa run time: 0.000494s
INFO - Synchronization time: 0.494095ms
INFO - Throughput: 0.024870GOPs/s
[[24987 29480 25807 28555 27965 32766 27694 13375 23178 19190 22626 30452
  23071 30627 28060 22068]
 [34062 35328 40731 29885 20304 39995 27828 24688 25476 26291 25811 38416
  26828 35212 33943 36255]
 [35518 42848 45164 34488 28638 45081 38295 22877 35412 31074 27533 38223
  30317 48155 39227 34637]
 [38598 39159 42572 33418 33310 42301 37854 29117 32001 31627 36325 44065
  28905 46872 37960 40761]
 [30109 33346 28580 29936 33739 37535 32295 21067 25283 24686 25860 33029
  24698 37175 32950 29071]
 [39235 36115 40943 32164 29017 44383 35203 23209 31738 24161 32013 44321
  31642 35284 34913 35455]
 [28696 28542 35681 30552 24798 31369 32104 18886 23136 22893 28179 28754
  20099 35092 22651 26708]
 [38936 39875 47241 38118 24559 45479 35422 18750 267

## 7. 回收缓冲区

In [5]:
# 查看完成后清空缓冲区
del output_buffer

In [None]:
# instruct_buffer = allocate(shape = (1), cacheable = 0, dtype = Instruct_DataType)
# instructions = [] #临时存储指令
# # insn_test = getWeightPreloadComputeInsn(
# #     1,
# #     1,
# #     1,
# #     1,
# #     1,
# #     1)

# # insn_test = getWeightPreloadInsn(1, 1)

# # insn_test = get2DLoadStoreInsn(
# #                           1, 
# #                           1, 
# #                           1, 
# #                           1, 
# #                           1, 
# #                           1, 
# #                           1)

# insn_test = getComputeInsn(1, 
#                         1, 
#                         1, 
#                         1)
    
# instructions.append(insn_test)
# # 将生成的指令批量存入缓冲区，然后启动saa让其读取指令
# for i, instruction in enumerate(instructions):
#     instruct_buffer[i] = np.frombuffer(instruction, dtype=Instruct_DataType)
#     print(instruct_buffer[i])
#     print_binary(instruct_buffer[i]) # 输出指令的二进制表示


# from pynq import allocate
# import time
# import numpy as np
# wait_cycles = 100000 # 定义一次最多等待周期为1000万周期
# def blocked_gemm_test(saa_driver,
#               dim_I, 
#               dim_J, 
#               dim_K, 
#               input, 
#               weight,
#               bias, 
#               output, 
#               block, 
#               bias_use):
    
#     print("=====================================================================================")
#     print(f"INFO - Blocked GEMM test: dim_I={dim_I}, dim_J={dim_J}, dim_K={dim_K}, block={block}, bias_use={bias_use}")
    
#     # 计算分块
#     dim_I_block = dim_I // MATRIX_WIDTH
#     dim_J_block = dim_J // MATRIX_WIDTH
#     dim_K_block = dim_K // MATRIX_WIDTH

#     # 计算指令数量
#     insn_load_size = (dim_I_block * dim_K_block) + (dim_J_block * dim_K_block)
#     insn_compute_size = 2 * dim_I_block * dim_K_block * dim_J_block  # 不使用权重复用
# #     insn_compute_size = (dim_I_block + 1) * dim_K_block * dim_J_block  # 使用权重复用
# #     insn_compute_size = dim_I_block * dim_K_block * dim_J_block + 1  # 使用权重复用和双缓冲
#     insn_store_size = dim_I_block * dim_J_block
#     insn_size = insn_load_size + insn_store_size + insn_compute_size + 1

#     # 初始化指令队列
#     insn_buf = allocate(shape = (insn_size), cacheable = 0, dtype = Instruct_DataType)
#     insn_idx = 0
    
#     # 生成加载Input指令
#     for i in range(dim_I_block):
#         for k in range(dim_K_block):
#             buffer_start = 0
#             dram_start = 0
#             A_block = i*dim_K_block+k
#             buffer_offset = buffer_start + A_block * MATRIX_WIDTH
#             dram_offset = dram_start + i * dim_K_block * MATRIX_WIDTH * MATRIX_WIDTH + k * MATRIX_WIDTH
#             insn_buf[insn_idx] = get2DLoadStoreInsn(OPCODE_LOAD, 
#                                       INPUT_BUFFER_ID, 
#                                       buffer_offset, 
#                                       dram_offset, 
#                                       MATRIX_WIDTH, 
#                                       MATRIX_WIDTH, 
#                                       dim_K)
#             insn_idx += 1

#     # 生成加载weight指令
#     for k in range(dim_K_block):
#         for j in range(dim_J_block):
#             buffer_start = 0
#             dram_start = 0
#             A_block = k * dim_J_block + j
#             buffer_offset = buffer_start + A_block * MATRIX_WIDTH
#             dram_offset = dram_start + k * dim_J_block * MATRIX_WIDTH * MATRIX_WIDTH + j * MATRIX_WIDTH 
#             insn_buf[insn_idx] = get2DLoadStoreInsn(OPCODE_LOAD, 
#                                       WEIGHT_BUFFER_ID, 
#                                       buffer_offset, 
#                                       dram_offset, 
#                                       MATRIX_WIDTH, 
#                                       MATRIX_WIDTH, 
#                                       dim_J)
#             insn_idx += 1
    
#     # 生成计算指令
#     # 用于切换权重寄存器，最先使用 weight1
#     pingpang = 0
#     wb_start_addr = 0
#     input_start_addr = 0
#     output_start_addr = 0
#     weight_offset = 0
#     output_offset = 0
#     input_offset = 0
#     accumulate = 0
    
#     # 初始化指令计数
#     compute_count = insn_idx

#     # 迭代公共维度块和输出列块
#     for k in range(dim_K_block):
#         for j in range(dim_J_block):
#             # 计算权重偏移
#             weight_offset = wb_start_addr + (k * dim_J_block + j) * MATRIX_WIDTH
#             accumulate = 0 if k == 0 else 1

#             # 第一次加载权重，使用初始寄存器，无法双缓冲
#             if k == 0 and j == 0:
#                 insn_buf[insn_idx] = getWeightPreloadInsn(weight_offset, pingpang)
#                 insn_idx += 1
#             else:
#                 # 剩下的权重加载可以进行双缓冲
# #                 insn_buf[insn_idx] = getWeightPreloadComputeInsn(
# #                     input_offset,
# #                     weight_offset,
# #                     output_offset,
# #                     pingpang,
# #                     pingpang,
# #                     accumulate)
# #                 insn_idx += 1
    
# #                 insn_buf[insn_idx] = getComputeInsn(input_offset, 
# #                                         output_offset, 
# #                                         pingpang, 
# #                                         accumulate)
# #                 insn_idx += 1
#                 insn_buf[insn_idx] = getComputeInsn(input_offset, 
#                                         output_offset, 
#                                         pingpang, 
#                                         accumulate)
#                 insn_idx += 1
#                 insn_buf[insn_idx] = getWeightPreloadInsn(weight_offset, not pingpang)
#                 insn_idx += 1
                
#                 pingpang = not pingpang  # 切换加载寄存器和计算寄存器


#             # 迭代输出行块
#             for i in range(dim_I_block):
#                 output_offset = output_start_addr + (i * dim_J_block + j) * MATRIX_WIDTH
#                 input_offset = input_start_addr + (i * dim_K_block + k) * MATRIX_WIDTH

#                 # 如果不是最后一个计算，使用 getComputeInsn 计算
#                 if i != dim_I_block - 1:
#                     insn_buf[insn_idx] = getComputeInsn(input_offset, 
#                                             output_offset, 
#                                             pingpang, 
#                                             accumulate)
#                     insn_idx += 1
#                 # 如果是最后一个权重块，使用当前寄存器进行计算
#                 if i == dim_I_block - 1 and j == dim_J_block - 1 and k == dim_K_block - 1:
#                     insn_buf[insn_idx] = getComputeInsn(input_offset, 
#                                             output_offset, 
#                                             pingpang, 
#                                             accumulate)
#                     insn_idx += 1
# #             if k != 0 and j != 0:
# #             insn_buf[insn_idx] = getComputeInsn(input_offset, 
# #                                     output_offset, 
# #                                     pingpang, 
# #                                     accumulate)
#             insn_idx += 1
    
    
#     # 更新计算指令的数量
#     compute_count = insn_idx - compute_count
#     print(f"compute_count: {compute_count}")    

    
#     # 生成存储指令
#     for i in range(dim_I_block):
#         for j in range(dim_J_block):
#             buffer_start = 0
#             dram_start = 0
#             A_block = i * dim_J_block + j
#             buffer_offset = buffer_start + A_block * MATRIX_WIDTH
#             dram_offset = dram_start + i * dim_J_block * MATRIX_WIDTH * MATRIX_WIDTH + j * MATRIX_WIDTH
#             insn_buf[insn_idx] = get2DLoadStoreInsn(OPCODE_STORE, 
#                                        OUTPUT_BUFFER_ID, 
#                                        buffer_offset, 
#                                        dram_offset, 
#                                        MATRIX_WIDTH, 
#                                        MATRIX_WIDTH, 
#                                        dim_J)
#             insn_idx += 1
            
#     # 生成结束指令
#     insn_buf[insn_idx] = getFinishInsn()
#     insn_idx += 1

#     print("insn_size",insn_size)
#     print("insn_idx",insn_idx)
#     print("insn",insn_idx)
#     for i in range(insn_idx):
#         if i>=insn_load_size and i<insn_load_size+compute_count:
#             print_binary(insn_buf[i])
            
#     # 运行SAA硬件
#     pt0 = time.perf_counter()
#     saa_driver.run_saa(insn_idx,
#            insn_buf.physical_address,
#            input.physical_address,
#            weight.physical_address,
#            output.physical_address,
#            wait_cycles)
#     pt1 = time.perf_counter()
#     time_sw = pt1 - pt0
#     print("saa run time: %fs" % time_sw)     
    
#     # 计算吞吐量
    
    
#     return 0 
