# SAA_top测试
## 1. 加载Overlay

Original matrix:
[[  0   1   2   3   4   5   6   7   8   9  10]
 [ 11  12  13  14  15  16  17  18  19  20  21]
 [ 22  23  24  25  26  27  28  29  30  31  32]
 [ 33  34  35  36  37  38  39  40  41  42  43]
 [ 44  45  46  47  48  49  50  51  52  53  54]
 [ 55  56  57  58  59  60  61  62  63  64  65]
 [ 66  67  68  69  70  71  72  73  74  75  76]
 [ 77  78  79  80  81  82  83  84  85  86  87]
 [ 88  89  90  91  92  93  94  95  96  97  98]
 [ 99 100 101 102 103 104 105 106 107 108 109]
 [110 111 112 113 114 115 116 117 118 119 120]]
Padded matrix:
[[  0   1   2   3   4   5   6   7   8   9  10   0]
 [ 11  12  13  14  15  16  17  18  19  20  21   0]
 [ 22  23  24  25  26  27  28  29  30  31  32   0]
 [ 33  34  35  36  37  38  39  40  41  42  43   0]
 [ 44  45  46  47  48  49  50  51  52  53  54   0]
 [ 55  56  57  58  59  60  61  62  63  64  65   0]
 [ 66  67  68  69  70  71  72  73  74  75  76   0]
 [ 77  78  79  80  81  82  83  84  85  86  87   0]
 [ 88  89  90  91  92  93  94  95  96  97 

## 1. 加载Overlay

In [11]:
import numpy as np
from pynq import allocate
import random
import time
from saa_insn_driver_last import * 
from saa_utils import * 
from pynq import Overlay

# 加载Overlay
overlay = Overlay("saa1.bit")
print("saa Overlay downloaded successfully!")

# 定义写入IP寄存器的函数，可以对IP的对应位置进行写入
def write_ip_register(ip, offset, value):
    """
    向指定IP核的寄存器写入值。
    
    参数:
    ip -- IP核实例
    offset -- 寄存器的偏移地址
    value -- 要写入的值
    """
    # 假设IP核实例有一个名为'write'的方法来写入寄存器
    ip.write(offset, value)

def read_ip_register(ip, offset):
    """
    从指定IP核的寄存器读取值。
    
    参数:
    ip -- IP核实例
    offset -- 寄存器的偏移地址
    
    返回值:
    寄存器中的值
    """
    # 通过寄存器偏移地址直接访问字典属性
    return ip.read(offset)

# 定义已有的偏移类，用于存储
class RegisterOffset:
    # 定义每个寄存器间的间隔为8字节
    REGISTER_OFFSET = 0x08
    # fetch模块
    FETCH_INSN_COUNT_OFFSET = 0x10  # fetch模块的指令数量寄存器
    FETCH_INSN_ADDR_OFFSET = 0x18  # fetch模块的指令地址寄存器
    # load模块
    LOAD_INP_ADDR_OFFSET = 0x10  # load模块的输入缓冲区地址8字节64位
    LOAD_WGT_ADDR_OFFSET = 0x1c  # load模块的权重缓冲区地址8字节64位
    # compute模块
    COMPUTE_DONE_OFFSET = 0x10  # compute模块的done信号
    COMPUTE_DONE_CTRL_OFFSET = 0x14  # compute模块的done信号
    COMPUTE_UOP_OFFSET = 0x20  # compute模块的done信号
    COMPUTE_BIAS_OFFSET = 0x2c  # compute模块的done信号
    # store模块
    STORE_OUT_ADDR_OFFSET = 0x10  # store模块的输出缓冲区地址8字节64位

# 从overlay获取IP实例,也就是handle
fetch_ip = overlay.fetch_0
load_ip = overlay.load_0
compute_ip = overlay.compute_0
store_ip = overlay.store_0

# 查看各IP寄存器映射
fetch_ip.register_map
load_ip.register_map
compute_ip.register_map
store_ip.register_map


# 使用写入寄存器函数，对四个IP进行配置
# 配置和VTA不同，我们的三个缓冲区的物理起始地址是有值的，
# 这是因为我使用memcpy时，指令中的dram_base代表的是dram的索引而不是首地址
# 因此传入指令时要传入索引，索引按照dram存储数据大小寻址
# 因此真正的数组首地址就是这里定义的物理地址
def RunSaa(insn_count,
           insn_phy_addr,
           uop_phy_addr,
           input_phy_addr,
           weight_phy_addr,
           bias_phy_addr, 
           output_phy_addr,
           wait_cycles):
    """
    向saa提交指令等待一次大批量指令执行完成,注意要有done信号表示计算完成以退出RunSaa(暂时没有)
    
    参数:
    insn_count -- 这一次批量执行的指令数量
    insn_phy_addr -- 这一次执行的指令的缓冲区首地址
    input_phy_addr -- 这一次执行的指令的输入缓冲区首地址
    weight_phy_addr -- 这一次执行的指令的权重缓冲区首地址
    output_phy_addr -- 这一次执行的指令的输出缓冲区首地址
    wait_cycles -- 最大等待的时间周期,可以设置很大很大,查询done信号等待这一批指令执行完成
    """
    # 配置各IP的寄存器
    # 配置fetch
    write_ip_register(fetch_ip,RegisterOffset.FETCH_INSN_COUNT_OFFSET,insn_count) # 配置指令数量寄存器
    write_ip_register(fetch_ip,RegisterOffset.FETCH_INSN_ADDR_OFFSET,insn_phy_addr) # 配置指令物理地址寄存器，也就是指令缓冲区物理首地址
    # 配置load
    write_ip_register(load_ip,RegisterOffset.LOAD_INP_ADDR_OFFSET,input_phy_addr) # 配置输入缓冲区物理地址
    write_ip_register(load_ip,RegisterOffset.LOAD_WGT_ADDR_OFFSET,weight_phy_addr) # 配置权重缓冲区物理地址
    # 配置compute
    write_ip_register(compute_ip,RegisterOffset.COMPUTE_UOP_OFFSET,uop_phy_addr) # 配置uop缓冲区物理地址
    write_ip_register(compute_ip,RegisterOffset.COMPUTE_BIAS_OFFSET,bias_phy_addr) # 配置bias缓冲区物理地址
    # 配置store
    write_ip_register(store_ip,RegisterOffset.STORE_OUT_ADDR_OFFSET,output_phy_addr) # 配置输出缓冲区物理地址

    #写入各IP控制寄存器，启动IP进行计算
    write_ip_register(fetch_ip,0x0,0x1) # 指令寄存器写入0x1启动本次模块
    write_ip_register(load_ip,0x0,0x81) # 加载寄存器写入0x81使得模块可以多次自动启动计算指令
    write_ip_register(compute_ip,0x0,0x81) # 计算寄存器写入0x81使得模块可以多次自动启动计算指令
    write_ip_register(store_ip,0x0,0x81) # 存储寄存器写入0x81使得模块可以多次自动启动计算指令

    #延时1微秒使得设备响应
    time.sleep(0.000001) # 让出CPU，等待0.000001秒（1u秒）
    
    # 读取compute的done信号是否完成
    for t in range(0, wait_cycles):
        done_flag = read_ip_register(compute_ip,RegisterOffset.COMPUTE_DONE_OFFSET) # 从computeIP的done寄存器读取本次指令是否执行完毕
        if done_flag == 0x1: # 如果done_flag被置为1，代表这次执行的是FINISH指令，本批次指令执行完毕
            print("done：",t)
            break
        else:
            time.sleep(0.0000001) # 让出CPU，等待0.000001秒（1u秒）

    # 根据是否超时返回，如果没超时返回0，超时返回1
    return 0 if t < wait_cycles else 1

# # 测试写入done信号并且读取done信号
# done_flag = read_ip_register(compute_ip,0x14) #读取done寄存器
# print(done_flag)
# write_ip_register(compute_ip,RegisterOffset.COMPUTE_DONE_OFFSET,0x1) # 写入1 
# done_flag = read_ip_register(compute_ip,0x14) #读取done寄存器
# print(done_flag)

 

saa Overlay downloaded successfully!


## 3.连续缓存申请

In [12]:
# 定义指令缓冲区大小
insn_count = 4000 # 最多能容纳2000条指令
block_size = 2*MATRIX_WIDTH # 以脉动阵列大小作为分块
# 定义buffer大小,这是执行一个批量的大小
row = 200*MATRIX_WIDTH
col = 200*MATRIX_WIDTH
col1 =200*MATRIX_WIDTH

# 定义PS端缓冲区,不使用cache，数据类型注意
# instruct_buffer = allocate(shape = (insn_count), cacheable = 0, dtype = Instruct_DataType)
input_buffer = allocate(shape = (row,col), cacheable = 0, dtype = Input_DataType)
weight_buffer = allocate(shape = (col,col1), cacheable = 0, dtype = Weight_DataType)
output_buffer  = allocate(shape = (row,col1), cacheable = 0, dtype = Output_DataType)
bias_buffer  = allocate(shape = (row,col1), cacheable = 0, dtype = Output_DataType)

## 4.测试数据生成

In [13]:
# 随机生成矩阵并存储到相应的数据缓冲区中
np.random.seed(2)  # 设置随机种子以确保生成的随机数相同
input_buffer[:] = np.random.randint(0, 100, size=(row, col), dtype=Input_DataType)
weight_buffer[:] = np.random.randint(0, 100, size=(col, col1), dtype=Weight_DataType)
bias_buffer[:] = np.random.randint(0, 100, size=(row, col1), dtype=Output_DataType)
print("Randomly generated input buffer:")
print(input_buffer)
print("\nRandomly generated weight buffer:")
print(weight_buffer)
print("\nRandomly generated bias buffer:")
print(bias_buffer)


Randomly generated input buffer:
[[40 92 29 ... 77 22 88]
 [40 84 87 ... 82 85 28]
 [58 12 67 ... 59 71 62]
 ...
 [69 97 27 ... 78  0 53]
 [36 41 94 ... 16 63 47]
 [52 36 67 ... 62 47 43]]

Randomly generated weight buffer:
[[15  0 34 ... 15 62 85]
 [61 64 47 ... 50 12 11]
 [32 85 93 ... 61 83 67]
 ...
 [44 53 24 ... 41 11 49]
 [85 66 12 ... 86  6 97]
 [18  4  3 ... 59 13 35]]

Randomly generated bias buffer:
[[21 25 19 ... 49 63 49]
 [51 91 25 ... 61 98 20]
 [53 56 34 ... 34 59 27]
 ...
 [56 21  7 ...  7 31 34]
 [ 0 83 40 ... 43 59 29]
 [16 29 91 ... 53 25 17]]


## 加载存储测试

In [14]:
from pynq import allocate
from saa_utils import *
import time
import numpy as np
# 初始化指令队列
insn_size = 3
insn_buf = allocate(shape = (insn_size), cacheable = 0, dtype = Instruct_DataType)
insn_idx = 0
# 初始化微操作序列
uop_buf = allocate(shape = (insn_size), cacheable = 0, dtype = Uop_DataType)

# # 加载输入
# insn_buf[insn_idx] = get2DLoadStoreInsn(OPCODE_LOAD, 
#                                         0,
#                                         0,
#                                         0,
#                                         1,
#                                       OUTPUT_BUFFER_ID, 
#                                       0, 
#                                       0, 
#                                       row//MATRIX_WIDTH, 
#                                       col1//MATRIX_WIDTH, 
#                                       col1//MATRIX_WIDTH)# 直接加载一整个块
# insn_idx += 1

# 存储输入
insn_buf[insn_idx] = get2DLoadStoreInsn(OPCODE_STORE, 
                                        0,
                                        0,
                                        1,
                                        0,
                                      OUTPUT_BUFFER_ID, 
                                      0, 
                                      0, 
                                      row//MATRIX_WIDTH, 
                                      col1//MATRIX_WIDTH, 
                                      col1//MATRIX_WIDTH)# 直接加载一整个块
insn_idx += 1

# 生成结束指令
insn_buf[insn_idx] = getFinishInsn(0,1)
insn_idx += 1
print(f"insn_idx: {insn_idx}") 

# 执行load和store
# 定义一次最多等待周期为1000万周期
wait_cycles = 100000
# 运行SAA硬件
pt0 = time.perf_counter()
RunSaa(insn_idx,
       insn_buf.physical_address,
       uop_buf.physical_address,
       input_buffer.physical_address,
       weight_buffer.physical_address,
       bias_buffer.physical_address,
       output_buffer.physical_address,
       wait_cycles)
pt1 = time.perf_counter()
t_fpga = pt1 - pt0
print("INFO - Saa run time: %fs" % t_fpga)     
# 计算传输带宽
data_size = row * col1 * 32 * 2
bps = data_size/(t_fpga*1E9)
print(f"INFO - 带宽: {bps:.6f}Gbps")

# 检查输出
print("Randomly generated input buffer:")
print(input_buffer)
print("Randomly generated weight buffer:")
print(weight_buffer)
print("\nRandomly generated bias buffer:")
print(bias_buffer)
print("Randomly generated output buffer:")
print(output_buffer)
output_buffer[:]=0
# del output_buffer

insn_idx: 2
done： 17234
INFO - Saa run time: 1.394785s
INFO - 带宽: 0.469865Gbps
Randomly generated input buffer:
[[40 92 29 ... 77 22 88]
 [40 84 87 ... 82 85 28]
 [58 12 67 ... 59 71 62]
 ...
 [69 97 27 ... 78  0 53]
 [36 41 94 ... 16 63 47]
 [52 36 67 ... 62 47 43]]
Randomly generated weight buffer:
[[15  0 34 ... 15 62 85]
 [61 64 47 ... 50 12 11]
 [32 85 93 ... 61 83 67]
 ...
 [44 53 24 ... 41 11 49]
 [85 66 12 ... 86  6 97]
 [18  4  3 ... 59 13 35]]

Randomly generated bias buffer:
[[21 25 19 ... 49 63 49]
 [51 91 25 ... 61 98 20]
 [53 56 34 ... 34 59 27]
 ...
 [56 21  7 ...  7 31 34]
 [ 0 83 40 ... 43 59 29]
 [16 29 91 ... 53 25 17]]
Randomly generated output buffer:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [35]:
# 随机生成矩阵并存储到相应的数据缓冲区中
np.random.seed(3)  # 设置随机种子以确保生成的随机数相同
input_matrix = np.random.randint(0, 100, size=(row, col), dtype=Input_DataType)
weight_matrix = np.random.randint(0, 100, size=(col, col1), dtype=Weight_DataType)
print("Randomly generated input buffer:")
print(input_matrix)
print("\nRandomly generated weight buffer:")
print(weight_matrix)

# 执行打包操作
pack_matrix_to_buffer(input_matrix, MATRIX_WIDTH, input_buffer)
pack_matrix_to_buffer(weight_matrix, MATRIX_WIDTH, weight_buffer)
print("Packed input buffer:")
print(input_buffer)
print("Packed input buffer:")
print(weight_buffer)

# 将输入矩阵转换为np.int32类型，以避免溢出
input_matrix_int32 = input_matrix.astype(np.int32)
weight_matrix_int32 = weight_matrix.astype(np.int32)
# 定义input_buffer和weight_buffer的矩阵乘法结果的结果矩阵
pt0 = time.perf_counter()
result_matrix = np.dot(input_matrix_int32, weight_matrix_int32)
pt1 = time.perf_counter()
time_sw = pt1 - pt0
print("pure software: %fs" % time_sw)
# 打印矩阵乘法结果
print("Matrix multiplication result:")
print(result_matrix)

Randomly generated input buffer:
[[23  1 13 ... 80 76 81]
 [37 87 78 ... 66 48 14]
 [13 51 61 ... 65 23 50]
 ...
 [70 25 61 ... 94 12 92]
 [32 75 40 ... 25 86 78]
 [16 15  5 ... 54 35 91]]

Randomly generated weight buffer:
[[56  4 28 ... 41 60 59]
 [98 88 62 ... 31 30 66]
 [ 0 12 40 ... 90 49  6]
 ...
 [24 81 37 ... 86 49  2]
 [52 77 31 ... 24 70 87]
 [94 46 55 ... 26 79 79]]


ValueError: could not broadcast input array from shape (256,) into shape (160,160)

In [28]:
# 执行分块矩阵乘法
blocked_gemm_test(saa_driver,
              row, 
              col1, 
              col, 
              input_buffer, 
              weight_buffer,
              bias_buffer, 
              output_buffer, 
              block_size, 
              0)
# 检查输出
print(output_buffer)

INFO - Blocked GEMM test: dim_I=16, dim_J=16, dim_K=16, block=8, bias_use=0
insn_idx: 6
done: 0
INFO - Saa run time: 0.000355s
INFO - Synchronization time: 0.355283ms
INFO - Throughput: 0.034587GOPs/s
[[24987 29480 25807 28555 34062 35328 40731 29885 35518 42848 45164 34488
  38598 39159 42572 33418]
 [27965 32766 27694 13375 20304 39995 27828 24688 28638 45081 38295 22877
  33310 42301 37854 29117]
 [23178 19190 22626 30452 25476 26291 25811 38416 35412 31074 27533 38223
  32001 31627 36325 44065]
 [23071 30627 28060 22068 26828 35212 33943 36255 30317 48155 39227 34637
  28905 46872 37960 40761]
 [30109 33346 28580 29936 39235 36115 40943 32164 28696 28542 35681 30552
  38936 39875 47241 38118]
 [33739 37535 32295 21067 29017 44383 35203 23209 24798 31369 32104 18886
  24559 45479 35422 18750]
 [25283 24686 25860 33029 31738 24161 32013 44321 23136 22893 28179 28754
  26734 29830 38431 43551]
 [24698 37175 32950 29071 31642 35284 34913 35455 20099 35092 22651 26708
  35876 45817 3781

## 7. 回收缓冲区

In [5]:
# 查看完成后清空缓冲区
del output_buffer

In [None]:
# instruct_buffer = allocate(shape = (1), cacheable = 0, dtype = Instruct_DataType)
# instructions = [] #临时存储指令
# # insn_test = getWeightPreloadComputeInsn(
# #     1,
# #     1,
# #     1,
# #     1,
# #     1,
# #     1)

# # insn_test = getWeightPreloadInsn(1, 1)

# # insn_test = get2DLoadStoreInsn(
# #                           1, 
# #                           1, 
# #                           1, 
# #                           1, 
# #                           1, 
# #                           1, 
# #                           1)

# insn_test = getComputeInsn(1, 
#                         1, 
#                         1, 
#                         1)
    
# instructions.append(insn_test)
# # 将生成的指令批量存入缓冲区，然后启动saa让其读取指令
# for i, instruction in enumerate(instructions):
#     instruct_buffer[i] = np.frombuffer(instruction, dtype=Instruct_DataType)
#     print(instruct_buffer[i])
#     print_binary(instruct_buffer[i]) # 输出指令的二进制表示


# from pynq import allocate
# import time
# import numpy as np
# wait_cycles = 100000 # 定义一次最多等待周期为1000万周期
# def blocked_gemm_test(saa_driver,
#               dim_I, 
#               dim_J, 
#               dim_K, 
#               input, 
#               weight,
#               bias, 
#               output, 
#               block, 
#               bias_use):
    
#     print("=====================================================================================")
#     print(f"INFO - Blocked GEMM test: dim_I={dim_I}, dim_J={dim_J}, dim_K={dim_K}, block={block}, bias_use={bias_use}")
    
#     # 计算分块
#     dim_I_block = dim_I // MATRIX_WIDTH
#     dim_J_block = dim_J // MATRIX_WIDTH
#     dim_K_block = dim_K // MATRIX_WIDTH

#     # 计算指令数量
#     insn_load_size = (dim_I_block * dim_K_block) + (dim_J_block * dim_K_block)
#     insn_compute_size = 2 * dim_I_block * dim_K_block * dim_J_block  # 不使用权重复用
# #     insn_compute_size = (dim_I_block + 1) * dim_K_block * dim_J_block  # 使用权重复用
# #     insn_compute_size = dim_I_block * dim_K_block * dim_J_block + 1  # 使用权重复用和双缓冲
#     insn_store_size = dim_I_block * dim_J_block
#     insn_size = insn_load_size + insn_store_size + insn_compute_size + 1

#     # 初始化指令队列
#     insn_buf = allocate(shape = (insn_size), cacheable = 0, dtype = Instruct_DataType)
#     insn_idx = 0
    
#     # 生成加载Input指令
#     for i in range(dim_I_block):
#         for k in range(dim_K_block):
#             buffer_start = 0
#             dram_start = 0
#             A_block = i*dim_K_block+k
#             buffer_offset = buffer_start + A_block * MATRIX_WIDTH
#             dram_offset = dram_start + i * dim_K_block * MATRIX_WIDTH * MATRIX_WIDTH + k * MATRIX_WIDTH
#             insn_buf[insn_idx] = get2DLoadStoreInsn(OPCODE_LOAD, 
#                                       INPUT_BUFFER_ID, 
#                                       buffer_offset, 
#                                       dram_offset, 
#                                       MATRIX_WIDTH, 
#                                       MATRIX_WIDTH, 
#                                       dim_K)
#             insn_idx += 1

#     # 生成加载weight指令
#     for k in range(dim_K_block):
#         for j in range(dim_J_block):
#             buffer_start = 0
#             dram_start = 0
#             A_block = k * dim_J_block + j
#             buffer_offset = buffer_start + A_block * MATRIX_WIDTH
#             dram_offset = dram_start + k * dim_J_block * MATRIX_WIDTH * MATRIX_WIDTH + j * MATRIX_WIDTH 
#             insn_buf[insn_idx] = get2DLoadStoreInsn(OPCODE_LOAD, 
#                                       WEIGHT_BUFFER_ID, 
#                                       buffer_offset, 
#                                       dram_offset, 
#                                       MATRIX_WIDTH, 
#                                       MATRIX_WIDTH, 
#                                       dim_J)
#             insn_idx += 1
    
#     # 生成计算指令
#     # 用于切换权重寄存器，最先使用 weight1
#     pingpang = 0
#     wb_start_addr = 0
#     input_start_addr = 0
#     output_start_addr = 0
#     weight_offset = 0
#     output_offset = 0
#     input_offset = 0
#     accumulate = 0
    
#     # 初始化指令计数
#     compute_count = insn_idx

#     # 迭代公共维度块和输出列块
#     for k in range(dim_K_block):
#         for j in range(dim_J_block):
#             # 计算权重偏移
#             weight_offset = wb_start_addr + (k * dim_J_block + j) * MATRIX_WIDTH
#             accumulate = 0 if k == 0 else 1

#             # 第一次加载权重，使用初始寄存器，无法双缓冲
#             if k == 0 and j == 0:
#                 insn_buf[insn_idx] = getWeightPreloadInsn(weight_offset, pingpang)
#                 insn_idx += 1
#             else:
#                 # 剩下的权重加载可以进行双缓冲
# #                 insn_buf[insn_idx] = getWeightPreloadComputeInsn(
# #                     input_offset,
# #                     weight_offset,
# #                     output_offset,
# #                     pingpang,
# #                     pingpang,
# #                     accumulate)
# #                 insn_idx += 1
    
# #                 insn_buf[insn_idx] = getComputeInsn(input_offset, 
# #                                         output_offset, 
# #                                         pingpang, 
# #                                         accumulate)
# #                 insn_idx += 1
#                 insn_buf[insn_idx] = getComputeInsn(input_offset, 
#                                         output_offset, 
#                                         pingpang, 
#                                         accumulate)
#                 insn_idx += 1
#                 insn_buf[insn_idx] = getWeightPreloadInsn(weight_offset, not pingpang)
#                 insn_idx += 1
                
#                 pingpang = not pingpang  # 切换加载寄存器和计算寄存器


#             # 迭代输出行块
#             for i in range(dim_I_block):
#                 output_offset = output_start_addr + (i * dim_J_block + j) * MATRIX_WIDTH
#                 input_offset = input_start_addr + (i * dim_K_block + k) * MATRIX_WIDTH

#                 # 如果不是最后一个计算，使用 getComputeInsn 计算
#                 if i != dim_I_block - 1:
#                     insn_buf[insn_idx] = getComputeInsn(input_offset, 
#                                             output_offset, 
#                                             pingpang, 
#                                             accumulate)
#                     insn_idx += 1
#                 # 如果是最后一个权重块，使用当前寄存器进行计算
#                 if i == dim_I_block - 1 and j == dim_J_block - 1 and k == dim_K_block - 1:
#                     insn_buf[insn_idx] = getComputeInsn(input_offset, 
#                                             output_offset, 
#                                             pingpang, 
#                                             accumulate)
#                     insn_idx += 1
# #             if k != 0 and j != 0:
# #             insn_buf[insn_idx] = getComputeInsn(input_offset, 
# #                                     output_offset, 
# #                                     pingpang, 
# #                                     accumulate)
#             insn_idx += 1
    
    
#     # 更新计算指令的数量
#     compute_count = insn_idx - compute_count
#     print(f"compute_count: {compute_count}")    

    
#     # 生成存储指令
#     for i in range(dim_I_block):
#         for j in range(dim_J_block):
#             buffer_start = 0
#             dram_start = 0
#             A_block = i * dim_J_block + j
#             buffer_offset = buffer_start + A_block * MATRIX_WIDTH
#             dram_offset = dram_start + i * dim_J_block * MATRIX_WIDTH * MATRIX_WIDTH + j * MATRIX_WIDTH
#             insn_buf[insn_idx] = get2DLoadStoreInsn(OPCODE_STORE, 
#                                        OUTPUT_BUFFER_ID, 
#                                        buffer_offset, 
#                                        dram_offset, 
#                                        MATRIX_WIDTH, 
#                                        MATRIX_WIDTH, 
#                                        dim_J)
#             insn_idx += 1
            
#     # 生成结束指令
#     insn_buf[insn_idx] = getFinishInsn()
#     insn_idx += 1

#     print("insn_size",insn_size)
#     print("insn_idx",insn_idx)
#     print("insn",insn_idx)
#     for i in range(insn_idx):
#         if i>=insn_load_size and i<insn_load_size+compute_count:
#             print_binary(insn_buf[i])
            
#     # 运行SAA硬件
#     pt0 = time.perf_counter()
#     saa_driver.run_saa(insn_idx,
#            insn_buf.physical_address,
#            input.physical_address,
#            weight.physical_address,
#            output.physical_address,
#            wait_cycles)
#     pt1 = time.perf_counter()
#     time_sw = pt1 - pt0
#     print("saa run time: %fs" % time_sw)     
    
#     # 计算吞吐量
    
    
#     return 0 
