In [1]:
import os
import logging
import config

import numpy as np
import torch
from torchinfo import summary

from brevitas.export import export_qonnx

Run folder created in: experiments_pynq-z1/353_pynq-z1__700FPS__workspace__AIMET_Balanced__BIPOLAR__w4W2a4__full_build/


# Logging

In [2]:
log_path = config.RUN_FOLDER

logger = logging.getLogger("GonLogger")
logger.propagate = False
logger.setLevel(logging.INFO)
file_handler = logging.FileHandler(log_path + 'logfile.log')
# formatter = logging.Formatter('%(message)s')
# file_handler.setFormatter(formatter)

# add file handler to logger
logger.addHandler(file_handler)

# FINN Folders setup

In [3]:
# finn_root_dir = os.environ["FINN_ROOT"]
# nb_dir = finn_root_dir + "/notebooks/uav_finn/classification/qonnx_to_finn_driver/"
# # Leave all build files inside experiments folder
# os.environ["FINN_BUILD_DIR"] = nb_dir + config.BUILD_FOLDER
# os.environ["FINN_HOST_BUILD_DIR"] = nb_dir + config.TMP_FOLDER

# models_folder = config.MODELS_FOLDER

# Original QONNX Model

In [4]:
#brevitas_cpu = './models/BED_classifier__best_mean_F1__BIPOLAR_Out__QONNX.onnx'
#brevitas_cpu = './models/Best_F1_AIMET__Bipolar.onnx'
# brevitas_cpu = './aimet_onnx/BED_classifier__best_mean_F1__BIPOLAR_Out__QONNX.onnx'
# brevitas_cpu = './models/aimet_onnx/BED_classifier__best_mean_F1__AIMET_NoPadding__BIPOLAR_Out__QONNX.onnx'
# brevitas_cpu = './models/aimet_onnx/BED_classifier__best_mean_F1__AIMET_NoPading_QIdy__BIPOLAR_Out__QONNX.onnx'
#brevitas_cpu = './models/aimet_onnx/BED_classifier__best_mean_F1__AIMET_NoPad_QIdy__Balanced__NoTrain__QONNX.onnx'

brevitas_cpu = './models/aimet_onnx/BED_classifier__best_mean_F1__AIMET_Balanced__BIPOLAR_Out__QONNX.onnx'


# brevitas_cpu = './models/aimet_onnx/BED_classifier__w4a4__best_mean_F1__BIPOLAR_Out__QONNX.onnx'
# brevitas_cpu = './models/aimet_onnx/BED_classifier__w4a4__best_mean_F1__QINT_Out__QONNX.onnx'

# FINN IMPORTS

In [5]:
from finn.util.visualization import showSrc, showInNetron

In [6]:
# showInNetron(brevitas_cpu)

# FINN Build IMPORTS

In [7]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg

# Custom step: Preprocess

In [8]:
from finn.util.pytorch import ToTensor
from qonnx.transformation.merge_onnx_models import MergeONNXModels
from qonnx.core.modelwrapper import ModelWrapper
from qonnx.core.datatype import DataType

def custom_step_add_pre_proc(model: ModelWrapper, cfg: build.DataflowBuildConfig):
    global_inp_name = model.graph.input[0].name
    ishape = model.get_tensor_shape(global_inp_name)
    preproc = ToTensor()
    export_qonnx(preproc, torch.randn(ishape), "preproc.onnx", opset_version=11)
    preproc_model = ModelWrapper("preproc.onnx")
    # set input finn datatype to UINT8
    preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType["UINT8"])
    # merge pre-processing onnx model with cnv model (passed as input argument)
    model = model.transform(MergeONNXModels(preproc_model))
    
    return model

# Custom step: Streamline

In [9]:
from qonnx.core.datatype import DataType

from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
#from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
from qonnx.transformation.infer_data_layouts import InferDataLayouts
from qonnx.transformation.general import RemoveUnusedTensors

from finn.transformation.streamline import Streamline
import finn.transformation.streamline.absorb as absorb
from finn.transformation.streamline.reorder import MakeMaxPoolNHWC, MoveScalarLinearPastInvariants
from finn.transformation.streamline.reorder import MoveTransposePastScalarMul


def custom_step_streamline(model: ModelWrapper, cfg: build.DataflowBuildConfig):
   
    model = model.transform(MoveScalarLinearPastInvariants())
    model = model.transform(Streamline())
    model = model.transform(LowerConvsToMatMul())
    model = model.transform(MakeMaxPoolNHWC())
    model = model.transform(ChangeDataLayoutQuantAvgPool2d())
    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
    #model = model.transform(RoundAndClipThresholds())
    model = model.transform(Streamline())
    model = model.transform(InferDataLayouts())
    model = model.transform(RemoveUnusedTensors())
    
    return model

# Custom step: Convert to HW

In [10]:
import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
from finn.transformation.fpgadataflow.create_dataflow_partition import (
    CreateDataflowPartition,
)
from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
from qonnx.transformation.general import GiveUniqueNodeNames

from qonnx.custom_op.registry import getCustomOp


def custom_step_convert_to_hw(model: ModelWrapper, cfg: build.DataflowBuildConfig):

    Multithreshold_node = model.get_nodes_by_op_type("MultiThreshold")    
    # Find MultiThreshold that FINN could not annotate datatype properly and set them to INT32
    # It happens with all convolutions splitted by Spatial SVD
        # First conv is the one impacted: k=(3,1)
        # Second conv, k=(1,3), followed by BN and ReLU, is annotated properly
    for node in Multithreshold_node:
        if model.get_tensor_datatype(node.input[1]) == "FLOAT32":
            print(f'{node.name}: node with Float32 annotation')
            model.set_tensor_datatype(node.input[1], DataType["INT32"])
            print(f'{node.name}: changed to datatype {model.get_tensor_datatype(node.input[1])}')
    # Change BIPOLAR end node to BINARY
    for node in Multithreshold_node:
        node_inst = getCustomOp(node)
        if node_inst.get_nodeattr("out_dtype") == "BIPOLAR":
            node_inst.set_nodeattr("out_dtype", "BINARY")
            node_inst.set_nodeattr("out_scale", 1.0)
            node_inst.set_nodeattr("out_bias", 0.0)
            print(f'Node changed from BIPOLAR to BINARY, to fulfill standalone MultiThreshold requirement\n{node}')
            print("Set Output to BINARY Datatype")
            global_out_name = model.graph.output[0].name
            model.set_tensor_datatype(global_out_name, DataType["BINARY"])
            
    if cfg.standalone_thresholds:
        # doing this first causes all threshold layers to be standalone
        # It allows MVAU_rtl and optimization with DSP
        model = model.transform(to_hw.InferThresholdingLayer())
    model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())   
    # input quantization (if any) to standalone thresholding. 
    model = model.transform(to_hw.InferThresholdingLayer())
    model = model.transform(to_hw.InferPool())
    model = model.transform(to_hw.InferStreamingMaxPool())
    model = model.transform(to_hw.InferConvInpGen())
    # get rid of Reshape(-1, 1) operation between hw nodes 
    model = model.transform(RemoveCNVtoFCFlatten())
    # get rid of Tranpose -> Tranpose identity seq
    model = model.transform(absorb.AbsorbConsecutiveTransposes())
    
    # infer tensor data layouts
    model = model.transform(InferDataLayouts())
    model = model.transform(GiveUniqueNodeNames())
    #model = model.transform(Streamline()) -> MAYBE NOT NEEDED ????
   
    return model

# Custom step: Specialize Layers -> redefine FMPadding as HLS

In [11]:
MVAU_list = [0, 1, 2, 14, 17, 18]

In [12]:
from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
from qonnx.transformation.infer_shapes import InferShapes
from qonnx.transformation.infer_datatypes import InferDataTypes
from qonnx.transformation.general import GiveReadableTensorNames

def custom_step_specialize_layers(model: ModelWrapper, cfg: build.DataflowBuildConfig):
    # Change all FMPadding to HLS, as Folding does not support this layer as RTL
    # It does not hurt if Padding is not present, as it will do nothing
    # FMPadding_node = model.get_nodes_by_op_type("FMPadding")
    # i = 0
    # for node in FMPadding_node:
    #     node_inst = getCustomOp(node)
    #     node_inst.set_nodeattr("preferred_impl_style", "hls")
    #     print(f'Node {i}: {node}')
    #     i += 1  

    #################################################################################
    ############## This works automatically with standalone thresholds ############## 
    #################################################################################
    # Change all MVAU to RTL, which is more optimized
    # It does not work:
        # Warning
    # /home/gmoreno/uav/finn/src/finn/transformation/fpgadataflow/specialize_layers.py:143: 
    # UserWarning: There is no RTL variant for MVAU_17. The node will automatically be
    # set to HLS variant. Please check the bit-widths to be <= 8 and ensure the
    # thresholds are implemented as standalone layer
    #
    # MVAU_node = model.get_nodes_by_op_type("MVAU")
    # j = 0
    # for node in MVAU_node:
    #     node_inst = getCustomOp(node)
    #     node_inst.set_nodeattr("preferred_impl_style", "rtl")
    #     print(f'Node {j}: {node}')
    #     j += 1

    ##### Better to do this with specialize template -> it is harder to define, because of layer names
    ##### changing from execution to execution
    # Change MVAUs in list from MVAU_rtl 2 to HLS
    # MVAU_list = [0, 1, 2, 5, 6, 7, 14] # for 500FPS, add more in next experiment
    # MVAU_nodes = model.get_nodes_by_op_type("MVAU")
    # for idx in MVAU_list:
    #     MVAU_node = MVAU_nodes[idx]
    #     node_inst = getCustomOp(MVAU_node)
    #     node_inst.set_nodeattr("preferred_impl_style", "hls")
    #     node_inst.set_nodeattr("ram_style", "block")
    #     node_inst.set_nodeattr("resType", "lut")
    #     print(f'Node MVAU {idx} changed to hls, block, lut: \n{MVAU_node}')
    
    # MVAUs in the list are kept like RTL. All other layers are converted to HLS
    MVAU_nodes = model.get_nodes_by_op_type("MVAU")
    for idx in range(len(MVAU_nodes)):
        if idx in MVAU_list:
            print(f'MVAU {idx} left unchanged')
            continue
        else:
            MVAU_node = MVAU_nodes[idx]
            node_inst = getCustomOp(MVAU_node)
            node_inst.set_nodeattr("preferred_impl_style", "hls")
            node_inst.set_nodeattr("ram_style", "block")
            node_inst.set_nodeattr("resType", "lut")
            print(f'Node MVAU {idx} changed to hls, block, lut: \n{MVAU_node}')
            

    # # Last ConvInputGen to BRAM
    # last_conv_inp_gen_node = model.get_nodes_by_op_type("ConvolutionInputGenerator")[-1]
    # conv_node_inst = getCustomOp(last_conv_inp_gen_node)
    # conv_node_inst.set_nodeattr("ram_style", "block")
    
    # All ConvInputGen to BRAM to save LUTs
    ConvInGen_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator")
    for conv_node in ConvInGen_nodes:
        node_inst = getCustomOp(conv_node)
        node_inst.set_nodeattr("ram_style", "block")
        print(f'Node ConvInGen {conv_node.name} changed to BRAM: \n{conv_node}')
    
    # Specialize
    model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
    model = model.transform(InferShapes())
    model = model.transform(InferDataTypes())   
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(GiveReadableTensorNames())

    return model

# FPGA Part

In [13]:
from finn.util.basic import pynq_part_map
import pandas as pd

In [14]:
fpga_df = pd.DataFrame(pynq_part_map.items(), columns=['Board', 'FPGA Part'])
fpga_df

Unnamed: 0,Board,FPGA Part
0,Ultra96,xczu3eg-sbva484-1-e
1,Ultra96-V2,xczu3eg-sbva484-1-i
2,Pynq-Z1,xc7z020clg400-1
3,Pynq-Z2,xc7z020clg400-1
4,ZCU102,xczu9eg-ffvb1156-2-e
5,ZCU104,xczu7ev-ffvc1156-2-e
6,ZCU111,xczu28dr-ffvg1517-2-e
7,RFSoC2x2,xczu28dr-ffvg1517-2-e
8,RFSoC4x2,xczu48dr-ffvg1517-2-e
9,KV260_SOM,xck26-sfvc784-2LV-c


In [15]:
# change this if you have a different PYNQ board, see list above
pynq_board = "Pynq-Z1"
fpga_part = pynq_part_map[pynq_board]

In [16]:
print(fpga_part)

xc7z020clg400-1


In [17]:
model_file = brevitas_cpu
print(model_file)

./models/aimet_onnx/BED_classifier__best_mean_F1__AIMET_Balanced__BIPOLAR_Out__QONNX.onnx


# Parameters

In [18]:
my_target_fps = 700
my_mvau_wwidth_max = 10000 #1000 #80 #36 is the default
my_default_swg_exception = True
my_standalone_thresholds = True
my_auto_fifo_depths = True
my_auto_fifo_strategy = "largefifo_rtlsim" # "characterize"
my_split_large_fifos = True
my_folding_config_file = ("./experiments_pynq-z1" + 
                          "/4073_pynq-z1__750FPS__workspace__AIMET_Balanced__BIPOLAR__w4W2a4__full_build_json_allFIFObram" +
                          "/final_hw_config_edited_all_fifo_bram.json")
my_specialize_layers_config_file = None

# Build estimate reports

In [19]:
import shutil

In [20]:
estimates_output_dir = config.RUN_FOLDER + "output_estimates_only"

#Delete previous run results if exist
if os.path.exists(estimates_output_dir):
    shutil.rmtree(estimates_output_dir)
    print("Previous run results deleted!")
else:
    print("Folder does not exist and it will be created")

my_estimate_steps = [
    custom_step_add_pre_proc,
    "step_qonnx_to_finn",
    "step_tidy_up",
    custom_step_streamline,
    custom_step_convert_to_hw,
    "step_create_dataflow_partition",
    custom_step_specialize_layers,
    "step_target_fps_parallelization",
    "step_apply_folding_config",
    "step_minimize_bit_width",
    "step_generate_estimate_reports",
]

cfg_estimates = build.DataflowBuildConfig(
    output_dir                    = estimates_output_dir,
    mvau_wwidth_max               = my_mvau_wwidth_max,
    target_fps                    = my_target_fps,
    synth_clk_period_ns           = 10.0,
    board                         = pynq_board,
    fpga_part                     = fpga_part,
    shell_flow_type               = build_cfg.ShellFlowType.VIVADO_ZYNQ,
    standalone_thresholds         = my_standalone_thresholds,
    default_swg_exception         = my_default_swg_exception, # Change to True to optimize ConvGenerators, removing FIFOs
    auto_fifo_depths              = my_auto_fifo_depths,
    auto_fifo_strategy            = my_auto_fifo_strategy, #"characterize", -> the other option, takes toooo long
    split_large_fifos             = my_split_large_fifos, # Change to True to save resources

    steps                         = my_estimate_steps,
    #specialize_layers_config_file = my_specialize_layers_config_file,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ],
)

Folder does not exist and it will be created


# Build FULL Flow

In [21]:
full_build_output_dir = config.RUN_FOLDER + "output_full_build"

#Delete previous run results if exist
if os.path.exists(full_build_output_dir):
    shutil.rmtree(full_build_output_dir)
    print("Previous run results deleted!")
else:
    print("Folder does not exist and it will be created")

my_build_steps = [
    custom_step_add_pre_proc,
    "step_qonnx_to_finn",
    "step_tidy_up",
    custom_step_streamline,
    custom_step_convert_to_hw,
    "step_create_dataflow_partition",
    custom_step_specialize_layers,
    "step_target_fps_parallelization",
    "step_apply_folding_config",
    "step_minimize_bit_width",
    "step_generate_estimate_reports",
    "step_hw_codegen",
    "step_hw_ipgen",
    "step_set_fifo_depths",
    "step_create_stitched_ip",
    "step_measure_rtlsim_performance",
    "step_out_of_context_synthesis",
    "step_synthesize_bitfile",
    "step_make_pynq_driver",
    "step_deployment_package",
]

cfg_full_build = build.DataflowBuildConfig(
    output_dir                    = full_build_output_dir,
    mvau_wwidth_max               = my_mvau_wwidth_max, 
    target_fps                    = my_target_fps,
    synth_clk_period_ns           = 10.0,
    board                         = pynq_board,
    fpga_part                     = fpga_part,
    shell_flow_type               = build_cfg.ShellFlowType.VIVADO_ZYNQ,
    standalone_thresholds         = my_standalone_thresholds,
    default_swg_exception         = my_default_swg_exception, # Change to True to optimize ConvGenerators, removing FIFOs
    auto_fifo_depths              = my_auto_fifo_depths,
    auto_fifo_strategy            = my_auto_fifo_strategy, #"characterize", -> the other option, takes toooo long
    split_large_fifos             = my_split_large_fifos, # Change to True to save resources

    steps                         = my_build_steps,
    #specialize_layers_config_file = my_specialize_layers_config_file,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
        build_cfg.DataflowOutputType.BITFILE,
        build_cfg.DataflowOutputType.PYNQ_DRIVER,
        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
    ],
)

Folder does not exist and it will be created


# Build using JSON file for Folding or FIFO sizes

In [22]:
full_build_output_dir = config.RUN_FOLDER + "output_full_build"

#Delete previous run results if exist
if os.path.exists(full_build_output_dir):
    shutil.rmtree(full_build_output_dir)
    print("Previous run results deleted!")
else:
    print("Folder does not exist and it will be created")

my_build_json_steps = [
    custom_step_add_pre_proc,
    "step_qonnx_to_finn",
    "step_tidy_up",
    custom_step_streamline,
    custom_step_convert_to_hw,
    "step_create_dataflow_partition",
    custom_step_specialize_layers,
    "step_target_fps_parallelization",
    "step_apply_folding_config",
    "step_minimize_bit_width",
    "step_generate_estimate_reports",
    "step_hw_codegen",
    "step_hw_ipgen",
    "step_set_fifo_depths",
    "step_create_stitched_ip",
    "step_measure_rtlsim_performance",
    "step_out_of_context_synthesis",
    "step_synthesize_bitfile",
    "step_make_pynq_driver",
    "step_deployment_package",
]

cfg_full_build_json_folding = build.DataflowBuildConfig(
    output_dir                    = full_build_output_dir,
    #mvau_wwidth_max               = my_mvau_wwidth_max, #36,
    #target_fps                    = my_target_fps,
    synth_clk_period_ns           = 10.0,
    board                         = pynq_board,
    fpga_part                     = fpga_part,
    shell_flow_type               = build_cfg.ShellFlowType.VIVADO_ZYNQ,
    standalone_thresholds         = my_standalone_thresholds,
    default_swg_exception         = my_default_swg_exception, # Change to True to optimize ConvGenerators, removing FIFOs
    auto_fifo_depths              = my_auto_fifo_depths,
    auto_fifo_strategy            = my_auto_fifo_strategy, #"characterize", -> the other option, takes toooo long
    split_large_fifos             = my_split_large_fifos, # Change to True to save resources

    steps                         = my_build_json_steps,
    folding_config_file           = my_folding_config_file,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
        build_cfg.DataflowOutputType.BITFILE,
        build_cfg.DataflowOutputType.PYNQ_DRIVER,
        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
    ],
)

Folder does not exist and it will be created


# Choose type of build: estimate, full build or build with json

In [23]:
# flow_config = "estimates"
flow_config = "full_build"
# flow_config = "full_build_json_folding"

if flow_config == "estimates":
    current_build_config = cfg_estimates
elif flow_config == "full_build":
    current_build_config = cfg_full_build
elif flow_config == "full_build_json_folding":
    current_build_config = cfg_full_build_json_folding
else:
    raise ValueError("Wrong config")

print(f'Perform: {flow_config}')

Perform: full_build


# Logging before build

In [24]:
logger.info(f'PYNQ board: {pynq_board}\n' +  
            f'\tModel used: {model_file}\n' +
            f'\tTarget fps: my_target_fps: {my_target_fps}.\n' +
            f'\tmvau_wwidth_max: {my_mvau_wwidth_max}.\n'+ 
            f'\tMVAU list: {MVAU_list}.\n'+ 
            f'\tDefault sliding window exception: {my_default_swg_exception}.\n'+ 
            f'\tAuto FIFO depth: {my_auto_fifo_depths}.\n'+ 
            f'\tAuto FIFO strategy: {my_auto_fifo_strategy}.\n'+ 
            f'\tSplit large FIFOs: {my_split_large_fifos}.\n'+ 
            f'\tFolding JSON file: {my_folding_config_file}.\n'+
            f'\tSpecialize JSON file: {my_specialize_layers_config_file}.\n'+
            f'\tFlow config: {flow_config}.\n')

# Build command

In [25]:
%%time
build.build_dataflow_cfg(model_file, current_build_config)

Building dataflow accelerator from ./models/aimet_onnx/BED_classifier__best_mean_F1__AIMET_Balanced__BIPOLAR_Out__QONNX.onnx
Intermediate outputs will be generated in /home/gmoreno/workspace
Final outputs will be generated in experiments_pynq-z1/353_pynq-z1__700FPS__workspace__AIMET_Balanced__BIPOLAR__w4W2a4__full_build/output_full_build
Build log is at experiments_pynq-z1/353_pynq-z1__700FPS__workspace__AIMET_Balanced__BIPOLAR__w4W2a4__full_build/output_full_build/build_dataflow.log
Running step: custom_step_add_pre_proc [1/20]
Running step: step_qonnx_to_finn [2/20]
Running step: step_tidy_up [3/20]
Running step: custom_step_streamline [4/20]
Running step: custom_step_convert_to_hw [5/20]
Running step: step_create_dataflow_partition [6/20]
Running step: custom_step_specialize_layers [7/20]
Running step: step_target_fps_parallelization [8/20]
Running step: step_apply_folding_config [9/20]
Running step: step_minimize_bit_width [10/20]
Running step: step_generate_estimate_reports [11/20

0