In [1]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil
import json

In [2]:
model_dir  = os.environ['FINN_ROOT'] + '/notebooks/projects/'
model_file = model_dir + '/model.onnx'
estimates_output_dir = model_dir + '/output_estimates_only'
rtlsim_output_dir = model_dir + '/output_ipstitch_ooc_rtlsim'

if os.path.exists(estimates_output_dir):
    shutil.rmtree(estimates_output_dir)
    print('estimates: Previous run results deleted')

if os.path.exists(rtlsim_output_dir):
    shutil.rmtree(rtlsim_output_dir)
    print('rtlsim: Previous run results deleted')


estimates: Previous run results deleted
rtlsim: Previous run results deleted


In [3]:
cfg_estimates = build.DataflowBuildConfig(
    output_dir          = estimates_output_dir,
    mvau_wwidth_max     = 4,
    target_fps          = 20,
    synth_clk_period_ns = 10.0,
    auto_fifo_depths    = True,
    split_large_fifos   = True,
    fpga_part           = 'xc7s25csga225-1',
    #fpga_part           = 'xczu5ev-sfvc784-1-e',
    #board               = 'kv260',
    steps               = build_cfg.estimate_only_dataflow_steps,
    generate_outputs    = [
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ]
)

In [4]:
%%time
build.build_dataflow_cfg(model_file, cfg_estimates)

Building dataflow accelerator from /home/petertso/Documents/finn/notebooks/projects//model.onnx
Intermediate outputs will be generated in /tmp/finn_dev_root
Final outputs will be generated in /home/petertso/Documents/finn/notebooks/projects//output_estimates_only
Build log is at /home/petertso/Documents/finn/notebooks/projects//output_estimates_only/build_dataflow.log
Running step: step_qonnx_to_finn [1/10]
Running step: step_tidy_up [2/10]
Running step: step_streamline [3/10]
Running step: step_convert_to_hw [4/10]
Running step: step_create_dataflow_partition [5/10]
Running step: step_specialize_layers [6/10]
Running step: step_target_fps_parallelization [7/10]
Running step: step_apply_folding_config [8/10]
Running step: step_minimize_bit_width [9/10]
Running step: step_generate_estimate_reports [10/10]
Completed successfully
CPU times: user 1.14 s, sys: 3.05 ms, total: 1.14 s
Wall time: 1.14 s


0

In [5]:
assert os.path.exists(estimates_output_dir + '/report/estimate_network_performance.json')

In [6]:
def read_json_dict(filename):
    with open(filename, 'r') as f:
        ret = json.load(f)
    return ret

read_json_dict(estimates_output_dir + '/report/estimate_layer_cycles.json')

{'ConvolutionInputGenerator_rtl_0': 22761,
 'MVAU_hls_0': 270000,
 'ConvolutionInputGenerator_rtl_1': 52200,
 'MVAU_hls_1': 912600,
 'ConvolutionInputGenerator_rtl_2': 12276,
 'MVAU_hls_2': 270000,
 'ConvolutionInputGenerator_rtl_3': 2256,
 'MVAU_hls_3': 62208,
 'ConvolutionInputGenerator_rtl_4': 576,
 'MVAU_hls_4': 9216,
 'MVAU_hls_5': 1600,
 'MVAU_hls_6': 1250,
 'MVAU_hls_7': 125,
 'MVAU_rtl_0': 5}

In [16]:
read_json_dict(estimates_output_dir + '/report/estimate_network_performance.json')

{'critical_path_cycles': 1617073,
 'max_cycles': 912600,
 'max_cycles_node_name': 'MVAU_hls_1',
 'estimated_throughput_fps': 109.57703265395573,
 'estimated_latency_ns': 16170730.0}

In [7]:
read_json_dict(estimates_output_dir + '/report/estimate_layer_resources.json')

{'ConvolutionInputGenerator_rtl_0': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 364,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'MVAU_hls_0': {'BRAM_18K': 1,
  'BRAM_efficiency': 0.06510416666666667,
  'LUT': 370,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'ConvolutionInputGenerator_rtl_1': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 468,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'MVAU_hls_1': {'BRAM_18K': 2,
  'BRAM_efficiency': 0.5859375,
  'LUT': 351,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'ConvolutionInputGenerator_rtl_2': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 408,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'MVAU_hls_2': {'BRAM_18K': 3,
  'BRAM_efficiency': 0.78125,
  'LUT': 351,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'ConvolutionInputGenerator_rtl_3': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 336,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'MVAU_hls_3': {'BRAM_18K': 2,
  'BRAM_efficiency':

In [8]:
cfg_stitched_ip = build.DataflowBuildConfig(
    output_dir          = rtlsim_output_dir,
    mvau_wwidth_max     = 8,
    target_fps          = 20,
    synth_clk_period_ns = 12.0,
    fpga_part           = 'xc7s25csga225-1',
    #fpga_part           = 'xczu5ev-sfvc784-1-e',
    auto_fifo_depths    = True,
    split_large_fifos   = True,
    generate_outputs    = [
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
    ]
)

In [9]:
%%time
build.build_dataflow_cfg(model_file, cfg_stitched_ip)

Building dataflow accelerator from /home/petertso/Documents/finn/notebooks/projects//model.onnx
Intermediate outputs will be generated in /tmp/finn_dev_root
Final outputs will be generated in /home/petertso/Documents/finn/notebooks/projects//output_ipstitch_ooc_rtlsim
Build log is at /home/petertso/Documents/finn/notebooks/projects//output_ipstitch_ooc_rtlsim/build_dataflow.log
Running step: step_qonnx_to_finn [1/19]
Running step: step_tidy_up [2/19]
Running step: step_streamline [3/19]
Running step: step_convert_to_hw [4/19]
Running step: step_create_dataflow_partition [5/19]
Running step: step_specialize_layers [6/19]
Running step: step_target_fps_parallelization [7/19]
Running step: step_apply_folding_config [8/19]
Running step: step_minimize_bit_width [9/19]
Running step: step_generate_estimate_reports [10/19]
Running step: step_hw_codegen [11/19]
Running step: step_hw_ipgen [12/19]
Running step: step_set_fifo_depths [13/19]
Running step: step_create_stitched_ip [14/19]
Running ste

0

In [10]:
assert os.path.exists(rtlsim_output_dir + "/report/ooc_synth_and_timing.json")
assert os.path.exists(rtlsim_output_dir + "/report/rtlsim_performance.json")
assert os.path.exists(rtlsim_output_dir + "/final_hw_config.json")

In [11]:
! ls {rtlsim_output_dir}/stitched_ip

all_verilog_srcs.txt		       finn_vivado_stitch_proj.xpr
data				       ip
finn_vivado_stitch_proj.cache	       make_project.sh
finn_vivado_stitch_proj.gen	       make_project.tcl
finn_vivado_stitch_proj.hw	       vivado.jou
finn_vivado_stitch_proj.ip_user_files  vivado.log
finn_vivado_stitch_proj.srcs


In [12]:
! ls {rtlsim_output_dir}/report

estimate_layer_resources_hls.json  rtlsim_performance.json
ooc_synth_and_timing.json


In [13]:
! cat {rtlsim_output_dir}/report/ooc_synth_and_timing.json

{
  "vivado_proj_folder": "/tmp/finn_dev_root/synth_out_of_context_08tcfpg6/results_finn_design_wrapper",
  "LUT": 11016.0,
  "LUTRAM": 1620.0,
  "FF": 10441.0,
  "DSP": 0.0,
  "BRAM": 28.0,
  "BRAM_18K": 8.0,
  "BRAM_36K": 24.0,
  "URAM": 0.0,
  "Carry": 724.0,
  "WNS": 2.256,
  "Delay": 2.256,
  "vivado_version": 2024.2,
  "vivado_build_no": 5239630.0,
  "": 0,
  "fmax_mhz": 102.6272577996716,
  "estimated_throughput_fps": 112.45590379100547
}

In [14]:
! cat {rtlsim_output_dir}/report/rtlsim_performance.json

{
  "N_IN_TXNS": 4096,
  "N_OUT_TXNS": 1,
  "cycles": 978993,
  "N": 1,
  "latency_cycles": 978993,
  "runtime[ms]": 11.747916,
  "throughput[images/s]": 85.1214802693516,
  "fclk[mhz]": 83.33333333333333,
  "stable_throughput[images/s]": 85.1214802693516
}

In [15]:
! cat {rtlsim_output_dir}/final_hw_config.json

{
  "Defaults": {},
  "StreamingFIFO_rtl_0": {
    "ram_style": "auto",
    "depth": 4096,
    "impl_style": "rtl",
    "inFIFODepths": [
      0
    ],
    "outFIFODepths": [
      0
    ]
  },
  "ConvolutionInputGenerator_rtl_0": {
    "SIMD": 1,
    "parallel_window": 0,
    "ram_style": "distributed",
    "inFIFODepths": [
      4096
    ],
    "outFIFODepths": [
      22500
    ]
  },
  "StreamingFIFO_rtl_1": {
    "ram_style": "auto",
    "depth": 22500,
    "impl_style": "vivado",
    "inFIFODepths": [
      0
    ],
    "outFIFODepths": [
      0
    ]
  },
  "MVAU_hls_0": {
    "PE": 1,
    "SIMD": 1,
    "ram_style": "auto",
    "resType": "auto",
    "mem_mode": "internal_decoupled",
    "runtime_writeable_weights": 0,
    "inFIFODepths": [
      22500
    ],
    "outFIFODepths": [
      4125
    ]
  },
  "StreamingFIFO_rtl_2": {
    "ram_style": "auto",
    "depth": 4125,
    "impl_style": "vivado",
    "inFIFODepths": [
      0
    ],
    "outFIFODepths": [
      0
    ]
 