In [1]:
import argparse
import json
import logging
import sys
from os.path import join as pjoin

import h5py
import onnx
import torch
import torch.nn.functional as F

#from braindecode.models.deep4 import Deep4Net
from quantized_deep4 import QuantDeep4Net
from braindecode.torch_ext.optimizers import AdamW
from braindecode.torch_ext.util import set_random_seeds

In [2]:
datapath = "./processed_data/KU_mi_smt.h5"
dfile = h5py.File(datapath, 'r')
subj = 6
torch.cuda.set_device(0)
set_random_seeds(seed=20200205, cuda=True)

In [3]:
def get_data(subj):
    dpath = '/s' + str(subj)
    X = dfile[pjoin(dpath, 'X')]
    Y = dfile[pjoin(dpath, 'Y')]
    return X[:], Y[:]

In [4]:
# Get data for within-subject classification
X, Y = get_data(subj)

X_train, Y_train = X[:200], Y[:200]
X_val, Y_val = X[200:300], Y[200:300]
X_test, Y_test = X[300:], Y[300:]

suffix = 's' + str(subj)
n_classes = 1
in_chans = X.shape[1]

# final_conv_length = auto ensures we only get a single output in the time dimension
model = QuantDeep4Net(in_chans=in_chans, n_classes=n_classes,
                 input_time_length=X.shape[2],
                 final_conv_length=1).cuda()

# these are good values for the deep model
optimizer = AdamW(model.parameters(), lr=1 * 0.01, weight_decay=0.5*0.001)
model.compile(loss=F.cross_entropy, optimizer=optimizer, iterator_seed=1, )

model.fit(X_train, Y_train, epochs=5, batch_size=16, scheduler='cosine', 
        validation_data=(X_val, Y_val))#, remember_best_column='valid_loss')

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  exp_avg.mul_(beta1).add_(1 - beta1, grad)


<experiment.Experiment at 0x7f0310afce20>

In [5]:
test_loss = model.evaluate(X_test, Y_test)
print(test_loss)

{'loss': 0.08196786791086197, 'misclass': 0.0, 'runtime': 0.0005168914794921875}


In [6]:
from finn.util.basic import make_build_dir
from finn.util.visualization import showInNetron
    
build_dir = "/workspace/finn"

In [7]:
import brevitas.onnx as bo

bo.export_finn_onnx(model.network.cpu(), export_path=build_dir + "/auto_mai_subj6_export.onnx", input_t=torch.randn(1, 62, 1000, 1))

  training = torch.tensor(training, dtype=torch.bool)


ir_version: 6
producer_name: "pytorch"
producer_version: "1.6"
graph {
  node {
    input: "0"
    output: "37"
    name: "Transpose_0"
    op_type: "Transpose"
    attribute {
      name: "perm"
      ints: 0
      ints: 3
      ints: 2
      ints: 1
      type: INTS
    }
  }
  node {
    input: "37"
    input: "38"
    output: "39"
    name: "Conv_2"
    op_type: "Conv"
    attribute {
      name: "dilations"
      ints: 1
      ints: 1
      type: INTS
    }
    attribute {
      name: "group"
      i: 1
      type: INT
    }
    attribute {
      name: "kernel_shape"
      ints: 9
      ints: 1
      type: INTS
    }
    attribute {
      name: "pads"
      ints: 0
      ints: 0
      ints: 0
      ints: 0
      type: INTS
    }
    attribute {
      name: "strides"
      ints: 1
      ints: 1
      type: INTS
    }
  }
  node {
    input: "39"
    input: "40"
    output: "41"
    name: "Mul_4"
    op_type: "Mul"
  }
  node {
    input: "41"
    input: "42"
    output: "43"
    na

In [8]:
showInNetron(build_dir + "/mai_subj6_export.onnx")

Serving '/workspace/finn/mai_subj6_export.onnx' at http://0.0.0.0:8081


In [9]:
from finn.util.inference_cost import inference_cost
import json

cost_dict_path = build_dir + "/auto_mai_subj6_inference_cost.json"

inference_cost(build_dir + "/auto_mai_subj6_export.onnx", output_json=cost_dict_path, 
               output_onnx=build_dir + "/auto_mai_subj6_inference_cost.onnx",
               preprocess=True, discount_sparsity=True)

Inference cost for /workspace/finn/auto_mai_subj6_export.onnx
{
  "discount_sparsity": true,
  "mem_o_FLOAT32": 1580601.0,
  "mem_w_INT8": 273088.0,
  "op_mac_FLOAT32_INT8": 57903765.0,
  "total_bops": 14823363840.0,
  "total_mem_o_bits": 50579232.0,
  "total_mem_w_bits": 2184704.0,
  "unsupported": "{'MultiThreshold'}"
}


In [10]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil

model_file = build_dir + "/auto_mai_subj6_export.onnx"

estimates_output_dir = "output_estimates_only"

#Delete previous run results if exist
if os.path.exists(estimates_output_dir):
    shutil.rmtree(estimates_output_dir)
    print("Previous run results deleted!")


cfg_estimates = build.DataflowBuildConfig(
    output_dir          = estimates_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 1000,
    synth_clk_period_ns = 10.0,
    #board               = "ZCU102",
    fpga_part           = "xc7z020clg400-1",
    steps               = build_cfg.estimate_only_dataflow_steps,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ]
)

Previous run results deleted!


In [11]:
%%time
build.build_dataflow_cfg(model_file, cfg_estimates)

Building dataflow accelerator from /workspace/finn/auto_mai_subj6_export.onnx
Intermediate outputs will be generated in /tmp/finn_dev_floodd1@ad.mee.tcd.ie
Final outputs will be generated in output_estimates_only
Build log is at output_estimates_only/build_dataflow.log
Running step: step_qonnx_to_finn [1/8]
Running step: step_tidy_up [2/8]
Running step: step_streamline [3/8]
Running step: step_convert_to_hls [4/8]
Running step: step_create_dataflow_partition [5/8]
Running step: step_target_fps_parallelization [6/8]
Running step: step_apply_folding_config [7/8]
Running step: step_generate_estimate_reports [8/8]
Completed successfully
CPU times: user 5.18 s, sys: 24.9 ms, total: 5.2 s
Wall time: 5.13 s


0

In [12]:
! ls {estimates_output_dir}

auto_folding_config.json  intermediate_models  time_per_step.json
build_dataflow.log	  report


In [13]:
! ls {estimates_output_dir}/report

estimate_layer_config_alternatives.json  estimate_network_performance.json
estimate_layer_cycles.json		 op_and_param_counts.json
estimate_layer_resources.json


In [14]:
! cat {estimates_output_dir}/report/estimate_network_performance.json

{
  "critical_path_cycles": 221030,
  "max_cycles": 90000,
  "max_cycles_node_name": "StreamingFCLayer_Batch_2",
  "estimated_throughput_fps": 1111.111111111111,
  "estimated_latency_ns": 2210300.0
}

In [15]:
import json
def read_json_dict(filename):
    with open(filename, "r") as f:
        ret = json.load(f)
    return ret

In [16]:
read_json_dict(estimates_output_dir + "/report/estimate_layer_cycles.json")

{'StreamingMaxPool_Batch_0': 996,
 'Thresholding_Batch_0': 6200,
 'ConvolutionInputGenerator1D_0': 249,
 'StreamingFCLayer_Batch_0': 60000,
 'StreamingMaxPool_Batch_1': 244,
 'Thresholding_Batch_1': 3000,
 'ConvolutionInputGenerator1D_1': 61,
 'StreamingFCLayer_Batch_1': 58500,
 'StreamingMaxPool_Batch_2': 56,
 'Thresholding_Batch_2': 1300,
 'ConvolutionInputGenerator1D_2': 14,
 'StreamingFCLayer_Batch_2': 90000,
 'StreamingMaxPool_Batch_3': 10,
 'Thresholding_Batch_3': 200,
 'StreamingFCLayer_Batch_3': 200}

In [17]:
read_json_dict(estimates_output_dir + "/report/estimate_layer_resources.json")

{'StreamingMaxPool_Batch_0': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 0,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'Thresholding_Batch_0': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 16,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'ConvolutionInputGenerator1D_0': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 8300,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'StreamingFCLayer_Batch_0': {'BRAM_18K': 10,
  'BRAM_efficiency': 0.48828125,
  'LUT': 73208,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'StreamingMaxPool_Batch_1': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 0,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'Thresholding_Batch_1': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 16,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'ConvolutionInputGenerator1D_1': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 4300,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'StreamingFCLayer_Batch_1': {'BRAM_18K': 36,
  'BRAM_effic

In [18]:
showInNetron(estimates_output_dir + '/intermediate_models/step_tidy_up.onnx')

Stopping http://0.0.0.0:8081
Serving 'output_estimates_only/intermediate_models/step_tidy_up.onnx' at http://0.0.0.0:8081


In [19]:
showInNetron(estimates_output_dir + '/intermediate_models/step_streamline.onnx')

Stopping http://0.0.0.0:8081
Serving 'output_estimates_only/intermediate_models/step_streamline.onnx' at http://0.0.0.0:8081


In [20]:
showInNetron(estimates_output_dir + '/intermediate_models/step_convert_to_hls.onnx')

Stopping http://0.0.0.0:8081
Serving 'output_estimates_only/intermediate_models/step_convert_to_hls.onnx' at http://0.0.0.0:8081


In [21]:
showInNetron(estimates_output_dir + '/intermediate_models/step_create_dataflow_partition.onnx')

Stopping http://0.0.0.0:8081
Serving 'output_estimates_only/intermediate_models/step_create_dataflow_partition.onnx' at http://0.0.0.0:8081


In [22]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil

model_file = build_dir + "/auto_mai_subj6_export.onnx"

rtlsim_output_dir = "output_ipstitch_ooc_rtlsim"

#Delete previous run results if exist
if os.path.exists(rtlsim_output_dir):
    shutil.rmtree(rtlsim_output_dir)
    print("Previous run results deleted!")

cfg_stitched_ip = build.DataflowBuildConfig(
    output_dir          = rtlsim_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 1000,
    synth_clk_period_ns = 10.0,
    #board               = "ZCU102",
    fpga_part           = "xc7z020clg400-1",
    generate_outputs=[
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
    ]
)

Previous run results deleted!


In [23]:
%%time
build.build_dataflow_cfg(model_file, cfg_stitched_ip)

Building dataflow accelerator from /workspace/finn/auto_mai_subj6_export.onnx
Intermediate outputs will be generated in /tmp/finn_dev_floodd1@ad.mee.tcd.ie
Final outputs will be generated in output_ipstitch_ooc_rtlsim
Build log is at output_ipstitch_ooc_rtlsim/build_dataflow.log
Running step: step_qonnx_to_finn [1/17]
Running step: step_tidy_up [2/17]
Running step: step_streamline [3/17]
Running step: step_convert_to_hls [4/17]
Running step: step_create_dataflow_partition [5/17]
Running step: step_target_fps_parallelization [6/17]
Running step: step_apply_folding_config [7/17]
Running step: step_generate_estimate_reports [8/17]
Running step: step_hls_codegen [9/17]
Running step: step_hls_ipgen [10/17]
Running step: step_set_fifo_depths [11/17]
Running step: step_create_stitched_ip [12/17]
Running step: step_measure_rtlsim_performance [13/17]
Running step: step_out_of_context_synthesis [14/17]


Traceback (most recent call last):
  File "/workspace/finn/src/finn/builder/build_dataflow.py", line 166, in build_dataflow_cfg
    model = transform_step(model, cfg)
  File "/workspace/finn/src/finn/builder/build_dataflow_steps.py", line 590, in step_out_of_context_synthesis
    model = model.transform(
  File "/workspace/finn-base/src/finn/core/modelwrapper.py", line 141, in transform
    (transformed_model, model_was_changed) = transformation.apply(
  File "/workspace/finn/src/finn/transformation/fpgadataflow/synth_ooc.py", line 61, in apply
    ret = out_of_context_synth(
  File "/workspace/finn-base/src/finn/util/vivado.py", line 66, in out_of_context_synth
    with open(res_counts_path, "r") as myfile:
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/finn_dev_floodd1@ad.mee.tcd.ie/synth_out_of_context_fd54belk/results_finn_design_wrapper/res.txt'


> [0;32m/workspace/finn-base/src/finn/util/vivado.py[0m(66)[0;36mout_of_context_synth[0;34m()[0m
[0;32m     64 [0;31m    [0mres_counts_path[0m [0;34m=[0m [0mvivado_proj_folder[0m [0;34m+[0m [0;34m"/res.txt"[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     65 [0;31m[0;34m[0m[0m
[0m[0;32m---> 66 [0;31m    [0;32mwith[0m [0mopen[0m[0;34m([0m[0mres_counts_path[0m[0;34m,[0m [0;34m"r"[0m[0;34m)[0m [0;32mas[0m [0mmyfile[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     67 [0;31m        [0mres_data[0m [0;34m=[0m [0mmyfile[0m[0;34m.[0m[0mread[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0msplit[0m[0;34m([0m[0;34m"\n"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     68 [0;31m    [0mret[0m [0;34m=[0m [0;34m{[0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0m
--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user
Build failed
CPU times: user 5min 53s, sys: 18.3 s, total: 6min 12s
Wall time: 1h 58min 44s


-1

In [24]:
! ls {rtlsim_output_dir}/stitched_ip

all_verilog_srcs.txt		       ip
finn_vivado_stitch_proj.cache	       make_project.sh
finn_vivado_stitch_proj.hw	       make_project.tcl
finn_vivado_stitch_proj.ip_user_files  vivado.jou
finn_vivado_stitch_proj.srcs	       vivado.log
finn_vivado_stitch_proj.xpr


In [25]:
! ls {rtlsim_output_dir}/report

estimate_layer_resources_hls.json  rtlsim_performance.json


In [26]:
! cat {rtlsim_output_dir}/report/ooc_synth_and_timing.json

cat: output_ipstitch_ooc_rtlsim/report/ooc_synth_and_timing.json: No such file or directory


In [27]:
! cat {rtlsim_output_dir}/report/rtlsim_performance.json

{
  "cycles": 162226,
  "runtime[ms]": 1.6222600000000003,
  "throughput[images/s]": 616.4240010849062,
  "DRAM_in_bandwidth[Mb/s]": 15.287315226905672,
  "DRAM_out_bandwidth[Mb/s]": 0.0018492720032547187,
  "fclk[mhz]": 100.0,
  "N": 1,
  "latency_cycles": 162226
}

In [28]:
! cat {rtlsim_output_dir}/final_hw_config.json

{
  "Defaults": {},
  "StreamingFIFO_0": {
    "ram_style": "auto",
    "depth": 2,
    "impl_style": "rtl"
  },
  "StreamingDataWidthConverter_Batch_0": {
    "impl_style": "hls"
  },
  "Thresholding_Batch_0": {
    "PE": 1,
    "ram_style": "distributed",
    "mem_mode": "const",
    "runtime_writeable_weights": 0
  },
  "StreamingDataWidthConverter_Batch_1": {
    "impl_style": "hls"
  },
  "ConvolutionInputGenerator1D_0": {
    "SIMD": 25,
    "ram_style": "distributed"
  },
  "StreamingDataWidthConverter_Batch_2": {
    "impl_style": "hls"
  },
  "StreamingFIFO_6": {
    "ram_style": "auto",
    "depth": 128,
    "impl_style": "rtl"
  },
  "StreamingFCLayer_Batch_0": {
    "PE": 5,
    "SIMD": 9,
    "ram_style": "auto",
    "resType": "lut",
    "mem_mode": "decoupled",
    "runtime_writeable_weights": 0
  },
  "StreamingDataWidthConverter_Batch_3": {
    "impl_style": "hls"
  },
  "StreamingDataWidthConverter_Batch_4": {
    "impl_style