In [1]:
import argparse
import json
import logging
import sys
from os.path import join as pjoin

import h5py
import onnx
import torch
import torch.nn.functional as F

#from braindecode.models.deep4 import Deep4Net
from quantized_deep4 import QuantDeep4Net
from braindecode.torch_ext.optimizers import AdamW
from braindecode.torch_ext.util import set_random_seeds

In [2]:
datapath = "./processed_data/KU_mi_smt.h5"
dfile = h5py.File(datapath, 'r')
subj = 6
torch.cuda.set_device(0)
set_random_seeds(seed=20200205, cuda=True)

In [3]:
def get_data(subj):
    dpath = '/s' + str(subj)
    X = dfile[pjoin(dpath, 'X')]
    Y = dfile[pjoin(dpath, 'Y')]
    return X[:], Y[:]

In [6]:
# Get data for within-subject classification
X, Y = get_data(subj)

X_train, Y_train = X[:200], Y[:200]
X_val, Y_val = X[200:300], Y[200:300]
X_test, Y_test = X[300:], Y[300:]

suffix = 's' + str(subj)
n_classes = 1
in_chans = X.shape[1]

# final_conv_length = auto ensures we only get a single output in the time dimension
model = QuantDeep4Net(in_chans=in_chans, n_classes=n_classes,
                 input_time_length=X.shape[2],
                 final_conv_length=1).cuda()

# these are good values for the deep model
optimizer = AdamW(model.parameters(), lr=1 * 0.01, weight_decay=0.5*0.001)
model.compile(loss=F.cross_entropy, optimizer=optimizer, iterator_seed=1, )

model.fit(X_train, Y_train, epochs=5, batch_size=16, scheduler='cosine', 
        validation_data=(X_val, Y_val))#, remember_best_column='valid_loss')

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  exp_avg.mul_(beta1).add_(1 - beta1, grad)


<experiment.Experiment at 0x7fdd06b5e160>

In [7]:
test_loss = model.evaluate(X_test, Y_test)
print(test_loss)

{'loss': 0.11558190733194351, 'misclass': 0.0, 'runtime': 0.0008082389831542969}


In [8]:
from finn.util.basic import make_build_dir
from finn.util.visualization import showInNetron
    
build_dir = "/workspace/finn"

In [9]:
import brevitas.onnx as bo

bo.export_finn_onnx(model.network.cpu(), export_path=build_dir + "/auto_mai_subj6_export.onnx", input_t=torch.randn(1, 62, 1000, 1))

  training = torch.tensor(training, dtype=torch.bool)


ir_version: 6
producer_name: "pytorch"
producer_version: "1.6"
graph {
  node {
    input: "0"
    output: "37"
    name: "Transpose_0"
    op_type: "Transpose"
    attribute {
      name: "perm"
      ints: 0
      ints: 3
      ints: 2
      ints: 1
      type: INTS
    }
  }
  node {
    input: "37"
    input: "38"
    output: "39"
    name: "Conv_2"
    op_type: "Conv"
    attribute {
      name: "dilations"
      ints: 1
      ints: 1
      type: INTS
    }
    attribute {
      name: "group"
      i: 1
      type: INT
    }
    attribute {
      name: "kernel_shape"
      ints: 9
      ints: 1
      type: INTS
    }
    attribute {
      name: "pads"
      ints: 0
      ints: 0
      ints: 0
      ints: 0
      type: INTS
    }
    attribute {
      name: "strides"
      ints: 1
      ints: 1
      type: INTS
    }
  }
  node {
    input: "39"
    input: "40"
    output: "41"
    name: "Mul_4"
    op_type: "Mul"
  }
  node {
    input: "41"
    input: "42"
    output: "43"
    na

In [8]:
showInNetron(build_dir + "/mai_subj6_export.onnx")

Serving '/workspace/finn/mai_subj6_export.onnx' at http://0.0.0.0:8081


In [10]:
from finn.util.inference_cost import inference_cost
import json

cost_dict_path = build_dir + "/auto_mai_subj6_inference_cost.json"

inference_cost(build_dir + "/auto_mai_subj6_export.onnx", output_json=cost_dict_path, 
               output_onnx=build_dir + "/auto_mai_subj6_inference_cost.onnx",
               preprocess=True, discount_sparsity=True)

Inference cost for /workspace/finn/auto_mai_subj6_export.onnx
{
  "discount_sparsity": true,
  "mem_o_FLOAT32": 1580601.0,
  "mem_w_INT8": 272984.0,
  "op_mac_FLOAT32_INT8": 57822856.0,
  "total_bops": 14802651136.0,
  "total_mem_o_bits": 50579232.0,
  "total_mem_w_bits": 2183872.0,
  "unsupported": "{'MultiThreshold'}"
}


In [19]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil

model_file = build_dir + "/auto_mai_subj6_export.onnx"

estimates_output_dir = "output_estimates_only"

#Delete previous run results if exist
if os.path.exists(estimates_output_dir):
    shutil.rmtree(estimates_output_dir)
    print("Previous run results deleted!")


cfg_estimates = build.DataflowBuildConfig(
    output_dir          = estimates_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 1000,
    synth_clk_period_ns = 10.0,
    board               = "ZCU102",
#     fpga_part           = "xc7z020clg400-1",
    steps               = build_cfg.estimate_only_dataflow_steps,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ]
)

Previous run results deleted!


In [20]:
%%time
build.build_dataflow_cfg(model_file, cfg_estimates)

Building dataflow accelerator from /workspace/finn/auto_mai_subj6_export.onnx
Intermediate outputs will be generated in /tmp/finn_dev_floodd1@ad.mee.tcd.ie
Final outputs will be generated in output_estimates_only
Build log is at output_estimates_only/build_dataflow.log
Running step: step_qonnx_to_finn [1/8]
Running step: step_tidy_up [2/8]
Running step: step_streamline [3/8]
Running step: step_convert_to_hls [4/8]
Running step: step_create_dataflow_partition [5/8]
Running step: step_target_fps_parallelization [6/8]
Running step: step_apply_folding_config [7/8]
Running step: step_generate_estimate_reports [8/8]
Completed successfully
CPU times: user 10.4 s, sys: 16.9 ms, total: 10.4 s
Wall time: 10.3 s


0

In [21]:
! ls {estimates_output_dir}

auto_folding_config.json  intermediate_models  report  time_per_step.json


In [22]:
! ls {estimates_output_dir}/report

estimate_layer_config_alternatives.json  estimate_network_performance.json
estimate_layer_cycles.json		 op_and_param_counts.json
estimate_layer_resources.json


In [23]:
! cat {estimates_output_dir}/report/estimate_network_performance.json

{
  "critical_path_cycles": 221030,
  "max_cycles": 90000,
  "max_cycles_node_name": "StreamingFCLayer_Batch_2",
  "estimated_throughput_fps": 1111.111111111111,
  "estimated_latency_ns": 2210300.0
}

In [24]:
import json
def read_json_dict(filename):
    with open(filename, "r") as f:
        ret = json.load(f)
    return ret

In [25]:
read_json_dict(estimates_output_dir + "/report/estimate_layer_cycles.json")

{'StreamingMaxPool_Batch_0': 996,
 'Thresholding_Batch_0': 6200,
 'ConvolutionInputGenerator1D_0': 249,
 'StreamingFCLayer_Batch_0': 60000,
 'StreamingMaxPool_Batch_1': 244,
 'Thresholding_Batch_1': 3000,
 'ConvolutionInputGenerator1D_1': 61,
 'StreamingFCLayer_Batch_1': 58500,
 'StreamingMaxPool_Batch_2': 56,
 'Thresholding_Batch_2': 1300,
 'ConvolutionInputGenerator1D_2': 14,
 'StreamingFCLayer_Batch_2': 90000,
 'StreamingMaxPool_Batch_3': 10,
 'Thresholding_Batch_3': 200,
 'StreamingFCLayer_Batch_3': 200}

In [26]:
read_json_dict(estimates_output_dir + "/report/estimate_layer_resources.json")

{'StreamingMaxPool_Batch_0': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 0,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'Thresholding_Batch_0': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 16,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'ConvolutionInputGenerator1D_0': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 8300,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'StreamingFCLayer_Batch_0': {'BRAM_18K': 10,
  'BRAM_efficiency': 0.48828125,
  'LUT': 73208,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'StreamingMaxPool_Batch_1': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 0,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'Thresholding_Batch_1': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 16,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'ConvolutionInputGenerator1D_1': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 4300,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'StreamingFCLayer_Batch_1': {'BRAM_18K': 36,
  'BRAM_effic

In [12]:
showInNetron(estimates_output_dir + '/intermediate_models/step_tidy_up.onnx')

Stopping http://0.0.0.0:8081
Serving 'output_estimates_only/intermediate_models/step_tidy_up.onnx' at http://0.0.0.0:8081


In [16]:
showInNetron(estimates_output_dir + '/intermediate_models/step_streamline.onnx')

Stopping http://0.0.0.0:8081
Serving 'output_estimates_only/intermediate_models/step_streamline.onnx' at http://0.0.0.0:8081


In [17]:
showInNetron(estimates_output_dir + '/intermediate_models/step_convert_to_hls.onnx')

Stopping http://0.0.0.0:8081
Serving 'output_estimates_only/intermediate_models/step_convert_to_hls.onnx' at http://0.0.0.0:8081


In [15]:
showInNetron(estimates_output_dir + '/intermediate_models/step_create_dataflow_partition.onnx')

Stopping http://0.0.0.0:8081
Serving 'output_estimates_only/intermediate_models/step_create_dataflow_partition.onnx' at http://0.0.0.0:8081
