# Part 1: Getting started

## Load Cifar10 dataset from tf.keras

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
plt.imshow(x_train[21])
# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255
#One-hot encoding
y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)

### Let's print some information about the dataset
Print the the dataset shape

In [None]:
print(x_train.shape, x_test.shape,y_train.shape, y_test.shape)

## Construct the model

In [None]:
from tensorflow.keras.layers import Dropout,Flatten, Dense, Activation, BatchNormalization, Conv2D, MaxPooling2D,InputLayer
from tensorflow.keras.models import Sequential

model = Sequential()
input_shape = (32, 32, 3)
model.add(InputLayer(input_shape=input_shape))
model.add(Conv2D(16, kernel_size=(3, 3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Conv2D(16, kernel_size=(3, 3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Conv2D(16, kernel_size=(3, 3), activation="relu"))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation(activation='softmax'))

"""
model = Sequential()
input_shape=(32, 32, 3)
model.add(InputLayer(input_shape=input_shape))
model.add(Conv2D(16, kernel_size=(3, 3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Conv2D(16, kernel_size=(3, 3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Conv2D(16, kernel_size=(3, 3), activation="relu"))
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(10))
model.add(Activation(activation='softmax'))
"""
model.summary()
model.build()

## Train the model
We'll use Adam optimizer with categorical crossentropy loss.
The callbacks will decay the learning rate and save the model into a directory 'model_mnist_cnn'
The model isn't very complex, so this should just take a few minutes even on the CPU.
If you've restarted the notebook kernel after training once, set `train = False` to load the trained model.

In [None]:
from tensorflow.keras.optimizers import Adam
from callbacks import all_callbacks
import os
train = True


if train:
    adam = Adam(lr=0.0001)
    model.compile(optimizer=adam, loss=['categorical_crossentropy'], metrics=['accuracy'])
    callbacks = all_callbacks(stop_patience = 1000,
                              lr_factor = 0.5,
                              lr_patience = 10,
                              lr_epsilon = 0.000001,
                              lr_cooldown = 2,
                              lr_minimum = 0.0000001,
                              outputDir = 'model_cifar10_cnn')
    model.fit(x_train, y_train, batch_size=128,
              epochs=100, validation_data=(x_test, y_test), shuffle=True,
              callbacks = callbacks.callbacks)
else:
    from tensorflow.keras.models import load_model
    model = load_model('model_cifar10_cnn/KERAS_check_best_model.h5')

## Check performance
Check the accuracy and make a ROC curve

In [None]:
import plotting
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
cifar10_classes=['airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck']
y_keras = model.predict(x_test)
print("Accuracy: {}".format(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_keras, axis=1))))
plt.figure(figsize=(9,9))
_ = plotting.makeRoc(y_test, y_keras, cifar10_classes)

# Convert the model to FPGA firmware with hls4ml
Now we will go through the steps to convert the model we trained to a low-latency optimized FPGA firmware with hls4ml.
First, we will evaluate its classification performance to make sure we haven't lost accuracy using the fixed-point data types. 
Then we will synthesize the model with Vivado HLS and check the metrics of latency and FPGA resource usage.

## Make an hls4ml config & model
The hls4ml Neural Network inference library is controlled through a configuration dictionary.
In this example we'll use the most simple variation, later exercises will look at more advanced configuration.

In [None]:
import hls4ml
from hls4ml.converters.keras_to_hls import keras_to_hls


config = hls4ml.utils.config_from_keras_model(model, granularity='name')
config['Backend']='VivadoAccelerator'
config['OutputDir'] = 'cifar10-hls-test'
config['ProjectName'] = 'myproject_cifar10_cnn'
config['XilinxPart']= 'xczu7ev-ffvc1156-2-e'
config['Board'] = 'zcu104'
config['ClockPeriod'] = 5
config['IOType'] = 'io_stream'
config['HLSConfig']={}
config['HLSConfig']['Model']={}
config['HLSConfig']['Model']=config['Model']
config['HLSConfig']['LayerName']=config['LayerName']
del config['Model']
del config['LayerName']
config['AcceleratorConfig']={}
config['AcceleratorConfig']['Interface'] = 'axi_stream'
config['AcceleratorConfig']['Driver'] = 'python'
config['AcceleratorConfig']['Precision']={}
config['AcceleratorConfig']['Precision']['Input']= 'float'
config['AcceleratorConfig']['Precision']['Output']= 'float'
config['KerasModel'] = model

print("-----------------------------------")
print("Configuration")
plotting.print_dict(config)
print("-----------------------------------")

hls_model = keras_to_hls(config)


Let's visualise what we created. The model architecture is shown, annotated with the shape and data types

In [None]:
hls4ml.utils.plot_model(hls_model, show_shapes=True, show_precision=True, to_file=None)

## Compile, predict
Now we need to check that this model performance is still good. We compile the hls_model, and then use `hls_model.predict` to execute the FPGA firmware with bit-accurate emulation on the CPU.

In [None]:
hls_model.compile()
x_test = np.ascontiguousarray(x_test)
y_hls4ml = hls_model.predict(x_test)

## Compare
That was easy! Now let's see how the performance compares to Keras:

In [None]:
acc_hls4ml = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_hls4ml, axis=1))
acc_keras=accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_keras, axis=1))
print('Accuracy hls4ml:     {}'.format(acc_hls4ml))
print('Accuracy keras:      {}'.format(acc_keras))

fig, ax = plt.subplots(figsize=(9, 9))
_ = plotting.makeRoc(y_test, y_keras, cifar10_classes)
plt.gca().set_prop_cycle(None) # reset the colors
_ = plotting.makeRoc(y_test, y_hls4ml, cifar10_classes, linestyle='--')

from matplotlib.lines import Line2D
lines = [Line2D([0], [0], ls='-'),
         Line2D([0], [0], ls='--')]
from matplotlib.legend import Legend
leg = Legend(ax, lines, labels=['keras', 'hls4ml'],
            loc='lower right', frameon=False)
ax.add_artist(leg)

## Synthesize
Now we'll actually use Vivado HLS to synthesize the model. We can run the build using a method of our `hls_model` object.
After running this step, we can integrate the generated IP into a workflow to compile for a specific FPGA board.
In this case, we'll just review the reports that Vivado HLS generates, checking the latency and resource usage.

**This can take several minutes.**

While the C-Synthesis is running, we can monitor the progress looking at the log file by opening a terminal from the notebook home, and executing:

`tail -f mnist-hls-test/vivado_hls.log`

In [None]:
import os
os.environ['PATH'] = '/workspace/home/Xilinx/Vivado/2019.2/bin:' + os.environ['PATH']
hls_model.build(csim=False,synth=True,export=False)

## Check the reports
Print out the reports generated by Vivado HLS. Pay attention to the Latency and the 'Utilization Estimates' sections

In [None]:
import hls4ml
from hls4ml.converters.keras_to_hls import keras_to_hls

hls4ml.report.read_vivado_report('cifar10-hls-test')