In [2]:

from tensorflow.keras.layers import Input, MaxPooling2D, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1
from tensorflow.keras.layers import Activation
import time
import os
os.environ['XILINX_VITIS'] = '/tools/Xilinx/Vitis/2024.2'
os.environ['PATH'] = '/tools/Xilinx/Vivado/2020.1/bin:' + os.environ['PATH']
os.environ['PATH'] = '/tools/Xilinx/Vitis_HLS/2024.2/bin:' + os.environ['PATH']
from utils.model_utils import save_model


In [3]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

# Load MNIST dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Normalize the pixel values to [0, 1]
x_train = x_train.astype("float32") / 255.0
x_test  = x_test.astype("float32") / 255.0

# Reshape to add channel dimension (28x28x1)
x_train = x_train.reshape((-1, 28, 28, 1))
x_test  = x_test.reshape((-1, 28, 28, 1))

# Convert labels to one-hot encoding
y_train = to_categorical(y_train, 10)
y_test  = to_categorical(y_test, 10)

# Split off a validation set
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=42)

# Create tf.data.Dataset objects (optional but recommended for performance)
batch_size = 1024

train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(batch_size)
val_data   = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(batch_size)
test_data  = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)

# Optional: set number of epochs
n_epochs = 10

input_shape = (28, 28, 1)
n_classes = 10

In [4]:
from tensorflow.keras.layers import Conv2D, Dense, BatchNormalization

filters_per_conv_layer = [16, 24]
neurons_per_dense_layer = [42]

x = x_in = Input(input_shape)

for i, f in enumerate(filters_per_conv_layer):
    print(('Adding convolutional block {} with N={} filters').format(i, f))
    x = Conv2D(
        int(f),
        kernel_size=(3, 3),
        strides=(1, 1),
        kernel_initializer='lecun_uniform',
        kernel_regularizer=l1(0.0001),
        use_bias=False,
        name='conv_{}'.format(i),
    )(x)
    x = BatchNormalization(name='bn_conv_{}'.format(i))(x)
    x = Activation('relu', name='conv_act_%i' % i)(x)
    x = MaxPooling2D(pool_size=(2, 2), name='pool_{}'.format(i))(x)
x = Flatten()(x)

for i, n in enumerate(neurons_per_dense_layer):
    print(('Adding dense block {} with N={} neurons').format(i, n))
    x = Dense(n, kernel_initializer='lecun_uniform', kernel_regularizer=l1(0.0001), name='dense_%i' % i, use_bias=False)(x)
    x = BatchNormalization(name='bn_dense_{}'.format(i))(x)
    x = Activation('relu', name='dense_act_%i' % i)(x)
x = Dense(int(n_classes), name='output_dense')(x)
x_out = Activation('softmax', name='output_softmax')(x)

baseline_model = Model(inputs=[x_in], outputs=[x_out], name='keras_baseline')

LOSS = tf.keras.losses.CategoricalCrossentropy()
OPTIMIZER = tf.keras.optimizers.legacy.Adam(learning_rate=3e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=True)

baseline_model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=["accuracy"])

# Copy the model to a new variable
cloned_model = tf.keras.models.clone_model(baseline_model)
cloned_model = tf.keras.models.clone_model(baseline_model)


Adding convolutional block 0 with N=16 filters
Adding convolutional block 1 with N=24 filters
Adding dense block 0 with N=42 neurons


In [5]:
from qkeras.autoqkeras import *
from qkeras import *

# Compile the cloned model
cloned_model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=["accuracy"])
# Train the cloned model
cloned_model.fit(train_data, epochs=n_epochs, validation_data=val_data, verbose=2)

print_qmodel_summary(cloned_model)

# Evaluate the cloned model
loss, accuracy = cloned_model.evaluate(test_data, verbose=2)
print(f"Test loss: {loss}")
print(f"Test accuracy: {accuracy}")
# Save the model
model_dir = "models"
model_name = "keras_baseline"
model_path = os.path.join(model_dir, model_name)
save_model(cloned_model, model_path)

Epoch 1/10
53/53 - 1s - loss: 0.5633 - accuracy: 0.8972 - val_loss: 1.0006 - val_accuracy: 0.7878 - 1s/epoch - 28ms/step
Epoch 2/10
53/53 - 1s - loss: 0.2027 - accuracy: 0.9794 - val_loss: 2.6119 - val_accuracy: 0.1885 - 1s/epoch - 20ms/step
Epoch 3/10
53/53 - 1s - loss: 0.1491 - accuracy: 0.9864 - val_loss: 3.2581 - val_accuracy: 0.1368 - 1s/epoch - 21ms/step
Epoch 4/10
53/53 - 1s - loss: 0.1216 - accuracy: 0.9890 - val_loss: 2.7741 - val_accuracy: 0.2388 - 1s/epoch - 20ms/step
Epoch 5/10
53/53 - 1s - loss: 0.1039 - accuracy: 0.9903 - val_loss: 2.1723 - val_accuracy: 0.3450 - 1s/epoch - 21ms/step
Epoch 6/10
53/53 - 1s - loss: 0.0936 - accuracy: 0.9915 - val_loss: 1.4332 - val_accuracy: 0.5597 - 1s/epoch - 21ms/step
Epoch 7/10
53/53 - 1s - loss: 0.0876 - accuracy: 0.9910 - val_loss: 1.0268 - val_accuracy: 0.6535 - 1s/epoch - 20ms/step
Epoch 8/10
53/53 - 1s - loss: 0.0801 - accuracy: 0.9921 - val_loss: 0.4104 - val_accuracy: 0.8777 - 1s/epoch - 21ms/step
Epoch 9/10
53/53 - 1s - loss: 0.

  saving_api.save_model(


In [6]:
from utils.netron_embed import view_model

# View your model inline (any format: .onnx, .h5, .pb, etc.)
view_model(model_path+".h5")

Serving 'models/keras_baseline.h5' at http://localhost:35959


In [7]:
from qkeras import print_qstats

# for automatic quantization
import pprint
from qkeras.autoqkeras import *
from qkeras import *
from qkeras.utils import model_quantize

from qkeras.qtools import run_qtools
from qkeras.qtools import settings as qtools_settings
from tensorflow_model_optimization.python.core.sparsity.keras import pruning_wrapper
from qkeras import quantized_bits
from qkeras import QDense, QActivation

q = run_qtools.QTools(
    baseline_model,
    process="horowitz",
    source_quantizers=[quantized_bits(16, 5, 1)],
    is_inference=True,
    weights_path=None,
    keras_quantizer="fp16",
    keras_accumulator="fp16",
    for_reference=False,
)
q.qtools_stats_print()

energy_dict = q.pe(
    weights_on_memory="fixed", activations_on_memory="fixed", min_sram_size=8 * 16 * 1024 * 1024, rd_wr_on_io=False
)

# get stats of energy distribution in each layer
energy_profile = q.extract_energy_profile(qtools_settings.cfg.include_energy, energy_dict)
# extract sum of energy of each layer according to the rule specified in
# qtools_settings.cfg.include_energy
total_energy = q.extract_energy_sum(qtools_settings.cfg.include_energy, energy_dict)

pprint.pprint(energy_profile)
print()

print("Total energy: {:.6f} uJ".format(total_energy / 1000000.0))

Instructions for updating:
Use ref() instead.
{
    "source_quantizers": [
        {
            "quantizer_type": "quantized_bits",
            "bits": 16,
            "int_bits": 6,
            "is_signed": true
        }
    ],
    "conv_0": {
        "layer_type": "Conv2D",
        "input_quantizer_list": [
            {
                "quantizer_type": "quantized_bits",
                "bits": 16,
                "int_bits": 6,
                "is_signed": true
            }
        ],
        "weight_quantizer": {
            "quantizer_type": "floating_point",
            "bits": 16,
            "shape": [
                3,
                3,
                1,
                16
            ]
        },
        "multiplier": {
            "quantizer_type": "floating_point",
            "bits": 16,
            "op_type": "mul"
        },
        "accumulator": {
            "quantizer_type": "floating_point",
            "bits": 16,
            "op_type": "add"
        },
    

In [None]:
# These are the quantizers we'll test in the bayesian optimization
quantization_config = {
    "kernel": {
        "quantized_bits(2,0,1,alpha=1.0)": 2,
        "quantized_bits(4,0,1,alpha=1.0)": 4,
        "quantized_bits(6,0,1,alpha=1.0)": 6,
        "quantized_bits(8,0,1,alpha=1.0)": 8,
    },
    "bias": {
        "quantized_bits(2,0,1,alpha=1.0)": 2,
        "quantized_bits(4,0,1,alpha=1.0)": 4,
        "quantized_bits(6,0,1,alpha=1.0)": 6,
        "quantized_bits(8,0,1,alpha=1.0)": 8,
    },
    "activation": {
        "quantized_relu(3,1)": 3,
        "quantized_relu(4,2)": 4,
        "quantized_relu(8,2)": 8,
        "quantized_relu(8,4)": 8,
        "quantized_relu(16,6)": 16,
    },
    "linear": {
        "quantized_bits(2,0,1,alpha=1.0)": 2,
        "quantized_bits(4,0,1,alpha=1.0)": 4,
        "quantized_bits(6,0,1,alpha=1.0)": 6,
        "quantized_bits(8,0,1,alpha=1.0)": 8,
    },
}


# These are the layer types we will quantize
limit = {
    "conv": [8,  16],
    "dense": [8, 16],
    "act": [16]
}


# Use this if you want to minimize the model bit size
goal_bits = {
    "type": "bits",
    "params": {
        "delta_p": 2.0,  # We tolerate up to a +8% accuracy change
        "delta_n": 2.0,  # We tolerate down to a -5% accuracy change
        "rate": 2.0,  # We want a x2 times smaller model
        "stress": 1.0,  # Force the reference model size to be smaller by setting stress<1
        "input_bits": 8,
        "output_bits": 8,
        "ref_bits": 8,
        "config": {"default": ["parameters", "activations"]},
    },
}

# Use this if you want to minimize the model energy consumption
goal_energy = {
    "type": "energy",
    "params": {
        "delta_p": 8.0,
        "delta_n": 8.0,
        "rate": 2.0,
        "stress": 1.0,
        "process": "horowitz",
        "parameters_on_memory": ["sram", "sram"],
        "activations_on_memory": ["sram", "sram"],
        "rd_wr_on_io": [False, False],
        "min_sram_size": [0, 0],
        "source_quantizers": ["fp32"],
        "reference_internal": "int8",
        "reference_accumulator": "int32",
    },
}

run_config = {
    "goal": goal_energy,
    "quantization_config": quantization_config,
    "learning_rate_optimizer": False,
    "transfer_weights": False,  # Randomely initialize weights
    "mode": "bayesian",  # This can be bayesian,random,hyperband
    "seed": 42,
    "limit": limit,
    #"tune_filters": "layer",
    #"tune_filters_exceptions": "^output",
    "tune_filters": "none",
    "tune_filters_exceptions": "",  
    "distribution_strategy": None,
    "max_trials": 5,  # Let's just do 5 trials for this demonstrator, ideally you should do as many as possible
}

In [9]:
from qkeras.autoqkeras import AutoQKeras
autoqk = AutoQKeras(
    model=baseline_model,
    output_dir="autoqk_results",
    **run_config
)

Limit configuration:{"conv": [8, 8, 16], "dense": [8, 8, 16], "act": [16]}


TypeError: Could not locate class 'QConv2D'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'module': 'keras.layers', 'class_name': 'QConv2D', 'config': {'name': 'conv_0', 'trainable': True, 'dtype': 'float32', 'filters': 16, 'kernel_size': [3, 3], 'strides': [1, 1], 'padding': 'valid', 'data_format': 'channels_last', 'dilation_rate': [1, 1], 'groups': 1, 'activation': 'linear', 'use_bias': False, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'LecunUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': {'module': 'keras.regularizers', 'class_name': 'L1', 'config': {'l1': 9.999999747378752e-05}, 'registered_name': None}, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None, 'kernel_quantizer': 'quantized_bits(2,0,1,alpha=1.0)', 'bias_quantizer': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 28, 28, 1]}, 'name': 'conv_0', 'inbound_nodes': [[['input_1', 0, 0, {}]]]}

In [9]:


space = autoqk.tuner.oracle.get_space()
print("\n🔍 Registered hyperparameters:")
for hp in space.space:
    print(f"• {hp.name}: {hp.values}")


autoqk.fit(
    x=train_data,
    validation_data=val_data,
    epochs=15  # Or however many you want for each trial
)

Trial 5 Complete [00h 00m 16s]
val_score: 0.7193027138710022

Best val_score So Far: 0.8993278741836548
Total elapsed time: 00h 01m 10s


In [10]:
aqmodel = autoqk.get_best_model()
print_qmodel_summary(aqmodel)

# Train for the full epochs
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=10, verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1),
]

learning_rate: 0.003000000026077032
Model: "keras_baseline"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 28, 28, 1)]       0         
                                                                 
 conv_0 (QConv2D)            (None, 26, 26, 16)        144       
                                                                 
 bn_conv_0 (BatchNormalizati  (None, 26, 26, 16)       64        
 on)                                                             
                                                                 
 conv_act_0 (QActivation)    (None, 26, 26, 16)        0         
                                                                 
 pool_0 (MaxPooling2D)       (None, 13, 13, 16)        0         
                                                                 
 conv_1 (QConv2D)            (None, 11, 11, 16)        2304      
                



In [11]:
start = time.time()
history = aqmodel.fit(train_data, epochs=n_epochs, validation_data=val_data, callbacks=callbacks, verbose=1)
end = time.time()
print('\n It took {} minutes to train!\n'.format((end - start) / 60.0))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.001500000013038516.
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

 It took 0.4949254314104716 minutes to train!



In [13]:
# This model has some remnants from the optimization procedure attached to it, so let's define a new one
aqmodel.save_weights("autoqkeras_cnn_weights.h5")

layers = [l for l in aqmodel.layers]
x = layers[0].output
for i in range(1, len(layers)):
    x = layers[i](x)

new_model = Model(inputs=[layers[0].input], outputs=[x])
LOSS = tf.keras.losses.CategoricalCrossentropy()
OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=3e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=True)

new_model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=["accuracy"])
new_model.summary()
new_model.load_weights("autoqkeras_cnn_weights.h5")

print_qmodel_summary(new_model)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 28, 28, 1)]       0         
                                                                 
 conv_0 (QConv2D)            (None, 26, 26, 16)        144       
                                                                 
 bn_conv_0 (BatchNormalizati  (None, 26, 26, 16)       64        
 on)                                                             
                                                                 
 conv_act_0 (QActivation)    (None, 26, 26, 16)        0         
                                                                 
 pool_0 (MaxPooling2D)       (None, 13, 13, 16)        0         
                                                                 
 conv_1 (QConv2D)            (None, 11, 11, 16)        2304      
                                                           

In [14]:
# Save the model 
new_model_path = os.path.join(model_dir, "new_model")
save_model(new_model, new_model_path)

view_model(new_model_path+".h5")

✅ Model saved to: models/new_model.h5
Serving 'models/new_model.h5' at http://localhost:49911


In [15]:
print_qmodel_summary(new_model)

conv_0               f=16 quantized_bits(6,0,1,alpha=1.0) 
bn_conv_0            is normal keras bn layer
conv_act_0           quantized_relu(8,2)
conv_1               f=16 quantized_bits(6,0,1,alpha=1.0) 
bn_conv_1            is normal keras bn layer
conv_act_1           quantized_relu(8,2)
conv_2               f=24 quantized_bits(6,0,1,alpha=1.0) 
bn_conv_2            is normal keras bn layer
conv_act_2           quantized_relu(8,2)
dense_0              u=42 quantized_bits(2,0,1,alpha=1.0) 
bn_dense_0           is normal keras bn layer
dense_act_0          quantized_relu(4,2)
dense_1              u=64 quantized_bits(2,0,1,alpha=1.0) 
bn_dense_1           is normal keras bn layer
dense_act_1          quantized_relu(4,2)



In [20]:
import hls4ml
import plotting


hls_config_aq = hls4ml.utils.config_from_keras_model(new_model, granularity='name')
hls_config_aq['Model']['ReuseFactor'] = 8
hls_config_aq['Model']['Precision'] = 'ap_fixed<16,6>'
hls_config_aq['LayerName']['output_softmax']['Strategy'] = 'Stable'
plotting.print_dict(hls_config_aq)

save_path = os.path.join("Projects", "AutoQKeras")

cfg_aq = hls4ml.converters.create_config(backend='Vitis')
cfg_aq['IOType'] = 'io_stream'  # Must set this if using CNNs!
cfg_aq['HLSConfig'] = hls_config_aq
cfg_aq['KerasModel'] = new_model
cfg_aq['OutputDir'] = save_path
cfg_aq['XilinxPart'] = 'xczu5ev-sfvc784-1-i'

hls_model_aq = hls4ml.converters.keras_to_hls(cfg_aq)
hls_model_aq.compile()


Interpreting Model
Topology:
Layer name: input_1, layer type: InputLayer, input shapes: [[None, 28, 28, 1]], output shape: [None, 28, 28, 1]
Layer name: conv_0, layer type: QConv2D, input shapes: [[None, 28, 28, 1]], output shape: [None, 26, 26, 16]
Layer name: bn_conv_0, layer type: BatchNormalization, input shapes: [[None, 26, 26, 16]], output shape: [None, 26, 26, 16]
Layer name: conv_act_0, layer type: Activation, input shapes: [[None, 26, 26, 16]], output shape: [None, 26, 26, 16]
Layer name: pool_0, layer type: MaxPooling2D, input shapes: [[None, 26, 26, 16]], output shape: [None, 13, 13, 16]
Layer name: conv_1, layer type: QConv2D, input shapes: [[None, 13, 13, 16]], output shape: [None, 11, 11, 16]
Layer name: bn_conv_1, layer type: BatchNormalization, input shapes: [[None, 11, 11, 16]], output shape: [None, 11, 11, 16]
Layer name: conv_act_1, layer type: Activation, input shapes: [[None, 11, 11, 16]], output shape: [None, 11, 11, 16]
Layer name: pool_1, layer type: MaxPooling2

In [17]:
from sklearn.metrics import accuracy_score

y_predict_aq = aqmodel.predict(x_test)
y_predict_hls4ml_aq = hls_model_aq.predict(np.ascontiguousarray(x_test))


accuracy_keras = float(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_predict_aq, axis=1)))
accuracy_hls4ml = float(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_predict_hls4ml_aq, axis=1)))

print("Accuracy AutoQ Keras:  {}".format(accuracy_keras))
print("Accuracy AutoQ hls4ml: {}".format(accuracy_hls4ml))

Accuracy AutoQ Keras:  0.9585
Accuracy AutoQ hls4ml: 0.958


In [21]:
synth = True
if synth:
    hls_model_aq.build(csim=False, synth=True, vsynth=True)  



****** Vitis HLS - High-Level Synthesis from C, C++ and OpenCL v2024.2 (64-bit)
  **** SW Build 5238294 on Nov  8 2024
  **** IP Build 5239520 on Sun Nov 10 16:12:51 MST 2024
  **** SharedData Build 5239561 on Fri Nov 08 14:39:27 MST 2024
  **** Start of session at: Tue Apr  8 17:01:05 2025
    ** Copyright 1986-2022 Xilinx, Inc. All Rights Reserved.
    ** Copyright 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.

source /tools/Xilinx/Vitis/2024.2/scripts/vitis_hls/hls.tcl -notrace
INFO: [HLS 200-10] For user 'theodoros' on host 'theodoros-MS-7D75' (Linux_x86_64 version 6.8.0-57-generic) on Tue Apr 08 17:01:06 EEST 2025
INFO: [HLS 200-10] On os Ubuntu 22.04.1 LTS
INFO: [HLS 200-10] In directory '/home/theodoros/Documents/AI_ON_FPGA/Projects/AutoQKeras'
Sourcing Tcl script 'build_prj.tcl'
INFO: [HLS 200-1510] Running: open_project myproject_prj 
INFO: [HLS 200-10] Creating and opening project '/home/theodoros/Documents/AI_ON_FPGA/Projects/AutoQKeras/myproject_prj'.
INFO: [