In [42]:
import os, time
import numpy as np
import matplotlib.pyplot as plt

# Import tensorflow
import tensorflow as tf
import tensorflow.keras as keras
# Import tensorflow model optimization, used for quantization-aware training
import tensorflow_model_optimization as tfmot

In [48]:
# remove annoying logging
# tf.get_logger().setLevel('ERROR')
# absl.logging.set_verbosity(absl.logging.ERROR)
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

In [3]:
# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
# Basic standardization
x_train = x_train / 255.0
x_test = x_test / 255.0

In [4]:
# Basic neural network deifnition
model = keras.Sequential([
  keras.layers.InputLayer(input_shape=(28, 28)),
  keras.layers.Reshape(target_shape=(28, 28, 1)),
  keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu'),
  keras.layers.MaxPooling2D(pool_size=(2, 2)),
  keras.layers.Flatten(),
  keras.layers.Dense(10)
])

2022-02-08 15:50:44.216600: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-02-08 15:50:46.824403: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 11423 MB memory:  -> device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:85:00.0, compute capability: 6.1
2022-02-08 15:50:46.825199: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 11423 MB memory:  -> device: 1, name: NVIDIA TITAN Xp, pci bus id: 0000:89:00.0, compute capability: 6.1
2022-02-08 15:50:46.825841: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:2 wit

In [5]:
# train the model
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(
  x_train,
  y_train,
  epochs=2,
  validation_split=0.1,
)

Epoch 1/2


2022-02-08 15:50:48.221405: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8302
2022-02-08 15:50:48.703499: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Epoch 2/2


<keras.callbacks.History at 0x7f3c54521a90>

In [6]:
# evaluate the original model
model.evaluate(x_test,y_test)



[0.06934657692909241, 0.9789000153541565]

# Quantization-aware finetuning

In [7]:
# ( Optional) Clone model
model_q = tf.keras.models.clone_model(model)
model_q.set_weights(model.get_weights())

In [8]:
# Create quantization-aware model
model_q = tfmot.quantization.keras.quantize_model(model_q)

model_q.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [9]:
# Evaluate the quantized model WITHOUT retraining
model_q.evaluate(x_test,y_test)
# accuracy should be pretty low



[4.079493999481201, 0.11349999904632568]

In [10]:
# retrain the model with quantization-aware training
model_q.fit(
  x_train,
  y_train,
  epochs=3,
  validation_split=0.1,
)
# evaluate model AFTER retraining
model_q.evaluate(x_test,y_test)
# accuracy should be equivalent to the accuracy before quantization

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.058014679700136185, 0.9817000031471252]

# TensorFlow Lite

In [11]:
# A function that converts a tensorflow model to a tensorflow lite model
def convert_TF_to_TFLite(model_tf, X_train, TFLite_target_filename):
    # Create TFLite model from the original TF model
    converter = tf.lite.TFLiteConverter.from_keras_model(model_tf)
    # Set the optimization flag to use default quantization
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    # Enforce integer only quantization
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
    # Also Quantize input and output (not mandatory, replace with tf.float32 to keep floating representations)
    converter.inference_input_type = tf.int8
    converter.inference_output_type = tf.int8
    # Provide a representative dataset to optimize quantization with respect to expected data distribution
    def generate_representative_dataset():
        for i in range(len(X_train)//10):
            yield([np.float32(X_train[i]).reshape(1, X_train[0].size)])
    # Converter will use the above function to optimize quantization
    converter.representative_dataset = generate_representative_dataset
    #convert to TFLite
    model_tflite = converter.convert()
    open(TFLite_target_filename, "wb").write(model_tflite)
    # If you want to read the on-disk size (good proxy for on device size)
    # size = os.path.getsize(TFLite_target_filename)
    return model_tflite

In [12]:
# convert our TF model to TFLite
model_tflite = convert_TF_to_TFLite(model_q, x_train, "model.tflite")

2022-02-08 15:51:44.682716: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /tmp/tmp_jvv02nm/assets


INFO:tensorflow:Assets written to: /tmp/tmp_jvv02nm/assets
2022-02-08 15:51:46.098049: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:357] Ignored output_format.
2022-02-08 15:51:46.098084: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:360] Ignored drop_control_dependency.
2022-02-08 15:51:46.099064: I tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/tmp_jvv02nm
2022-02-08 15:51:46.102142: I tensorflow/cc/saved_model/reader.cc:78] Reading meta graph with tags { serve }
2022-02-08 15:51:46.102162: I tensorflow/cc/saved_model/reader.cc:119] Reading SavedModel debug info (if present) from: /tmp/tmp_jvv02nm
2022-02-08 15:51:46.114338: I tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2022-02-08 15:51:46.173914: I tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/tmp_jvv02nm
2022-02-08 15:51:46.195365: I tensorflow/cc/saved_model/loader.cc:301] SavedModel

In [47]:
# Check the size
size = os.path.getsize("model.tflite")
print(f"Size of the TFLite model on disk: {size/1000} KB")

Size of the TFLite model on disk: 58.8 KB


In [37]:
# Runs the inference on data x_test with a TFLite model
def predict_TFLite(model, X, num_classes=10):
    x_data = np.copy(X) # the function quantizes the input, so we must make a copy
    # Initialize the TFLite interpreter
    interpreter = tf.lite.Interpreter(model_content=model)
    interpreter.allocate_tensors()
    input_details = interpreter.get_input_details()[0]
    output_details = interpreter.get_output_details()[0]
    # Inputs will be quantized
    input_scale, input_zero_point = input_details["quantization"]
    if (input_scale, input_zero_point) != (0.0, 0):
        x_data = x_data / input_scale + input_zero_point
        x_data = x_data.astype(input_details["dtype"])
    # Invoke the interpreter
    predictions = np.empty((x_data.shape[0],num_classes), dtype=output_details["dtype"])
    for i in range(len(x_data)):
        interpreter.set_tensor(input_details["index"], [x_data[i]])
        interpreter.invoke()
        predictions[i] = np.copy(interpreter.get_tensor(output_details["index"])[0])
    # Dequantize output
    output_scale, output_zero_point = output_details["quantization"]
    if (output_scale, output_zero_point) != (0.0, 0):
        predictions = predictions.astype(np.float32)
        predictions = (predictions - output_zero_point) * output_scale
    # todo reshape output into array for each exit
    return predictions

def evaluate_TFLite(model, X, Y):
    time_start = time.time()
    predictions = predict_TFLite(model, X)
    predictions = np.argmax(predictions,axis=-1)
    accuracy = np.nanmean(predictions.flatten()==Y.flatten())*100
    time_end = time.time()
    print(f"Ellapsed time: {time_end-time_start:.3f} s for {predictions.shape[0]} samples")
    return accuracy

In [40]:
# Evaluate the TFLite model on the test images
accuracy = evaluate_TFLite(model_tflite, x_test, y_test)
print(f"Accuracy of the TFLite model: {accuracy}%")

Ellapsed time: 8.063 s for 10000 samples
Accuracy of the TFLite model: 98.17%


# TensorFlow Lite for Microcontrollers

In [56]:
# Create a C++ array of the TFLite model
# The model needs to be saved on disk
def convert_TFLite_to_TFLM(TFLite_filename, TFLM_target_filename):
    # Read a TFLite saved model, convert it to TFLite Micro
    # Convert to a C source file, i.e, a TensorFlow Lite for Microcontrollers model
    !xxd -i {TFLite_filename} > {TFLM_target_filename}
    # Update variable names
    REPLACE_TEXT = TFLite_filename.replace('/', '_').replace('.', '_')
    !sed -i 's/'{REPLACE_TEXT}'/g_model/g' {TFLM_target_filename}

In [58]:
# Save model on disk
open("model.tflite", "wb").write(model_tflite)
# Convert to TFLM
convert_TFLite_to_TFLM("model.tflite", "embedded_model.cc")
# It takes a few second to obtain the file