Quantization involves reducing the precision of your model's weights and, optionally, activation functions from floating-point numbers (like float32) to lower-bit representations, such as int8 or float16. This process can significantly reduce the model size and speed up inference, making it highly suitable for deployment on resource-constrained devices.

In [1]:
# NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and
# matrices, along with a large collection of high-level mathematical functions to operate on these arrays.
import numpy as np

# Importing specific modules from keras, which is now part of TensorFlow
# Callbacks are utilities called at certain points during model training. EarlyStopping stops training when a monitored
# metric has stopped improving, and ModelCheckpoint saves the model after every epoch.
from keras.callbacks import EarlyStopping, ModelCheckpoint
# load_model is used to load a saved model. Sequential is a linear stack of layers.
from keras.models import Sequential
# Dense is a standard layer type that is used in many neural networks.
from keras.layers import Dense

# TensorFlow Lite provides tools and classes for converting TensorFlow models into a highly optimized format suitable
# for deployment on mobile devices, embedded systems, or other platforms with limited computational capacity. This
# module includes functionalities for model conversion, optimization, and inference. By importing `lite`, you gain
# access to the TFLiteConverter class for model conversion, optimization options like quantization, and utilities for
# running TFLite models on target devices.
from tensorflow import lite

2024-02-27 20:15:18.210714: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-27 20:15:18.228962: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-27 20:15:18.228976: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-27 20:15:18.229467: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-27 20:15:18.233291: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# Define a simple sequential model with a single Dense (fully connected) layer.
# The model will have a single input feature and a linear activation function.
model = Sequential([
    Dense(units=1, input_shape=[1], activation='linear')
])

2024-02-27 20:15:18.883576: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21880 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:01:00.0, compute capability: 8.6


In [3]:
# Compile the model with the Stochastic Gradient Descent (SGD) optimizer.
# Use mean squared error as the loss function, suitable for regression problems.
model.compile(optimizer='sgd', loss='mean_squared_error')

In [4]:
# Create dummy data for training.
# `xs` represents the input features, and `ys` represents the target outputs.
# These arrays are used to train the model to learn the relationship y = 2x - 1.
xs = np.array([-1.0, 0.0, 1.0, 2.0, 3.0, 4.0], dtype=float)
ys = np.array([-3.0, -1.0, 1.0, 3.0, 5.0, 7.0], dtype=float)

In [5]:
# Train the model on the dummy data.
# We specify the number of iterations over the entire dataset (epochs) and suppress the training log (verbose=0).
model.fit(xs, ys, epochs=100, verbose=0)

2024-02-27 20:15:31.001603: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f3ce0f352e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-27 20:15:31.001615: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2024-02-27 20:15:31.007598: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1709082931.034221   46514 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


<keras.src.callbacks.History at 0x7f3ecc197910>

In [6]:
# Save the trained model to a file in Hierarchical Data Format version 5 (HDF5).
# This allows the model to be loaded and used later without retraining.
model.save('../models/exercise_0.h5')

  saving_api.save_model(


In [7]:
# Convert the trained model into the TensorFlow Lite format.
# This step prepares the model for deployment on mobile or embedded devices by reducing its size and potentially
# improving inference speed.
converter = lite.TFLiteConverter.from_keras_model(model)

In [8]:
# Apply default optimizations for the conversion process, including quantization.
# Quantization reduces the precision of the model's weights and activations, which can decrease size and increase
# inference speed with minimal impact on accuracy.
converter.optimizations = [lite.Optimize.DEFAULT]

In [9]:
# Convert the model to its TensorFlow Lite version with applied optimizations.
tflite_model = converter.convert()

INFO:tensorflow:Assets written to: /tmp/tmp8vieiiwn/assets


INFO:tensorflow:Assets written to: /tmp/tmp8vieiiwn/assets
2024-02-27 20:15:53.826014: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:378] Ignored output_format.
2024-02-27 20:15:53.826024: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:381] Ignored drop_control_dependency.
2024-02-27 20:15:53.826157: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmp8vieiiwn
2024-02-27 20:15:53.826412: I tensorflow/cc/saved_model/reader.cc:51] Reading meta graph with tags { serve }
2024-02-27 20:15:53.826415: I tensorflow/cc/saved_model/reader.cc:146] Reading SavedModel debug info (if present) from: /tmp/tmp8vieiiwn
2024-02-27 20:15:53.827244: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled
2024-02-27 20:15:53.827419: I tensorflow/cc/saved_model/loader.cc:233] Restoring SavedModel bundle.
2024-02-27 20:15:53.837208: I tensorflow/cc/saved_model/loader.cc:217] Running initializatio

In [10]:
# Save the converted (and possibly quantized) TensorFlow Lite model to a file.
# The model is now ready to be deployed on compatible devices.
with open('../models/exercise_0.tflite', 'wb') as f:
    f.write(tflite_model)