In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import numpy as np

INPUT_SHAPE = (96, 96, 3)
NUM_CLASSES = 10
LEARNING_RATE = 0.0001
BATCH_SIZE = 32
INITIAL_EPOCHS = 10
FINE_TUNE_EPOCHS = 10
FINE_TUNE_AT = 100
H5_MODEL_PATH = "cifar10_mobilenetv2_finetuned.h5"
TFLITE_MODEL_PATH = "cifar10_mobilenetv2_finetuned.tflite"



2025-08-04 23:10:57.893316: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754349058.085601      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754349058.141031      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def load_and_preprocess_data():
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()

    y_train = to_categorical(y_train, NUM_CLASSES)
    y_test = to_categorical(y_test, NUM_CLASSES)

    def preprocess_image(image, label):
        image = tf.cast(image, tf.float32)
        image = tf.image.resize(image, (INPUT_SHAPE[0], INPUT_SHAPE[1]))
        image = tf.keras.applications.mobilenet_v2.preprocess_input(image)
        return image, label

    train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    train_ds = (
        train_ds.shuffle(buffer_size=1024)
        .map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
        .batch(BATCH_SIZE)
        .prefetch(tf.data.AUTOTUNE)
    )

    test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    test_ds = (
        test_ds.map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
        .batch(BATCH_SIZE)
        .prefetch(tf.data.AUTOTUNE)
    )
    return train_ds, test_ds

def build_model():
    base_model = MobileNetV2(input_shape=INPUT_SHAPE,
                             include_top=False,
                             weights='imagenet')
    base_model.trainable = False

    inputs = Input(shape=INPUT_SHAPE)
    x = base_model(inputs, training=False)
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.2)(x)
    outputs = Dense(NUM_CLASSES, activation='softmax')(x)
    model = Model(inputs, outputs)
    return model, base_model

def compile_and_train(model, train_ds, test_ds):
    model.compile(optimizer=Adam(learning_rate=LEARNING_RATE),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    history = model.fit(train_ds,
                        epochs=INITIAL_EPOCHS,
                        validation_data=test_ds)
    return history

def fine_tune_model(model, base_model, train_ds, test_ds):
    base_model.trainable = True

    for layer in base_model.layers[:FINE_TUNE_AT]:
        layer.trainable = False

    model.compile(optimizer=Adam(learning_rate=LEARNING_RATE / 10),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    total_epochs = INITIAL_EPOCHS + FINE_TUNE_EPOCHS
    history_fine = model.fit(train_ds,
                             epochs=total_epochs,
                             initial_epoch=INITIAL_EPOCHS,
                             validation_data=test_ds)
    return history_fine

def save_models(model):
    model.save(H5_MODEL_PATH)
    print(f"Model saved to {H5_MODEL_PATH}")

    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    tflite_model = converter.convert()

    with open(TFLITE_MODEL_PATH, 'wb') as f:
        f.write(tflite_model)
    print(f"Model saved to {TFLITE_MODEL_PATH}")

def test_tflite_accuracy(model_path):
    (_, _), (x_test, y_test) = cifar10.load_data()

    interpreter = tf.lite.Interpreter(model_path=model_path)
    interpreter.allocate_tensors()

    input_details = interpreter.get_input_details()[0]
    output_details = interpreter.get_output_details()[0]

    height = input_details['shape'][1]
    width = input_details['shape'][2]

    correct_predictions = 0
    total_images = len(x_test)

    for i in range(total_images):
        image = x_test[i]
        image_expanded = np.expand_dims(image, axis=0)
        image_resized = tf.image.resize(image_expanded, [height, width])
        image_preprocessed = tf.keras.applications.mobilenet_v2.preprocess_input(image_resized)

        interpreter.set_tensor(input_details['index'], image_preprocessed)
        interpreter.invoke()

        output_data = interpreter.get_tensor(output_details['index'])
        predicted_label = np.argmax(output_data)
        true_label = y_test[i][0]

        if predicted_label == true_label:
            correct_predictions += 1

    accuracy = (correct_predictions / total_images) * 100
    print(f"\nTFLite model accuracy: {accuracy:.2f}% ({correct_predictions}/{total_images})")



In [3]:
train_ds, test_ds = load_and_preprocess_data()
model, base_model = build_model()


Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 0us/step


I0000 00:00:1754349086.139378      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_96_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
print("--- Starting Initial Training (Transfer Learning) ---")
compile_and_train(model, train_ds, test_ds)

loss, accuracy = model.evaluate(test_ds)
print(f"\nTest Accuracy (Transfer Learning): {accuracy * 100:.2f}%")

--- Starting Initial Training (Transfer Learning) ---
Epoch 1/10


I0000 00:00:1754349105.599298      97 service.cc:148] XLA service 0x7e18c0004140 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1754349105.599923      97 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1754349106.548376      97 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  18/1563[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m15s[0m 10ms/step - accuracy: 0.1338 - loss: 3.1476

I0000 00:00:1754349110.216667      97 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 17ms/step - accuracy: 0.4847 - loss: 1.5991 - val_accuracy: 0.8065 - val_loss: 0.5770
Epoch 2/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 11ms/step - accuracy: 0.7896 - loss: 0.6145 - val_accuracy: 0.8339 - val_loss: 0.4879
Epoch 3/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 11ms/step - accuracy: 0.8207 - loss: 0.5256 - val_accuracy: 0.8445 - val_loss: 0.4542
Epoch 4/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 11ms/step - accuracy: 0.8332 - loss: 0.4813 - val_accuracy: 0.8500 - val_loss: 0.4357
Epoch 5/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 11ms/step - accuracy: 0.8428 - loss: 0.4607 - val_accuracy: 0.8538 - val_loss: 0.4236
Epoch 6/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 11ms/step - accuracy: 0.8493 - loss: 0.4353 - val_accuracy: 0.8571 - val_loss: 0.4143
Epoch 7/10
[1m

In [5]:
model.summary()

In [6]:
print("\n--- Starting Fine-Tuning ---")
fine_tune_model(model, base_model, train_ds, test_ds)

loss, accuracy = model.evaluate(test_ds)
print(f"\nFinal Test Accuracy: {accuracy * 100:.2f}%")




--- Starting Fine-Tuning ---
Epoch 11/20


E0000 00:00:1754349313.928111      99 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1754349314.111639      99 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1754349314.375269      99 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1754349314.579847      99 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m1559/1563[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - accuracy: 0.7789 - loss: 0.6680

E0000 00:00:1754349346.890508      97 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1754349347.072553      97 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1754349347.323905      97 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1754349347.528221      97 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 26ms/step - accuracy: 0.7791 - loss: 0.6676 - val_accuracy: 0.8769 - val_loss: 0.3685
Epoch 12/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 16ms/step - accuracy: 0.8625 - loss: 0.4067 - val_accuracy: 0.8902 - val_loss: 0.3317
Epoch 13/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 16ms/step - accuracy: 0.8859 - loss: 0.3315 - val_accuracy: 0.8978 - val_loss: 0.3089
Epoch 14/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 16ms/step - accuracy: 0.9020 - loss: 0.2823 - val_accuracy: 0.9003 - val_loss: 0.2957
Epoch 15/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 16ms/step - accuracy: 0.9156 - loss: 0.2430 - val_accuracy: 0.9059 - val_loss: 0.2849
Epoch 16/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 16ms/step - accuracy: 0.9266 - loss: 0.2067 - val_accuracy: 0.9053 - val_loss: 0.2791
Epoch 17/2

In [7]:
save_models(model)

test_tflite_accuracy(TFLITE_MODEL_PATH)

Model saved to cifar10_mobilenetv2_finetuned.h5
Saved artifact at '/tmp/tmp0ece7ffm'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 96, 96, 3), dtype=tf.float32, name='keras_tensor_154')
Output Type:
  TensorSpec(shape=(None, 10), dtype=tf.float32, name=None)
Captures:
  138647661585872: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138647661586256: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138647661586832: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138647661587600: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138647661586640: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138647661586064: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138647661587024: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138647661587984: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138647661588368: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138647661586448: TensorSpec(shape

W0000 00:00:1754349615.918035      36 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1754349615.918071      36 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.
I0000 00:00:1754349616.072327      36 mlir_graph_optimization_pass.cc:401] MLIR V1 optimization pass is not enabled


Model saved to cifar10_mobilenetv2_finetuned.tflite


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.



TFLite model accuracy: 90.41% (9041/10000)


# TensorFlow Lite INT8

In [8]:
import tensorflow as tf
import numpy as np
import os

H5_MODEL_PATH = "cifar10_mobilenetv2_finetuned.h5"
TFLITE_INT8_MODEL_PATH = "cifar10_mobilenetv2_finetuned_int8.tflite"
INPUT_SHAPE = (96, 96, 3)

def representative_dataset_gen():
    (x_train, _), _ = tf.keras.datasets.cifar10.load_data()
    for i in range(100):
        image = x_train[i].astype(np.float32)
        image = tf.image.resize(image, (INPUT_SHAPE[0], INPUT_SHAPE[1]))
        image = tf.keras.applications.mobilenet_v2.preprocess_input(image)
        image = np.expand_dims(image, axis=0)
        yield [image]

converter = tf.lite.TFLiteConverter.from_keras_model(tf.keras.models.load_model(H5_MODEL_PATH))
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_dataset_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

tflite_model_quant = converter.convert()

with open(TFLITE_INT8_MODEL_PATH, 'wb') as f:
    f.write(tflite_model_quant)

print(f"INT8 quantized model saved to: {TFLITE_INT8_MODEL_PATH}")
print(f"Original model size: {os.path.getsize(H5_MODEL_PATH) / 1024:.2f} KB")
print(f"INT8 model size: {os.path.getsize(TFLITE_INT8_MODEL_PATH) / 1024:.2f} KB")

Saved artifact at '/tmp/tmpnlmeyfa6'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 96, 96, 3), dtype=tf.float32, name='input_layer_1')
Output Type:
  TensorSpec(shape=(None, 10), dtype=tf.float32, name=None)
Captures:
  138642454074704: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138642454076048: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138642454076432: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138642454076240: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138642454074896: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138642454077584: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138642454077968: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138642454078352: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138642454078160: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138642454075472: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13864245407950

W0000 00:00:1754349703.949370      36 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1754349703.949406      36 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.


INT8 quantized model saved to: cifar10_mobilenetv2_finetuned_int8.tflite
Original model size: 23915.88 KB
INT8 model size: 2657.98 KB


fully_quantize: 0, inference_type: 6, input_inference_type: INT8, output_inference_type: INT8


In [10]:
import tensorflow as tf
import numpy as np

TFLITE_INT8_MODEL_PATH = "cifar10_mobilenetv2_finetuned_int8.tflite"

def test_tflite_int8_accuracy(model_path):
    (_, _), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

    interpreter = tf.lite.Interpreter(model_path=model_path)
    interpreter.allocate_tensors()

    input_details = interpreter.get_input_details()[0]
    output_details = interpreter.get_output_details()[0]

    input_scale, input_zero_point = input_details["quantization"]
    output_scale, output_zero_point = output_details["quantization"]
    
    height = input_details['shape'][1]
    width = input_details['shape'][2]

    correct_predictions = 0
    total_images = len(x_test)

    for i in range(total_images):
        image = x_test[i]
        image_expanded = np.expand_dims(image, axis=0)
        image_resized = tf.image.resize(image_expanded, [height, width])
        image_preprocessed = tf.keras.applications.mobilenet_v2.preprocess_input(image_resized)

        image_quantized = (image_preprocessed / input_scale) + input_zero_point
        image_quantized = tf.cast(image_quantized, dtype=input_details["dtype"])

        interpreter.set_tensor(input_details['index'], image_quantized)
        interpreter.invoke()

        output_data = interpreter.get_tensor(output_details['index'])
        output_data_dequantized = (output_data.astype(np.float32) - output_zero_point) * output_scale
        
        predicted_label = np.argmax(output_data_dequantized)
        true_label = y_test[i][0]

        if predicted_label == true_label:
            correct_predictions += 1
            
    accuracy = (correct_predictions / total_images) * 100
    print(f"\nINT8 TFLite model accuracy: {accuracy:.2f}% ({correct_predictions}/{total_images})")

test_tflite_int8_accuracy(TFLITE_INT8_MODEL_PATH)


INT8 TFLite model accuracy: 74.48% (7448/10000)
