In [1]:
import tensorflow as tf 
import numpy as np
import time

from tensorflow import keras

saved_model1 = './model1_mnist/'
saved_model2 = './model2_mnist/'

# Currently only working when the model is saved & loaded using the keras api, rather than the tf.saved_model api
saved_model2_keras = './model2_keras/'
reloaded_model = tf.keras.models.load_model(saved_model2_keras)

2021-09-20 11:26:56.000124: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Data

In [2]:
# Using tf MNIST data set this time 
mnist = tf.keras.datasets.mnist
fash = tf.keras.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the input image so that each pixel value is between 0 to 1.
train_images_norm = train_images.astype(np.float32) / 255.0
test_images_norm = test_images.astype(np.float32) / 255.0

## Optional retrain + optional given network

## TF normal

In [3]:
# tf inference
pred = reloaded_model.predict(test_images)
predictions = np.argmax(pred, axis=1)

2021-09-20 11:27:04.606348: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


## TFLite

In [4]:
# tflite conv + inference 

inf_time = []

def representative_data_gen():
    for input_value in tf.data.Dataset.from_tensor_slices(train_images_norm).batch(1).take(1000):
        yield [input_value]

def conv_int8(model_path):
    start_conv = time.time()
    # TODO set option to read it from saved model or from existing model
    # for now : uses keras (existing model)
    # converter1 = tf.lite.TFLiteConverter.from_saved_model(model_path)
    converter1 = tf.lite.TFLiteConverter.from_keras_model(model_path)
    converter1.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
    converter1.optimizations = [tf.lite.Optimize.DEFAULT]
    converter1.representative_dataset = representative_data_gen
    converter1.inference_input_type = tf.uint8
    converter1.inference_output_type = tf.uint8

    quant_model = converter1.convert()
    end_conv = time.time()
    inf_time.append(end_conv-start_conv)

    return(quant_model)


def interpret(model, test_set):
    #start_int = time.time()
    interpreter = tf.lite.Interpreter(model_content=model)

    interpreter.allocate_tensors()
    input_details = interpreter.get_input_details()
    input_type = interpreter.get_input_details()[0]['dtype']
    # Quantization parameters : 
    input_scale, input_zero_point = input_details[0]["quantization"]

    # whole set, currently only supports test_set as an array
    interpreter.resize_tensor_input(input_index=input_details[0]['index'], tensor_size=np.shape(test_set))
    interpreter.allocate_tensors()

    # 8bit quantization approximation
    test_images_q = test_set / input_scale + input_zero_point
    test_images_q = np.reshape(test_images_q.astype(input_type), np.shape(test_set)) # wordy line

    # Loading into the tensor
    interpreter.set_tensor(input_details[0]['index'], test_images_q)
    end_int = time.time()

    #inf_time.append(end_int-start_int)
    return interpreter

def run_inference(interpreter):
    output_details = interpreter.get_output_details()
    start_inf = time.time()
    interpreter.invoke()
    end_inf = time.time()
    inference = interpreter.get_tensor(output_details[0]['index'])
    predictions = np.argmax(inference, axis=1)

    inf_time.append(end_inf-start_inf)
    print('Quantized model accuracy : ', (predictions == test_labels).mean())

    return predictions

In [28]:
quanted_model = conv_int8(reloaded_model)
interpreter = interpret(quanted_model, test_images_norm)
run_inference(interpreter)

#print('Original model accuracy : {}'.format(round(eval[1],2)))

INFO:tensorflow:Assets written to: /var/folders/pj/_64wh_0d44z5f2y6mk2zml0w0000gn/T/tmpt5o0hec8/assets


INFO:tensorflow:Assets written to: /var/folders/pj/_64wh_0d44z5f2y6mk2zml0w0000gn/T/tmpt5o0hec8/assets


Quantized model accuracy :  0.9795


array([7, 2, 1, ..., 4, 5, 6])

In [29]:
inf_time
print('Conversion time : {}, Inference time {}'.format(round(inf_time[0],2), round(inf_time[1],2)))

Conversion time : 0.96, Inference time 3.91


In [38]:
from tensorflow.keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [40]:
x_train.dtype

dtype('uint8')