# Quantization on MNIST data set, 2d attempt 

## Working as expected 

#### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np

In [2]:
print(tf.__version__)

2.4.0


#### Data

In [3]:
# Using tf MNIST data set this time 
mnist = tf.keras.datasets.mnist
fash = tf.keras.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the input image so that each pixel value is between 0 to 1.
train_images = train_images.astype(np.float32) / 255.0
test_images = test_images.astype(np.float32) / 255.0

In [4]:
train_images.shape

(60000, 28, 28)

#### Model to be quantized

In [5]:
model2 = tf.keras.Sequential([
  tf.keras.layers.InputLayer(input_shape=(28, 28)),
  tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
  tf.keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation='relu'),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(10)
])

# Train the digit classification model
model2.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(
                  from_logits=True),
              metrics=['accuracy'])
model2.fit(
  train_images,
  train_labels,
  epochs=5,
  validation_data=(test_images, test_labels)
)
eval = model2.evaluate(test_images, test_labels)
print("test loss, test acc:", eval)
tf.saved_model.save(model2, './model2_mnist/')
model2.save('./model2_keras/')

2021-09-13 11:09:55.908224: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-09-13 11:09:55.908473: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-13 11:09:55.987473: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
test loss, test acc: [0.05982794985175133, 0.9793999791145325]


2021-09-13 11:10:35.598872: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./model2_mnist/assets
INFO:tensorflow:Assets written to: ./model2_keras/assets


## 1. Quantization blueprint

In [30]:
path = './model1_mnist/'
converter = tf.lite.TFLiteConverter.from_saved_model(path)

In [31]:
# Conversion set-up 

converter.target_spec.supported_ops = [
    tf.lite.OpsSet.TFLITE_BUILTINS,  # enable TensorFlow Lite ops.
    tf.lite.OpsSet.SELECT_TF_OPS  # enable TensorFlow ops.
]

converter.optimizations = [tf.lite.Optimize.DEFAULT]

model = converter.convert()

### No quantization

In [32]:
# Converted model interpreter 

interpreter = tf.lite.Interpreter(model_content=model)
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print('Input details : ', input_details)
print('Output details :', output_details)

Input details :  [{'name': 'serving_default_flatten_input:0', 'index': 0, 'shape': array([  1, 784], dtype=int32), 'shape_signature': array([ -1, 784], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]
Output details : [{'name': 'StatefulPartitionedCall:0', 'index': 12, 'shape': array([ 1, 10], dtype=int32), 'shape_signature': array([-1, 10], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]


Since the input shape is set to 1,784 ( from the original TF model) we resize the input tensor before allocating it again.   
Then we can load the whole test set to produce a production result that we can interpret thanks to argmax.

In [33]:
test_images = test_images.reshape(10000,784)
interpreter.resize_tensor_input(input_index = input_details[0]['index'],tensor_size=(10000,784))
interpreter.allocate_tensors()
interpreter.set_tensor(input_details[0]['index'], test_images)

interpreter.invoke()
tflite_predictions = interpreter.get_tensor(output_details[0]['index'])

Here we get 98% prediction, which would be the precision o the noraml model, but no quantization was actually done : the datatype is till float32

In [34]:
pred = np.argmax(tflite_predictions, axis=1)
print('Accuracy : ', (pred == test_labels).mean() )

Accuracy :  0.9822


## 2. Post-training integer quantization

Now we will reproduce the steps, but with the addition of the quantization steps

In [35]:
def representative_data_gen():
  for input_value in tf.data.Dataset.from_tensor_slices(train_images).batch(1).take(1000):
    yield [input_value]

converter = tf.lite.TFLiteConverter.from_saved_model(path)

converter.target_spec.supported_ops = [
    tf.lite.OpsSet.TFLITE_BUILTINS_INT8#,  
    # tf.lite.OpsSet.TFLITE_BUILTINS,
    # tf.lite.OpsSet.SELECT_TF_OPS  
]
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_data_gen
converter.inference_input_type = tf.uint8
converter.inference_output_type = tf.uint8

quant_model = converter.convert()

interpreter = tf.lite.Interpreter(model_content=quant_model)



In [36]:
input_type = interpreter.get_input_details()[0]['dtype']
print('input: ', input_type)
output_type = interpreter.get_output_details()[0]['dtype']
print('output: ', output_type)

input:  <class 'numpy.uint8'>
output:  <class 'numpy.uint8'>


In [37]:
# To replace in the interpreter.set_tensor line for single inference : 

# test_images[10] = np.expand_dims(test_images[10], axis=0).astype(input_type)
# test_images[10] = test_images[10] / input_scale + input_zero_point
# dici = test_images[10].astype(input_type)

In [38]:
# Interpreter
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

input_scale, input_zero_point = input_details[0]["quantization"]

#whole set 
interpreter.resize_tensor_input(input_index = input_details[0]['index'],tensor_size=(10000,784))
interpreter.allocate_tensors()

test_images_q = test_images / input_scale + input_zero_point
test_images_q = np.reshape(test_images_q.astype(input_type), (10000,784))

interpreter.set_tensor(input_details[0]['index'], test_images_q)

In [39]:
# Invoke
interpreter.invoke()
tflite_predictions2 = interpreter.get_tensor(output_details[0]['index'])
#tflite_predictions2

In [14]:
pred2 = np.argmax(tflite_predictions2, axis=1)
print('Accuracy : ', (pred2 == test_labels).mean())
print('Predictions similarity noquant vs. quant : ', (pred2 == pred).mean())

Accuracy :  0.9821
Predictions similarity noquant vs. quant :  0.9993


In [15]:
# import matplotlib.pyplot as plt
# #plt.imshow(np.reshape(test_images_int8[97], (28,28)))
# plt.show()
# plt.imshow(np.reshape(test_images[10], (28,28)))
# type(test_images)

## 3. Automated/Factored quantization

#### Data

In [16]:
mnist = tf.keras.datasets.mnist
fash = tf.keras.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Very important step
train_images_norm = train_images.astype(np.float32) / 255.0
test_images_norm = test_images.astype(np.float32) / 255.0

#### Model 
Here I can load any presaved model or train a new one

In [17]:
# Careful on which dataset model2 (and 1) are trained
saved_model1 = './model1_mnist/'
saved_model2 = './model2_mnist/'

#### Functions

In [18]:
def representative_data_gen():
    for input_value in tf.data.Dataset.from_tensor_slices(train_images_norm).batch(1).take(1000):
        yield [input_value]

def conv(model_path):
    # TODO set option to read it from saved model or from existing model
    converter1 = tf.lite.TFLiteConverter.from_saved_model(model_path)
    converter1.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
    converter1.optimizations = [tf.lite.Optimize.DEFAULT]
    converter1.representative_dataset = representative_data_gen
    converter1.inference_input_type = tf.uint8
    converter1.inference_output_type = tf.uint8

    quant_model = converter1.convert()
    return(quant_model)


def interpret(model, test_set):
    interpreter = tf.lite.Interpreter(model_content=model)

    interpreter.allocate_tensors()
    input_details = interpreter.get_input_details()
    input_type = interpreter.get_input_details()[0]['dtype']
    # Quantization parameters : 
    input_scale, input_zero_point = input_details[0]["quantization"]

    # whole set, currently only supports test_set as an array
    interpreter.resize_tensor_input(input_index=input_details[0]['index'], tensor_size=np.shape(test_set))
    interpreter.allocate_tensors()

    # 8bit quantization approximation
    test_images_q = test_set / input_scale + input_zero_point
    test_images_q = np.reshape(test_images_q.astype(input_type), np.shape(test_set)) # wordy line

    # Loading into the tensor
    interpreter.set_tensor(input_details[0]['index'], test_images_q)

    return interpreter

def run_inference(interpreter):
    output_details = interpreter.get_output_details()
    interpreter.invoke()
    inference = interpreter.get_tensor(output_details[0]['index'])
    predictions = np.argmax(inference, axis=1)
    print('Quantized model accuracy : ', (predictions == test_labels).mean())

    return predictions

#### Execution 

In [40]:
# possible TODO : make it only in a single function, with (saved_model, test_images) params
# + the dataset as a param
# + 
quanted_model = conv(saved_model2)
interpreter = interpret(quanted_model, test_images_norm)
run_inference(interpreter)

print('Original model accuracy : {}'.format(round(eval[1],2)))



Quantized model accuracy :  0.9795
Original model accuracy : 0.98


### With model2 :
on mnist   : accuracy 0.98
on fashion : accuracy 0.89