<a href="https://colab.research.google.com/github/willismax/ML-in-Production-30-days-sharing/blob/main/notebook/20.%E8%A8%93%E7%B7%B4%E5%BE%8C%E9%87%8F%E5%8C%96_TensorFolw_Lite_Quantization_%E9%90%B5%E4%BA%BA%E8%B3%BD%E7%A4%BA%E7%AF%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TensorFolw Lite Quantization

- 此為鐵人賽系列文示範文件，參考[TensorFlow Lite官方範例](https://www.tensorflow.org/lite/performance/post_training_quantization)修改而成。
- TF Lite 評估函數參考[來源](https://www.tensorflow.org/lite/performance/post_training_integer_quant_16x8)。

In [1]:
# 建立評估模型的dict
MODEL_SIZE = {}
ACCURACY = {}

In [2]:
import tensorflow as tf
import numpy as np
import os

## 建立基本模型

- 模型採用`tf.keras.datasets.mnist`，用CNN進行建模。

In [3]:
# Load MNIST dataset
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the input image so that each pixel value is between 0 to 1.
train_images = train_images / 255.0
test_images = test_images / 255.0

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [4]:
def model_builder():

  keras = tf.keras

  model = keras.Sequential([
    keras.layers.InputLayer(input_shape=(28, 28)),
    keras.layers.Reshape(target_shape=(28, 28, 1)),
    keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation='relu'),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.Flatten(),
    keras.layers.Dense(10, activation='softmax')
  ])

  return model

In [5]:
baseline_model = model_builder()
baseline_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
    )

baseline_model.summary()
baseline_model.save_weights('baseline_weights.h5')

baseline_model.fit(train_images, train_labels, epochs=1, shuffle=False)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 28, 28, 1)         0         
                                                                 
 conv2d (Conv2D)             (None, 26, 26, 12)        120       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 12)       0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 2028)              0         
                                                                 
 dense (Dense)               (None, 10)                20290     
                                                                 
Total params: 20,410
Trainable params: 20,410
Non-trainable params: 0
____________________________________________________

<keras.callbacks.History at 0x7f45c2badfd0>

In [6]:
# 儲存未量化模型
baseline_model.save('non_quantized.h5', include_optimizer=False)

# 評估模型並紀錄準確率
_, ACCURACY['baseline Keras model'] = baseline_model.evaluate(test_images, test_labels)

# 紀錄模型大小
MODEL_SIZE['baseline h5'] = os.path.getsize('non_quantized.h5')




In [7]:
ACCURACY

{'baseline Keras model': 0.9617999792098999}

In [8]:
MODEL_SIZE

{'baseline h5': 99144}

## 轉為 TF Lite 格式

- 轉為 TF Lite 使用的 `*.tflite`格式。

In [9]:
converter = tf.lite.TFLiteConverter.from_keras_model(baseline_model)

tflite_model = converter.convert()

with open('non_quantized.tflite', 'wb') as f:
    f.write(tflite_model)

INFO:tensorflow:Assets written to: /tmp/tmpgr7ifpaq/assets




- 建立TF Lite 的評估模型準確率的函數，轉檔為tflite後需要特別撰寫，參考[官方範例](https://www.tensorflow.org/lite/performance/post_training_integer_quant_16x8#evaluate_the_models)。

In [10]:
# A helper function to evaluate the TF Lite model using "test" dataset.
# from: https://www.tensorflow.org/lite/performance/post_training_integer_quant_16x8#evaluate_the_models
def evaluate_model(filemane):
  #Load the model into the interpreters
  interpreter = tf.lite.Interpreter(model_path=str(filemane))
  interpreter.allocate_tensors()

  input_index = interpreter.get_input_details()[0]["index"]
  output_index = interpreter.get_output_details()[0]["index"]

  # Run predictions on every image in the "test" dataset.
  prediction_digits = []
  for test_image in test_images:
    # Pre-processing: add batch dimension and convert to float32 to match with
    # the model's input data format.
    test_image = np.expand_dims(test_image, axis=0).astype(np.float32)
    interpreter.set_tensor(input_index, test_image)

    # Run inference.
    interpreter.invoke()

    # Post-processing: remove batch dimension and find the digit with highest
    # probability.
    output = interpreter.tensor(output_index)
    digit = np.argmax(output()[0])
    prediction_digits.append(digit)

  # Compare prediction results with ground truth labels to calculate accuracy.
  accurate_count = 0
  for index in range(len(prediction_digits)):
    if prediction_digits[index] == test_labels[index]:
      accurate_count += 1
  accuracy = accurate_count * 1.0 / len(prediction_digits)

  return accuracy

- 精確值略有提升，模型大小略降

In [11]:
ACCURACY['non quantized tflite'] = evaluate_model(filemane='non_quantized.tflite')
ACCURACY

{'baseline Keras model': 0.9617999792098999, 'non quantized tflite': 0.9618}

In [12]:
MODEL_SIZE['non quantized tflite'] = os.path.getsize('non_quantized.tflite')
MODEL_SIZE

{'baseline h5': 99144, 'non quantized tflite': 84728}

## 訓練後量化 Post-Training Quantization

- 本範例示範訓練後量化之動態範圍量化 Dynamic range quantization 。
- 您也可以嘗試固定float8、float16量化。

In [13]:
# Dynamic range quantization
converter = tf.lite.TFLiteConverter.from_keras_model(baseline_model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

with open('post_training_quantized.tflite', 'wb') as f:
    f.write(tflite_model)

INFO:tensorflow:Assets written to: /tmp/tmpsro84xmf/assets


INFO:tensorflow:Assets written to: /tmp/tmpsro84xmf/assets


- 模型大小下降許多，精準度略有提升

In [14]:
ACCURACY['post training quantized tflite'] = evaluate_model(filemane='post_training_quantized.tflite')
ACCURACY

{'baseline Keras model': 0.9617999792098999,
 'non quantized tflite': 0.9618,
 'post training quantized tflite': 0.9618}

In [15]:
MODEL_SIZE['post training quantized tflite'] = os.path.getsize('post_training_quantized.tflite')
MODEL_SIZE

{'baseline h5': 99144,
 'non quantized tflite': 84728,
 'post training quantized tflite': 24112}

## (選用)量化感知訓練 Quantization Aware Training

- 當訓練後量化導致您的準確率下降多到無法接受，可以考慮在量化模型之前進行[量化感知訓練 Quantization Aware Training](https://www.tensorflow.org/model_optimization/guide/quantization/training)。
- 此方法為在訓練期間在模型中插入假量化節點來模擬精度損失，讓模型學會適應精度損失，以獲得更準確的預測。
- 需使用 `tensorflow_model_optimization` 模組，該模組提供 `quantize_model()` 完成任務。
- 調整後再量化可舒緩準確率下降的問題。

In [16]:
!pip install tensorflow_model_optimization

Collecting tensorflow_model_optimization
  Downloading tensorflow_model_optimization-0.7.2-py2.py3-none-any.whl (237 kB)
[?25l[K     |█▍                              | 10 kB 14.8 MB/s eta 0:00:01[K     |██▊                             | 20 kB 8.4 MB/s eta 0:00:01[K     |████▏                           | 30 kB 5.9 MB/s eta 0:00:01[K     |█████▌                          | 40 kB 5.8 MB/s eta 0:00:01[K     |███████                         | 51 kB 3.4 MB/s eta 0:00:01[K     |████████▎                       | 61 kB 4.0 MB/s eta 0:00:01[K     |█████████▋                      | 71 kB 4.4 MB/s eta 0:00:01[K     |███████████                     | 81 kB 4.9 MB/s eta 0:00:01[K     |████████████▍                   | 92 kB 4.7 MB/s eta 0:00:01[K     |█████████████▉                  | 102 kB 4.2 MB/s eta 0:00:01[K     |███████████████▏                | 112 kB 4.2 MB/s eta 0:00:01[K     |████████████████▌               | 122 kB 4.2 MB/s eta 0:00:01[K     |██████████████████ 

- 使用先前初步訓練的 'baseline_weights.h5' 模型權重進行優化。
- 模型增加了些假結點與 Layer。

In [28]:
import tensorflow_model_optimization as tfmot

# method to quantize a Keras model
quantize_model = tfmot.quantization.keras.quantize_model

# Define the model architecture.
model_to_quantize = model_builder()

# Reinitialize weights with saved file
model_to_quantize.load_weights('baseline_weights.h5')

# Quantize the model
q_aware_model = quantize_model(model_to_quantize)

# `quantize_model` requires a recompile.
q_aware_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

q_aware_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 quantize_layer_1 (QuantizeL  (None, 28, 28)           3         
 ayer)                                                           
                                                                 
 quant_reshape_2 (QuantizeWr  (None, 28, 28, 1)        1         
 apperV2)                                                        
                                                                 
 quant_conv2d_2 (QuantizeWra  (None, 26, 26, 12)       147       
 pperV2)                                                         
                                                                 
 quant_max_pooling2d_2 (Quan  (None, 13, 13, 12)       1         
 tizeWrapperV2)                                                  
                                                                 
 quant_flatten_2 (QuantizeWr  (None, 2028)            

In [29]:
q_aware_model.save('quantization_aware_non-quantized.h5', include_optimizer=False)

- 訓練經過感知訓練的模型，您可以自行調整 epochs。


In [23]:
# Train the model
q_aware_model.fit(train_images, train_labels, epochs=10, shuffle=False)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f45c29fe850>

In [24]:
_, ACCURACY['quantization aware non-quantized'] = q_aware_model.evaluate(test_images, test_labels, verbose=0)

In [25]:
ACCURACY

{'baseline Keras model': 0.9617999792098999,
 'non quantized tflite': 0.9618,
 'post training quantized tflite': 0.9618,
 'quantization aware non-quantized': 0.09839999675750732}

In [30]:
MODEL_SIZE['quantization aware non-quantized'] = os.path.getsize('quantization_aware_non-quantized.h5')
MODEL_SIZE

{'baseline h5': 99144,
 'non quantized tflite': 84728,
 'post training quantized tflite': 24112,
 'quantization aware non-quantized': 116472}