In [None]:
import numpy as np
import tensorflow as tf
import time

In [None]:
def base_conv():
    model = tf.keras.models.Sequential([

        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
        tf.keras.layers.DepthwiseConv2D((3, 3), activation='relu'),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.DepthwiseConv2D((3, 3), activation='relu'),
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
        tf.keras.layers.DepthwiseConv2D((3, 3), activation='relu'),
        tf.keras.layers.Conv2D(256, (3, 3), activation='relu'),
        tf.keras.layers.GlobalAveragePooling2D(),
        
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    return model

def create_model():
    # Load MNIST dataset

    model = base_conv()

    # Compile the model
    model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

    model.summary()
    # Save the trained model
    model.save("models/mnist_cnn.keras")
    print('saved model')
    return model

create_model()

In [22]:
def conv_1a(input_size):
    model = tf.keras.models.Sequential([
        
        tf.keras.layers.Dense(128, activation='relu',input_shape=input_size),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10, activation='softmax')
        
        
    ])
    model.build(tf.random.normal(input_size))
    return model

model = conv_1a(( 256,))
model.save("models/GPU_CNN_7.keras")



In [19]:
def conv_1b(input_size):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_size),
        tf.keras.layers.DepthwiseConv2D((3, 3), activation='relu'),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.DepthwiseConv2D((3, 3), activation='relu'),
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
        tf.keras.layers.DepthwiseConv2D((3, 3), activation='relu'),
        tf.keras.layers.Conv2D(256, (3, 3), activation='relu'),
        tf.keras.layers.GlobalAveragePooling2D(),
    ])
    model.build(tf.random.normal(input_size))
    return model

#convert one model to tflite
def convert_to_tflite(model, input_size, n):
    def representative_dataset():
        for _ in range(100):
            yield [tf.random.normal([1, *input_size])]

    quantizer = tf.lite.TFLiteConverter.from_keras_model(model)
    quantizer.optimizations = [tf.lite.Optimize.DEFAULT]
    quantizer.representative_dataset = representative_dataset
    quantizer.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS]
    quantizer.inference_input_type = tf.uint8  # or tf.uint8
    quantizer.inference_output_type = tf.uint8  # or tf.uint8
    
    tflite_quant_model = quantizer.convert()

    # Save the TFLite model to a file
    with open(f"models/TPU_CNN_{n}", "wb") as f:
        f.write(tflite_quant_model)
    print("saved tf lite model")
    return tflite_quant_model

input_size = (224,224,3) 
model_tflite = conv_1b(input_size)
model_tflite = convert_to_tflite(model_tflite, input_size, 7)

INFO:tensorflow:Assets written to: /tmp/tmp2twjcsvc/assets


INFO:tensorflow:Assets written to: /tmp/tmp2twjcsvc/assets
2024-04-23 17:27:50.431773: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:378] Ignored output_format.
2024-04-23 17:27:50.431800: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:381] Ignored drop_control_dependency.
2024-04-23 17:27:50.432012: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmp2twjcsvc
2024-04-23 17:27:50.434751: I tensorflow/cc/saved_model/reader.cc:51] Reading meta graph with tags { serve }
2024-04-23 17:27:50.434769: I tensorflow/cc/saved_model/reader.cc:146] Reading SavedModel debug info (if present) from: /tmp/tmp2twjcsvc
2024-04-23 17:27:50.440685: I tensorflow/cc/saved_model/loader.cc:233] Restoring SavedModel bundle.
2024-04-23 17:27:50.492288: I tensorflow/cc/saved_model/loader.cc:217] Running initialization op on SavedModel bundle at path: /tmp/tmp2twjcsvc
2024-04-23 17:27:50.511111: I tensorflow/cc/saved_model/loader.cc:316] SavedModel

saved tf lite model


fully_quantize: 0, inference_type: 6, input_inference_type: UINT8, output_inference_type: UINT8


In [None]:
#
batch_sizes = [1,4,16,64,256,1024]
for i in range(1,5):
    model = tf.keras.models.load_model(f"models/GPU_CNN_{i}.keras")
    for batch_size in batch_sizes:
        start = time.perf_counter()
        outputs = model.predict(np.random.rand(n, 224, 224, 3), verbose=0)
        end = time.perf_counter()
        print(f"GPU_CNN_{i} batch size {batch_size} time: {(end-start)*1000/batch_size} ms")


In [None]:

def generate_1b_input():
    return np.random.rand(1,128).astype(np.float32)

def tflite_inference(num_trials):
    
    #load model 1
    model_1a = tf.keras.models.load_model("models/mnist_dnn1a.keras")
    
    
    # Load the TFLite model
    interpreter = tf.lite.Interpreter(model_path="mnist_model_batched.tflite")
    interpreter.allocate_tensors()

    # Get input and output details
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    output_scale, output_zero_point = output_details[0]['quantization']
    input_scale, input_zero_point = input_details[0]["quantization"]
    
    # Prepare input data
    input_shape = input_details[0]['shape']
    inference_times = []
    inputs = [generate_1b_input() for _ in range(num_trials)]
    for input_data in inputs:
        start = time.perf_counter()
        input_data = (input_data / input_scale) + input_zero_point
        interpreter.set_tensor(input_details[0]['index'], input_data.astype(np.uint8))
        interpreter.invoke()
        output_data = interpreter.get_tensor(output_details[0]['index'])
        output_data = output_scale * (output_data - output_zero_point)
        end = time.perf_counter()
        inference_times.append((end - start)*1000)

    avg_inference_time = np.mean(inference_times)
    print("average inference time: ", avg_inference_time, "ms")

tflite_inference(10)