# This is designed for inference calculation only

In [1]:
import logging
logging.getLogger("tensorflow").setLevel(logging.DEBUG)

import tensorflow as tf
import numpy as np
from collections import OrderedDict
import netron

# Data preparation mnist

In [2]:
# Load MNIST dataset
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (eva_images, eva_labels) = mnist.load_data()

# Normalize the input image so that each pixel value is between 0 and 1.
train_images_float32_28x28 = train_images.astype(np.float32) / 255.0
eva_images_float32_28x28 = eva_images.astype(np.float32) / 255.0

# Normalize the input image so that each pixel value is between -128 and 127.
train_images_int8 = np.int8(train_images.astype(np.float32) - 128.0)
eva_images_int8 = np.int8(eva_images.astype(np.float32) - 128.0)

# image dimensions (assumed square)
image_size = train_images_float32_28x28.shape[1]
input_size = image_size * image_size
print(f'image_size: {image_size}, input_size: {input_size}')

#reshape data to fit model
train_images_float32 = np.reshape(train_images_float32_28x28, [-1, input_size])
eva_images_float32 = np.reshape(eva_images_float32_28x28, [-1, input_size])

print(train_images_float32.shape)
print(eva_images_float32.shape)

image_size: 28, input_size: 784
(60000, 784)
(10000, 784)


# Manual calculation of Inference for quantized nn

In [4]:
class Module:
    def __init__(self):
        self.modules = OrderedDict()

    def add_module(self, module, name:str):
        self.modules[name] = module

    def forward(self, input) -> np.ndarray:
        for module in self.modules:
            # print(f"module: {module}")
            input = self.modules[module].forward(input)

        return input

In [5]:
#------------------------------------------------------------------------------
#   FullyConnected class
#------------------------------------------------------------------------------
class FullyConnected(Module):
    def __init__(self, w, b, s_w, s_i, s_o, z_i, z_o, min_T, max_T):
        super(FullyConnected, self).__init__()
        self.W = w
        self.b = b
        self.z_i = z_i
        self.z_o = z_o
        self.m = s_i * s_w / s_o
        self.s_b = s_i * s_w
        self.min_T = min_T
        self.max_T = max_T

    def forward(self, input: np.ndarray) -> np.ndarray:

        output_int8 = np.zeros((input.shape[0], self.W.shape[1]), dtype=np.int8)
        output = np.zeros((input.shape[0], self.W.shape[1]), dtype=np.int32)
        a2 = np.zeros((self.W.shape[1]), dtype=np.int32)

        for i in range(input.shape[0]):
            for k in range(self.W.shape[1]):
                for j in range(input.shape[1]):
                    a2[k] += np.int32(self.W[j][k])
                    output[i][k] += np.int32(input[i][j]) * np.int32(self.W[j][k])

                    # print(f'i = {input[i][j]}, W = {self.W[j][k]}, output_32 = {output[i][k]}, a2={a2[k]}')

                # print('-----------------')
                # print(f'z_o = {self.z_o}, m = {self.m}, o = {output[i][k]}, b = {self.b[k]}, a2 = {a2[k]}, z_i = {self.z_i}, zia2: {-self.z_i*a2[k]}')
                # print(f'output_before_saturate_cast = ',np.int32(self.z_o) + self.m * (-(np.int32(self.z_i)*a2[k]) + output[i][k] + self.b[k]))

                # output_int8[i][k] = tf.dtypes.saturate_cast(np.round(np.int32(self.z_o) + self.m * (-(np.int32(self.z_i)*a2[k]) + output[i][k] + self.b[k])), tf.int8)

                ourput_value = np.round(np.int32(self.z_o) + self.m * (-(np.int32(self.z_i)*a2[k]) + output[i][k] + self.b[k]))

                if ourput_value > self.max_T:
                    output_int8[i][k] = np.int8(self.max_T)
                elif ourput_value < self.min_T:
                    output_int8[i][k] = np.int8(self.min_T)
                else:
                    output_int8[i][k] = np.int8(ourput_value)

                # print('output_int8: ', output_int8[i][k])
                # print('-----------------')

        # print(f'input: {input} \noutput: {output_int8}\n-----------------\n-----------------')
        # print(f'input: {input.dtype} \noutput2: {output_int8.dtype}\n-----------------')

        return output_int8

#------------------------------------------------------------------------------
#   Quantize class
#------------------------------------------------------------------------------
class Quantize(Module):
    def __init__(self, s, z_i, z_o, d_type):
        super(Quantize, self).__init__()
        self.z_i = z_i
        self.z_o = z_o
        self.s = s
        self.d_type = d_type

        # print(f'Quantize: z_i: {self.z_i} z_o: {self.z_o} s: {self.s} d_type: {self.d_type}')

    def forward(self, input: np.ndarray) -> np.ndarray:
        # converts from int8 to uint8 and vice versa
        if self.d_type is np.int8:
            arr_q = (input + 128).astype(np.uint8)
        elif self.d_type is np.uint8:
            arr_q = (input - 128).astype(np.int8)
        else:
            raise ValueError(f'input type is not supported: {input.dtype}')

        # print(f'input: {input} \noutput: {arr_q}\n-----------------\n-----------------')
        # print(f'input: {input.dtype} \noutput: {arr_q.dtype}\n-----------------')

        return arr_q


# Calculations

In [6]:
# Calculation of a, b parameters

def calculation_a_b(input_array):
    return np.min(input_array), np.max(input_array)

############################################################################################################

# Weight quantization

def weight_scaling_factor(a, b, min_T, max_T):
    s_a = a / min_T
    s_b = b / max_T

    if s_a > s_b:
        return s_a, a, max_T * s_a
    else:
        return s_b, min_T * s_b, b

def clamp(r,a,b):
    return min(max(r, a), b)

def weight_quan(r, a, b, min_T, s):

    q_value = np.round((clamp(r,a,b) - a) / s) + min_T

    # print(f'q_value: {q_value}')

    z = 0
    r = s * (q_value - z)

    # print(f'r: {r}')

    return q_value

def weight_arr_quan(input_arr, min_T, max_T):

    a, b = calculation_a_b(input_arr)
    s, a, b = weight_scaling_factor(a, b, min_T, max_T)

    out_arr = np.zeros(input_arr.shape, dtype=np.int8).T

    for i in range(input_arr.shape[0]):
        for j in range(input_arr.shape[1]):
            out_arr[j][i] = weight_quan(input_arr[i][j], a, b, min_T, s)

    return s, 0, out_arr

############################################################################################################

# Activation quantization

def activation_scaling_factor(a, b, n):
   return (b - a) / (n - 1)

def activation_scale_zero_point(input_array, n):
    a, b = calculation_a_b(input_array)
    s = activation_scaling_factor(a, b, n)
    q_value = np.round((0 - a) / s) - n/2
    z = q_value - (0 / s)

    print(f's: {s} \nz: {z}')

    return s, z

def activation_quan(r, a, b, n, s, z):

    q_value = np.round((clamp(r,a,b) - a) / s) - n/2

    print(f'q_value: {q_value}')

    r = s * (q_value - z)
    print(f'r: {r}')

    return q_value

############################################################################################################

# Bias quantization

def bias_quan(r, s_w, s_i):
    return np.round(r / (s_i * s_w))

def bias_arr_quan(arr, s_w, s_i):

    arr = np.array([bias_quan(r, s_w, s_i) for r in arr], dtype=np.int32)
    s = s_w * s_i

    return s, 0, arr

# Create model float32 TF

In [7]:
# Define a simple sequential model
def create_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(20, activation='relu', input_dim=input_size),
    tf.keras.layers.Dense(10)
    ])

    model.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

    return model

# Create a basic model instance
model = create_model()
model.load_weights(r'models/mnist_float_nn_tf/mnist_float_weights')

# Define a new model that outputs the intermediate layer
intermediate_model = tf.keras.Model(inputs=model.input, outputs=model.layers[0].output)

# Get the intermediate output for the entire input dataset
intermediate_output_float32 = intermediate_model.predict(train_images_float32)

outputs_float32 = model.predict(train_images_float32)



# Create model int8

In [8]:
int8_s_w1, int8_z_w1, int8_w1 = weight_arr_quan(model.layers[0].get_weights()[0], -127, 127)
int8_s_i1, int8_z_i1 = activation_scale_zero_point(train_images_float32, 256)
int8_s_b1, int8_z_b1, int8_b1 = bias_arr_quan(model.layers[0].get_weights()[1], int8_s_w1, int8_s_i1)

int8_s_w2, int8_z_w2, int8_w2 = weight_arr_quan(model.layers[1].get_weights()[0], -127, 127)
int8_s_i2, int8_z_i2 = activation_scale_zero_point(intermediate_output_float32, 256)
int8_s_b2, int8_z_b2, int8_b2 = bias_arr_quan(model.layers[1].get_weights()[1], int8_s_w2, int8_s_i2)

int8_s_o1, int8_z_o1 = int8_s_i2, int8_z_i2

int8_s_o2, int8_z_o2 = activation_scale_zero_point(outputs_float32, 256)


model_int8 = Module()
model_int8.add_module(FullyConnected(int8_w1.T, int8_b1, int8_s_w1, int8_s_i1, int8_s_o1, int8_z_i1, int8_z_o1, -128, 127), 'l1')
model_int8.add_module(FullyConnected(int8_w2.T, int8_b2, int8_s_w2, int8_s_i2, int8_s_o2, int8_z_i2, int8_z_o2, -128, 127), 'l2')

s: 0.00392156862745098 
z: -128.0
s: 0.09114260206035539 
z: -128.0
s: 0.26033223470052086 
z: 8.0


# Create model int4

In [9]:
int4_s_w1, int4_z_w1, int4_w1 = weight_arr_quan(model.layers[0].get_weights()[0], -7, 7)
int4_s_i1, int4_z_i1 = activation_scale_zero_point(train_images_float32, 16)
int4_s_b1, int4_z_b1, int4_b1 = bias_arr_quan(model.layers[0].get_weights()[1], int4_s_w1, int4_s_i1)

int4_s_w2, int4_z_w2, int4_w2 = weight_arr_quan(model.layers[1].get_weights()[0], -7, 7)
int4_s_i2, int4_z_i2 = activation_scale_zero_point(intermediate_output_float32, 16)
int4_s_b2, int4_z_b2, int4_b2 = bias_arr_quan(model.layers[1].get_weights()[1], int4_s_w2, int4_s_i2)

int4_s_o1, int4_z_o1 = int4_s_i2, int4_z_i2

int4_s_o2, int4_z_o2 = activation_scale_zero_point(outputs_float32, 16)


model_int4 = Module()
model_int4.add_module(FullyConnected(int4_w1.T, int4_b1, int4_s_w1, int4_s_i1, int4_s_o1, int4_z_i1, int4_z_o1, -8, 7), 'l1')
model_int4.add_module(FullyConnected(int4_w2.T, int4_b2, int4_s_w2, int4_s_i2, int4_s_o2, int4_z_i2, int4_z_o2, -8, 7), 'l2')

print(int4_w1.T.shape)


s: 0.06666666666666667 
z: -8.0
s: 1.5494242350260417 
z: -8.0
s: 4.425647989908854 
z: 0.0
(784, 20)


# Create model int2

In [10]:
int2_s_w1, int2_z_w1, int2_w1 = weight_arr_quan(model.layers[0].get_weights()[0], -1, 1)
int2_s_i1, int2_z_i1 = activation_scale_zero_point(train_images_float32, 4)
int2_s_b1, int2_z_b1, int2_b1 = bias_arr_quan(model.layers[0].get_weights()[1], int2_s_w1, int2_s_i1)

int2_s_w2, int2_z_w2, int2_w2 = weight_arr_quan(model.layers[1].get_weights()[0], -1, 1)
int2_s_i2, int2_z_i2 = activation_scale_zero_point(intermediate_output_float32, 4)
int2_s_b2, int2_z_b2, int2_b2 = bias_arr_quan(model.layers[1].get_weights()[1], int2_s_w2, int2_s_i2)

int2_s_o1, int2_z_o1 = int2_s_i2, int2_z_i2

int2_s_o2, int2_z_o2 = activation_scale_zero_point(outputs_float32, 4)


model_int2 = Module()
model_int2.add_module(FullyConnected(int2_w1.T, int2_b1, int2_s_w1, int2_s_i1, int2_s_o1, int2_z_i1, int2_z_o1, -2, 1), 'l1')
model_int2.add_module(FullyConnected(int2_w2.T, int2_b2, int2_s_w2, int2_s_i2, int2_s_o2, int2_z_i2, int2_z_o2, -2, 1), 'l2')

s: 0.3333333333333333 
z: -2.0
s: 7.747121175130208 
z: -2.0
s: 22.12823994954427 
z: 0.0


# Compare TFLite_int8, int8, int4 and int2

In [11]:
# Initialize the interpreter
import pathlib

tflite_file = pathlib.Path('.\models\mnist_int8_tflite_model.tflite')
interpreter = tf.lite.Interpreter(model_path=str(tflite_file))
interpreter.allocate_tensors()

def run_tflite_model(interpreter, input):

    input_index = interpreter.get_input_details()[0]["index"]
    output_index = interpreter.get_output_details()[0]["index"]
    input_details = interpreter.get_input_details()[0]

    input_scale, input_zero_point = input_details["quantization"]
    test_image = input / input_scale + input_zero_point

    test_image = np.expand_dims(test_image, axis=0).astype(input_details["dtype"])


    interpreter.set_tensor(input_index, test_image)
    interpreter.invoke()
    output = interpreter.get_tensor(output_index)

    return output

In [12]:
def evaluate_models(model_int8, model_int4, model_int2, model_int8_tf_path, eva_images_float32, eva_labels, path_to_save):
    dif = np.array([0])
    correct_int8 = 0
    correct_int4 = 0
    correct_int2 = 0
    correct_int8_tf = 0

    mistakes_int8 = np.array([])
    mistakes_int4 = np.array([])
    mistakes_int2 = np.array([])
    mistakes_int8_tf = np.array([])

    tflite_file = pathlib.Path(model_int8_tf_path)
    interpreter = tf.lite.Interpreter(model_path=str(tflite_file))
    interpreter.allocate_tensors()

    for i in range(eva_images_float32.shape[0]):

        if i % 1000 == 0 and i != 0:
            print("done ", i)

        input_int8_tf = eva_images_float32[i]

        input_int8 = eva_images_float32[i]
        input_int8 = input_int8 / int8_s_i1 + int8_z_i1
        input_int8 = np.expand_dims(input_int8, axis=0).astype(np.int8)

        input_int4 = eva_images_float32[i]
        input_int4 = input_int4 / int4_s_i1 + int4_z_i1
        input_int4 = np.expand_dims(input_int4, axis=0).astype(np.int8)

        input_int2 = eva_images_float32[i]
        input_int2 = input_int2 / int2_s_i1 + int2_z_i1
        input_int2 = np.expand_dims(input_int2, axis=0).astype(np.int8)


        output_model_int8 = model_int8.forward(input_int8)
        output_model_int4 = model_int4.forward(input_int4)
        output_model_int2 = model_int2.forward(input_int2)
        output_model_int8_tf = run_tflite_model(interpreter, input_int8_tf)

        dif[0] += np.sum(np.abs(output_model_int8 - output_model_int8_tf))

        answer_int8 = np.argmax(output_model_int8)
        answer_int4 = np.argmax(output_model_int4)
        answer_int2 = np.argmax(output_model_int2)
        answer_int8_tf = np.argmax(output_model_int8_tf)

        if answer_int8 == eva_labels[i]:
            correct_int8 += 1
        else:
            mistakes_int8 = np.append(mistakes_int8, i)

        if answer_int4 == eva_labels[i]:
            correct_int4 += 1
        else:
            mistakes_int4 = np.append(mistakes_int4, i)

        if answer_int2 == eva_labels[i]:
            correct_int2 += 1
        else:
            mistakes_int2 = np.append(mistakes_int2, i)

        if answer_int8_tf == eva_labels[i]:
            correct_int8_tf += 1
        else:
            mistakes_int8_tf = np.append(mistakes_int8_tf, i)


    np.save(path_to_save + '/array_dif_int8tf_int8_my-weights-scales-zpoints.npy', dif)
    np.save(path_to_save + '/mistakes_int8.npy', mistakes_int8)
    np.save(path_to_save + '/mistakes_int4.npy', mistakes_int4)
    np.save(path_to_save + '/mistakes_int2.npy', mistakes_int2)
    np.save(path_to_save + '/mistakes_int8_tf.npy', mistakes_int8_tf)

    mistake_output = np.sum(dif) / eva_images_float32.shape[0]
    mistake_value = mistake_output / 10

    print(f'Mistake in output: {mistake_output}')
    print(f'Mistake in value: {mistake_value}\n')

    print('Accuracy float32: ', model.evaluate(eva_images_float32, eva_labels)[1])
    print('Accuracy int8 tf: ', correct_int8_tf / eva_images_float32.shape[0])
    print('Accuracy int8   : ', correct_int8 / eva_images_float32.shape[0])
    print('Accuracy int4   : ', correct_int4 / eva_images_float32.shape[0])
    print('Accuracy int2   : ', correct_int2 / eva_images_float32.shape[0])

In [13]:
model.evaluate(eva_images_float32, eva_labels)



[0.1665320098400116, 0.9531999826431274]

In [14]:
# dif = np.load('mistakes_layer2_neurons20-10/array_dif_int8tf_int8my-weights-scales-zpoints.npy')
# mistake_output = np.sum(dif) / eva_images_float32.shape[0]
# mistake_value = mistake_output / 10
#
# print(f'Mistake in output: {mistake_output}')
# print(f'Mistake in value: {mistake_value}')

# Bigger nn model float32 500-500-500-10

In [15]:
# Define a simple sequential model
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(100, activation='relu', input_dim=input_size),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dense(10)
    ])

    model.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

    return model

# Create a basic model instance

model = create_model()
model.fit(
    train_images_float32,
    train_labels,
    epochs=10,
    batch_size=64,
    validation_data=(eva_images_float32, eva_labels)
)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

KeyboardInterrupt: 

In [None]:
tflite_models_dir = pathlib.Path("models")
tflite_models_dir.mkdir(exist_ok=True, parents=True)

float_weights_path = r"./models/mnist_float_nn_tf_layers4_neurons_100-100-100-10/mnist_float_weights"

# Save the model:
float_weights_model_file = tflite_models_dir / "mnist_float_nn_tf_layers4_neurons_100-100-100-10/mnist_float_weights"
float_weights_model_file_index = tflite_models_dir / "mnist_float_nn_tf_layers4_neurons_100-100-100-10/mnist_float_weights.index"
if not float_weights_model_file_index.is_file():
    model.save_weights(float_weights_model_file)
    print("Float weights saved to: ", float_weights_model_file)

In [None]:
model = create_model()
model.load_weights(float_weights_model_file)

In [None]:
# Define a new model that outputs the intermediate layer
intermediate_model_1 = tf.keras.Model(inputs=model.input, outputs=model.layers[0].output)
intermediate_model_2 = tf.keras.Model(inputs=model.input, outputs=model.layers[1].output)
intermediate_model_3 = tf.keras.Model(inputs=model.input, outputs=model.layers[2].output)

# Get the intermediate output for the entire input dataset
intermediate_output_1_float32 = intermediate_model_1.predict(train_images_float32)
intermediate_output_2_float32 = intermediate_model_2.predict(train_images_float32)
intermediate_output_3_float32 = intermediate_model_3.predict(train_images_float32)

outputs_float32 = model.predict(train_images_float32)

# Bigger nn model int8 tf

In [None]:
def representative_data_gen():
    for input_value in train_images_float32:
        yield [input_value]


converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_data_gen

# Ensure that if any ops can't be quantized, the converter throws an error
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]

# Set the input and output tensors to int8
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

tflite_model_quant = converter.convert()

In [None]:
tflite_models_dir = pathlib.Path("models")
tflite_models_dir.mkdir(exist_ok=True, parents=True)

model_path = r"models/mnist_int8_tflite_model_layers4_neurons100-100-100-10"

# Save the model:
int8_tflite_model_file = tflite_models_dir / "mnist_int8_tflite_model_layers4_neurons100-100-100-10.tflite"
if not int8_tflite_model_file.is_file():
    int8_tflite_model_file.write_bytes(tflite_model_quant)
    print("Model saved to: ", int8_tflite_model_file)

netron.start(model_path + r'.tflite')

In [None]:
tflite_file = pathlib.Path('.\models\mnist_int8_tflite_model_layers4_neurons100-100-100-10.tflite')
interpreter = tf.lite.Interpreter(model_path=str(tflite_file))
interpreter.allocate_tensors()

# Bigger nn model int8

In [None]:
int8_s_w1, int8_z_w1, int8_w1 = weight_arr_quan(model.layers[0].get_weights()[0], -127, 127)
int8_s_i1, int8_z_i1 = activation_scale_zero_point(train_images_float32, 256)
int8_s_b1, int8_z_b1, int8_b1 = bias_arr_quan(model.layers[0].get_weights()[1], int8_s_w1, int8_s_i1)

int8_s_w2, int8_z_w2, int8_w2 = weight_arr_quan(model.layers[1].get_weights()[0], -127, 127)
int8_s_i2, int8_z_i2 = activation_scale_zero_point(intermediate_output_1_float32, 256)
int8_s_b2, int8_z_b2, int8_b2 = bias_arr_quan(model.layers[1].get_weights()[1], int8_s_w2, int8_s_i2)

int8_s_w3, int8_z_w3, int8_w3 = weight_arr_quan(model.layers[2].get_weights()[0], -127, 127)
int8_s_i3, int8_z_i3 = activation_scale_zero_point(intermediate_output_2_float32, 256)
int8_s_b3, int8_z_b3, int8_b3 = bias_arr_quan(model.layers[2].get_weights()[1], int8_s_w3, int8_s_i3)

int8_s_w4, int8_z_w4, int8_w4 = weight_arr_quan(model.layers[3].get_weights()[0], -127, 127)
int8_s_i4, int8_z_i4 = activation_scale_zero_point(intermediate_output_3_float32, 256)
int8_s_b4, int8_z_b4, int8_b4 = bias_arr_quan(model.layers[3].get_weights()[1], int8_s_w4, int8_s_i4)

int8_s_o1, int8_z_o1 = int8_s_i2, int8_z_i2
int8_s_o2, int8_z_o2 = int8_s_i3, int8_z_i3
int8_s_o3, int8_z_o3 = int8_s_i4, int8_z_i4
int8_s_o4, int8_z_o4 = activation_scale_zero_point(outputs_float32, 256)

model_int8 = Module()
model_int8.add_module(FullyConnected(int8_w1.T, int8_b1, int8_s_w1, int8_s_i1, int8_s_o1, int8_z_i1, int8_z_o1, -128, 127), 'l1')
model_int8.add_module(FullyConnected(int8_w2.T, int8_b2, int8_s_w2, int8_s_i2, int8_s_o2, int8_z_i2, int8_z_o2, -128, 127), 'l2')
model_int8.add_module(FullyConnected(int8_w3.T, int8_b3, int8_s_w3, int8_s_i3, int8_s_o3, int8_z_i3, int8_z_o3, -128, 127), 'l3')
model_int8.add_module(FullyConnected(int8_w4.T, int8_b4, int8_s_w4, int8_s_i4, int8_s_o4, int8_z_i4, int8_z_o4, -128, 127), 'l4')

# Bigger nn model int4

In [None]:
int4_s_w1, int4_z_w1, int4_w1 = weight_arr_quan(model.layers[0].get_weights()[0], -7, 7)
int4_s_i1, int4_z_i1 = activation_scale_zero_point(train_images_float32, 16)
int4_s_b1, int4_z_b1, int4_b1 = bias_arr_quan(model.layers[0].get_weights()[1], int4_s_w1, int4_s_i1)

int4_s_w2, int4_z_w2, int4_w2 = weight_arr_quan(model.layers[1].get_weights()[0], -7, 7)
int4_s_i2, int4_z_i2 = activation_scale_zero_point(intermediate_output_1_float32, 16)
int4_s_b2, int4_z_b2, int4_b2 = bias_arr_quan(model.layers[1].get_weights()[1], int4_s_w2, int4_s_i2)

int4_s_w3, int4_z_w3, int4_w3 = weight_arr_quan(model.layers[2].get_weights()[0], -7, 7)
int4_s_i3, int4_z_i3 = activation_scale_zero_point(intermediate_output_2_float32, 16)
int4_s_b3, int4_z_b3, int4_b3 = bias_arr_quan(model.layers[2].get_weights()[1], int4_s_w3, int4_s_i3)

int4_s_w4, int4_z_w4, int4_w4 = weight_arr_quan(model.layers[3].get_weights()[0], -7, 7)
int4_s_i4, int4_z_i4 = activation_scale_zero_point(intermediate_output_3_float32, 16)
int4_s_b4, int4_z_b4, int4_b4 = bias_arr_quan(model.layers[3].get_weights()[1], int4_s_w4, int4_s_i4)

int4_s_o1, int4_z_o1 = int4_s_i2, int4_z_i2
int4_s_o2, int4_z_o2 = int4_s_i3, int4_z_i3
int4_s_o3, int4_z_o3 = int4_s_i4, int4_z_i4
int4_s_o4, int4_z_o4 = activation_scale_zero_point(outputs_float32, 16)

model_int4 = Module()
model_int4.add_module(FullyConnected(int4_w1.T, int4_b1, int4_s_w1, int4_s_i1, int4_s_o1, int4_z_i1, int4_z_o1, -8, 7), 'l1')
model_int4.add_module(FullyConnected(int4_w2.T, int4_b2, int4_s_w2, int4_s_i2, int4_s_o2, int4_z_i2, int4_z_o2, -8, 7), 'l2')
model_int4.add_module(FullyConnected(int4_w3.T, int4_b3, int4_s_w3, int4_s_i3, int4_s_o3, int4_z_i3, int4_z_o3, -8, 7), 'l3')
model_int4.add_module(FullyConnected(int4_w4.T, int4_b4, int4_s_w4, int4_s_i4, int4_s_o4, int4_z_i4, int4_z_o4, -8, 7), 'l4')

# Bigger nn model int2

In [None]:
int2_s_w1, int2_z_w1, int2_w1 = weight_arr_quan(model.layers[0].get_weights()[0], -1, 1)
int2_s_i1, int2_z_i1 = activation_scale_zero_point(train_images_float32, 4)
int2_s_b1, int2_z_b1, int2_b1 = bias_arr_quan(model.layers[0].get_weights()[1], int2_s_w1, int2_s_i1)

int2_s_w2, int2_z_w2, int2_w2 = weight_arr_quan(model.layers[1].get_weights()[0], -1, 1)
int2_s_i2, int2_z_i2 = activation_scale_zero_point(intermediate_output_1_float32, 4)
int2_s_b2, int2_z_b2, int2_b2 = bias_arr_quan(model.layers[1].get_weights()[1], int2_s_w2, int2_s_i2)

int2_s_w3, int2_z_w3, int2_w3 = weight_arr_quan(model.layers[2].get_weights()[0], -1, 1)
int2_s_i3, int2_z_i3 = activation_scale_zero_point(intermediate_output_2_float32, 4)
int2_s_b3, int2_z_b3, int2_b3 = bias_arr_quan(model.layers[2].get_weights()[1], int2_s_w3, int2_s_i3)

int2_s_w4, int2_z_w4, int2_w4 = weight_arr_quan(model.layers[3].get_weights()[0], -1, 1)
int2_s_i4, int2_z_i4 = activation_scale_zero_point(intermediate_output_3_float32, 4)
int2_s_b4, int2_z_b4, int2_b4 = bias_arr_quan(model.layers[3].get_weights()[1], int2_s_w4, int2_s_i4)

int2_s_o1, int2_z_o1 = int2_s_i2, int2_z_i2
int2_s_o2, int2_z_o2 = int2_s_i3, int2_z_i3
int2_s_o3, int2_z_o3 = int2_s_i4, int2_z_i4
int2_s_o4, int2_z_o4 = activation_scale_zero_point(outputs_float32, 4)

model_int2 = Module()
model_int2.add_module(FullyConnected(int2_w1.T, int2_b1, int2_s_w1, int2_s_i1, int2_s_o1, int2_z_i1, int2_z_o1, -2, 1), 'l1')
model_int2.add_module(FullyConnected(int2_w2.T, int2_b2, int2_s_w2, int2_s_i2, int2_s_o2, int2_z_i2, int2_z_o2, -2, 1), 'l2')
model_int2.add_module(FullyConnected(int2_w3.T, int2_b3, int2_s_w3, int2_s_i3, int2_s_o3, int2_z_i3, int2_z_o3, -2, 1), 'l3')
model_int2.add_module(FullyConnected(int2_w4.T, int2_b4, int2_s_w4, int2_s_i4, int2_s_o4, int2_z_i4, int2_z_o4, -2, 1), 'l4')


In [None]:
import time

dif = np.array([0])
correct_int8 = 0
correct_int4 = 0
correct_int2 = 0
correct_int8_tf = 0

mistakes_int8 = np.array([])
mistakes_int4 = np.array([])
mistakes_int2 = np.array([])
mistakes_int8_tf = np.array([])

for i in range(1):

    if i == 0:
        start_time = time.time()

    input_int8_tf = eva_images_float32[i]

    input_int8 = eva_images_float32[i]
    input_int8 = input_int8 / int8_s_i1 + int8_z_i1
    input_int8 = np.expand_dims(input_int8, axis=0).astype(np.int8)

    input_int4 = eva_images_float32[i]
    input_int4 = input_int4 / int4_s_i1 + int4_z_i1
    input_int4 = np.expand_dims(input_int4, axis=0).astype(np.int8)

    print(input_int4.shape)

    input_int2 = eva_images_float32[i]
    input_int2 = input_int2 / int2_s_i1 + int2_z_i1
    input_int2 = np.expand_dims(input_int2, axis=0).astype(np.int8)


    output_model_int8 = model_int8.forward(input_int8)
    output_model_int4 = model_int4.forward(input_int4)
    output_model_int2 = model_int2.forward(input_int2)
    output_model_int8_tf = run_tflite_model(interpreter, input_int8_tf)

    dif[0] += np.sum(np.abs(output_model_int8 - output_model_int8_tf))

    answer_int8 = np.argmax(output_model_int8)
    answer_int4 = np.argmax(output_model_int4)
    answer_int2 = np.argmax(output_model_int2)
    answer_int8_tf = np.argmax(output_model_int8_tf)

    if answer_int8 == eva_labels[i]:
        correct_int8 += 1
    else:
        mistakes_int8 = np.append(mistakes_int8, i)

    if answer_int4 == eva_labels[i]:
        correct_int4 += 1
    else:
        mistakes_int4 = np.append(mistakes_int4, i)

    if answer_int2 == eva_labels[i]:
        correct_int2 += 1
    else:
        mistakes_int2 = np.append(mistakes_int2, i)

    if answer_int8_tf == eva_labels[i]:
        correct_int8_tf += 1
    else:
        mistakes_int8_tf = np.append(mistakes_int8_tf, i)

    if i == 0:
        end_time = time.time()
        print('time for 1 image: ', end_time - start_time)


np.save('mistakes_layers4_neurons500-500-500-10/array_dif_int8tf_int8_my-weights-scales-zpoints.npy', dif)
np.save('mistakes_layers4_neurons500-500-500-10/mistakes_int8.npy', mistakes_int8)
np.save('mistakes_layers4_neurons500-500-500-10/mistakes_int4.npy', mistakes_int4)
np.save('mistakes_layers4_neurons500-500-500-10/mistakes_int2.npy', mistakes_int2)
np.save('mistakes_layers4_neurons500-500-500-10/mistakes_int8_tf.npy', mistakes_int8_tf)

mistake_output = np.sum(dif) / eva_images_float32.shape[0]
mistake_value = mistake_output / 10

print(f'Mistake in output: {mistake_output}')
print(f'Mistake in value: {mistake_value}\n')

print('Accuracy float32: ', model.evaluate(eva_images_float32, eva_labels)[1])
print('Accuracy int8 tf: ', correct_int8_tf / eva_images_float32.shape[0])
print('Accuracy int8   : ', correct_int8 / eva_images_float32.shape[0])
print('Accuracy int4   : ', correct_int4 / eva_images_float32.shape[0])
print('Accuracy int2   : ', correct_int2 / eva_images_float32.shape[0])

# PCA reduction

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.86) # 64 eva 8-bit accuracy:  0.9564 4-bit accuracy:  0.7812
# pca = PCA(n_components = 0.74) # 32 eva 8-bit accuracy:  0.9448 4-bit accuracy:  0.7912
# pca = PCA(n_components = 0.59) # 16 eva 8-bit accuracy:  0.9292 4-bit accuracy:  0.8102
# pca = PCA(n_components = 0.43) # 8 eva 8-bit accuracy:  0.8717 4-bit accuracy:  0.7355
# pca = PCA(n_components = 0.28) # 4 eva 8-bit accuracy:  0.6484 4-bit accuracy:  0.487
train_reduced = pca.fit_transform(train_images_float32)
eva_reduced = pca.transform(eva_images_float32)
print(train_reduced.shape)
print(eva_reduced.shape)

In [None]:
input_size = train_reduced.shape[1]

In [None]:
# Define a simple sequential model
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(20, activation='relu', input_dim=input_size),
        tf.keras.layers.Dense(10)
    ])

    model.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

    return model

# Create a basic model instance

model = create_model()
model.fit(
    train_reduced,
    train_labels,
    epochs=10,
    batch_size=64,
    validation_data=(eva_reduced, eva_labels)
)

model.evaluate(eva_reduced, eva_labels)

# Define a new model that outputs the intermediate layer
intermediate_model = tf.keras.Model(inputs=model.input, outputs=model.layers[0].output)

# Get the intermediate output for the entire input dataset
intermediate_output_float32 = intermediate_model.predict(train_reduced)

outputs_float32 = model.predict(train_reduced)

In [None]:
int8_s_w1, int8_z_w1, int8_w1 = weight_arr_quan(model.layers[0].get_weights()[0], -127, 127)
int8_s_i1, int8_z_i1 = activation_scale_zero_point(train_reduced, 256)
int8_s_b1, int8_z_b1, int8_b1 = bias_arr_quan(model.layers[0].get_weights()[1], int8_s_w1, int8_s_i1)

int8_s_w2, int8_z_w2, int8_w2 = weight_arr_quan(model.layers[1].get_weights()[0], -127, 127)
int8_s_i2, int8_z_i2 = activation_scale_zero_point(intermediate_output_float32, 256)
int8_s_b2, int8_z_b2, int8_b2 = bias_arr_quan(model.layers[1].get_weights()[1], int8_s_w2, int8_s_i2)

int8_s_o1, int8_z_o1 = int8_s_i2, int8_z_i2

int8_s_o2, int8_z_o2 = activation_scale_zero_point(outputs_float32, 256)


model_int8 = Module()
model_int8.add_module(FullyConnected(int8_w1.T, int8_b1, int8_s_w1, int8_s_i1, int8_s_o1, int8_z_i1, int8_z_o1, -128, 127), 'l1')
model_int8.add_module(FullyConnected(int8_w2.T, int8_b2, int8_s_w2, int8_s_i2, int8_s_o2, int8_z_i2, int8_z_o2, -128, 127), 'l2')

In [None]:
int4_s_w1, int4_z_w1, int4_w1 = weight_arr_quan(model.layers[0].get_weights()[0], -7, 7)
int4_s_i1, int4_z_i1 = activation_scale_zero_point(train_reduced, 16)
int4_s_b1, int4_z_b1, int4_b1 = bias_arr_quan(model.layers[0].get_weights()[1], int4_s_w1, int4_s_i1)

int4_s_w2, int4_z_w2, int4_w2 = weight_arr_quan(model.layers[1].get_weights()[0], -7, 7)
int4_s_i2, int4_z_i2 = activation_scale_zero_point(intermediate_output_float32, 16)
int4_s_b2, int4_z_b2, int4_b2 = bias_arr_quan(model.layers[1].get_weights()[1], int4_s_w2, int4_s_i2)

int4_s_o1, int4_z_o1 = int4_s_i2, int4_z_i2

int4_s_o2, int4_z_o2 = activation_scale_zero_point(outputs_float32, 16)


model_int4 = Module()
model_int4.add_module(FullyConnected(int4_w1.T, int4_b1, int4_s_w1, int4_s_i1, int4_s_o1, int4_z_i1, int4_z_o1, -8, 7), 'l1')
model_int4.add_module(FullyConnected(int4_w2.T, int4_b2, int4_s_w2, int4_s_i2, int4_s_o2, int4_z_i2, int4_z_o2, -8, 7), 'l2')

In [None]:
def test_accuracy(model8, model4, test_images, test_labels):
    correct4 = 0
    correct8 = 0
    for i in range(len(test_images)):

        input_int8 = test_images[i]
        input_int8 = input_int8 / int8_s_i1 + int8_z_i1
        input_int8 = np.expand_dims(input_int8, axis=0).astype(np.int8)

        input_int4 = test_images[i]
        input_int4 = input_int4 / int4_s_i1 + int4_z_i1
        input_int4 = np.expand_dims(input_int4, axis=0).astype(np.int8)

        output8 = model8.forward(input_int8)
        output4 = model4.forward(input_int4)

        if np.argmax(output4) == test_labels[i]:
            correct4 += 1
        if np.argmax(output8) == test_labels[i]:
            correct8 += 1

    print('8-bit accuracy: ', correct8 / test_images.shape[0], '4-bit accuracy: ', correct4 / test_images.shape[0])

In [None]:
print(test_accuracy(model_int8,model_int4, eva_reduced, eva_labels))

# 2D array

In [None]:
two_d_array = []
for i in range(5):
    row = [i] * (i+1)   # create a row with i elements
    two_d_array.append(row)  # add the row to the 2D array

print(two_d_array)

In [None]:
for i in range(5):
    for j in range(5):
        try:
            print("try: ", two_d_array[i][j])
        except:
            try:
                print("except: ", two_d_array[j][i])
            except:
                print("except: error")

In [None]:
# create an empty NumPy array
arr = np.zeros((5,), dtype=np.ndarray)

# assign a list with different lengths to the first element of the array
arr[0] = np.array([1])
arr[1] = np.array([1, 2])
arr[2] = np.array([1, 2, 3])
arr[3] = np.array([1, 2, 3, 4])
arr[4] = np.array([1, 2, 3, 4, 5])

print(arr)
# np.save('arr.npy', arr)

In [None]:
for i in range(5):
    for j in range(5):
        try:
            print("try: ", arr[i][j])
        except:
            try:
                print("except: ", arr[j][i])
            except:
                print("except: error")