In [1]:
import tensorflow as tf
import numpy as np
from spiking_models import DenseRNN, SpikingReLU, SpikingSigmoid, SpikingTanh, Accumulate
import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Input, Model
from operations_layers import SqueezeLayer, ExpandLayer, MatMulLayer, MatMulLayerTranspose, TransposeLayer, \
    ExtractPatchesLayer, PositionalEncodingLayer, Tokpos

In [2]:
def get_normalized_weights(model, x_test, percentile=100):
    x_test = x_test[::10]
    max_activation = 0
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.ReLU):
            activation = tf.keras.Model(inputs=model.inputs, outputs=layer.output)(x_test).numpy()
            if np.percentile(activation, percentile) > max_activation:
                max_activation = np.percentile(activation, percentile)
        elif isinstance(layer, tf.keras.layers.Dense):
            if layer.activation.__name__ == 'relu':
                activation = tf.keras.Model(inputs=model.inputs, outputs=layer.output)(x_test).numpy()
                if np.percentile(activation, percentile) > max_activation:
                    max_activation = np.percentile(activation, percentile)

    weights = model.get_weights()     
    if max_activation == 0:
        print("\n" + "-"*32 + "\nNo normalization\n" + "-"*32)
    else:
        print("\n" + "-"*32 + "\nNormalizing by", max_activation, "\n" + "-"*32)
        for i in range(len(weights)):
            weights[i] /= (max_activation)
    return weights


def evaluate_conversion(converted_model, original_model, x_test, y_test, testacc, timesteps=50):
    for i in range(1, timesteps+1):
        _, acc = converted_model.evaluate(x_test, y_test, batch_size=y_test.shape[0], verbose=0)
        print(
            "Timesteps", str(i) + "/" + str(timesteps) + " -",
            "acc spiking (orig): %.2f%% (%.2f%%)" % (acc*100, testacc*100),
            "- conv loss: %+.2f%%" % ((-(1 - acc/testacc)*100)))

In [3]:
def robust_weight_normalization(model, x_test, ppercentile=1):
    prev_factor = 1
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.ReLU) or (isinstance(layer, tf.keras.layers.Dense) and
                                                       layer.activation.__name__ == 'relu'):

            activation = tf.keras.Model(inputs=model.inputs, outputs=layer.output)(x_test).numpy()
            activation = tf.math.reduce_max(activation, axis=0)
            activation = tf.sort(activation)
            max_act = activation[int(ppercentile * (len(activation) - 1))]

            weights, bias = layer.get_weights()
            max_wt = max(0, tf.math.reduce_max(weights))
            max_bias = tf.math.reduce_max(bias)

            max_wt_bias = max(max_bias, max_wt)

            scale_factor = max(max_act, max_wt_bias)
            applied_factor = scale_factor / prev_factor

            weights = weights / applied_factor
            bias = bias / scale_factor

            prev_factor = scale_factor
            layer.set_weights([weights, bias])
            print(f"Scale factor for layer {layer}")
            print(f"{applied_factor}")

    return model

In [4]:
def weight_conversion_model(weights, bias):
    """
    Simple model-based conversion model proposed by Diehl et al.
    :param weights: weights of the network.
    :param bias: bias of the network.
    :return: rescaled weights.
    """
    # Get weights from trained network
    converted_weights = weights
    converted_bias = bias

    # model based normalization
    previous_factor = 1
    for l in range(len(converted_weights)):
        max_pos_input = 0
        # Find maximum input for this layer
        for o in range(converted_weights[l].shape[0]):
            input_sum = 0
            for i in range(converted_weights[l].shape[1]):
                input_sum += tf.math.maximum(0, converted_weights[l][o, i])
            if converted_bias is not None and converted_bias[l] is not None:
                input_sum += tf.math.maximum(0, converted_bias[l][o])
            max_pos_input = tf.math.maximum(max_pos_input, input_sum)

        # get the maximum weight in the layer, in case all weights are negative, max_pos_input would be zero, so we
        # use the max weight to rescale instead
        max_wt = tf.math.reduce_max(converted_weights[l])
        if converted_bias is not None and converted_bias[l] is not None:
            max_bias = tf.math.reduce_max(converted_bias[l])
            max_wt = tf.math.maximum(max_wt, max_bias)
        scale_factor = tf.math.maximum(max_wt, max_pos_input)
        # Rescale all weights
        applied_factor = scale_factor / previous_factor
        converted_weights[l] = converted_weights[l] / applied_factor
        if converted_bias is not None and converted_bias[l] is not None:
            converted_bias[l] = converted_bias[l] / scale_factor
        previous_factor = scale_factor
        print(f"Scale factor for this layer is {previous_factor}")

    return converted_weights, converted_bias


def weight_conversion_robust_and_data_based(weights, bias, model, data, normalization_method='robust',
                                            ppercentile=0.99):

    """
    Two methods proposed by Diehl et al and Rueckauer et al. Both methods are data-based, so they use weights and
    activations to find the best scaling factor.
    :param weights: weights of the network.
    :param bias: bias of the network.
    :param model: ann model.
    :param data: dataset to determine activations.
    :param normalization_method: type of normalization - robust (Rueckauer) or data (Diehl).
    :param ppercentile: percentile of the activation, which is taken from maximal activation.
    :return: rescaled weights.
    """
    if normalization_method == 'data':
        ppercentile = 1.0

    # Get weights from trained network
    converted_weights = weights
    converted_bias = bias

    # use training set to find max_act for each neuron

    activations = []
    for l in range(0, len(converted_weights)):
        activation = get_activations_layer(model.input, model.layers[l].output, data)
        activation_per_neuron = [np.max(activation[:, i]) for i in range(activation.shape[1])]
        activations.append(activation_per_neuron)

    previous_factor = 1
    for l in range(len(converted_weights)):
        # get the p-percentile of the activation
        pos_inputs = activations[l]
        pos_inputs.sort()
        max_act = pos_inputs[int(ppercentile * (len(pos_inputs) - 1))]
        # get the maximum weight in the layer
        max_wt = tf.math.reduce_max(converted_weights[l])
        if converted_bias is not None and converted_bias[l] is not None:
            max_bias = tf.math.reduce_max(converted_bias[l])
            max_wt = tf.math.maximum(max_wt, max_bias)
        scale_factor = tf.math.maximum(max_wt, max_act)

        applied_factor = scale_factor / previous_factor
        # rescale weights
        converted_weights[l] = converted_weights[l] / applied_factor

        # rescale bias
        if converted_bias is not None and converted_bias[l] is not None:
            converted_bias[l] = converted_bias[l] / scale_factor
        previous_factor = scale_factor
        print(f"Scale factor for this layer is {previous_factor}")

    return converted_weights, converted_bias


def get_activations_layer(layer_in, layer_out, data, batch_size=32):

    """
    Getting activation for specific layer of neural network.
    :param layer_in: input layer of a model. For sequential models first layer, for functional model.layers[0].input can
    be used.
    :param layer_out: layer for which activations should be computed. For functional model.layers[i].output can be used.
    :param data: dataset.
    :param batch_size: batch_size of batches in which dataset should be divided.
    :return: activations for a specific layer for all
    """

    if len(data) % batch_size != 0:
        data = data[: -(len(data) % batch_size)]

    return Model(layer_in, layer_out).predict(data, batch_size)

In [None]:
tf.random.set_seed(1234)
batch_size=128
epochs=2
dv = 25
nv = -1
vocab_size = 10000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review
embed_dim = 32  # Embedding size for each token
mlp_dim = 64
l = 50
num_heads = 4
num_classes = 2

def multi_head_attention(x):
    # ================== Multi Head Self Attention ===============
    v2 = tf.keras.layers.Dense(embed_dim)(x)
    q2 = tf.keras.layers.Dense(embed_dim)(x)
    k2 = tf.keras.layers.Dense(embed_dim)(x)

    v = tf.keras.layers.Reshape([embed_dim, l, num_heads])(v2)
    v = TransposeLayer()(v)
    q = tf.keras.layers.Reshape([embed_dim, l, num_heads])(q2)
    q = TransposeLayer()(q)
    k = tf.keras.layers.Reshape([embed_dim, l, num_heads])(k2)
    k = TransposeLayer()(k)

    # =============== Scaled dot-product attention =================
    # QK^T
    att = MatMulLayerTranspose()([q, k])
    # softmax(QK^T)
    att = tf.keras.layers.Softmax(axis=-1)(att)
    # softmax(QK^T)*V
    out = MatMulLayer()([att, v])

    att = TransposeLayer()(out)
    out = tf.keras.layers.Reshape([-1, l, embed_dim])(att)
    out = tf.keras.layers.Dense(embed_dim)(out)
    # out = tf.keras.layers.Reshape([l, d_model, 1])(out)
    x = tf.keras.layers.Reshape([-1, l, embed_dim])(x)
    # ============== End of Multi Head Self Attention =============
    # Concat Layer
    add = tf.keras.layers.Add()([out, x])
    # ================== End of Transformer =======================
    return out, add

def create_ann_approved_version():
    inputs = tf.keras.layers.Input(shape=(maxlen,))
    x = Tokpos(maxlen, vocab_size, embed_dim)(inputs)
    out = x
    for i in range(1):
        out, add = multi_head_attention(out)
        out = tf.keras.layers.Dense(mlp_dim, activation="relu")(add)
        out = tf.keras.layers.Dense(embed_dim)(out)
        out = tf.keras.layers.Add()([out, add])
        
    x = tf.keras.layers.Flatten()(out)
    x = tf.keras.layers.Dense(embed_dim, activation="relu")(x)
    x = tf.keras.layers.Dense(embed_dim)(x)
    x = tf.keras.layers.Dense(mlp_dim, activation="relu")(x)
    # --------------------------------------------------
    x = tf.keras.layers.Dense(num_classes)(x)
    x = tf.keras.layers.Softmax()(x)
    
    ann = tf.keras.models.Model(inputs=inputs, outputs=x)
    
    ann.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy"])

    ann.fit(
        x_train,
        y_train,
        validation_data=(x_test, y_test),
        batch_size=batch_size,
        epochs=epochs)
    return ann


def convert_tailored_approved_version(weights, y_test):
    inputs = tf.keras.layers.Input(shape=(1, maxlen,), batch_size=y_test.shape[0])
    x = Tokpos(maxlen, vocab_size, embed_dim)(inputs)
    out = x
    for i in range(1):
        out, add = multi_head_attention(out)
        out = tf.keras.layers.Dense(mlp_dim)(add)
        out = tf.keras.layers.Reshape([1, num_heads*l*mlp_dim])(out)
        out = tf.keras.layers.RNN(SpikingReLU(num_heads*l*mlp_dim), return_sequences=True, return_state=False, 
                            stateful=True)(out)
        out = tf.keras.layers.Reshape([num_heads, l, mlp_dim])(out)
        
        out = tf.keras.layers.Dense(embed_dim)(out)
        out = tf.keras.layers.Add()([out, add])
        
    x = tf.keras.layers.Flatten()(out)
    x = ExpandLayer()(x)
    x = tf.keras.layers.Dense(embed_dim)(x)
    x = tf.keras.layers.RNN(SpikingReLU(embed_dim), return_sequences=True, return_state=False, 
                            stateful=True)(x)
    x = tf.keras.layers.Dense(embed_dim)(x)
    x = tf.keras.layers.Dense(mlp_dim)(x)
    x = tf.keras.layers.RNN(SpikingReLU(mlp_dim), return_sequences=True, return_state=False, 
                            stateful=True)(x)
    # --------------------------------------------------
    x = tf.keras.layers.Dense(num_classes)(x)
    
    x = tf.keras.layers.RNN(Accumulate(num_classes), return_sequences=True, return_state=False, stateful=True)(x)
    x = tf.keras.layers.Softmax()(x)
    
    x = SqueezeLayer()(x)
    
    spiking = tf.keras.models.Model(inputs=inputs, outputs=x)
    
    print("-"*32 + "\n")
    spiking.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy"])
    print(spiking.summary())
    spiking.set_weights(weights)
    return spiking


(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=vocab_size)
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)

x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

# Analog model
ann = create_ann_approved_version()
print(ann.summary())

_, testacc = ann.evaluate(x_test, y_test, batch_size=batch_size, verbose=0)
# weights = ann.get_weights()
# weights = get_normalized_weights(ann, x_train, percentile=85)

model_normalized = robust_weight_normalization(ann, x_train)
weights = model_normalized.get_weights()

##################################################
# Preprocessing for RNN 
x_train = np.expand_dims(x_train, axis=1)  # (60000, 784) -> (60000, 1, 784)
x_test = np.expand_dims(x_test, axis=1)

##################################################
# Conversion to spiking model
# snn = convert(ann, weights, x_test, y_test)
snn = convert_tailored_approved_version(weights, y_test)
evaluate_conversion(snn, ann, x_test, y_test, testacc, timesteps=10)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


Epoch 1/2
Epoch 2/2
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
tokpos (Tokpos)                 (None, 200, 32)      326400      input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 200, 32)      1056        tokpos[0][0]                     
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 200, 32)      1056        tokpos[0][0]                     
__________________________________________________________________________