In [1]:
import tensorflow as tf
import numpy as np
from spiking_models import DenseRNN, SpikingReLU, SpikingSigmoid, SpikingTanh, Accumulate
import keras
from tensorflow.keras.utils import to_categorical

In [2]:
def convert(model, weights, x_test, y_test):
    print("Converted model:\n" + "-"*32)
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.InputLayer):
            print("Input Layer")
            inputs = tf.keras.Input(shape=(1, model.layers[0].input_shape[0][1]), batch_size=y_test.shape[0])
            x = inputs        
        elif isinstance(layer, tf.keras.layers.Dense):
            x = tf.keras.layers.Dense(layer.output_shape[1])(x)
            # x = tf.keras.layers.RNN(DenseRNN(layer.output_shape[1]), return_sequences=True, return_state=False, stateful=True)(x)
            if layer.activation.__name__ == 'linear':
                print("Dense Layer w/o activation")
                pass
            elif layer.activation.__name__ == 'relu':
                print("Dense Layer with SpikingReLU")
                x = tf.keras.layers.RNN(SpikingReLU(layer.output_shape[1]), return_sequences=True, return_state=False, stateful=True)(x)
            elif layer.activation.__name__ == 'sigmoid':
                print("Dense Layer with SpikingSigmoid")
                x = tf.keras.layers.RNN(SpikingSigmoid(layer.output_shape[1]), return_sequences=True, return_state=False, stateful=True)(x)
            elif layer.activation.__name__ == 'tanh':
                print("Dense Layer with SpikingTanh")
                x = tf.keras.layers.RNN(SpikingTanh(layer.output_shape[1]), return_sequences=True, return_state=False, stateful=True)(x)
            else:
                print('[Info] Activation type', layer.activation.__name__, 'not implemented')
        elif isinstance(layer, tf.keras.layers.ReLU):
            print("SpikingReLU Layer")
            x = tf.keras.layers.RNN(SpikingReLU(layer.output_shape[1]), return_sequences=True, return_state=False, stateful=True)(x)
        elif isinstance(layer, tf.keras.layers.Softmax):
            print("Accumulate + Softmax Layer")
            print(layer.output_shape[1])
            x = tf.keras.layers.RNN(Accumulate(layer.output_shape[1]), return_sequences=True, return_state=False, stateful=True)(x)
            x = tf.keras.layers.Softmax()(x)
        else:
            print("[Info] Layer type ", layer, "not implemented")
    spiking = tf.keras.models.Model(inputs=inputs, outputs=x)
    print("-"*32 + "\n")

    spiking.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer="adam",
        metrics=["sparse_categorical_accuracy"],)

    spiking.set_weights(weights)
    return spiking

In [3]:
def get_normalized_weights(model, x_test, percentile=100):
    max_activation = 0
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.ReLU):
            activation = tf.keras.Model(inputs=model.inputs, outputs=layer.output)(x_test).numpy()
            if np.percentile(activation, percentile) > max_activation:
                max_activation = np.percentile(activation, percentile)
        elif isinstance(layer, tf.keras.layers.Dense):
            if layer.activation.__name__ == 'relu':
                activation = tf.keras.Model(inputs=model.inputs, outputs=layer.output)(x_test).numpy()
                if np.percentile(activation, percentile) > max_activation:
                    max_activation = np.percentile(activation, percentile)

    weights = model.get_weights()     
    if max_activation == 0:
        print("\n" + "-"*32 + "\nNo normalization\n" + "-"*32)
    else:
        print("\n" + "-"*32 + "\nNormalizing by", max_activation, "\n" + "-"*32)
        for i in range(len(weights)):
            weights[i] /= (max_activation)
    return weights

In [4]:
def evaluate_conversion(converted_model, original_model, x_test, y_test, testacc, timesteps=50):
    for i in range(1, timesteps+1):
        _, acc = converted_model.evaluate(x_test, y_test, batch_size=y_test.shape[0], verbose=0)
        print(
            "Timesteps", str(i) + "/" + str(timesteps) + " -",
            "acc spiking (orig): %.2f%% (%.2f%%)" % (acc*100, testacc*100),
            "- conv loss: %+.2f%%" % ((-(1 - acc/testacc)*100)))

In [5]:
tf.random.set_seed(1234)
batch_size=512
epochs = 5
act='relu'


def create_ann():
    inputs = tf.keras.Input(shape=(784,))
    x = tf.keras.layers.Dense(500, activation=act)(inputs)
    #x = tf.keras.layers.ReLU()(x)  # max_value=1
    x = tf.keras.layers.Dense(100, activation=act)(x)
    #x = tf.keras.layers.Activation(tf.nn.relu)(x)  # not implemented yet
    x = tf.keras.layers.Dense(10, activation=act)(x)
    x = tf.keras.layers.Softmax()(x)
    ann = tf.keras.Model(inputs=inputs, outputs=x)

    ann.compile(
        optimizer=tf.keras.optimizers.RMSprop(),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

    ann.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs)
    return ann

In [6]:
##################################################
# Import Data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255., x_test / 255.
x_train = x_train.reshape((60000, 784))
x_test = x_test.reshape((10000, 784))

# Analog model
ann = create_ann()

_, testacc = ann.evaluate(x_test, y_test, batch_size=batch_size, verbose=0)
#weights = ann.get_weights()
weights = get_normalized_weights(ann, x_train, percentile=85)

##################################################
# Preprocessing for RNN 
x_train = np.expand_dims(x_train, axis=1)  # (60000, 784) -> (60000, 1, 784)
x_test = np.expand_dims(x_test, axis=1)
#x_rnn = np.tile(x_train, (1, 1, 1))
#y_rnn = y_train  # np.tile(x_test, (1, timesteps, 1))

##################################################
# Conversion to spiking model
snn = convert(ann, weights, x_test, y_test)
evaluate_conversion(snn, ann, x_test, y_test, testacc, timesteps=50)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

--------------------------------
Normalizing by 5.39086127281189 
--------------------------------
Converted model:
--------------------------------
Input Layer
Dense Layer with SpikingReLU
Dense Layer with SpikingReLU
Dense Layer with SpikingReLU
Accumulate + Softmax Layer
10
--------------------------------

Timesteps 1/50 - acc spiking (orig): 9.80% (97.74%) - conv loss: -89.97%
Timesteps 2/50 - acc spiking (orig): 9.80% (97.74%) - conv loss: -89.97%
Timesteps 3/50 - acc spiking (orig): 9.80% (97.74%) - conv loss: -89.97%
Timesteps 4/50 - acc spiking (orig): 9.80% (97.74%) - conv loss: -89.97%
Timesteps 5/50 - acc spiking (orig): 9.80% (97.74%) - conv loss: -89.97%
Timesteps 6/50 - acc spiking (orig): 9.80% (97.74%) - conv loss: -89.97%
Timesteps 7/50 - acc spiking (orig): 9.80% (97.74%) - conv loss: -89.97%
Timesteps 8/50 - acc spiking (orig): 9.80% (97.74%) - conv loss: -89.97%
Timesteps 9/50 - acc spiking (orig): 9.80% (97.74%) -

In [6]:
def extract_patches(images):
    batch_size = tf.shape(images)[0]
    
    patches = tf.image.extract_patches(
        images=images,
        sizes=[1, patch_size, patch_size, 1],
        strides=[1, patch_size, patch_size, 1],
        rates=[1, 1, 1, 1],
        padding="VALID",
    )
    patches = tf.reshape(patches, [batch_size, -1, patch_dim])
    return patches

In [7]:
class ScaleLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(ScaleLayer, self).__init__()

    def call(self, inputs):
        scale = inputs[1]
        return inputs[0] / scale

    
class MatMulLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(MatMulLayer, self).__init__()

    def call(self, inputs):
        return tf.matmul(inputs[0], inputs[1])

    
class MatMulLayerTranspose(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(MatMulLayerTranspose, self).__init__()

    def call(self, inputs):
        return tf.matmul(inputs[0], inputs[1], transpose_b=True)

In [8]:
"""
Fully Functional Modular version with Multiply layers instead of einsums
"""

num_classes = 10
patch_size = 4
channels = 1
patch_dim = channels * patch_size ** 2
d_model = 64*3
image_size = 28
num_patches = (image_size // patch_size) ** 2
batch_size = 500

# the data, shuffled and split between tran and test sets
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Normalize input so we can train ANN with it.
# Will be converted back to integers for SNN layer.
x_train = x_train / 255
x_test = x_test / 255

# Add a channel dimension.
axis = 1 if keras.backend.image_data_format() == 'channels_first' else -1
x_train = np.expand_dims(x_train, axis)
x_test = np.expand_dims(x_test, axis)

# One-hot encode target vectors.
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)


# # Flat dataset loading
# (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
# x_train, x_test = x_train / 255., x_test / 255.
# x_train = x_train.reshape((60000, 784))
# x_test = x_test.reshape((10000, 784))

input_shape = x_train.shape[1:]

inp = tf.keras.layers.Input(shape=input_shape)

patches = extract_patches(inp)
x = tf.keras.layers.Dense(d_model)(patches)

pos_emb = tf.Variable(initial_value=tf.random.uniform(shape=(1, num_patches + 1, d_model)), 
                      name="pos_emb", validate_shape=(1, num_patches + 1, d_model), trainable=True)
class_emb = tf.Variable(initial_value=tf.random.uniform(shape=(1, 1, d_model)), name="class_emb", 
                        validate_shape=(1, 1, d_model), trainable=True)

class_emb = tf.broadcast_to(class_emb, [batch_size, 1, d_model])

x = tf.concat([class_emb, x], axis=1)
x = x + pos_emb

# x = tf.keras.layers.Reshape([50, 64*3])(x)

# att = MultiHeadsAttModel(l=6*6, d=64*3 , dv=8*3, dout=32, nv = 8 )
# x = att([x,x,x])
l = 50
d = 64*3
dv = 8*3
dout = 32
nv = 8
# v1 = tf.keras.layers.Input()(x)
# q1 = tf.keras.layers.Input()(x)
# k1 = tf.keras.layers.Input()(x)

v2 = tf.keras.layers.Dense(dv*nv, activation="relu")(x)
q2 = tf.keras.layers.Dense(dv*nv, activation="relu")(x)
k2 = tf.keras.layers.Dense(dv*nv, activation="relu")(x)
print(v2.shape)
v = tf.keras.layers.Reshape([l, nv, dv])(v2)
q = tf.keras.layers.Reshape([l, nv, dv])(q2)
k = tf.keras.layers.Reshape([l, nv, dv])(k2)

# att = tf.keras.layers.Multiply()([q, k])
att = MatMulLayerTranspose()
att = att([q, k])
# att = tf.matmul(q, k , transpose_b=True)

# att = att/np.sqrt(dv)
# att = tf.keras.layers.Multiply()([att, 1/np.sqrt(dv)])
# att = tf.math.divide(att, np.sqrt(dv))
att = ScaleLayer()([att, np.sqrt(dv)])
# att = tf.einsum('baik,baij->bakj',q, k)/np.sqrt(dv)
att = tf.keras.layers.Softmax(axis=-1)(att)

# out = tf.einsum('bajk,baik->baji',att, v)

# out = tf.keras.layers.Multiply()([att, v])

out = MatMulLayer()([att, v])
# out = tf.matmul(att, v)

out = tf.keras.layers.Reshape([l, d])(out)
x = tf.keras.layers.Reshape([l, d])(x)
# out = tf.expand_dims(out, axis=3)
# x = tf.expand_dims(x, axis=3)
add = tf.keras.layers.Add()([out, x])

out = tf.keras.layers.Dense(dout, activation="relu")(add)

# x = tf.keras.layers.Reshape([6, 6, 32])(out)

# out = tf.keras.layers.BatchNormalization()(out)

out = tf.keras.layers.Flatten()(out)
# out = tf.keras.layers.Dense(32, activation='relu')(out)
out = tf.keras.layers.Dense(num_classes, activation='softmax')(out)

model = tf.keras.Model(inputs=inp, outputs=out)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train, y_train, batch_size=500, epochs=5, verbose=1, validation_data=(x_test, y_test))

The following Variables were used a Lambda layer's call (tf.__operators__.add), but
are not present in its tracked objects:
  <tf.Variable 'pos_emb:0' shape=(1, 50, 192) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
(500, 50, 192)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f5b009f2d00>

In [9]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 28, 28, 1)]  0                                            
__________________________________________________________________________________________________
tf.compat.v1.shape (TFOpLambda) (4,)                 0           input_1[0][0]                    
__________________________________________________________________________________________________
tf.image.extract_patches (TFOpL (None, 7, 7, 16)     0           input_1[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.getitem (Slici ()                   0           tf.compat.v1.shape[0][0]         
______________________________________________________________________________________________

## Standard Vision Transformer

In [10]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    LayerNormalization,
)
from tensorflow.keras.layers.experimental.preprocessing import Rescaling
import tensorflow_datasets as tfds

class MultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = Dense(embed_dim)
        self.key_dense = Dense(embed_dim)
        self.value_dense = Dense(embed_dim)
        self.combine_heads = Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(
            x, (batch_size, -1, self.num_heads, self.projection_dim)
        )
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)
        query = self.separate_heads(query, batch_size)
        key = self.separate_heads(key, batch_size)
        value = self.separate_heads(value, batch_size)

        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )
        output = self.combine_heads(concat_attention)
        return output


class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.mlp = tf.keras.Sequential(
            [
                Dense(mlp_dim, activation=tfa.activations.gelu),
                Dropout(dropout),
                Dense(embed_dim),
                Dropout(dropout),
            ]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

    def call(self, inputs, training):
        inputs_norm = self.layernorm1(inputs)
        attn_output = self.att(inputs_norm)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = attn_output + inputs

        out1_norm = self.layernorm2(out1)
        mlp_output = self.mlp(out1_norm)
        mlp_output = self.dropout2(mlp_output, training=training)
        return mlp_output + out1

 The versions of TensorFlow you are currently using is 2.4.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [11]:
def extract_patches(self, images):
    batch_size = tf.shape(images)[0]
    patches = tf.image.extract_patches(
        images=images,
        sizes=[1, self.patch_size, self.patch_size, 1],
        strides=[1, self.patch_size, self.patch_size, 1],
        rates=[1, 1, 1, 1],
        padding="VALID",
    )
    patches = tf.reshape(patches, [batch_size, -1, self.patch_dim])
    return patches

num_patches = (image_size // patch_size) ** 2
self.patch_dim = channels * patch_size ** 2

self.patch_size = patch_size
self.d_model = d_model
self.num_layers = num_layers

self.rescale = Rescaling(1.0 / 255)
self.pos_emb = self.add_weight(
    "pos_emb", shape=(1, num_patches + 1, d_model)
)
self.class_emb = self.add_weight("class_emb", shape=(1, 1, d_model))
self.patch_proj = Dense(d_model)
self.enc_layers = [
    TransformerBlock(d_model, num_heads, mlp_dim, dropout)
    for _ in range(num_layers)
]
LayerNormalization(epsilon=1e-6)
Dense(mlp_dim, activation=tfa.activations.gelu),
Dropout(dropout),
Dense(num_classes),



batch_size = tf.shape(x)[0]
x = Rescaling(1.0 / 255)(x)
patches = extract_patches(x)
x = Dense(d_model)(patches)

class_emb = tf.broadcast_to(
    self.class_emb, [batch_size, 1, self.d_model]
)
print(x.shape)
print(class_emb.shape)
print(self.pos_emb.shape)
x = tf.concat([class_emb, x], axis=1)
x = x + self.pos_emb

for layer in self.enc_layers:
    x = layer(x, training)

# First (class token) is used for classification
x = self.mlp_head(x[:, 0])

NameError: name 'self' is not defined

In [12]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

logdir = "logs"
image_size = 28
patch_size = 4
num_layers = 4
d_model = 64
num_heads = 4
mlp_dim = 128
lr = 3e-4
weight_decay = 1e-4
batch_size = 64
epochs = 5

In [13]:
ds = tfds.load("mnist", as_supervised=True)

ds_train = (ds["train"].shuffle(5 * batch_size).batch(batch_size))

ds_test = (
    ds["test"]
    .cache()
    .batch(batch_size)
    .prefetch(AUTOTUNE)
)
next(iter(ds_train))[0].shape

TensorShape([64, 28, 28, 1])

In [14]:
import numpy as np
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Normalize input so we can train ANN with it.
# Will be converted back to integers for SNN layer.
# x_train = x_train / 255
# x_test = x_test / 255

# Add a channel dimension.
axis = 1 if tf.keras.backend.image_data_format() == 'channels_first' else -1
x_train = tf.expand_dims(x_train, axis)
x_test = tf.expand_dims(x_test, axis)

# One-hot encode target vectors.
y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)

In [15]:
model = VisionTransformer(
    image_size=image_size,
    patch_size=patch_size,
    num_layers=num_layers,
    num_classes=10,
    d_model=d_model,
    num_heads=num_heads,
    mlp_dim=mlp_dim,
    channels=1,
)

model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer="adam",
    metrics=["accuracy"],
)
# model.fit(
#     ds_train,
#     validation_data=ds_test,
#     epochs=1,
# )

# model.compile('adam', 'categorical_crossentropy', ['accuracy'])
model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

NameError: name 'VisionTransformer' is not defined

In [33]:
model.summary()

Model: "vision_transformer"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
rescaling (Rescaling)        multiple                  0         
_________________________________________________________________
dense_57 (Dense)             multiple                  1088      
_________________________________________________________________
transformer_block (Transform multiple                  33472     
_________________________________________________________________
transformer_block_1 (Transfo multiple                  33472     
_________________________________________________________________
transformer_block_2 (Transfo multiple                  33472     
_________________________________________________________________
transformer_block_3 (Transfo multiple                  33472     
_________________________________________________________________
sequential_4 (Sequential)    (None, 10)         

### Spiking Vision Transformer

In [16]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    LayerNormalization,
    RNN
)
from tensorflow.keras.layers.experimental.preprocessing import Rescaling
import tensorflow_datasets as tfds

class SpikingMultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = Dense(embed_dim)
        self.key_dense = Dense(embed_dim)
        self.value_dense = Dense(embed_dim)
        self.combine_heads = Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(
            x, (batch_size, -1, self.num_heads, self.projection_dim)
        )
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)
        query = self.separate_heads(query, batch_size)
        key = self.separate_heads(key, batch_size)
        value = self.separate_heads(value, batch_size)

        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )
        output = self.combine_heads(concat_attention)
        return output


class SpikingTransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.att = SpikingMultiHeadSelfAttention(embed_dim, num_heads)
        self.mlp = tf.keras.Sequential(
            [
                Dense(mlp_dim),
                RNN(SpikingReLU(mlp_dim), return_sequences=True, return_state=False, stateful=True),
                Dropout(dropout),
                Dense(embed_dim),
                Dropout(dropout),
            ]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

    def call(self, inputs, training):
        inputs_norm = self.layernorm1(inputs)
        attn_output = self.att(inputs_norm)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = attn_output + inputs

        out1_norm = self.layernorm2(out1)
        mlp_output = self.mlp(out1_norm)
        mlp_output = self.dropout2(mlp_output, training=training)
        return mlp_output + out1

In [20]:
batch_size = 50
dropout = 0.1
input_shape = x_train.shape[1:]

def extract_patches(images):
    batch_size = tf.shape(images)[0]
    patches = tf.image.extract_patches(
        images=images,
        sizes=[1, patch_size, patch_size, 1],
        strides=[1, patch_size, patch_size, 1],
        rates=[1, 1, 1, 1],
        padding="VALID",
    )
    patches = tf.reshape(patches, [batch_size, -1, patch_dim])
    return patches

inp = tf.keras.layers.Input(shape=(input_shape))
x = Rescaling(1.0 / 255)(inp)

# =============== VISION PART =====================
# patching, positional embedding and class embedding
patches = extract_patches(x)
x = Dense(d_model)(patches)

pos_emb = tf.Variable(initial_value=tf.random.uniform(shape=(1, num_patches + 1, d_model)), 
                      name="pos_emb", validate_shape=(1, num_patches + 1, d_model), trainable=True)
class_emb = tf.Variable(initial_value=tf.random.uniform(shape=(1, 1, d_model)), name="class_emb", 
                        validate_shape=(1, 1, d_model), trainable=True)

class_emb = tf.broadcast_to(class_emb, [batch_size, 1, d_model])

x = tf.concat([class_emb, x], axis=1)
x = x + pos_emb

# Transformer Blocks
x = TransformerBlock(d_model, num_heads, mlp_dim, dropout)(x)

# ================= MLP HEAD ===================
x = Dense(mlp_dim, activation=tf.nn.relu)(x[:, 0])
x = Dense(num_classes)(x)

# ================ Model compilation and training ==================
model = tf.keras.models.Model(inputs=inp, outputs=x)

model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer="adam",
    metrics=["accuracy"],
)

model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

The following Variables were used a Lambda layer's call (tf.__operators__.add_1), but
are not present in its tracked objects:
  <tf.Variable 'pos_emb:0' shape=(1, 50, 64) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.


The following Variables were used a Lambda layer's call (tf.__operators__.add_1), but
are not present in its tracked objects:
  <tf.Variable 'pos_emb:0' shape=(1, 50, 64) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
TensorFlow Addons has compiled its custom ops against TensorFlow 2.2.0, and there are no compatibility guarantees between the two versions. 
This means that you might get segfaults when loading the custom op, or other kind of low-level errors.
 If you do, do not file an issue on Github. This is a known limitation.

It might help you to fallback to pure Python ops with TF_ADDONS_PY_OPS . To do that, see https://github.com/tensorflow/addons#gpucpu-custom-ops 

You can also change the TensorFlow version installed on your system. You would need a TensorFlow version equal to or above 2.2.0 and strictly below 2.3.0.



<tensorflow.python.keras.callbacks.History at 0x7f5afdd38e50>

In [21]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 28, 28, 1)]  0                                            
__________________________________________________________________________________________________
rescaling_3 (Rescaling)         (None, 28, 28, 1)    0           input_5[0][0]                    
__________________________________________________________________________________________________
tf.compat.v1.shape_1 (TFOpLambd (4,)                 0           rescaling_3[0][0]                
__________________________________________________________________________________________________
tf.image.extract_patches_1 (TFO (None, 7, 7, 16)     0           rescaling_3[0][0]                
____________________________________________________________________________________________

In [22]:
d_model = 64
num_heads = 4
mlp_dim = 128
dropout = 0.1


def convert(model, weights, x_test, y_test):
    print("Converted model:\n" + "-"*32)
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.InputLayer):
            print("Input Layer")
            inputs = tf.keras.Input(shape=(1, model.layers[0].input_shape[0][1]), batch_size=y_test.shape[0])
            x = inputs        
        elif isinstance(layer, tf.keras.layers.Dense):
            x = tf.keras.layers.Dense(layer.output_shape[1])(x)
            # x = tf.keras.layers.RNN(DenseRNN(layer.output_shape[1]), return_sequences=True, return_state=False, stateful=True)(x)
            if layer.activation.__name__ == 'linear':
                print("Dense Layer w/o activation")
                pass
            elif layer.activation.__name__ == 'relu':
                print("Dense Layer with SpikingReLU")
                x = tf.keras.layers.RNN(SpikingReLU(layer.output_shape[1]), return_sequences=True, return_state=False, stateful=True)(x)
            elif layer.activation.__name__ == 'sigmoid':
                print("Dense Layer with SpikingSigmoid")
                x = tf.keras.layers.RNN(SpikingSigmoid(layer.output_shape[1]), return_sequences=True, return_state=False, stateful=True)(x)
            elif layer.activation.__name__ == 'tanh':
                print("Dense Layer with SpikingTanh")
                x = tf.keras.layers.RNN(SpikingTanh(layer.output_shape[1]), return_sequences=True, return_state=False, stateful=True)(x)
            else:
                print('[Info] Activation type', layer.activation.__name__, 'not implemented')
        elif isinstance(layer, tf.keras.layers.ReLU):
            print("SpikingReLU Layer")
            x = tf.keras.layers.RNN(SpikingReLU(layer.output_shape[1]), return_sequences=True, return_state=False, stateful=True)(x)
        elif isinstance(layer, tf.keras.layers.Softmax):
            print("Accumulate + Softmax Layer")
            print(layer.output_shape[1])
            x = tf.keras.layers.RNN(Accumulate(layer.output_shape[1]), return_sequences=True, return_state=False, stateful=True)(x)
            x = tf.keras.layers.Softmax()(x)
        elif isinstance(layer, TransformerBlock):
            x  = SpikingTransformerBlock(d_model, num_heads, mlp_dim, dropout)
        else:
            print("[Info] Layer type ", layer, "not implemented, so we use it's keras version")
            x = layer(x)
    spiking = tf.keras.models.Model(inputs=inputs, outputs=x)
    print("-"*32 + "\n")

    spiking.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer="adam",
        metrics=["sparse_categorical_accuracy"],)

    spiking.set_weights(weights)
    return spiking

In [23]:
weights = get_normalized_weights(model, x_train, percentile=85)

##################################################
# Preprocessing for RNN 
x_train = np.expand_dims(x_train, axis=1)  # (60000, 784) -> (60000, 1, 784)
x_test = np.expand_dims(x_test, axis=1)
#x_rnn = np.tile(x_train, (1, 1, 1))
#y_rnn = y_train  # np.tile(x_test, (1, timesteps, 1))

##################################################
# Conversion to spiking model
snn = convert(ann, weights, x_test, y_test)
evaluate_conversion(snn, ann, x_test, y_test, testacc, timesteps=50)

InvalidArgumentError: ConcatOp : Dimensions of inputs should match: shape[0] = [50,1,64] vs. shape[1] = [60000,49,64] [Op:ConcatV2] name: concat