In [22]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Dense,
    Embedding,
    Dropout,
    LayerNormalization
)

In [27]:
# -----------------------------
# Positional Encoding Layer
# -----------------------------
class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
      super().__init__()
      self.max_len = max_len
      self.d_model = d_model
      self.pos_encoding = self.positional_encoding(max_len, d_model)

    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / tf.pow(10000., (2 * (i//2)) / tf.cast(d_model, tf.float32))
        return pos * angle_rates

    def positional_encoding(self, max_len, d_model):
        angle_rads = self.get_angles(
            pos=tf.range(max_len)[:, tf.newaxis],
            i=tf.range(d_model)[tf.newaxis, :],
            d_model=d_model
        )

        # QUESTION : apply sin to even indices, cos to odd
        angle_rads[:, 0::2] = tf.math.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = tf.math.cos(angle_rads[:, 1::2])

        return angle_rads[tf.newaxis, ...]

    def call(self, x):
        return x + tf.cast(self.pos_encoding[:, :tf.shape(x)[1], :], x.dtype)

In [28]:
# -----------------------------
# Transformer Encoder Block
# -----------------------------
def transformer_encoder_block(embed_dim, num_heads, ff_dim, dropout=0.1):
    inputs = layers.Input(shape=(None, embed_dim))

    # QUESTION : Apply Layer Norm + MultiHead Attention
    x = layers.LayerNormalization()(inputs) #normalization
    attention_output = layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embed_dim//num_heads,
        dropout=dropout
    )(x, x)
    x = layers.Add()([inputs, attention_output])  # Residual

    # QUESTION : Apply Feed Forward Network with Approrpriate Activation fxn
    ff = layers.LayerNormalization()(x)
    ff = layers.Dense(ff_dim, activation="relu")(ff)
    ff = layers.Dense(embed_dim)(ff)
    x = layers.Add()([x, ff])  # Residual

    return Model(inputs, x, name="TransformerEncoderBlock")


In [29]:

# -----------------------------
# Build the Transformer Model
# -----------------------------
def build_transformer(
        max_len=100,
        vocab_size=10000,
        embed_dim=64,
        num_heads=4,
        ff_dim=128,
        num_layers=3
    ):
    inputs = layers.Input(shape=(None,), dtype=tf.int32)

    # QUESTION : Apply Token Embedding and Positional Encoding

    x = layers.Embedding(vocab_size, embed_dim)(inputs) #embedding
    x = PositionalEncoding(max_len, embed_dim)(x) #positional encoding

    # 3 Transformer Encoder Blocks
    for _ in range(num_layers):
        x = transformer_encoder_block(embed_dim, num_heads, ff_dim)(x)

    # Classification head (example)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(64, activation="relu")(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = Model(inputs, outputs, name="Transformer_3Layer")
    return model

In [30]:
# -----------------------------
# Instantiate + Compile Model
# -----------------------------
model = build_transformer()

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), #fill appropriate learning rate
    loss="binary_crossentropy", #fill appropriate loss fxn
    metrics=["accuracy"]
)

model.summary()

TypeError: `x` and `y` must have the same dtype, got tf.int32 != tf.float32.