In [None]:
import keras
from keras import ops # operations for tensor manipulation functions
from keras import layers

In [2]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

In [3]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = ops.shape(x)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [4]:
vocab_size = 20000 # only top 20K words
max_len = 200 # only top 200 words in each movie review

(X_train, y_train), (X_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
print(f"Training sequences: {len(X_train)}")
print(f"Validation sequences: {len(X_val)}")

Training sequences: 25000
Validation sequences: 25000


In [5]:
X_train = keras.utils.pad_sequences(X_train, maxlen=max_len)
X_val = keras.utils.pad_sequences(X_val, maxlen=max_len)

In [6]:
X_train

array([[   5,   25,  100, ...,   19,  178,   32],
       [   0,    0,    0, ...,   16,  145,   95],
       [   0,    0,    0, ...,    7,  129,  113],
       ...,
       [   0,    0,    0, ...,    4, 3586,    2],
       [   0,    0,    0, ...,   12,    9,   23],
       [   0,    0,    0, ...,  204,  131,    9]])

In [7]:
# Create input layer
inputs = layers.Input(shape=(max_len,))

In [8]:
# Create embedding layer
embed_dim = 32 # Embedding dimension for each token
embedding_layer = TokenAndPositionEmbedding(max_len, vocab_size, embed_dim)
x = embedding_layer(inputs)




In [9]:
# Create transformer layer
num_heads = 2 # attention heads
ff_dim = 32 # hidden layer size in FFN inside transformer
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)

In [10]:
# Apply global average pooling to output - reduce spatial dimension
x = layers.GlobalAveragePooling1D()(x)

In [11]:
# Apply dropout regularization - randomly setting fractions of inputs to 0 to avaoid overfitting
x = layers.Dropout(0.1)(x)

In [12]:
# Add 20 dense layers and ReLU to learn non-linear relationships
x = layers.Dense(20, activation='relu')(x)

In [13]:
# Apply dropout regularization - randomly setting fractions of inputs to 0 to avaoid overfitting
x = layers.Dropout(0.1)(x)

In [14]:
# Create output layer - 2 dense layers and softmax for binary classification
# Softmax - converts output values to probabilities
outputs = layers.Dense(2, activation='softmax')(x)

In [15]:
# Create model
model = keras.Model(inputs=inputs, outputs=outputs)

In [16]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [17]:
history = model.fit(X_train, y_train, batch_size=32, epochs=2, validation_data=(X_val, y_val))

Epoch 1/2
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 32ms/step - accuracy: 0.7092 - loss: 0.5250 - val_accuracy: 0.8779 - val_loss: 0.2898
Epoch 2/2
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 31ms/step - accuracy: 0.9287 - loss: 0.1964 - val_accuracy: 0.8743 - val_loss: 0.3053


In [18]:
# Assuming `model` is your trained model
import numpy as np
# Make predictions on a single input sequence
input_sequence = X_val[0]  # Example input sequence from validation data
input_sequence = input_sequence.reshape(1, -1)  # Reshape to match model input shape
predictions = model.predict(input_sequence)

# Print the predicted class probabilities
print("Predicted class probabilities:", predictions)

# Get the predicted class label (0 or 1)
predicted_class = np.argmax(predictions)
print("Predicted class:", predicted_class)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328ms/step
Predicted class probabilities: [[0.9364726  0.06352738]]
Predicted class: 0
