# Transformer Encoder for IMDB Sentiment Classification

A small Transformer encoder built with Keras (MultiHeadAttention) used as a text classifier. This demonstrates attention and parallel sequence processing.

## 0. Environment / Install (run if needed)
Run this cell to install packages if they are missing. On Colab you can skip already installed ones.

In [None]:
import sys
print('Python', sys.version)


In [None]:
# !pip install -q tensorflow

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import numpy as np

# Load raw IMDB (using tensorflow datasets would require extra install; we'll use keras's imdb indices)
from tensorflow.keras.datasets import imdb
vocab_size = 10000
maxlen = 200

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)

# Convert integer sequences back to text using imdb.get_word_index (for vectorization demo)
word_index = imdb.get_word_index()
index_word = {i+3: w for w, i in word_index.items()}
index_word[0] = '<pad>'; index_word[1] = '<start>'; index_word[2] = '<unk>'

def decode_review(seq):
    return ' '.join(index_word.get(i, '?') for i in seq)

x_train_text = [decode_review(s) for s in x_train]
x_test_text = [decode_review(s) for s in x_test]

print('Example:', x_train_text[0][:200])

In [None]:
# Text vectorization
from tensorflow.keras.layers import TextVectorization
vectorizer = TextVectorization(max_tokens=vocab_size, output_sequence_length=maxlen)
vectorizer.adapt(x_train_text)

# Build simple transformer encoder block
from tensorflow.keras.layers import Embedding, LayerNormalization, Dense, Dropout, MultiHeadAttention, GlobalAveragePooling1D, Input
from tensorflow.keras.models import Model

embedding_dim = 64
num_heads = 4
ff_dim = 128

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0.1):
    # Attention + skip
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads)(inputs, inputs)
    x = Dropout(dropout)(x)
    x = LayerNormalization(epsilon=1e-6)(x)
    res = x + inputs

    # Feed-forward
    x = Dense(ff_dim, activation='relu')(res)
    x = Dropout(dropout)(x)
    x = Dense(inputs.shape[-1])(x)
    x = LayerNormalization(epsilon=1e-6)(x)
    return x + res

inputs = Input(shape=(None,), dtype='int64')
x = Embedding(vocab_size, embedding_dim)(inputs)
x = transformer_encoder(x, head_size=embedding_dim//num_heads, num_heads=num_heads, ff_dim=ff_dim)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.2)(x)
outputs = Dense(1, activation='sigmoid')(x)

model = Model(inputs, outputs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Prepare vectorized integer sequences
x_train_vect = vectorizer(np.array(x_train_text))
x_test_vect = vectorizer(np.array(x_test_text))

# Train (short for demo)
history = model.fit(x_train_vect, y_train, epochs=3, batch_size=128, validation_split=0.2)

In [None]:
loss, acc = model.evaluate(x_test_vect, y_test)
print(f'Test accuracy: {acc:.4f}')

## Notes
- Transformer uses attention to capture relationships across entire sequence. This demo uses a single encoder block.