# Transformer Encoder Architecture

<img src="./img/encoder_only.png" alt="encoder_only" style="width: 600px;"/>

Encoder-only transformers simplify the original architecture for scenarios where the primary focus is on understanding and representing the input data, rather than generating sequences, such as text classification, named entity or intent recognition, etc.

It consists of multiple encoder layers. Each encoder layer incorporates a multi-headed self-attention mechanism to capture relationships between elements in the sequence, followed by feed-forward layers to map this knowledge into abstract nonlinear representations.

## Pytorch TransformerEncoderLayer

In [1]:
import torch
import torch.nn as nn

In [2]:
sample_texts = [
    'I love this product',
    'This is terrible',
    'Could be better',
    'This is the best',
]

In [3]:
labels = [1,0,0,1]

In [4]:
train_data, test_data = sample_texts[:3], sample_texts[3:]
train_labels, test_labels = labels[:3], labels[3:]

TransformerEncoderLayer:

* d_model - influences the model's representational depth
* nhead - determines how many word contexts the model can focus on simultaneously, impacting its contextual understanding

In [5]:
class TransformerEncoder(nn.Module):
    def __init__(self, embed_size, heads, num_layers, dropout):
        super().__init__()
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_size, nhead=heads),
            num_layers=num_layers
        )
        self.fc = nn.Linear(embed_size, 2)

    def forward(self, x):
        x = self.encoder(x)
        x = x.mean(dim=1)
        return self.fc(x)

In [None]:
model = TransformerEncoder(embed_size=512, heads=8, num_layers=3, dropout=0.5)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_function = nn.CrossEntropyLoss()

In [None]:
# training loop
for epoch in range(10):
    for sentence, label in zip(train_data,train_labels):
        # Split the sentences into tokens and stack the embeddings
        tokens = sentence.split()
        data = torch.stack([token_embeddings[i] for i in tokens], dim=1)
        output = model(data)
        loss = loss_function(output, torch.tensor([label]))
        # Zero the gradients and perform a backward pass
        optimizer.zero_grad()
        loss.backwards()
        optimizer.step()

In [None]:
# testing
def predict(sentence):
    model.eval()
    # Deactivate the gradient computations and get the sentiment prediction.
    with torch.no_grad():
        tokens = sentence.split()
        data = torch.stack([token_embeddings.get(token, torch.rand((1, 512))) for i in tokens], dim=1)
        output = model(data)
        predicted = torch.argmax(output, dim=1)
    return 'positive' if predicted.item() == 1 else 'negative'

In [2]:
sample_text = [
    "the animal didn't cross the street because it was too tired",
    "the cat sat on the mat",
]

## Feed Forward transformation

Positional encoding and Multi-Head Attention mechanism explained in `./transformers_fundamentals.ipynb`.

In [8]:
class FeedForwardTransformation(nn.Module):
    """
    parms: d_model the embedinggs dimensionality
    params: d_ff the dimension between linear layers
    """
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(
            self.relu(
                self.fc1(x)
            )
        )

# Transformer Encoder from scratch

In [9]:
from transformers_utils import PositionalEncoder, MultiHeadAttention

In [10]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForwardTransformation(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.attention(x,x,x,mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x
        

Mask is used during the forward pass, concretely in the attention stage to prevent processing of padding tokens.

The self-attention mechanism should not look at padded tokens, since they do not contain relevant information for the language task. This is where a padding mask with zeros for padded positions in the sequence, is utilized.

In [11]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_seq_length):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.poistional_encoding = PositionalEncoder(d_model, max_seq_length)
        self.layers = nn.ModuleList(
            [EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]
        )

    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.poistional_encoding(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x
        

Transformer head is final output layer designed for specific task.

In [12]:
class ClassifierHead(nn.Module):
    def __init__(self, d_model, num_classes):
        super().__init__()
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        logits = self.fc(x)
        return nn.functional.log_softmax(logits, dim=-1)

## Testing Encoder

In [13]:
num_classes = 3
vocab_size = 10000
batch_size = 8
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
sequence_length = 64
dropout = 0.1

In [14]:
input_sequence = torch.randint(0, vocab_size, (batch_size, sequence_length))
mask = torch.randint(0, 2, (sequence_length, sequence_length))

In [15]:
# Instantiate the encoder transformer's body and head
encoder = TransformerEncoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, sequence_length)
classifier = ClassifierHead(d_model, num_classes)

In [16]:
# The forward pass 
output = encoder(input_sequence, mask)
classification = classifier(output)
print("Classification outputs for a batch of ", batch_size, "sequences:")
print(classification[0][0])

Classification outputs for a batch of  8 sequences:
tensor([-1.7434, -0.8088, -0.9685], grad_fn=<SelectBackward0>)
