# Encoder-Only Transformer (BERT-like Model)

In this notebook, we implement an encoder-only transformer model similar to BERT. The model is designed for Masked Language Modeling (MLM), where certain tokens in the input are masked, and the model learns to predict the masked tokens.

### Import Libraries

In [1]:
%pip install torch numpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F # mainly for ReLU
import numpy as np
import copy


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/arthurnurnberg/Library/Mobile Documents/com~apple~CloudDocs/Arbeit/TUD Tutor LLMs/06_lm_architectures/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/arthurnurnberg/Library/Mobile Documents/com~apple~CloudDocs/Arbeit/TUD Tutor LLMs/06_lm_architectures/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1075,

In [3]:
# Set the device ("mps" if you're using an M series mac):
if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

### Special Tokens
There are some different tokens you might stumble across when dealing with inputs in NLP. 
- The [PAD] Token acts as a padding if a sentence does not have the desired fixed length. 
- The [BOS] Token indicates the Beginning of the Sentence. 
- The [EOS] Token indicates the End of the Sentence. 
- The [CLS] Token represents Sentence Level Classification. 
- The [SEP] Token represents Separation of Sentences (used by BERT). 
- The [UNK] Token represents OOB-Tokens, meaning unknown Tokens that are not included in the vocabulary. 

In [4]:
# Sample vocabulary and data
vocab = ['[PAD]', '[MASK]', '[CLS]', '[SEP]', 'i', 'like', 'eating', 'apples', 'and', 'bananas', 'really']
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
vocab_size = len(vocab)

# Sample sentences for showcasing possible input
sentences = [
    ['[CLS]', 'i', 'like', 'eating', 'apples', '[SEP]'],
    ['[CLS]', 'i', 'like', 'eating', 'bananas', '[SEP]'],
    ['[CLS]', 'i', 'really', 'like', 'apples', '[SEP]']
]

In [5]:
def mask_tokens(sentences, mask_prob=0.15):
    inputs = []
    labels = []
    for sent in sentences:
        input_ids = [word_to_idx[word] for word in sent]
        label_ids = input_ids.copy()
        for i in range(1, len(sent) - 1):  # Exclude [CLS] and [SEP] tokens
            if torch.rand(1).item() < mask_prob:
                input_ids[i] = word_to_idx['[MASK]']
        inputs.append(input_ids)
        labels.append(label_ids)
    return torch.tensor(inputs), torch.tensor(labels)

In [6]:
# Prepare data
inputs, labels = mask_tokens(sentences)
inputs = inputs.to(device)
labels = labels.to(device)

In [7]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, model_dimension, max_len=512):
        super(PositionalEncoding, self).__init__()
        # TODO: Implement positional encoding
        positional_encoding = torch.zeros(max_len, model_dimension)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        frequency_divisor = torch.exp(torch.arange(0, model_dimension, 2).float() * (-np.log(10000.0) / model_dimension))
        positional_encoding[:, 0::2] = torch.sin(position * frequency_divisor)
        positional_encoding[:, 1::2] = torch.cos(position * frequency_divisor)
        positional_encoding = positional_encoding.unsqueeze(0)  # Shape: (1, max_len, d_model)
        self.register_buffer('pe', positional_encoding)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :].to(x.device)
        return x

In [8]:
# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, model_dimension, num_attention_heads, dim_feedforward, dropout=0.1):
        super(EncoderLayer, self).__init__()
        # TODO: Implement encoder layer components
        self.self_attention_layer = nn.MultiheadAttention(model_dimension, num_attention_heads, dropout=dropout)
        # Feedforward network
        self.linear1 = nn.Linear(model_dimension, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, model_dimension)
        # Layer normalization
        self.norm1 = nn.LayerNorm(model_dimension)
        self.norm2 = nn.LayerNorm(model_dimension)
        # Dropout layers
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, input_tensor, source_mask=None):
        # TODO: Implement forward pass
        feedforward_output = self.self_attention_layer(input_tensor, input_tensor, input_tensor, attn_mask=source_mask)[0]
        input_tensor = input_tensor + self.dropout1(feedforward_output)
        input_tensor = self.norm1(input_tensor)
        # Feedforward network
        feedforward_output = self.linear2(self.dropout(F.relu(self.linear1(input_tensor))))
        input_tensor = input_tensor + self.dropout2(feedforward_output)
        input_tensor = self.norm2(input_tensor)
        return input_tensor

In [9]:
# Encoder
class Encoder(nn.Module):
    def __init__(self, num_layers, model_dimension, num_attention_heads, vocab_size, dim_feedforward, dropout=0.1):
        super(Encoder, self).__init__()
        # TODO: Implement encoder components
        self.model_dimension = model_dimension
        self.embedding = nn.Embedding(vocab_size, model_dimension)
        self.positional_encoder = PositionalEncoding(model_dimension)
        # Create multiple encoder layers
        encoder_layer = EncoderLayer(model_dimension, num_attention_heads, dim_feedforward, dropout)
        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(model_dimension)

    def forward(self, input_sequence, src_mask=None):
        # TODO: Implement forward pass
        input_sequence = self.embedding(input_sequence) * np.sqrt(self.model_dimension)
        input_sequence = self.positional_encoder(input_sequence)
        input_sequence = input_sequence.permute(1, 0, 2)  # Transformer expects (sequence length, batch size, embedding size)
        for layer in self.layers:
            input_sequence = layer(input_sequence, src_mask)
        input_sequence = self.norm(input_sequence)
        return input_sequence

In [10]:
# BERT Model
class BERTModel(nn.Module):
    def __init__(self, num_layers, model_dimension, num_attention_heads, vocab_size, dim_feedforward, dropout=0.1):
        super(BERTModel, self).__init__()
        # TODO: Implement BERT model components
        self.encoder = Encoder(num_layers, model_dimension, num_attention_heads, vocab_size, dim_feedforward, dropout)
        self.output_layer = nn.Linear(model_dimension, vocab_size)

    def forward(self, input_sequence, input_sequence_mask=None):
        # TODO: Implement forward pass
        encoder_output = self.encoder(input_sequence, input_sequence_mask)
        encoder_output = encoder_output.permute(1, 0, 2)  # Back to (batch size, sequence length, embedding size)
        logits = self.output_layer(encoder_output)
        return logits

In [11]:
# Hyperparameters
num_layers = 2
model_dimension = 64
num_attention_heads = 4
dim_feedforward = 128
dropout = 0.1

In [12]:
# Initialize the model, loss function, and optimizer
model = BERTModel(num_layers, model_dimension, num_attention_heads, vocab_size, dim_feedforward, dropout).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=word_to_idx['[PAD]'])
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [13]:
# Training loop
epochs = 50
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    # TODO: Forward pass
    outputs = model(inputs)
    # Reshape outputs and labels
    outputs = outputs.view(-1, vocab_size)
    labels_flat = labels.view(-1)
    loss = criterion(outputs, labels_flat)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

Epoch 10/50, Loss: 0.9608
Epoch 20/50, Loss: 0.2906
Epoch 30/50, Loss: 0.1274
Epoch 40/50, Loss: 0.0760
Epoch 50/50, Loss: 0.0506


In [14]:
# Inference: Predict masked tokens
def predict_masked_tokens(model, input_sentence):
    model.eval()
    input_ids = [word_to_idx.get(word, word_to_idx['[PAD]']) for word in input_sentence]
    input_tensor = torch.tensor([input_ids]).to(device)
    with torch.no_grad():
        outputs = model(input_tensor)
        predictions = torch.argmax(outputs, dim=-1)
    predicted_sentence = [idx_to_word[idx.item()] for idx in predictions[0]]
    return predicted_sentence

In [15]:
# Test the model
test_sentence = ['[CLS]', 'i', 'like', '[MASK]', '[MASK]', '[SEP]']
predicted_sentence = predict_masked_tokens(model, test_sentence)
print("Input Sentence:", test_sentence)
print("Predicted Sentence:", predicted_sentence)

Input Sentence: ['[CLS]', 'i', 'like', '[MASK]', '[MASK]', '[SEP]']
Predicted Sentence: ['[CLS]', 'i', 'like', 'eating', 'eating', '[SEP]']
