**Multi-Head Self Attention code from scratch**

In [None]:
import numpy as np

# Scaled Dot-Product Attention
def scaled_dot_product_attention(Q, K, V, mask=None):
    d_k = Q.shape[-1]
    scores = np.matmul(Q, K.transpose(-2, -1)) / np.sqrt(d_k)

    if mask is not None:
        scores += (mask * -1e9)  # Apply mask to ignore certain tokens

    attention_weights = softmax(scores, axis=-1)
    output = np.matmul(attention_weights, V)
    return output, attention_weights

def softmax(x, axis=-1):
    exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

# Multi-Head Self Attention Mechanism
class MultiHeadAttention:
    def __init__(self, d_model, num_heads):
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % num_heads == 0  # Ensure evenly divisible

        self.depth = d_model // num_heads
        self.W_q = np.random.rand(d_model, d_model)
        self.W_k = np.random.rand(d_model, d_model)
        self.W_v = np.random.rand(d_model, d_model)
        self.W_o = np.random.rand(d_model, d_model)

    def split_heads(self, x, batch_size):
        x = x.reshape(batch_size, -1, self.num_heads, self.depth)
        return x.transpose(0, 2, 1, 3)  # (batch_size, num_heads, seq_length, depth)

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.shape[0]

        Q = np.dot(Q, self.W_q)  # (batch_size, seq_len, d_model)
        K = np.dot(K, self.W_k)
        V = np.dot(V, self.W_v)

        # Split Q, K, V into multiple heads
        Q = self.split_heads(Q, batch_size)
        K = self.split_heads(K, batch_size)
        V = self.split_heads(V, batch_size)

        # Scaled Dot Product Attention for each head
        attention, _ = scaled_dot_product_attention(Q, K, V, mask)

        # Concatenate heads back to original dimensions
        attention = attention.transpose(0, 2, 1, 3).reshape(batch_size, -1, self.d_model)

        # Final linear transformation
        output = np.dot(attention, self.W_o)
        return output


**FeedForwardNetwork**

In [None]:
class FeedForwardNetwork:
    def __init__(self, d_model, d_ff):
        self.W_1 = np.random.rand(d_model, d_ff)
        self.W_2 = np.random.rand(d_ff, d_model)

    def forward(self, x):
        return np.dot(np.maximum(0, np.dot(x, self.W_1)), self.W_2)


**Layer Normalization**

In [None]:
class LayerNorm:
    def __init__(self, d_model, eps=1e-6):
        self.gamma = np.ones((d_model,))
        self.beta = np.zeros((d_model,))
        self.eps = eps

    def forward(self, x):
        mean = np.mean(x, axis=-1, keepdims=True)
        std = np.std(x, axis=-1, keepdims=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta


**Transformer Encoder**

In [None]:
class TransformerEncoderLayer:
    def __init__(self, d_model, num_heads, d_ff):
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)

    def forward(self, x, mask):
        # Self-attention with normalization
        attn_output = self.mha.forward(x, x, x, mask)
        out1 = self.norm1.forward(x + attn_output)  # Residual connection

        # Feed-forward network with normalization
        ffn_output = self.ffn.forward(out1)
        out2 = self.norm2.forward(out1 + ffn_output)  # Residual connection

        return out2


In [None]:
class TransformerEncoder:
    def __init__(self, num_layers, d_model, num_heads, d_ff):
        self.layers = [TransformerEncoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)]

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer.forward(x, mask)
        return x


**Next Steps for Full Transformer:**


1.   **Position Encoding:** Implement positional encodings to give the model awareness of the token positions in a sequence.
2.   **Decoder Block:** Build the decoder block similar to the encoder but with additional mechanisms for causal attention (so the model only attends to past tokens).
3.   **Training & Loss Functions:** Set up the model to compute loss and perform backpropagation for tasks like language translation or sequence generation.






**Machine Translation Task using T5**

In [2]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00

In [3]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

In [4]:
# Load the WMT14 dataset for English-to-German translation
dataset = load_dataset('wmt14', 'de-en')

# For this example, we'll use only a small subset to speed up training.
train_dataset = dataset['train'].select(range(1000))  # Select the first 1000 samples for training
val_dataset = dataset['validation'].select(range(100))  # Select the first 100 samples for validation

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/280M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/265M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/474k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/509k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

In [11]:
# Load pre-trained T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Define a function to preprocess the dataset
def preprocess_data(batch):
    # Tokenize the English input (source) and German output (target)
    inputs = tokenizer([item['en'] for item in batch['translation']], padding='max_length', truncation=True, max_length=128,return_tensors='pt')
    targets = tokenizer([item['de'] for item in batch['translation']], padding='max_length', truncation=True, max_length=128,return_tensors='pt')

    # Return input_ids and labels for the model
    return {
        'input_ids': inputs['input_ids'].squeeze(),
        'attention_mask': inputs['attention_mask'].squeeze(),
        'labels': targets['input_ids'].squeeze()
    }

# Apply the preprocessing to the datasets
train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)

# Create DataLoader for training and validation
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
# Load pre-trained T5 model for sequence-to-sequence tasks
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [13]:
from torch.utils.data import DataLoader
import torch

# Define a function to collate the data into tensors
def collate_fn(batch):
    # Convert lists of input_ids and attention_mask to tensors
    input_ids = torch.tensor([example['input_ids'] for example in batch])
    attention_mask = torch.tensor([example['attention_mask'] for example in batch])
    labels = torch.tensor([example['labels'] for example in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Create DataLoader for training and validation
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=collate_fn)


In [21]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
epochs = 10  # Adjust based on dataset size and computational resources

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        # Move input data to the device (GPU or CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}, Training loss: {avg_train_loss}')

    # Evaluation on validation set
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}, Validation loss: {avg_val_loss}')

Epoch 1, Training loss: 0.6613120110262007
Epoch 1, Validation loss: 0.32995123096874784
Epoch 2, Training loss: 0.553305519005609
Epoch 2, Validation loss: 0.28129687905311584
Epoch 3, Training loss: 0.49981162283155656
Epoch 3, Validation loss: 0.2573237099817821
Epoch 4, Training loss: 0.4716296460893419
Epoch 4, Validation loss: 0.2498022743633815
Epoch 5, Training loss: 0.44624734114086817
Epoch 5, Validation loss: 0.24666305950709752
Epoch 6, Training loss: 0.4376449707954649
Epoch 6, Validation loss: 0.24506221073014395
Epoch 7, Training loss: 0.4236607825945294
Epoch 7, Validation loss: 0.24462237315518515
Epoch 8, Training loss: 0.4124175291212778
Epoch 8, Validation loss: 0.24348439701965877
Epoch 9, Training loss: 0.4053363544600351
Epoch 9, Validation loss: 0.243513428739139
Epoch 10, Training loss: 0.39981958270072937
Epoch 10, Validation loss: 0.2437643940959658


In [20]:
def translate_sentence(sentence, model, tokenizer):
    model.eval()  # Make sure the model is in evaluation mode

    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
    print("Input IDs:", inputs['input_ids'])

    # Generate the translated output (decoder output)
    outputs = model.generate(inputs['input_ids'],
                             attention_mask=inputs['attention_mask'],
                             max_length=50,
                             num_beams=4,  # Use beam search
                             early_stopping=True)  # Stop when all beams finish

    # Print generated token IDs for debugging
    print("Generated Token IDs:", outputs)

    # Decode the output
    translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return translated_sentence

# Example translation
english_sentence = "Who are you?"
translated_sentence = translate_sentence(english_sentence, model, tokenizer)
print(f"English: {english_sentence}")
print(f"German: {translated_sentence}")


Input IDs: tensor([[2645,   33,   25,   58,    1]], device='cuda:0')
Generated Token IDs: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]], device='cuda:0')
English: Who are you?
German: 


In [18]:
# Assuming you have a sentence to translate
test_sentences = ["Hello, how are you?"]  # Example input
inputs = tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True).to(device)

# Generate outputs
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    generated_ids = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

# Print the generated translations
for text in generated_texts:
    print(text)



