<a href="https://colab.research.google.com/github/syedmahmoodiagents/transformers/blob/main/Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel


In [None]:
### Complete Transformer Model

In [None]:

class Encoder(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(Encoder, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.linear = nn.Linear(embed_dim, embed_dim)
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_output, _ = self.multihead_attn(x, x, x)
        x = self.norm(x + attn_output)
        x = self.norm(x + self.linear(x))
        return x

class Decoder(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(Decoder, self).__init__()
        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.encoder_attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.linear = nn.Linear(embed_dim, embed_dim)
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x, enc_output):
        attn_output, _ = self.self_attn(x, x, x)
        x = self.norm(x + attn_output)
        attn_output, _ = self.encoder_attn(x, enc_output, enc_output)
        x = self.norm(x + attn_output)
        x = self.norm(x + self.linear(x))
        return x

class Transformer(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(Transformer, self).__init__()
        self.encoder = Encoder(embed_dim, num_heads)
        self.decoder = Decoder(embed_dim, num_heads)
        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, src, tgt):
        enc_output = self.encoder(src)
        dec_output = self.decoder(tgt, enc_output)
        output = self.fc(dec_output)
        return output


In [None]:
# Example usage
embed_dim = 64
num_heads = 8
transformer_model = Transformer(embed_dim, num_heads)
src = torch.rand(10, 32, embed_dim)  # (sequence_length, batch_size, embed_dim)
tgt = torch.rand(10, 32, embed_dim)  # (sequence_length, batch_size, embed_dim)
output = transformer_model(src, tgt)
print("Output shape:", output.shape)

Output shape: torch.Size([10, 32, 64])


In [None]:
# Transformer with BERT based Embeddings

In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

In [None]:
source_sentences = ["Hello world", "Machine learning is great"]
target_sentences = ["Bonjour le monde", "L'apprentissage automatique est g√©nial"]

In [None]:
source_inputs = tokenizer(source_sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
target_inputs = tokenizer(target_sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)

In [None]:
# Get BERT embeddings
with torch.no_grad():
  source_outputs = bert_model(**source_inputs)
  target_outputs = bert_model(**target_inputs)
  source_last_hidden_states = source_outputs.last_hidden_state
  target_last_hidden_states = target_outputs.last_hidden_state

In [None]:
class Encoder(nn.Module):
    def __init__(self, bert_model, embed_dim, num_heads):
        super().__init__()
        self.bert = bert_model
        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.linear = nn.Linear(embed_dim, embed_dim)
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            x = outputs.last_hidden_state
        attn_output, _ = self.multihead_attn(x, x, x)
        x = self.norm(x + attn_output)
        x = self.norm(x + self.linear(x))
        return x



In [None]:
class Decoder(nn.Module):
    def __init__(self, bert_model, embed_dim, num_heads):
        super(Decoder, self).__init__()
        self.bert = bert_model
        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.encoder_attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.linear = nn.Linear(embed_dim, embed_dim)
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, input_ids, attention_mask, enc_output):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            x = outputs.last_hidden_state
        attn_output, _ = self.self_attn(x, x, x)
        x = self.norm(x + attn_output)

        # enc_output = enc_output.transpose(0, 1)
        # enc_output = enc_output[:, 0]

        attn_output, _ = self.encoder_attn(x, enc_output, enc_output)

        x = self.norm(x + attn_output)
        x = self.norm(x + self.linear(x))
        return x

In [None]:
class Transformer(nn.Module):
    def __init__(self, bert_model, embed_dim, num_heads):
        super(Transformer, self).__init__()
        self.encoder = Encoder(bert_model, embed_dim, num_heads)
        self.decoder = Decoder(bert_model, embed_dim, num_heads)
        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, src_input_ids, src_attention_mask, tgt_input_ids, tgt_attention_mask):
        enc_output = self.encoder(src_input_ids, src_attention_mask)
        dec_output = self.decoder(tgt_input_ids, tgt_attention_mask, enc_output)
        output = self.fc(dec_output)
        return output
