<a href="https://colab.research.google.com/github/vedantpople4/LLM/blob/main/Part3_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import tensorflow as tf
print(tf.__version__)

import tensorflow.keras

2.17.1


In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import random

In [16]:
# Custom Tokenizer (same as before)
class SimpleTokenizer:
    def __init__(self):
        self.vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
        self.reverse_vocab = {0: '<pad>', 1: '<sos>', 2: '<eos>', 3: '<unk>'}

    def fit(self, texts):
        words = set(" ".join(texts).split())
        for word in words:
            if word not in self.vocab:
                self.vocab[word] = len(self.vocab)
                self.reverse_vocab[len(self.reverse_vocab)] = word

    def encode(self, text):
        return [self.vocab['<sos>']] + [self.vocab.get(word, self.vocab['<unk>']) for word in text.split()] + [self.vocab['<eos>']]

    def decode(self, indices):
        return " ".join([self.reverse_vocab.get(idx, '<unk>') for idx in indices if idx not in [self.vocab['<sos>'], self.vocab['<eos>'], self.vocab['<pad>']]])

# Custom Dataset (same as before)
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, src_texts, tgt_texts, src_tokenizer, tgt_tokenizer):
        self.src_encoded = [src_tokenizer.encode(text) for text in src_texts]
        self.tgt_encoded = [tgt_tokenizer.encode(text) for text in tgt_texts]

    def __len__(self):
        return len(self.src_encoded)

    def __getitem__(self, idx):
        return self.src_encoded[idx], self.tgt_encoded[idx]

In [17]:
# Collate function
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)

    src_lengths = [len(seq) for seq in src_batch]
    tgt_lengths = [len(seq) for seq in tgt_batch]

    max_src_len = max(src_lengths)
    max_tgt_len = max(tgt_lengths)

    src_padded = [seq + [src_tokenizer.vocab['<pad>']] * (max_src_len - len(seq)) for seq in src_batch]
    tgt_padded = [seq + [tgt_tokenizer.vocab['<pad>']] * (max_tgt_len - len(seq)) for seq in tgt_batch]

    return torch.tensor(src_padded), torch.tensor(tgt_padded)

In [18]:
# Custom Transformer (same as before)
class CustomTransformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers):
        super(CustomTransformer, self).__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):
        src_embed = self.src_embedding(src).transpose(0, 1)
        tgt_embed = self.tgt_embedding(tgt).transpose(0, 1)

        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt_embed.size(0)).to(tgt_embed.device)

        output = self.transformer(src_embed, tgt_embed, tgt_mask=tgt_mask)
        return self.fc_out(output.transpose(0, 1))

In [19]:
# Toy dataset (same as before)
eng_texts = [
    "hello how are you",
    "i love programming",
    "machine learning is fun",
    "python is a great language",
    "deep learning is powerful"
]

fr_texts = [
    "bonjour comment allez vous",
    "j'aime la programmation",
    "l'apprentissage automatique est amusant",
    "python est un excellent langage",
    "l'apprentissage profond est puissant"
]

In [20]:
# Tokenization
src_tokenizer = SimpleTokenizer()
tgt_tokenizer = SimpleTokenizer()

src_tokenizer.fit(eng_texts)
tgt_tokenizer.fit(fr_texts)

# Create dataset and dataloader
dataset = CustomDataset(eng_texts, fr_texts, src_tokenizer, tgt_tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Initialize model
src_vocab_size = len(src_tokenizer.vocab)
tgt_vocab_size = len(tgt_tokenizer.vocab)
d_model = 32
nhead = 2
num_encoder_layers = 2
num_decoder_layers = 2

In [21]:
model = CustomTransformer(src_vocab_size, tgt_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers)

# Training loop
criterion = nn.CrossEntropyLoss(ignore_index=src_tokenizer.vocab['<pad>'])
optimizer = optim.Adam(model.parameters())

num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.reshape(-1, tgt_vocab_size), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

# Test the model


Epoch 10/100, Loss: 1.0845
Epoch 20/100, Loss: 0.6032
Epoch 30/100, Loss: 0.3345
Epoch 40/100, Loss: 0.2421
Epoch 50/100, Loss: 0.1677
Epoch 60/100, Loss: 0.1283
Epoch 70/100, Loss: 0.1021
Epoch 80/100, Loss: 0.0768
Epoch 90/100, Loss: 0.0661
Epoch 100/100, Loss: 0.0599


In [22]:
model.eval()
test_sentence = "deep learning is fun"
src_tensor = torch.tensor(src_tokenizer.encode(test_sentence)).unsqueeze(0)
tgt_tensor = torch.tensor([tgt_tokenizer.vocab['<sos>']]).unsqueeze(0)

with torch.no_grad():
    for _ in range(10):
        output = model(src_tensor, tgt_tensor)
        next_word = output.argmax(2)[:, -1]
        tgt_tensor = torch.cat([tgt_tensor, next_word.unsqueeze(0)], dim=1)

        if next_word.item() == tgt_tokenizer.vocab['<eos>']:
            break

translated = tgt_tokenizer.decode(tgt_tensor.squeeze().tolist())
print(f"Input: {test_sentence}")
print(f"Translation: {translated}")

Input: deep learning is fun
Translation: l'apprentissage automatique est amusant
