<a href="https://colab.research.google.com/github/vikasrai19/transformer_scratch/blob/transformers_google_colab/review_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import os
import wget
import zipfile
import nltk
import torch
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from nltk.tokenize import word_tokenize
import time
from datasets import load_dataset
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers, n_heads, dropout=0.1):
        super(TransformerModel, self).__init__()

        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, hidden_dim))

        self.transformer = nn.Transformer(
            d_model=hidden_dim,
            nhead=n_heads,
            num_encoder_layers=n_layers,
            num_decoder_layers=n_layers,
            dropout=dropout
        )

    def forward(self, src, tgt):
        src = self.embedding(src) + self.positional_encoding
        tgt = self.embedding(tgt) + self.positional_encoding

        output = self.transformer(src, tgt)
        return output

In [7]:
def download_cornell_dataset(retries=10, delay=5):

    url = "http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
    if not os.path.exists("cornell_movie_dialogs_corpus.zip"):
        for attempt in range(retries):
            try:
                wget.download(url, "cornell_movie_dialogs_corpus.zip")
                break
            except Exception as e:
                print(f"Error occurred: {e}")
                if attempt < retries - 1:
                    print(f"Retrying in {delay} seconds...")
                    time.sleep(delay)
                else:
                    print("Failed to download the dataset after several attempts.")
                    raise
    with zipfile.ZipFile("cornell_movie_dialogs_corpus.zip", 'r') as zip_ref:
        zip_ref.extractall("cornell_movie_data")

# Load and preprocess the conversation data
def load_conversation_dataset():
    file_path = "cornell_movie_data/cornell movie-dialogs corpus/movie_lines.txt"
    conversations = []
    with open(file_path, 'r', encoding='iso-8859-1') as f:
        for line in f.readlines():
            parts = line.strip().split(" +++$+++ ")
            if len(parts) == 5:
                conversations.append((parts[3], parts[4]))  # Extract conversation pair
    return conversations

# Vocabulary Building and Preprocessing
def build_vocab(data):
    vocab = defaultdict(lambda: len(vocab))
    vocab["<pad>"] = 0
    vocab["<sos>"] = 1
    vocab["<eos>"] = 2
    vocab["<unk>"] = 3

    for pair in data:
        for sentence in pair:
            for word in word_tokenize(sentence.lower()):
                vocab[word]  # Add word to vocab
    return vocab

# Encode conversation pairs
def encode_conv_pair(pair, vocab):
    src, tgt = pair
    src_tokens = [vocab["<sos>"]] + [vocab[word] for word in word_tokenize(src.lower())] + [vocab["<eos>"]]
    tgt_tokens = [vocab["<sos>"]] + [vocab[word] for word in word_tokenize(tgt.lower())] + [vocab["<eos>"]]
    return src_tokens, tgt_tokens

# Pad sequence
def pad_sequence(seq, max_length, pad_value=0):
    return seq[:max_length] + [pad_value] * max(0, max_length - len(seq))

# Cornell Dataset Class
class CornellDataset(Dataset):
    def __init__(self, conversations, vocab, max_length=30):
        self.data = [encode_conv_pair(pair, vocab) for pair in conversations]
        self.max_length = max_length
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        src = pad_sequence(src, self.max_length)
        tgt = pad_sequence(tgt, self.max_length)
        return torch.tensor(src), torch.tensor(tgt), torch.tensor(0)

# Download and load the conversation dataset
download_cornell_dataset()
conversation_data = load_conversation_dataset()
vocab_conversation = build_vocab(conversation_data)
cornell_dataset = CornellDataset(conversation_data, vocab_conversation)

# Create DataLoader for conversation data
conversation_loader = DataLoader(cornell_dataset, batch_size=8, shuffle=True, pin_memory=True, num_workers=0)

In [8]:
def encode_qa_pair(pair, vocab):
    src, tgt = pair[1], pair[2]
    src_tokens = [vocab["<sos>"]] + [vocab[word] for word in word_tokenize(src.lower())] + [vocab["<eos>"]]
    tgt_tokens = [vocab["<sos>"]] + [vocab[word] for word in word_tokenize(tgt.lower())] + [vocab["<eos>"]]
    return src_tokens, tgt_tokens


# Load SQuAD dataset
def load_squad_dataset():
    dataset = load_dataset("squad")
    qa_pairs = []

    for entry in dataset['train']:  # We use the training split for example
        context = entry['context']
        question = entry['question']
        answer = entry['answers']['text'][0]  # Take the first answer
        qa_pairs.append((context, question, answer))

    return qa_pairs

# Preprocess SQuAD dataset
class SQuADDataset(Dataset):
    def __init__(self, qa_pairs, vocab, max_length=30):
        self.data = [encode_qa_pair(pair, vocab) for pair in qa_pairs]
        self.max_length = max_length
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        src = pad_sequence(src, self.max_length)
        tgt = pad_sequence(tgt, self.max_length)
        return torch.tensor(src), torch.tensor(tgt), torch.tensor(1)

# Load the QA dataset
qa_data = load_squad_dataset()
vocab_qa = build_vocab(qa_data)
squad_dataset = SQuADDataset(qa_data, vocab_qa)

# Create DataLoader for QA data
qa_loader = DataLoader(squad_dataset, batch_size=8, shuffle=True, pin_memory=True, num_workers=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [12]:
def evaluate_model(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in data_loader:
            output, _ = model(src, tgt)
            loss = loss_fn(output, tgt)
            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    return avg_loss

In [None]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device('cuda') if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# Shared encoder with domain-specific heads
class SharedTransformerWithHeads(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, n_heads, dropout=0.1):
        super(SharedTransformerWithHeads, self).__init__()

        # Shared encoder
        self.encoder = TransformerModel(input_dim, hidden_dim, n_layers, n_heads, dropout)

        # Domain-specific heads
        self.conversation_head = nn.Linear(hidden_dim, output_dim)  # Conversation task
        self.qa_head = nn.Linear(hidden_dim, output_dim)  # QA task

        # Domain classifier to detect conversation vs QA
        self.domain_classifier = nn.Linear(hidden_dim, 2)  # Predict Conversation or QA

    def forward(self, src, tgt):
        enc_output = self.encoder(src, tgt)

        # Predict domain (conversation or QA)
        domain_pred = self.domain_classifier(enc_output.mean(dim=1))  # Classifier based on mean

        # Domain-specific outputs
        conversation_output = self.conversation_head(enc_output)
        qa_output = self.qa_head(enc_output)

        return conversation_output, qa_output, domain_pred


# Training function
def train_model(model, data_loader, optimizer, loss_fn, epochs=10):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        progress_bar = tqdm(enumerate(data_loader, 1), total=len(data_loader), desc=f'Epoch {epoch + 1}/{epochs}')
        # for src, tgt, domain_label in data_loader:
        for batch_idx, (src, tgt, domain_label) in progress_bar:
            optimizer.zero_grad()
            src, tgt, domain_label = src.to(device), tgt.to(device), domain_label.to(device)
            conversation_output, qa_output, domain_pred = model(src, tgt)

            # Loss for each task
            conversation_output = conversation_output.view(-1, conversation_output.size(-1))
            qa_output = qa_output.view(-1, qa_output.size(-1))
            tgt = tgt.view(-1)
            conv_loss = loss_fn(conversation_output, tgt)
            qa_loss = loss_fn(qa_output, tgt)

            # Domain classification loss
            # domain_loss = domain_loss_fn(domain_pred, domain_label)

            # total_loss = conv_loss + qa_loss + domain_loss
            total_loss = conv_loss + qa_loss
            total_loss.backward()
            optimizer.step()
            progress_bar.set_postfix(loss=total_loss.item())

        print(f"Epoch {epoch + 1}: Loss = {total_loss.item()}")
    torch.save(model.state_dict(), f"models/shared_transformer_model.pth")

input_dim = output_dim = max(len(vocab_qa), len(vocab_conversation))
shared_model = SharedTransformerWithHeads(input_dim=input_dim, hidden_dim=256, output_dim=output_dim, n_layers=2, n_heads=4)
optimizer = optim.Adam(shared_model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)

# Conversation model train
train_model(model=shared_model, data_loader=conversation_loader, optimizer=optimizer, loss_fn=loss_fn, epochs=5)

# QA Model training
train_model(model=shared_model, data_loader=qa_loader, optimizer=optimizer, loss_fn=loss_fn, epochs=5)

conversation_loss = evaluate_model(shared_model, conversation_loader, loss_fn)
qa_loss = evaluate_model(shared_model, qa_loader, loss_fn)

print(f"Conversation Loss: {conversation_loss}")
print(f"QA Loss: {qa_loss}")

Using device: cuda


Epoch 1/5:  10%|█         | 3869/38056 [04:09<36:21, 15.67it/s, loss=0.222]