In [5]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
import dask.dataframe as dd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [6]:
torch.cuda.empty_cache()

In [7]:
device = torch.device('cuda')

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask = None):
        # Scaled dot product page 4 with figure 2 left
        # Normalizing happens of the Q * K multiplication at 2 points
        # 1. The Q * K multiplication is divided by sqrt(d_k) and then softmax is applied
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim = -1)
        output = torch.matmul(attn_probs, V)
        return output
    
    def split_heads(self, x):
        # What is purpose 
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
    
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
    
    def forward(self, Q, K, V, mask = None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [9]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.sin(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [11]:
# Encoder layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [12]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [13]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.EmbeddingBag(tgt_vocab_size, d_model)
        self.positional_embedding = PositionalEncoding(d_model, max_seq_length)

        # How many decoeder layers and encoder we want to run through
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = ((1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()).to(device)
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_embedding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_embedding(self.encoder_embedding(tgt)))
        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
        
        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
        output = self.fc(dec_output)

        return output

In [14]:
# src_vocab_size = 5000
# tgt_vocab_size = 5000
# d_model = 512
# # Number of heads in multi head attention
# num_heads = 8
# # num_layers here meaning how many stack of attention layers we are producing.
# num_layers = 6
# d_ff = 2048
# max_seq_length = 100
# dropout = 0.1

# transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# transformer.to(device)

# # Generate random sample data
# src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
# tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
# src_data = src_data.to(device)
# tgt_data = tgt_data.to(device)

In [15]:
# output = transformer(src_data, tgt_data)

In [16]:
csv = dd.read_csv('/home/paperspace/Documents/en-fr.csv')
csv = csv.compute()

In [17]:
csv = csv.dropna()

In [18]:
class TranslateDataset(data.Dataset):
    def __init__(self, csv):
        self.csv = csv
        self.english_values = self.csv['en'].values
        self.french_values = self.csv['fr'].values
        # self.english_values = english_values
        # self.french_values = french_values
        self.english_tokenizer = AutoTokenizer.from_pretrained('/home/paperspace/Documents/bert-base-cased')
        self.french_tokenizer = AutoTokenizer.from_pretrained('/home/paperspace/Documents/flaubert-base-cased')

    def __len__(self):
        return len(self.english_values)

    def __getitem__(self, idx):
        english = self.english_values[idx]
        french = self.french_values[idx]
        english = self.english_tokenizer(english, max_length=100, padding='max_length', truncation=True, return_tensors='pt')
        french = self.french_tokenizer(french, max_length=100, padding='max_length', truncation=True, return_tensors='pt')
        return english['input_ids'].squeeze(0), french['input_ids'].squeeze(0)

In [19]:
def split_datasets(dataset, test_size=0.01):
    # total_size = len(dataset)
    # test_size = int(total_size * test)
    # val_size = int(test_size * test)
    # print(total_size)
    # print(test_size)
    # print(val_size)
    train, test = train_test_split(dataset, test_size=test_size, random_state=42)
    train, val = train_test_split(test, test_size=test_size, random_state=42)
    return train, val, test

In [20]:
train, val, test = split_datasets(csv)

In [21]:
# del csv

In [22]:
train_dataset = TranslateDataset(train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataset = TranslateDataset(val)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_dataset = TranslateDataset(test)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

ImportError: 
 requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [20]:
src_vocab_size = 30000
tgt_vocab_size = 30145
d_model = 512
# Number of heads in multi head attention
num_heads = 8
# num_layers here meaning how many stack of attention layers we are producing.
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

transformer.to(device)

# Generate random sample data
# src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
# tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
# src_data = src_data.to(device)
# tgt_data = tgt_data.to(device)

Transformer(
  (encoder_embedding): Embedding(30000, 512)
  (decoder_embedding): EmbeddingBag(30145, 512, mode='mean')
  (positional_embedding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=512, out_features=512, bias=True)
        (W_k): Linear(in_features=512, out_features=512, bias=True)
        (W_v): Linear(in_features=512, out_features=512, bias=True)
        (W_o): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-5): 

In [21]:
def train_transformer(model, dataloader, optimizer, criterion, device, num_epochs):
    model.to(device)
    model.train()
    
    for epoch in tqdm(range(num_epochs)):
        total_loss = 0
        
        for batch in dataloader:
            src, tgt = batch
            src, tgt = src.to(device), tgt.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            # tgt[:, :-1] --> Remove the last token from the target
            output = model(src, tgt[:, :-1])
            
            # Reshape output and target for loss computation
            output = output.permute(0, 2, 1)  # [batch_size, vocab_size, seq_len]
            # Why? --> We don't need to predict the <sos> token
            tgt = tgt[:, 1:]  # Shift target to the right
            
            # Compute loss
            loss = criterion(output, tgt)
            total_loss += loss.item()
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
        
        avg_loss = total_loss / len(dataloader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

In [22]:
criterion = nn.CrossEntropyLoss(ignore_index=0) 

In [23]:
train_transformer(transformer, train_loader, optim.Adam(transformer.parameters(), lr=1e-4), criterion, device, 50)

  0%|          | 0/50 [00:00<?, ?it/s]


RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [None]:
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length)) 