In [1]:
%load_ext autoreload
%autoreload 2
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
import dask.dataframe as dd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import sys
sys.path.append('../scripts')
from model import Transformer, PositionalEncoding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

In [3]:
device = torch.device('cuda')

In [4]:
csv = dd.read_csv('/home/paperspace/text_translataion_dataset/en-fr.csv')
csv = csv.compute()

In [5]:
csv = csv.dropna()

In [6]:
class TranslateDataset(data.Dataset):
    def __init__(self, csv):
        self.csv = csv
        self.english_values = self.csv['en'].values
        self.french_values = self.csv['fr'].values
        # self.english_values = english_values
        # self.french_values = french_values
        self.english_tokenizer = AutoTokenizer.from_pretrained('/home/paperspace/bert-base-cased')
        self.french_tokenizer = AutoTokenizer.from_pretrained('/home/paperspace/flaubert-base-cased')

    def __len__(self):
        return len(self.english_values)

    def __getitem__(self, idx):
        english = self.english_values[idx]
        french = self.french_values[idx]
        input_tokenized = self.english_tokenizer(english, max_length=100, padding='max_length', truncation=True, return_tensors='pt')
        output_tokenized = self.french_tokenizer(french, max_length=100, padding='max_length', truncation=True, return_tensors='pt')
        return input_tokenized['input_ids'].squeeze(0), output_tokenized['input_ids'].squeeze(0)
        # return input_tokenized['input_ids'].squeeze(0), output_tokenized['input_ids']

In [7]:
def split_datasets(dataset, test_size=0.01):
    train, test = train_test_split(dataset, test_size=test_size, random_state=42)
    train, val = train_test_split(test, test_size=test_size, random_state=42)
    return train, val, test

In [8]:
train, val, test = split_datasets(csv)

In [9]:
# del csv

In [20]:
train_dataset = TranslateDataset(train)
train_loader = DataLoader(train_dataset, batch_size=312, shuffle=True)
val_dataset = TranslateDataset(val)
val_loader = DataLoader(val_dataset, batch_size=312, shuffle=True)
test_dataset = TranslateDataset(test)
test_loader = DataLoader(test_dataset, batch_size=312, shuffle=False)

In [11]:
src_vocab_size = train_dataset.english_tokenizer.vocab_size
tgt_vocab_size = train_dataset.french_tokenizer.vocab_size
d_model = 512
# Number of heads in multi head attention
num_heads = 8
# num_layers here meaning how many stack of attention layers we are producing.
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

transformer.to(device)

# Generate random sample data
# src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
# tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
# src_data = src_data.to(device)
# tgt_data = tgt_data.to(device)

Transformer(
  (encoder_embedding): Embedding(28996, 512)
  (decoder_embedding): Embedding(68729, 512)
  (positional_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=512, out_features=512, bias=True)
        (W_k): Linear(in_features=512, out_features=512, bias=True)
        (W_v): Linear(in_features=512, out_features=512, bias=True)
        (W_o): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-5): 6 x DecoderLayer(

In [12]:
# temp, temp2 = None, None
# for batch in train_loader:
#     src_data, tgt_data = batch
#     temp = src_data
#     temp2 = tgt_data
#     break
#     # src_data = src_data.to(device)
#     # tgt_data = tgt_data.to(device)
#     # output = transformer(src_data, tgt_data)
#     # print(output)
#     # break

In [13]:
# # transformer(temp.to(device), temp2.to(device))
# transformer(temp.to(device), temp2.to(device))

In [17]:
def train_transformer(model, dataloader, optimizer, criterion, device, num_epochs):
    model.to(device)
    model.train()
    
    for epoch in (range(num_epochs)):
        total_loss = 0
        
        for batch in tqdm(dataloader):
            src, tgt = batch
            src, tgt = src.to(device), tgt.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            # tgt[:, :-1] --> Remove the last token from the target
            output = model(src, tgt[:, :-1])
            
            # Reshape output and target for loss computation
            output = output.permute(0, 2, 1)  # [batch_size, vocab_size, seq_len]
            # Why? --> We don't need to predict the <sos> token
            tgt = tgt[:, 1:]  # Shift target to the right
            
            # Compute loss
            loss = criterion(output, tgt)
            total_loss += loss.item()
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
        
        avg_loss = total_loss / len(dataloader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

In [18]:
criterion = nn.CrossEntropyLoss(ignore_index=0) 

In [21]:
train_transformer(transformer, train_loader, optim.Adam(transformer.parameters(), lr=1e-4), criterion, device, 50)

  0%|          | 0/195 [00:00<?, ?it/s]

100%|██████████| 195/195 [03:59<00:00,  1.23s/it]


Epoch [1/50], Loss: 1.9910


100%|██████████| 195/195 [03:59<00:00,  1.23s/it]


Epoch [2/50], Loss: 1.8378


100%|██████████| 195/195 [03:59<00:00,  1.23s/it]


Epoch [3/50], Loss: 1.7123


100%|██████████| 195/195 [03:59<00:00,  1.23s/it]


Epoch [4/50], Loss: 1.5970


100%|██████████| 195/195 [03:59<00:00,  1.23s/it]


Epoch [5/50], Loss: 1.4925


100%|██████████| 195/195 [04:00<00:00,  1.23s/it]


Epoch [6/50], Loss: 1.4010


100%|██████████| 195/195 [03:59<00:00,  1.23s/it]


Epoch [7/50], Loss: 1.3218


  7%|▋         | 13/195 [00:15<03:39,  1.21s/it]


KeyboardInterrupt: 

In [25]:
# Step 1: Create a random tensor
random_tensor = torch.randn(100, 10)  # 100 samples, each with 10 features

# Step 2: Create a dataset
# dataset = TensorDataset(random_tensor)

In [26]:
src_data = torch.randint(1, src_vocab_size, (100000, max_seq_length)).float()  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (100000, max_seq_length)).float()  # (batch_size, seq_length)
src_dataset = data.TensorDataset(src_data)
tgt_dataset = data.TensorDataset(tgt_data)
src_loader = data.DataLoader(src_dataset, batch_size=16, shuffle=True)
tgt_loader = data.DataLoader(tgt_dataset, batch_size=16, shuffle=True)

In [18]:
src_vocab_size = 30000
tgt_vocab_size = 30145
d_model = 512
# Number of heads in multi head attention
num_heads = 8
# num_layers here meaning how many stack of attention layers we are producing.
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# transformer.to(device)

In [19]:
src_data = torch.randint(1, src_vocab_size, (128, max_seq_length))
tgt_data = torch.randint(1, tgt_vocab_size, (128, max_seq_length))
# src_data = src_data.float()
# tgt_data = tgt_data.float()
# src_data = src_data.to(device)
# tgt_data = tgt_data.to(device)

In [20]:
transformer(src_data, tgt_data)

IndexError: index out of range in self