# **Import Libraries**

In [31]:
!pip install transformers datasets sentencepiece sacremoses
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
%env TORCH_USE_CUDA_DSA=1
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq

env: TORCH_USE_CUDA_DSA=1


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# **Build Model**

In [4]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, causal=False):
    super().__init__()

    self.d_k = d_k
    self.n_heads = n_heads

    self.key = nn.Linear(d_model, d_k * n_heads)
    self.query = nn.Linear(d_model, d_k * n_heads)
    self.value = nn.Linear(d_model, d_k * n_heads)

    self.fc = nn.Linear(d_k * n_heads, d_model)

    self.causal = causal
    if causal:
      cm = torch.tril(torch.ones(max_len, max_len))
      self.register_buffer(
          "causal_mask",
          cm.view(1, 1, max_len, max_len)
      )

  def forward(self, q, k, v, pad_mask=None):
    q = self.query(q) # N x T x (hd_k)
    k = self.key(k)
    v = self.value(v)

    N = q.shape[0]
    T_output = q.shape[1]
    T_input = k.shape[1]

    # (N, T, h, d_k) -> (N, h, T, d_k)
    q = q.view(N, T_output, self.n_heads, self.d_k).transpose(1, 2)
    k = k.view(N, T_input, self.n_heads, self.d_k).transpose(1, 2)
    v = v.view(N, T_input, self.n_heads, self.d_k).transpose(1, 2)

    # (N, h, T, d_k) x (N, h, d_k, T) --> (N, h, T, T)
    attn_scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_k)
    if pad_mask is not None:
      attn_scores = attn_scores.masked_fill(
          pad_mask[:, None, None, :] == 0, float('-inf'))
    if self.causal:
      attn_scores = attn_scores.masked_fill(
          self.causal_mask[:, :, :T_output, :T_input] == 0, float('-inf'))
    attn_weights = F.softmax(attn_scores, dim=-1)

    # compute attention-weighted values: (N, h, T, T) x (N, h, T, d_k) --> (N, h, T, d_k)
    A = attn_weights @ v

    A = A.transpose(1, 2) # (N, T, h, d_k)
    A = A.contiguous().view(N, T_output, self.d_k * self.n_heads)

    # projection
    return self.fc(A)

In [5]:
class EncoderBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob=0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = MultiHeadAttention(d_k, d_model, n_heads, max_len, causal=False)
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model * 4),
        nn.GELU(),
        nn.Linear(d_model * 4, d_model),
        nn.Dropout(dropout_prob),
    )
    self.dropout = nn.Dropout(p=dropout_prob)

  def forward(self, x, pad_mask=None):
    x = self.ln1(x + self.mha(x, x, x, pad_mask))
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)
    return x

In [6]:
class DecoderBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob=0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.ln3 = nn.LayerNorm(d_model)
    self.mha1 = MultiHeadAttention(d_k, d_model, n_heads, max_len, causal=True)
    self.mha2 = MultiHeadAttention(d_k, d_model, n_heads, max_len, causal=False)
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model * 4),
        nn.GELU(),
        nn.Linear(d_model * 4, d_model),
        nn.Dropout(dropout_prob),
    )
    self.dropout = nn.Dropout(p=dropout_prob)

  def forward(self, enc_output, dec_input, enc_mask=None, dec_mask=None):
    x = self.ln1(dec_input + self.mha1(dec_input, dec_input, dec_input, dec_mask))

    #multi-head attention include encoder output
    x = self.ln2(x + self.mha2(x, enc_output, enc_output, enc_mask))

    x = self.ln3(x + self.ann(x))
    x = self.dropout(x)
    return x

In [7]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=2048, dropout_prob=0.1):
    super().__init__()
    self.dropout = nn.Dropout(p=dropout_prob)

    position = torch.arange(max_len).unsqueeze(1)
    exp_term = torch.arange(0, d_model, 2)
    div_term = torch.exp(exp_term * (-math.log(10000.0) / d_model))
    pe = torch.zeros(1, max_len, d_model)
    pe[0, :, 0::2] = torch.sin(position * div_term)
    pe[0, :, 1::2] = torch.cos(position * div_term)
    self.register_buffer('pe', pe)

  def forward(self, x):
    x = x + self.pe[:, :x.size(1), :]
    return self.dropout(x)

In [8]:
class Encoder(nn.Module):
  def __init__(self,
               vocab_size,
               max_len,
               d_k,
               d_model,
               n_heads,
               n_layers,
              #  n_classes,
               dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks = [
        EncoderBlock(
            d_k,
            d_model,
            n_heads,
            max_len,
            dropout_prob) for _ in range(n_layers)]
    self.transformer_blocks = nn.Sequential(*transformer_blocks)
    self.ln = nn.LayerNorm(d_model)

  def forward(self, x, pad_mask=None):
    x = self.embedding(x)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x, pad_mask)

    x = self.ln(x)
    return x

In [9]:
class Decoder(nn.Module):
  def __init__(self,
               vocab_size,
               max_len,
               d_k,
               d_model,
               n_heads,
               n_layers,
               dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks = [
        DecoderBlock(
            d_k,
            d_model,
            n_heads,
            max_len,
            dropout_prob) for _ in range(n_layers)]
    self.transformer_blocks = nn.Sequential(*transformer_blocks)
    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, vocab_size)

  def forward(self, enc_output, dec_input, enc_mask=None, dec_mask=None):
    x = self.embedding(dec_input)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(enc_output, x, enc_mask, dec_mask)
    x = self.ln(x)
    x = self.fc(x)
    return x

In [10]:
class Transformer(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, enc_input, dec_input, enc_mask, dec_mask):
    enc_output = self.encoder(enc_input, enc_mask)
    dec_output = self.decoder(enc_output, dec_input, enc_mask, dec_mask)
    return dec_output

In [11]:
class ScheduledOptimizer():
    def __init__(self, optimizer, init_lr, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.init_lr = init_lr
        self.d_model = d_model
        self.n_warmup_steps = n_warmup_steps
        self.n_steps = 0


    def step_and_update_lr(self):
        self._update_learning_rate()
        self._optimizer.step()


    def zero_grad(self):
        self._optimizer.zero_grad()


    def _get_lr_scale(self):
        d_model = self.d_model
        n_steps, n_warmup_steps = self.n_steps, self.n_warmup_steps
        return (d_model ** -0.5) * min(n_steps ** (-0.5), n_steps * n_warmup_steps ** (-1.5))

    def state_dict(self):
        optimizer_state_dict = {
            'init_lr':self.init_lr,
            'd_model':self.d_model,
            'n_warmup_steps':self.n_warmup_steps,
            'n_steps':self.n_steps,
            '_optimizer':self._optimizer.state_dict(),
        }

        return optimizer_state_dict

    def load_state_dict(self, state_dict):
        self.init_lr = state_dict['init_lr']
        self.d_model = state_dict['d_model']
        self.n_warmup_steps = state_dict['n_warmup_steps']
        self.n_steps = state_dict['n_steps']

        self._optimizer.load_state_dict(state_dict['_optimizer'])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

# **Load and Preprocess Data**

In [12]:
raw_dataset = load_dataset('csv', data_files='/content/gdrive/MyDrive/Transformer/data/dataset.csv')

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-f27de3802791d4db/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-f27de3802791d4db/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
raw_dataset

### *Split Data (70/15/15)**

In [13]:
#train
dataset = raw_dataset['train'].train_test_split(test_size=0.3, train_size=0.7, shuffle=True, seed=42)
train_dataset = dataset['train']
#val
val_test_dataset = dataset['test']
val_test_split = val_test_dataset.train_test_split(test_size=0.5, shuffle=True, seed=42)
val_dataset = val_test_split['train']
#test
test_dataset = val_test_split['test']

In [14]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-vi"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

In [15]:
#example
en_sentence = train_dataset[2]["en"]
vi_sentence = train_dataset[2]["vi"]

inputs = tokenizer(en_sentence)
targets = tokenizer(text_target=vi_sentence)

tokenizer.convert_ids_to_tokens(targets['input_ids'])

['▁anh',
 '▁che',
 '▁giấu',
 '▁sự',
 '▁thật',
 '▁rằng',
 '▁anh',
 '▁đã',
 '▁ly',
 '▁dị',
 '▁vợ',
 '</s>']

In [16]:
vi_sentence

'anh che giấu sự thật rằng anh đã ly dị vợ'

In [17]:
max_input_length = 128
max_target_length = 128

def preprocess_function(batch):
    model_inputs = tokenizer(
        batch['en'], max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    labels = tokenizer(
        text_target=batch['vi'], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [18]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/658936 [00:00<?, ? examples/s]

Map:   0%|          | 0/282402 [00:00<?, ? examples/s]

In [19]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 658936
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 282402
    })
})

In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [21]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(0, 5)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [22]:
tokenizer.all_special_ids

[0, 1, 53684]

In [23]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)
valid_loader = DataLoader(
    tokenized_datasets["test"],
    batch_size=32,
    collate_fn=data_collator
)

In [24]:
tokenizer.add_special_tokens({"cls_token": "<s>"})


1

In [25]:
tokenizer.vocab_size

53685

# **Config Model**

In [26]:
encoder = Encoder(vocab_size=tokenizer.vocab_size + 1,
                  max_len=512,
                  d_k=16,
                  d_model=512,
                  n_heads=8,
                  n_layers=6,
                  dropout_prob=0.1)
decoder = Decoder(vocab_size=tokenizer.vocab_size + 1,
                  max_len=512,
                  d_k=16,
                  d_model=512,
                  n_heads=8,
                  n_layers=6,
                  dropout_prob=0.1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
encoder.to(device)
decoder.to(device)
transformer = Transformer(encoder, decoder)

cuda:0


### *Load old model and optimizer from previous trainning*

In [27]:
# Load old model and optimization
transformer.load_state_dict(torch.load('/content/gdrive/MyDrive/Transformer/data/transformer_model.pth'))
optimizer = torch.optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09)
optimizer = ScheduledOptimizer(optimizer,0.2, 64, 4000)
optimizer.load_state_dict(torch.load('/content/gdrive/MyDrive/Transformer/data/optimizer.pth'))

In [28]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=-100)

### *Train*

In [None]:
from datetime import datetime
def train(model, criterion, optimizer, train_loader, valid_loader, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    for batch in train_loader:
      batch = {k: v.to(device) for k, v in batch.items()}

      optimizer.zero_grad()

      enc_input = batch['input_ids']
      enc_mask = batch['attention_mask']
      targets = batch['labels']

      dec_input = targets.clone().detach()
      dec_input = torch.roll(dec_input, shifts=1, dims=1)
      dec_input[:, 0] = 53_685

      dec_input = dec_input.masked_fill(
          dec_input == -100, tokenizer.pad_token_id)

      dec_mask = torch.ones_like(dec_input)
      dec_mask = dec_mask.masked_fill(dec_input == tokenizer.pad_token_id, 0)

      outputs = model(enc_input, dec_input, enc_mask, dec_mask)
      loss = criterion(outputs.transpose(2, 1), targets)

      loss.backward()
      optimizer.step_and_update_lr()
      train_loss.append(loss.item())

    train_loss = np.mean(train_loss)

    model.eval()
    test_loss = []
    for batch in valid_loader:
      batch = {k: v.to(device) for k, v in batch.items()}

      enc_input = batch['input_ids']
      enc_mask = batch['attention_mask']
      targets = batch['labels']

      dec_input = targets.clone().detach()
      dec_input = torch.roll(dec_input, shifts=1, dims=1)
      dec_input[:, 0] = 53_685

      dec_input = dec_input.masked_fill(dec_input == -100, tokenizer.pad_token_id)

      dec_mask = torch.ones_like(dec_input)
      dec_mask = dec_mask.masked_fill(dec_input == tokenizer.pad_token_id, 0)

      outputs = model(enc_input, dec_input, enc_mask, dec_mask)
      loss = criterion(outputs.transpose(2, 1), targets)
      test_loss.append(loss.item())
    test_loss = np.mean(test_loss)

    # Save losses
    train_losses[it] = train_loss
    test_losses[it] = test_loss

    dt = datetime.now() - t0

    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
      Test Loss: {test_loss:.4f}, Duration: {dt}')
    # path to save
    model_path = "/content/gdrive/MyDrive/Transformer/data/transformer_model.pth"
    optimizer_path = "/content/gdrive/MyDrive/Transformer/data/optimizer.pth"

    # save model
    torch.save(model.state_dict(), model_path)
    torch.save(optimizer.state_dict(), optimizer_path)
  return train_losses, test_losses

In [None]:
train_losses, test_losses = train(transformer, criterion, optimizer, train_loader, valid_loader, epochs=13)

# **Predict**

In [32]:
def translate(input_sentence, transformer):
  enc_input = tokenizer(input_sentence, return_tensors='pt').to(device)
  enc_output = transformer.encoder(enc_input['input_ids'], enc_input['attention_mask'])

  dec_input_ids = torch.tensor([[53_685]], device=device)
  dec_attn_mask = torch.ones_like(dec_input_ids, device=device)

  for _ in range(32):
    dec_output = transformer.decoder(
        enc_output,
        dec_input_ids,
        enc_input['attention_mask'],
        dec_attn_mask,
    )

    prediction_id = torch.argmax(dec_output[:, -1, :], axis=-1)
    dec_input_ids = torch.hstack((dec_input_ids, prediction_id.view(1, 1)))
    dec_attn_mask = torch.ones_like(dec_input_ids)

    if prediction_id == 0:
      break

  translation = tokenizer.decode(dec_input_ids[0, 1:-1])
  return translation

In [35]:
translate("Why are people so pressed about the clapping, for that's the last thing you should think about in this situation.",transformer)

'tại sao mọi người lại áp đặt về tiếng vỗ tay, vì đó là điều cuối cùng bạn nên nghĩ về trong tình huống này'

In [36]:
val_df = pd.DataFrame(val_dataset)
test_df = pd.DataFrame(test_dataset)

In [37]:
val_df

Unnamed: 0,en,vi
0,Try to eat whole grains over refined grains .,Bạn nên ăn ngũ cốc nguyên hạt thay cho ngũ cốc...
1,It 's also worth noting that although the idea...,Nó sẽ không xứng đáng mặc dù ý kiến dường như ...
2,Dampening your skin is a great way to lower yo...,Làm mát da là một cách tuyệt vời để hạ nhiệt đ...
3,I didn't know why I wasn't supposed to go to t...,tôi không biết tại sao tôi không nên đến một p...
4,Could one have been mixed up for the other ?,Có khi nào hai câu chuyện đã bị nhầm lẫn với n...
...,...,...
141196,Now you can browse and stream movies and telev...,Bây giờ bạn có thể duyệt tìm và xem trực tuyến...
141197,Clear your mind of all the details you 're wor...,Dọn dẹp tâm trí khỏi những điều bạn đang lo lắ...
141198,"Instead , good sources of calcium for nursing ...","Thay vào đó , những thực phẩm giàu canxi dành ..."
141199,Women age faster than men,phụ nữ tuổi nhanh hơn nam giới


In [43]:
# Giới hạn số dòng dịch thành 5 dòng đầu tiên
df_subset = val_df[:5000]

# Áp dụng hàm dịch vào DataFrame con
df_subset['vi_candidate'] = df_subset['en'].apply(lambda x: translate(x, transformer))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['vi_candidate'] = df_subset['en'].apply(lambda x: translate(x, transformer))


In [46]:
df_subset.to_csv('/content/gdrive/MyDrive/Transformer/data/val_candidate.csv', index=False)

In [None]:
test_df['vi_candidate'] = test_df['en'].apply(lambda x: translate(x, transformer))
