<a href="https://colab.research.google.com/github/yhfga18/ML_stuff/blob/main/6_Transformer_for_Machine_Translation_1022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer for Machine Translation



In [None]:
!pip install torch<=1.2.0
!pip install torchtext==0.5

/bin/bash: =1.2.0: No such file or directory


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
# from torchtext.legacy.data import Field, BucketIterator

from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint


In [None]:
!python -m spacy download en
!python -m spacy download de

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 13.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 14.7 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.7/dist-

In [None]:
# STEPS:
# 1. Specify how preprocessing should be done -> Fields
### 2. Use Dataset to load the data -> TabularDataset (JSON/CSV/TSV Files)
# 3. Construct an iterator to do batching & padding -> BucketIterator




spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")

# build tokenizer 
def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")

english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>")

# training/validation/test dataset
train_data, valid_data, test_data = Multi30k.splits(
    fields=(german, english), exts=(".de", ".en")
)
# meaning we will have german, with field 'german' preprocessing, same goes to english


# Build vocab 
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

### Bucket Iterater 

In [None]:
# transformer 

class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers, 
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()

        self.device = device

        # input for Encoder
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size) # src_vocab_size => embedding_size
        self.positional_embedding = nn.Embedding(max_len, embedding_size) # max_len => embedding_size

        # input for Decoder
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size) 
        self.positional_embedding = nn.Embedding(max_len, embedding_size) 

        # Transformer
        self.transformer = nn.Transformer(
            embedding_size, 
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion, #not sure about this
            dropout,
        )

        # output part
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size) # linear layer mapping to vocab for output
        
        self.droptout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx


    # masking
    # masking for target (pytorch has func for this)
    # source mask (no func for source mask) so define it ↓
    def make_src_mask(self, src): # to skip computation for padded values
        # src shape: (src_len, N) (source text length, for N batches)
        src_mask = src.transpose(0,1) == self.src_pad_idx 
        # (N, src_len)
        # transpose して pytorch に入れられるように...
        return src_mask
     
    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape
        
        # 1. input phase
        # - position embedding
        # - word embedding
        # - prepare masks

        # create position for position embedding
        src_positions = (
            torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N).to(self.device)
            # creating [0,1,2,3...src_seq_length] just for the positions
            # unsqueeze(1) to add dimension 
            # expand(trg_seq_length) just so we have it for every example that we send in
        )
        # the same thing for target position 
        trg_positions = (
            torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, N).to(self.device)
        )


        embed_src = self.dropout(
            (self.src_word_embedding(src) + 
             self.src_position_embedding(src_positions))
        )

        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + 
             self.trg_position_embedding(trg_positions))
        )

        # mask 
        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
        # ↑ (PyTorch's transformer has inbuilt func for target masking)



        # Create transformer nn
        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding = src_padding_mask,
            tgt_mask = trg_mask
        )

        out = self.fc_out(out)

        return out



NameError: ignored

In [None]:
import torch
a1 = torch.arange(0, 4)
print(a1)

a2 = a1.unsqueeze(1)
print(a2)

a3 = a2.expand(4, 3)
print(a3)


# torch.arange(0, 5).unsqueeze(1).expand(5, 5)

tensor([0, 1, 2, 3])
tensor([[0],
        [1],
        [2],
        [3]])
tensor([[0, 0, 0],
        [1, 1, 1],
        [2, 2, 2],
        [3, 3, 3]])


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = True

In [None]:
# training hyperparameters 
num_epochs = 5
learning_rate = 3e-4
batch_size = 32

# model hyperparameters
embedding_size = 512
src_vocab_size = len(german.vocab)
trg_vocab_size = len(english.vocab)
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.1 # lower for Seq2Seq
max_len = 100 # no larger than 100, used for positional embedding. If >100 then delete or increase max_len
forward_expansion = 4
src_pad_idx = english.vocab.stoi["<pad>"] #string-to-index

# Tensorboard for nice plots 
writer = SummaryWriter("runs/loss_plot")
step = 0

In [None]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device,
)

model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi["<pad>"] # ↓で使う ignore padded index, as they aren't relevant in computing the cost
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.ptar"), model, optimizer)


sentence = "ein pferd geht unter einer brücke neben einem boot."


In [None]:
for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if save_model: 
        checkpoint = {
            "state_dict" : model.state_dict(),
            "optimizer" : optimizer.state_dict(),
        }

        save_checkpoint(checkpoint)


    # evaluate phase
    model.eval()
    # translated_sentence = translate_sentence(
    #     model, sentence, german, english, device, max_length = 100
    # )
    # print(f"Translated example sentence \n {translated_sentence}")


    # training phase
    model.train()
    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        # forward prop
        output = model(inp_data, target[:-1])
        # ↑ target should be one-time-step ahead of input data 
            # prediction probability 
        # for each batch, predicted sentence 
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1) # long vector that has index for every target 
        optimizer.zero_grad()

        loss = criterion(output, target)
        loss = backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
        
        optimizer.step()

        writer.add_scalar("Training loss", loss, global_step = step)
        step += 1


score = blue(test_data, model, german, english, device) 
print(f"Blue score {score*100:.2f}")

[Epoch 0 / 5]
=> Saving checkpoint


AttributeError: ignored