In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!pip install transformers

In [3]:
import os
PROJECT_PATH = "/content/drive/MyDrive/projects/project1"
os.chdir(PROJECT_PATH)
print(os.getcwd())

/content/drive/MyDrive/projects/project1


In [None]:
from transformer.model import Transformer
import numpy as np
from utils.dataloader import batch_loader
from utils.tokenizer import en_vocab_size, cn_vocab_size, batch_tokenize

In [5]:
combined_sentences = np.load("dataset/dataset.npy")

In [6]:
# Prepare data
BATCH_SIZE = 16
train_dataloader, val_dataloader = batch_loader(combined_sentences, BATCH_SIZE=16)
num_train_batches = len(train_dataloader)
num_val_batches = (len(val_dataloader)
                        if val_dataloader is not None else 0)
len(train_dataloader), len(val_dataloader)

Training set size: 17891
Validation set size: 4473


(1119, 280)

In [19]:
# Prepare model
max_len = 256
num_hiddens = 512
ffn_hiddens = 48
num_heads = 8
num_layers = 2
drop_prob = 0.3
model = Transformer(en_vocab_size, cn_vocab_size, max_len, num_hiddens, ffn_hiddens, num_heads, drop_prob, num_layers)
#model

In [20]:
# configure loss and optimization
import torch
from torch import nn

# When computing the loss, we are ignoring cases when the label is the padding token
#criterian = nn.NLLLoss(ignore_index=0, reduction="none")
# index of padding token is 0
criterian = nn.CrossEntropyLoss(ignore_index=0)

# Initialize parameters
for params in model.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.SGD(model.parameters(), lr=0.1)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [21]:
for params in model.parameters():
    print(params)

Parameter containing:
tensor([[ 3.4785e-03,  3.7195e-03,  4.4735e-03,  ..., -9.4871e-03,
          1.1240e-02,  5.0808e-03],
        [ 1.6372e-03,  1.1049e-02,  1.3005e-02,  ...,  9.9636e-03,
         -9.1842e-03, -5.9640e-03],
        [ 8.0815e-03,  1.3578e-02,  7.6332e-03,  ..., -1.3631e-02,
          1.0846e-02,  4.5835e-03],
        ...,
        [-7.0538e-03, -3.8137e-03,  1.9973e-03,  ...,  1.3423e-02,
         -1.1269e-02, -2.9455e-03],
        [-9.6588e-03,  8.5613e-03,  8.0871e-03,  ...,  1.4665e-03,
         -2.0318e-03, -3.7139e-04],
        [-8.6069e-03, -4.9576e-03,  1.0922e-02,  ..., -3.5362e-03,
          7.2834e-05,  9.8188e-04]], requires_grad=True)
Parameter containing:
tensor([[ 0.0154,  0.0323, -0.0183,  ...,  0.0461,  0.0106, -0.0395],
        [-0.0402, -0.0341,  0.0341,  ..., -0.0478,  0.0016,  0.0295],
        [ 0.0397, -0.0011, -0.0001,  ...,  0.0298,  0.0071,  0.0528],
        ...,
        [-0.0264,  0.0119,  0.0453,  ...,  0.0309, -0.0527,  0.0138],
        [ 0

## Training

In [22]:
import time
import math

grad_clip_val = 1

# batch training for each epoch
def fit_epoch():
    # Training
    model.train()
    model.to(device)
    epoch_loss = 0
    train_iterator = iter(train_dataloader)
    for _, batch in enumerate(train_iterator):
        en_tokenized, en_valid_lens, cn_tokenized, cn_valid_len = batch_tokenize(batch)
        optim.zero_grad()

        output = model(en_tokenized, en_valid_lens, cn_tokenized)
        output = output[1:].view(-1, output.shape[-1]).to(device)
        trg = cn_tokenized[1:].view(-1).to(device)
        train_loss = criterian(output, trg).to(device)

        train_loss.backward()
        if grad_clip_val > 0:  # To be discussed later
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val)
        optim.step()
        epoch_loss += train_loss.item()

    # Validation
    if val_dataloader is None:
        return
    model.eval()
    val_epoch_loss = 0
    val_iterator = iter(val_dataloader)
    for _, batch in enumerate(val_iterator):
        with torch.no_grad():
            val_en_tokenized, val_en_valid_lens, val_cn_tokenized, val_cn_valid_len = batch_tokenize(batch)
            val_output = model(val_en_tokenized, val_en_valid_lens, val_cn_tokenized)
            val_output = val_output[1:].view(-1, val_output.shape[-1])
            val_trg = val_cn_tokenized[1:].view(-1)
            val_loss = criterian(val_output, val_trg)
            val_epoch_loss += val_loss.item()
    return epoch_loss / len(train_iterator), val_epoch_loss / len(val_iterator)

In [23]:
def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import copy

best_valid_loss = float('inf')
max_epochs = 10
train_losses, val_losses = [], []

for epoch in range(max_epochs):
    start_time = time.time()
    train_loss, valid_loss = fit_epoch()
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    torch.save(model.cpu().state_dict(), f'checkpoint/epoch_{epoch}.pt')
    torch.save(model.cpu().state_dict(), f'checkpoint/latest.pt')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_model = copy.deepcopy(model)
        torch.save(best_model.cpu().state_dict(), f'checkpoint/best.pt')
    train_losses.append(train_loss)
    val_losses.append(valid_loss)

print(f'\t Val. Loss of best model: {valid_loss:.3f} |  Val. PPL of best model: {math.exp(valid_loss):7.3f}')


Epoch: 01 | Time: 17m 3s
	Train Loss: 5.848 | Train PPL: 346.623
	 Val. Loss: 5.129 |  Val. PPL: 168.784
Epoch: 02 | Time: 17m 40s
	Train Loss: 4.444 | Train PPL:  85.122
	 Val. Loss: 4.340 |  Val. PPL:  76.744
Epoch: 03 | Time: 17m 36s
	Train Loss: 3.410 | Train PPL:  30.279
	 Val. Loss: 3.461 |  Val. PPL:  31.837
Epoch: 04 | Time: 18m 7s
	Train Loss: 2.495 | Train PPL:  12.126
	 Val. Loss: 2.873 |  Val. PPL:  17.698
Epoch: 05 | Time: 18m 0s
	Train Loss: 1.846 | Train PPL:   6.332
	 Val. Loss: 2.472 |  Val. PPL:  11.849
Epoch: 06 | Time: 18m 4s
	Train Loss: 1.352 | Train PPL:   3.866
	 Val. Loss: 2.395 |  Val. PPL:  10.973
Epoch: 07 | Time: 18m 5s
	Train Loss: 0.963 | Train PPL:   2.619
	 Val. Loss: 1.860 |  Val. PPL:   6.427
Epoch: 08 | Time: 19m 14s
	Train Loss: 0.652 | Train PPL:   1.919
	 Val. Loss: 1.487 |  Val. PPL:   4.424
