In [None]:
%reload_ext autoreload
%autoreload 2

import copy
import math
import time
import torch
import sys
from torch import nn, Tensor

sys.path.append("..")
from model import TransformerModel, generate_square_subsequent_mask
from data_utils import data_process, batchify, get_batch
from train import train_epoch, evaluate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"torch_version: {torch.__version__}")
print(f"device: {device}")

## Dataset

In [None]:
from torch.utils.data import dataset
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


train_iter = WikiText2(split="train")
tokenizer = get_tokenizer("basic_english")
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter, vocab, tokenizer)
val_data = data_process(val_iter, vocab, tokenizer)
test_data = data_process(test_iter, vocab, tokenizer)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size, device)
val_data = batchify(val_data, eval_batch_size, device)
test_data = batchify(test_data, eval_batch_size, device)

In [None]:
idx2word = {i: x for i, x in enumerate(vocab.get_itos())}
def to_sentence(x):
    return " ".join([idx2word[int(idx)] for idx in x])

i = 50
x = to_sentence(test_data[i:i+30][:, 0])
y = to_sentence(test_data[i+1:i+1+30][:, 0])
print(f"{x}\n\n{y}")

## Build Model

In [None]:
ntokens = len(vocab)
emsize = 200
d_hid = 200
nlayers = 2
nhead = 2
dropout = 0.2

model =  TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)

## Training

In [None]:
criterion = nn.CrossEntropyLoss()
lr = 5.0
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95)

In [None]:
best_val_loss = float("inf")
epochs = 2
best_model = None

for epoch in range(1, epochs+1):
    epoch_start_time = time.time()
    train_epoch(train_data, model, criterion, optimizer, scheduler, ntokens, epoch, 
                log_interval=10, device=device)
    
    val_loss = evaluate(val_data, model, criterion, ntokens, device=device) 
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)
        
    scheduler.step()

## Eval

In [None]:
test_loss = evaluate(test_data, best_model, criterion, ntokens, device=device) 
test_ppl = math.exp(test_loss)

print('=' * 89)
print(f'| End of training | test loss {test_loss:5.2f} | '
      f'test ppl {test_ppl:8.2f}')
print('=' * 89)