# Transformer tutorial
Feb 2021 \
https://github.com/pytorch/tutorials/blob/master/beginner_source/transformer_tutorial.py

In [1]:
import math
import numpy as np


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer
from torchtext.utils import download_from_url, extract_archive
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import io

from ruamel.yaml import YAML
import logging
import time


In [2]:
import matplotlib.pyplot as plt

In [3]:

class YParams():
  """ Yaml file parser """
  def __init__(self, yaml_filename, config_name, print_params=False):
    self._yaml_filename = yaml_filename
    self._config_name = config_name
    self.params = {}

    if print_params:
      print("------------------ Configuration ------------------")

    with open(yaml_filename) as _file:

      for key, val in YAML().load(_file)[config_name].items():
        if print_params: print(key, val)
        if val =='None': val = None

        self.params[key] = val
        self.__setattr__(key, val)

    if print_params:
      print("---------------------------------------------------")

  def __getitem__(self, key):
    return self.params[key]

  def __setitem__(self, key, val):
    self.params[key] = val
    self.__setattr__(key, val)

  def __contains__(self, key):
    return (key in self.params)

  def update_params(self, config):
    for key, val in config.items():
      self.params[key] = val
      self.__setattr__(key, val)

  def log(self):
    logging.info("------------------ Configuration ------------------")
    logging.info("Configuration file: "+str(self._yaml_filename))
    logging.info("Configuration name: "+str(self._config_name))
    for key, val in self.params.items():
        logging.info(str(key) + ' ' + str(val))
    logging.info("---------------------------------------------------")

In [4]:
%matplotlib widget

## Terminology
|Term|meaning| Eg |
|:-|:-|:-|
| **ntokens** | size of vocabulary| 400 |
| **ninp** | Length of sequence | 20 | 
| **nhead** | Number of heads in attention mechanism | 8 | 
| **nhid** | Dimension of FF in encoder | |
| **nlayers**  | Layers in encoder | |


In [5]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):       
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        self.encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(self.encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)
        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        # Get lower triangular matrix
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        # convert to 0->-inf, 1-> 0.0
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
    
    def forward(self, src, src_mask):
#         print("src,"src.shape)
        src = self.encoder(src) * math.sqrt(self.ninp)
#         print(src.shape)
        src = self.pos_encoder(src)
#         print(src.shape)
        output = self.transformer_encoder(src, src_mask)
#         print(output.shape)
        output = self.decoder(output)
#         print(output.shape)
        return output


In [6]:
# class PatchEmbedding(nn.Module):
#     def __init__(self, in_channels: int = 4, patch_size: int = 16, emb_size: int = 768):
#         self.patch_size = patch_size
#         super().__init__()
#         self.model = nn.Sequential(
#             nn.Conv2d(in_channels, emb_size, kernel_size=patch_size, stride=patch_size),
#         )
                
#     def forward(self, x):
#         x = self.model(x)
#         return x
    
# # x=torch.rand(4,4,16,16)
# # PatchEmbedding()(x).shape


### Data setup

In [7]:

def f_data_process(raw_text_iter,vocab,tokenizer):
    '''Get tensor with vocab indices for given words.
    Convert words -> indices
    '''
    data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                       dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

def f_batchify(data, bsz,device):
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

def f_get_batch(source, i):
    ''' Create data and target from source. Target is just the next batch of inputs
    '''
    seq_len = min(params.bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target


class dataset:
    ''' Class for storing datasets and other info'''
    def __init__(self):
        url = 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip'
        ## Download file names for train, val, test
        test_filepath, valid_filepath, train_filepath = extract_archive(download_from_url(url))
        ## Get function to Convert sentence to list of words
        self.tokenizer = get_tokenizer('basic_english') 
        ## Get vocab list
        self.vocab = build_vocab_from_iterator(map(self.tokenizer, iter(io.open(train_filepath, encoding="utf8"))))

        train_data = f_data_process(iter(io.open(train_filepath, encoding="utf8")),self.vocab,self.tokenizer)
        val_data = f_data_process(iter(io.open(valid_filepath, encoding="utf8")),self.vocab,self.tokenizer)
        test_data = f_data_process(iter(io.open(test_filepath, encoding="utf8")),self.vocab,self.tokenizer)
        print(train_data.shape,val_data.shape,test_data.shape)

        train_data = f_batchify(train_data, params.batch_size,params['device'])
        val_data = f_batchify(val_data, params.eval_batch_size,params['device'])
        test_data = f_batchify(test_data, params.eval_batch_size,params['device'])

        self.train_data=train_data
        self.val_data=val_data
        self.test_data=test_data
        
        print(self.train_data.shape,self.val_data.shape,self.test_data.shape)


In [8]:
class Trainer():
    def __init__(self,params):
        self.params=params
        self.model = TransformerModel(params.ntokens, params.emsize, params.nhead, params.nhid, params.nlayers, params.dropout).to(params['device'])
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=params.lr)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1.0, gamma=0.95)

    def train(self,dataset):
        self.model.train() # Turn on the train mode
        total_loss = 0.
        start_time = time.time()
        src_mask = self.model.generate_square_subsequent_mask(params.bptt).to(params['device'])
        for batch, i in enumerate(range(0, dataset.size(0) - 1, params.bptt)):
            data, targets = f_get_batch(dataset, i)
            self.optimizer.zero_grad()
            if data.size(0) != params.bptt:
                src_mask = self.model.generate_square_subsequent_mask(data.size(0)).to(params['device'])
            output = self.model(data, src_mask)
            loss = self.criterion(output.view(-1, params.ntokens), targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
            self.optimizer.step()

            total_loss += loss.item()
            log_interval = 200
            if batch % log_interval == 0 and batch > 0:
                cur_loss = total_loss / log_interval
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches | '
                      'lr {:02.2f} | ms/batch {:5.2f} | '
                      'loss {:5.2f} | ppl {:8.2f}'.format(
                        epoch, batch, len(Dset.train_data) // params.bptt, self.scheduler.get_lr()[0],
                        elapsed * 1000 / log_interval,
                        cur_loss, math.exp(cur_loss)))
                total_loss = 0
                start_time = time.time()


    def evaluate(self, eval_model, data_source):
        self.model.eval() # Turn on the evaluation mode
        total_loss = 0.
        src_mask = self.model.generate_square_subsequent_mask(params.bptt).to(params['device'])
        with torch.no_grad():
            for i in range(0, data_source.size(0) - 1, params.bptt):
                data, targets = f_get_batch(data_source, i)
                if data.size(0) != params.bptt:
                    src_mask = self.model.generate_square_subsequent_mask(data.size(0)).to(params['device'])
                output = eval_model(data, src_mask)
                output_flat = output.view(-1, params.ntokens)
                total_loss += len(data) * self.criterion(output_flat, targets).item()
        return total_loss / (len(data_source) - 1)


In [9]:
if __name__=="__main__":
    
    torch.backends.cudnn.benchmark = True
    
    params = YParams('config_1.yaml', 'default')
    params.params['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            
    Dset=dataset()
    params.__setitem__('ntokens', len(Dset.vocab.stoi))# the size of vocabulary

    params.__setitem__('epochs', 10)# override num_epochs

    trainer=Trainer(params)
    
    ########################################################################
    # Train 
    best_val_loss = float("inf")
    best_model = None

    for epoch in range(1, params.epochs + 1):
        epoch_start_time = time.time()
        trainer.train(Dset.train_data)
        val_loss = trainer.evaluate(trainer.model, Dset.val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
              'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                         val_loss, math.exp(val_loss)))
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = trainer.model

        trainer.scheduler.step()
    
    ######################################################################
    # Evaluate the model with the test dataset
    # Apply the best model to check the result with the test dataset.

    test_loss = trainer.evaluate(best_model, Dset.test_data)
    print('=' * 89)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, math.exp(test_loss)))
    print('=' * 89)

36718lines [00:01, 29352.78lines/s]


torch.Size([2049990]) torch.Size([214417]) torch.Size([241859])
torch.Size([102499, 20]) torch.Size([21441, 10]) torch.Size([24185, 10])




| epoch   1 |   200/ 2928 batches | lr 1.00 | ms/batch 21.42 | loss  7.56 | ppl  1911.31
| epoch   1 |   400/ 2928 batches | lr 1.00 | ms/batch 18.01 | loss  6.92 | ppl  1010.56
| epoch   1 |   600/ 2928 batches | lr 1.00 | ms/batch 18.14 | loss  6.67 | ppl   791.60
| epoch   1 |   800/ 2928 batches | lr 1.00 | ms/batch 18.07 | loss  6.58 | ppl   723.44
| epoch   1 |  1000/ 2928 batches | lr 1.00 | ms/batch 18.01 | loss  6.47 | ppl   646.90
| epoch   1 |  1200/ 2928 batches | lr 1.00 | ms/batch 17.95 | loss  6.43 | ppl   622.97
| epoch   1 |  1400/ 2928 batches | lr 1.00 | ms/batch 18.06 | loss  6.34 | ppl   564.63
| epoch   1 |  1600/ 2928 batches | lr 1.00 | ms/batch 17.96 | loss  6.32 | ppl   555.64
| epoch   1 |  1800/ 2928 batches | lr 1.00 | ms/batch 17.68 | loss  6.23 | ppl   507.89
| epoch   1 |  2000/ 2928 batches | lr 1.00 | ms/batch 17.87 | loss  6.21 | ppl   495.54
| epoch   1 |  2200/ 2928 batches | lr 1.00 | ms/batch 18.02 | loss  6.08 | ppl   435.55
| epoch   1 |  2400/ 

In [10]:
params.params

{'batch_size': 20,
 'eval_batch_size': 10,
 'bptt': 35,
 'emsize': 200,
 'nhid': 200,
 'nlayers': 6,
 'nhead': 8,
 'dropout': 0.2,
 'lr': 1.0,
 'epochs': 10,
 'device': device(type='cuda'),
 'ntokens': 28783}