In [None]:
! pip install labml
! pip install labml-nn

Collecting labml-nn
  Downloading labml_nn-0.4.118-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 7.8 MB/s 
Collecting labml-helpers>=0.4.84
  Downloading labml_helpers-0.4.84-py3-none-any.whl (18 kB)
Collecting einops
  Downloading einops-0.3.2-py3-none-any.whl (25 kB)
Installing collected packages: labml-helpers, einops, labml-nn
Successfully installed einops-0.3.2 labml-helpers-0.4.84 labml-nn-0.4.118


In [119]:
import math
from typing import Optional, List
import torch
from torch import nn as nn
from labml_nn.utils import clone_module_list
import argparse
from google.colab import drive
drive.mount('/content/gdrive/')
import argparse
import time
import math
import os, sys
import itertools

import numpy as np

import torch.optim as optim

Mounted at /content/gdrive/


In [None]:
import sys
sys.path.append('/content/gdrive/MyDrive/Study/NLP/')
sys.path.append('/content/gdrive/MyDrive/Study/NLP/utils')

In [None]:
import utils
import data_utils

# Model

In [None]:
class PrepareForMultiHeadAttention(nn.Module):
  def __init__(self, d_model, heads,d_k, bias):
    super().__init__()
    self.linear = nn.Linear(d_model, heads*d_k, bias= bias)
    self.heads = heads
    self.d_k = d_k
  def forward(self, x: torch.Tensor):
    head_shape = x.shape[:-1]
    x = self.linear(x)
    x = x.view(*head_shape, self.heads, self.d_k)
    return x



In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1,bias=True):
      super().__init__()
      self.d_k = d_model // heads
      self.heads = heads
      self.query = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=bias)
      self.key = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=bias)
      self.key = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=bias)
      self.softmax = nn.Softmax(dim=1)
      self.output = nn.Linear(d_model, d_model)
      self.dropout = nn.Dropout(droupout)
      self.scale = 1/math.sqrt(self.d_k)
      self.attn = None
    def get_scores(self, query, key):
      return torch.einsum('ibhd,jbhd->ijbh', query, key)

    def prepare_mask(self, mask, query_shape, key_shape):
      assert mask.shape[0] == 1 or mask.shape[0] == query_shape[0]
      assert mask.shape[1] == key_shape[0]
      assert mask.shape[2] == 1 or mask.shape[2] == query_shape[1]

      mask = mask.unsqueeze(-1)
      return mask

    def forward(self, *, query, key, value, mask=None):
      seq_len, batch_size, _ = query.shape
      
      if mask is not None:
        mask = self.prepare_mask(mask, query.shape, key.shape)

      query = self.query(query)
      key = self.key(key)
      value = self.value(value)
      scores = self.get_scores(query, key)

      scores *= self.scale

      if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))

      attn = self.softmax(scores)  
      attn = self.dropout(attn)
      x = torch.einsum("ijbh,jbhd->ibhd", attn, value)
      self.attn = attn.detach()
      x = x.reshape(seq_len, batch_size, -1)
      return self.output(X)

In [None]:
class FeedForWard(nn.Module):
  def __init__(self, d_model, d_ff, dropout=0.1, activation=nn.ReLU(), is_gated=False, bias_1 = True, bias_2=True, bias_gate=True):
    super().__init__()
    self.layer1 = nn.Linear(d_model, d_ff, bias=bias_1)
    self.layer2 = nn.Linear(d_model, d_ff, bias=bias_2)
    self.dropout = nn.Dropout(dropout)

    self.activation = activation

    self.is_gated = is_gated
    if is_gated:
      self.linear_v = nn.Linear(d_model, d_ff, bias=bias_gate)
    
  def forward(self, x):
    g = self.activation(self.layer1(x))
    if self.is_gated:
      x = g*self.linear_v(x)
    else:
      x = g
    x = self.dropout(x)
    return self.layer2(x)

In [None]:
def shift_right(x):
  zero_pad = x.new_zeros(x.shape[0], 1, *x.shape[2:])
  x_padded = torch.cat([x, zero_pad], dim=1)
  x_padded = x_padded.view(x.shape[1] + 1, x.shape[0], *x.shape[2:])
  x = x_padded[:-1].view_as(x)
  return x
class RelativeMultiHeadAttention(MultiHeadAttention):
  def __init__(self, heads, d_model, dropout_prob = 0.1):
    super().__init__(heads, d_model, dropout_prob, bias=False)
    self.P = 2 ** 12
    self.key_pos_embeddings = nn.Parameter(torch.zeros((self.P * 2, heads, self.d_k)), requires_grad=True)
    self.key_pos_bias = nn.Parameter(torch.zeros((self.P * 2, heads)), requires_grad=True)
    self.query_pos_bias = nn.Parameter(torch.zeros((heads, self.d_k)), requires_grad=True)

  def get_scores(self, query, key):
    key_pos_emb = self.key_pos_embeddings[self.P - key.shape[0]:self.P + query.shape[0]]
    key_pos_bias = self.key_pos_bias[self.P - key.shape[0]:self.P + query.shape[0]]
    query_pos_bias = self.query_pos_bias[None, None, :, :]
    
    ac = torch.einsum('ibhd,jbhd->ijbh', query + query_pos_bias, key)
    b = torch.einsum('ibhd,jhd->ijbh', query, key_pos_emb)
    d = key_pos_bias[None, :, None, :]
    bd = shift_right(b + d)
    bd = bd[:, -key.shape[0]:]
    return ac + bd



In [None]:
class TransformerXLLayer(nn.Module):
  def __init__(self,*, d_model, self_attn, feed_forwar, dropout_prob):
    super().__init__()
    self.size = d_model
    self.self_attn = self_attn
    self.feed_forward = feed_forward
    self.droupout = nn.Dropout(dropout_prob)
    self.norm_self_attn = nn.LayerNorm([d_model])
    self.norm_ff = nn.LayerNorm([d_model])

  def forward(self, *, x, mem, mask):
    z = self.norm_self_attn(x)

    if mem is not None:
      mem = self.norm_self_attn(mem)
      m_z = torch.cat((mem,z), dim=0)
    else:
      m_z = z
    self_attn = self.self_attn(query = z, key = m_z, value = m_z, mask = mask)

    x = x + self.dropout(self_attn)
    z = self.norm_ff(x)
    ff = self.feed_forward(z)
    x = x + self.dropout(ff)

    return x

class Transformer(nn.Module):
  def __init__(self, layer, n_layers):
    super().__init__()
    self.layers = clone_module_list(layer, n_layers)
    self.norm = nn.LayerNorm([layer.size])

  def forward(self, x, mem, mask):
    new_mem = []
    for i, layer in enumerate(self.layers):
      new_mem.append(x.detach())
      m = mem[i] if mem else None
      x = layer(x=x, meme=m, mask=mask)
    return self.norm(x), new_mem

# Tiền xử lý dữ liệu

In [None]:
from data_utils import get_lm_corpus, LMOrderedIterator
from exp_utils import create_exp_dir

In [115]:
parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
parser.add_argument('--data', type=str, default='/content/drive/MyDrive/Data/',
                    help='location of the data corpus')
parser.add_argument('--dataset', type=str, default='wt2',
                    choices=['wt103', 'lm1b', 'enwik8', 'text8'],
                    help='dataset name')
parser.add_argument('--n_layer', type=int, default=12,
                    help='number of total layers')
parser.add_argument('--n_head', type=int, default=10,
                    help='number of heads')
parser.add_argument('--d_head', type=int, default=50,
                    help='head dimension')
parser.add_argument('--d_embed', type=int, default=-1,
                    help='embedding dimension')
parser.add_argument('--d_model', type=int, default=500,
                    help='model dimension')
parser.add_argument('--d_inner', type=int, default=1000,
                    help='inner dimension in FF')
parser.add_argument('--dropout', type=float, default=0.0,
                    help='global dropout rate')
parser.add_argument('--dropatt', type=float, default=0.0,
                    help='attention probability dropout rate')
parser.add_argument('--init', default='normal', type=str,
                    help='parameter initializer to use.')
parser.add_argument('--emb_init', default='normal', type=str,
                    help='parameter initializer to use.')
parser.add_argument('--init_range', type=float, default=0.1,
                    help='parameters initialized by U(-init_range, init_range)')
parser.add_argument('--emb_init_range', type=float, default=0.01,
                    help='parameters initialized by U(-init_range, init_range)')
parser.add_argument('--init_std', type=float, default=0.02,
                    help='parameters initialized by N(0, init_std)')
parser.add_argument('--proj_init_std', type=float, default=0.01,
                    help='parameters initialized by N(0, init_std)')
parser.add_argument('--optim', default='adam', type=str,
                    choices=['adam', 'sgd', 'adagrad'],
                    help='optimizer to use.')
parser.add_argument('--lr', type=float, default=0.00025,
                    help='initial learning rate (0.00025|5 for adam|sgd)')
parser.add_argument('--mom', type=float, default=0.0,
                    help='momentum for sgd')
parser.add_argument('--scheduler', default='cosine', type=str,
                    choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'],
                    help='lr scheduler to use.')
parser.add_argument('--warmup_step', type=int, default=0,
                    help='upper epoch limit')
parser.add_argument('--decay_rate', type=float, default=0.5,
                    help='decay factor when ReduceLROnPlateau is used')
parser.add_argument('--lr_min', type=float, default=0.0,
                    help='minimum learning rate during annealing')
parser.add_argument('--clip', type=float, default=0.25,
                    help='gradient clipping')
parser.add_argument('--clip_nonemb', action='store_true',
                    help='only clip the gradient of non-embedding params')
parser.add_argument('--max_step', type=int, default=100000,
                    help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=60,
                    help='batch size')
parser.add_argument('--batch_chunk', type=int, default=1,
                    help='split batch into chunks to save memory')
parser.add_argument('--tgt_len', type=int, default=70,
                    help='number of tokens to predict')
parser.add_argument('--eval_tgt_len', type=int, default=50,
                    help='number of tokens to predict for evaluation')
parser.add_argument('--ext_len', type=int, default=0,
                    help='length of the extended context')
parser.add_argument('--mem_len', type=int, default=0,
                    help='length of the retained previous heads')
parser.add_argument('--not_tied', action='store_true',
                    help='do not tie the word embedding and softmax weights')
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')
parser.add_argument('--cuda', action='store_true',
                    help='use CUDA')
parser.add_argument('--adaptive', action='store_true',
                    help='use adaptive softmax')
parser.add_argument('--div_val', type=int, default=1,
                    help='divident value for adapative input and softmax')
parser.add_argument('--pre_lnorm', action='store_true',
                    help='apply LayerNorm to the input instead of the output')
parser.add_argument('--varlen', action='store_true',
                    help='use variable length')
parser.add_argument('--multi_gpu', action='store_true',
                    help='use multiple GPU')
parser.add_argument('--log-interval', type=int, default=200,
                    help='report interval')
parser.add_argument('--eval-interval', type=int, default=4000,
                    help='evaluation interval')
parser.add_argument('--work_dir', default='/content/drive/MyDrive/Study/NLP/', type=str,
                    help='experiment directory.')
parser.add_argument('--restart', action='store_true',
                    help='restart training from the saved checkpoint')
parser.add_argument('--restart_dir', type=str, default='',
                    help='restart dir')
parser.add_argument('--debug', action='store_true',
                    help='run in debug mode (do not create exp dir)')
parser.add_argument('--same_length', action='store_true',
                    help='use the same attn length for all tokens')
parser.add_argument('--attn_type', type=int, default=0,
                    help='attention type. 0 for ours, 1 for Shaw et al,'
                    '2 for Vaswani et al, 3 for Al Rfou et al.')
parser.add_argument('--clamp_len', type=int, default=-1,
                    help='use the same pos embeddings after clamp_len')
parser.add_argument('--eta_min', type=float, default=0.0,
                    help='min learning rate for cosine scheduler')
parser.add_argument('--gpu0_bsz', type=int, default=-1,
                    help='batch size on gpu 0')
parser.add_argument('--max_eval_steps', type=int, default=-1,
                    help='max eval steps')
parser.add_argument('--sample_softmax', type=int, default=-1,
                    help='number of samples in sampled softmax')
parser.add_argument('--patience', type=int, default=0,
                    help='patience')
parser.add_argument('--finetune_v2', action='store_true',
                    help='finetune v2')
parser.add_argument('--finetune_v3', action='store_true',
                    help='finetune v3')
parser.add_argument('--fp16', action='store_true',
                    help='Run in pseudo-fp16 mode (fp16 storage fp32 math).')
parser.add_argument('--static-loss-scale', type=float, default=1,
                    help='Static loss scale, positive power of 2 values can '
                    'improve fp16 convergence.')
parser.add_argument('--dynamic-loss-scale', action='store_true',
                    help='Use dynamic loss scaling.  If supplied, this argument'
                    ' supersedes --static-loss-scale.')
parser.add_argument('-f')
args = parser.parse_args()

In [None]:
corpus = get_lm_corpus(args.data, 'wt2')
ntokens = len(corpus.vocab)
args.n_token = ntokens



eval_batch_size = 10
tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len,
    device=device, ext_len=args.ext_len)
va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len,
    device=device, ext_len=args.ext_len)
te_iter = corpus.get_iterator('test', eval_batch_size, args.eval_tgt_len,
    device=device, ext_len=args.ext_len)

Loading cached dataset...


In [None]:
layer = TransformerXLLayer()
model = 

In [None]:
if args.scheduler == 'cosine':
    # here we do not set eta_min to lr_min to be backward compatible
    # because in previous versions eta_min is default to 0
    # rather than the default value of lr_min 1e-6
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
        args.max_step, eta_min=args.eta_min) # should use eta_min arg
    if args.sample_softmax > 0:
        scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse,
            args.max_step, eta_min=args.eta_min) # should use eta_min arg

<data_utils.LMOrderedIterator at 0x7f01bb26bed0>

In [120]:
if args.optim.lower() == 'adam':
    if args.sample_softmax > 0:
        dense_params, sparse_params = [], []
        for param in model.parameters():
            if param.size() == model.word_emb.weight.size():
                sparse_params.append(param)
            else:
                dense_params.append(param)
        optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr)
        optimizer = optim.Adam(dense_params, lr=args.lr)
    else:
        optimizer = optim.Adam(model.parameters(), lr=args.lr)

NameError: ignored

In [116]:

def evaluate(eval_iter):
    # Turn on evaluation mode which disables dropout.
    model.eval()

    # If the model does not use memory at all, make the ext_len longer.
    # Otherwise, make the mem_len longer and keep the ext_len the same.
    if args.mem_len == 0:
        model.reset_length(args.eval_tgt_len,
            args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len)
    else:
        model.reset_length(args.eval_tgt_len,
            args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len)

    # Evaluation
    total_len, total_loss = 0, 0.
    with torch.no_grad():
        mems = tuple()
        for i, (data, target, seq_len) in enumerate(eval_iter):
            if args.max_eval_steps > 0 and i >= args.max_eval_steps:
                break
            ret = model(data, target, *mems)
            loss, mems = ret[0], ret[1:]
            loss = loss.mean()
            total_loss += seq_len * loss.float().item()
            total_len += seq_len

    # Switch back to the training mode
    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
    model.train()

    return total_loss / total_len

In [117]:
def train():
    # Turn on training mode which enables dropout.
    global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
    model.train()
    train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter
    for batch, (data, target, seq_len) in enumerate(train_iter):
        model.zero_grad()
        ret = para_model(data, target, *mems)
        loss, mems = ret[0], ret[1:]
        loss = loss.float().mean().type_as(loss)
        loss.backward()
        train_loss += loss.float().item()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()
        if args.sample_softmax > 0:
            optimizer_sparse.step()

        # step-wise learning rate annealing
        train_step += 1
        if args.scheduler in ['cosine', 'constant', 'dev_perf']:
            # linear warmup stage
            if train_step < args.warmup_step:
                curr_lr = args.lr * train_step / args.warmup_step
                optimizer.param_groups[0]['lr'] = curr_lr
                if args.sample_softmax > 0:
                    optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2
            else:
                if args.scheduler == 'cosine':
                    scheduler.step(train_step)
                    if args.sample_softmax > 0:
                        scheduler_sparse.step(train_step)
        elif args.scheduler == 'inv_sqrt':
            scheduler.step(train_step)

        if train_step % args.log_interval == 0:
            cur_loss = train_loss / args.log_interval
            elapsed = time.time() - log_start_time
            log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \
                      '| ms/batch {:5.2f} | loss {:5.2f}'.format(
                epoch, train_step, batch+1, optimizer.param_groups[0]['lr'],
                elapsed * 1000 / args.log_interval, cur_loss)
            if args.dataset in ['enwik8', 'text8']:
                log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2))
            else:
                log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss))
            logging(log_str)
            train_loss = 0
            log_start_time = time.time()

        if train_step % args.eval_interval == 0:
            val_loss = evaluate(va_iter)
            logging('-' * 100)
            log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \
                      '| valid loss {:5.2f}'.format(
                train_step // args.eval_interval, train_step,
                (time.time() - eval_start_time), val_loss)
            if args.dataset in ['enwik8', 'text8']:
                log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2))
            else:
                log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss))
            logging(log_str)
            logging('-' * 100)
            # Save the model if the validation loss is the best we've seen so far.
            if not best_val_loss or val_loss < best_val_loss:
                if not args.debug:
                    with open(os.path.join(args.work_dir, 'model.pt'), 'wb') as f:
                        torch.save(model, f)
                    with open(os.path.join(args.work_dir, 'optimizer.pt'), 'wb') as f:
                        torch.save(optimizer.state_dict(), f)
                best_val_loss = val_loss

            # dev-performance based learning rate annealing
            if args.scheduler == 'dev_perf':
                scheduler.step(val_loss)
                if args.sample_softmax > 0:
                    scheduler_sparse.step(val_loss)

            eval_start_time = time.time()

        if train_step == args.max_step:
            break


tensor([[    0,   284, 15178,  ...,  1352,  1335,    16],
        [    1,   357,    43,  ...,    46,    43,  2015],
        [    2,  1496,  7369,  ...,   380,    27, 33001],
        ...,
        [  357,   415,   173,  ...,   212,    78,  1575],
        [ 2520,     9,  3890,  ...,   208,    27,   808],
        [   33,    35,    19,  ...,  8832,  6091,   209]])