In [None]:
# pip install -r requirements.txt

In [None]:
# !pip install -q torchdata=0.3.0 torchtext==0.12
# !pip install spacy==3.0, altair GPUtil
# !python -m spacy download de_core_news_sm
# !python -m spacy download en_core_web_sm

In [None]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
from torch.optim.lr_scheduler import LambdaLR

import pandas as pd
import altair as alt

from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator

import torchtext.datasets as datasets

import spacy
import GPUtil
import warnings
warnings.filterwarnings('ignore')

from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

run_exaples_True

In [None]:
def is_interactive notebook() :
  return __name__ == '__main__'

def show_example(fn, args=[]) :
  if __name__ == '__main__' & run_examples :
    return fn(*args)
  
def execute_example(fn, args=[] ) :
  if __name__ == '__main__' & run_examples :
    return fn(*args))

  
class DummyOptimizer(torch.optim.Optimizer) :
  def __init__(self) :
    self.param_groups = [{'lr' : 0}]
    None
  
  def step(self) :
    None
  
  def zero_grad(self, set_to_none = False) :
    None
  

Class DummyScheduler :
def step(self) :
  None
  

#Model Architecture

In [None]:
class EncoderDecoder(nn.Module) :

  def __init__(self, encoder, decoder, src_embed, tgt_embed, generator) :
    super(EncoderDecoder,self).__init__()

    self.encoder = encoder
    self.decoder = decoder
    self. src_embed = src_embed
    self.tgt_embed = tgt_embed
    self.generator = generator

  def forward(self, src, tgt, src_mask, tgt_mask) :
    return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
  
  def encode(self, src, src_mask) :
    reutrn self.encoder(self.src_embed(src), src_mask)
  
  def decode(self, memory, src_mask, tgt, tgt_mask) :
    return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [None]:
class Generator(nn.Module) :
  def __init__(self, d_model, vocab) :
    super(Generator,self).__init__()

    self.prob = nn.Linear(d_model, vocab)

  def forward(self, x) :
    return log_softmax(self.prob(x), dim=-1)

    

##Encoder and Decoder Stacks

Encoder 

The encoder is composed of a stack of N=6 identical layers

In [None]:
def clones(module, N) :
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [None]:
class Encoder(nn.Module) :
  def __init__(self, layer, N) :
    super(Encoder,self).__init__()
    self.layers =clones(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self, x, mask) :
    for layer in self.layers :
      x = layer(x, mask)
  
    return self.norm(x)

In [None]:
class LayerNorm(nn.Module) :

  def __init__(self, features, eps=1e-6) :
    super(LayerNorm,self).__init__() 
    self.a_2 = nn.Parameter(torch.ones(features))
    self.b_2 = nn.Parameter(torch.zeros(feature))
    self.eps = eps

  def forward(self, x) :
    mean = x.mean(-1, keepdim=True)
    std = x.std(-1, keepdim=True)

    return self.a_2 * (x-mean) / (std+self.eps) + self.b_2

That is, the output of each sub-layer is LayerNorm(x+ sublayer(x))

sublayer(x) is the function implemented by the sublayer itself.

To ficilitate these residual connections, all sublayers in the odel, as well as the embedding layers, produce outputs of dim=512

In [None]:
class SublayerConnection(nn.Module) :
  '''
  A residual connection followed by a layer norm.
  '''

  def __init__(self, size, dropout) :
    super(SublayerConnection, self).__init__()
    self.norm = LayerNorm
    self. dropout = nn.Dropout(dropout)

  def forward(self, x) :
    return x + self.dropout(sublayer(self.norm(x)))

Each layer has two sublayers. 

The first is a multi head self att.

The Second is a simple, position-wise fc ffn.

In [None]:
class EncoderLayer(nn.Module) :
  def __init__(self, size, self_attn, ff, dropout) :
    super(EncoderLayer,self).__init__()

    self.size = size
    self.self_attn = self_attn
    self. ff = ff
    # clones  를 통해 sublayerconnection을 진행하며, 각각의 레이어는 2개
    self.sublayer = clones(SublayerConnection(size, dropout),2)
    
  def forward(self,x, mask) :
    x = self.sublayer[0](x, lambda x : self.self_attn(x,x,x,mask)) 
    return self.sublayer[1](x, self.ff)

Decoder

the decoder is also composed of a stack of N=6 identical layers

In [None]:
class Decoder(nn.Module) :
  def __init__(self, layers, N) :
    super(Decoder,self).__init__()
    self.layers = clones(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self, x, memory, src_mask, tgt_mask) :
    for layer in self.layers :
      x = layer(x, memory, src_mask, tgt_mask) 
      return self.norm(x)

the decoder insets a third sublayer, which performs multi head attn oer the output of the encoder stack.

In [None]:
class DecoderLayer(nn.Module) :

  def __init__(self, size, self_attn, src_attn, ff, dropout) :
    super(DecoderLayer,self).__init__()
    self.size = size
    self.self_attn = self_attn
    self.src_attn =src_attn
    self.ff = ff
    self.sublayer = clones(SublayerConnection(size, dropout), 3)

  def forward(self, x, memory, src_mask, tgt_mask) :
    m = memory
    x = self.sublayer[0](x, lambda x : self.self_attn(x,x,x,tgt_mask))

    x = self.sublayer[1](x, lambda x : self.src_attn(x,m,m, src_mask))

    return self.sublayer[2](x, self.ff)

In [None]:
def subsequent_mask(size) :
  attn_shape = (1, size,size)

  subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(torch.unit8)

  return subsequent_mask == 0

In [None]:
def example_mask() :
  LS_data = pd.concat(
      [
          pd.DataFrame(
              {'Subsequent_mask' : subsequent_mask(20)[0][x,y].flatten(),
              'Window' : y,
               'Masking' : x
               }
          )
          for y in range(20)
          for x in range(20)
      ]
  )

return(
    alt.Chart(Ls_data)
    .mark_rect()
    .properties(height = 250, width=250)
    .encode(
        alt.x('window:0'),
        alt.y('masking:0'),
        alt.color('Subsequent_mask : Q', scale = alt.Sclae(scheme= 'viridis')),

    )
    .interactive()
)

show_example(example_mask)

##Attention

query, key, value

The output is computed a s a weighted sum of the values, where the weight assigned to each value i scomputed by a compatibility function of the query with the corresponding key.

We call out particular attn 'Scaled dot product attn".



In [None]:
def attentino(query, key, value, mask=None, dropout=None) :
  d_k = query.size(-1)
  scores = torch.matmul(query, key.Transpose(-2, -1)) / math.sqrt(d_k)

  if mask is not None :
    scores=  scores.masked_fill(mask == 0, -1e9)

  p_attn = scores.softmax(dim=-1)
  
  if dropout is not None :
    p_attn = dropout(p_attn)

  return torch.matmul(p_attn, value), p_attn

Multihead(Q,K,V) = Concat(head1,head2.....,head_h)*WO

where head_i = Attention(QW_i^Q, Kw_i^K, VW_i^V)

In [None]:
class MultiHeadedAttentino(nn.Module) :
  def __init__(self, h, d_model, dropout=0.1) :
    super(MultiHeadedAttention,self).__init__()
    assert d_model % h == 0

    self.d_k = d_model / h
    self. h = h
    self.linears = clones(nn.Linear(d_model, d_model), 4)
    self.attn = None
    self.dropout = nn.Dropout(dropout)

  def forward(self, query, key, value, mask=None) :
    if mask is not None :
      mask = mask.unsqueeze(1)
    
    nbatches = query.size(0)

    query, key, value = [
        lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1,2)
        for lin, x, in zip(self.linears, (query, key, value))
    ]

    x, self.attn = attention(
        query, key, value, mask=mask, dropout = self.dropout
    )

    x  = concat(
        x.transpose(1,2)
        .contiguous()
        .view(nbatches, -1, self.h * self.d_k)
    )

    del query
    del key
    del value
    return self.linears[-1](x)

Position Wise FFN

FFN(x) = max(0, xW1 + b1)*W2 + b2


 

While the linear transformations are the same across different positions, they use different parameters from layer to layer. Another way of describing this is as two convlitions with kernel size 1. The dimensionality of input and output is d_model = 512, and the inner_layer, it has dimensionality d_ff = 2048.

In [None]:
class PositionwiseFF(nn.Module) :

  def __init__(self, d_model, d_ff, dropout=0.1) :
    super(PositionwiseFF,self).__init__()
    self.w_1 = nn.Linear(d_model, d_ff)
    self. w_2 = nn.Linear(d_ff, d_model)
    self.dropout == nn.Dropout(p=dropout)

  def forward(self, x) :
    return self.w_2(self.dropout(self.w_1(x).relu()))

##Embeddings and Softmax

Similariy to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of d_model.

We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next token probalities. In our model, we share the same weight matrix btw the two embedding layers and the pre softmax linear transformation.

In [None]:
class Embeddings(nn.Module) :
  def __init__(self, d_model, vocab) :
    super(Embeddings,self).__init__()
    self.d_model = d_model
    self.lut = nn.Embeddings(vocab, d_model)

  def forward(self,x) :
    return self.lut(x) * math.sqrt(self.d_model)

Positional Encoding


Pe(pos, 2i) = sin(pos/10000^2i/d_model)
Pe(pos, 2i+1) = cos(pos/10000^2i/d_model)

In [None]:
class PositionalEncoding(nn.Module) :
  def __init__(self, d_model, dropout, max_len  = 10000) :
    super(PositionalEncoding,self).__init__()
    self.dropout = nn.Dropout(dropout)

    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len).unsqueeze(1)
    div_tern = torch.exp(
        torch.arange(0, d_model, 2)* - (math.log(10000) / d_model)
    )

    pe[:, 0::2] = torch.sin(position * div_tern)
    pe[:, 1::2] = torch.cos(position * div_term)

    pe = pe.unsqueeze(0)
    self.register_buffer('pe',pe)

  def forward(self, x) :
    x = x + self.pe[: , : x.size(1)].requres_grad_(False)
    return self.dropout(x)

# Full model

In [None]:
def make_model (src_vocab, tgt_vocab, N=6, d_model = 512, d_ff = 2048, h=8, dropout=0.1) :

  c =copy.deepcopy
  attn = MultiHeadedAttn(h, model) 
  ff = PositionwiseFF(d_model, d_ff, dropout)
  position = PositionalEncoding(d_model, dropout)

  model = EncoderDecoder(
      Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
      Decoder(DecoderLayer(d_model, c(attn), , c(attn), c(ff), dropout), N),
      
      nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
      nn.Sequential(Embeddings(d_model, tgt_vocab, c(position))),
      Generator(d_model, tgt_vocab)
  )

  for p in model.parameters() :
    if p.dim() >1 :
      nn.init.xavier_uniform(_p)
  return model

#Model Training

##Batches and Masking

In [1]:
class batch :

  def __init__(self, src, tgt=None, pad=2) :    # 2 == blank
    self.src = src
    self.src_mask = (src != pad).unsqueeze(-2)
    if tgt is not None :
      self.tgt = tgt[:, :-1]
      self.tgt_y = tgt[:, 1:]
      self.tgt_mask = self.make_std_mask(self.tgt, pad)
      self.ntokens = (self.tgt_y != pad).data.sum()

  @staticmethod
  def make_std_mask(tgt, pad) :
    tgt_mask = (tgt != pad).unsqueeze(-2)
    tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)

    return tgt_mask

IndentationError: ignored

##Trainig Loop

In [None]:
class TrainState :
  '''Track # steps, examples, and tokens processed'''

  step : int = 0
  accum_step : int = 0
  sample : int = 0
  tokens : int = 0

In [None]:
def run_epoch(data_iter, model, loss_compute, optimizer, scheduler, mode='train', accum_iter=1, train_state = TrainState()) :
  '''Train a single epoch'''
  start = time.time()
  total_tokens = 0
  totla_loss = 0
  tokens = 0
  n_accum = 0

  for i,batch in enumerate(data_iter) :
    out = model.forward(batch.src, batch.tgt, batch.src_mask, batch_tgt_mask)
    loss, loss_node = loss_compute(out, bath.tgt_y, batch.ntokens)

    # loss_node = loss_node / accum_iter
    if mode == 'train' or mode == 'train+log' :
      loss_node.backward()
      train_state.step +=1
      train_state.smaples += batch.src.shape[0]
      train_state.tokens += bathc.ntokens
      if i % accum_iter == 0 :
        optimizer.step()
        optimizer.zero_grad(set_to_none = True)
        n_accum+=1
        train_state.accum_iter +=1
      
      scheduler.step()
    
    total_loss += loss
    total_tokens += bathc.ntokens
    tokens += bathc.ntokens

    if i % 40 == 1 and ( mode == 'train' or mode == 'train+log') :
      lr = optimizer.param_groups[0]['lr']
      elapsed = time.time - start

      print(
          'epoch step : %6d | accumulatino step : %3d | loss : %6.2f' + '| tokens  / sec : %7.1f |'

      )
      start = time.time()
      tokens = 0

    del loss
    del loss_node
  return total_loss / total_token, train_state

##Optmizer

we used the adam with b1 = 0.9, b2= 0.98, ϵ=1e-.

lrate = d_model^-0.5 * min(step_num^0.5, step_num.warmup_steps^-1.5)

also, used warmup_steps was 4000.

In [None]:
def rate(step, model_size, factor, warmup) :
  '''
  we have to default the step to 1 for lambdaLR fucntion to avoid zero raising to negative power.
  '''

  if step == 0 :
    step =1
  return factgor * (model_size ** (-0.5) * min(step **(-0.5), step * warmup ** (-1.5)))

  

In [None]:
def examples_learning_schedule() :
  opts = [ 
      [512,1,4000], #example1
      [512,1,8000], #example2
      [245,1,4000]  #example3
  ]

  dummy_model = torch.nn.Linear(1,1)
  learning_rates = []

  # we have 3examples in opts list.

  for idx, example in enumerate(opts) :
    optimizer = torch.optim.Adam(dummy_model.parameters(), lr=1, betas=(0.9,0.98), eps=1e-9)
    lr_scheduler = LambdaLR(optimizer= optimizer, lr_lambda = lambda step: rate(stpe, *example))

    temp = []

    for step in range(20000) :
      temp.append(optimizer.parma_groups[0]['lr'])
      optimizer.step()
      lr_scheduler.step()
    learning_rates.append(temp)

  learning_rates = torch.tensor(learning_rates)

  # Enable altair to handle more than 5000 rows
  alt.data_transformers.disable_max_rows()

  opts_data = pd.concat(
      [
          pd.DataFrame(
              {
              'learning rate' : learning_rates[warmup_idx, :],
              'model_size : warmup' : ["512:4000", "512:8000", "256:4000"][warmup_idx],
              'step' : range(20000)
         }
      )
          for warmpup_idx in [0,1,2]
    ]
  )

return (
    alt.Chart(opts_data)
    .mark_line()
    .properties(width=600)
    .encode(x=  'spte', y='learning rate', color = 'model_size : warmup:N')
    .interactive()
)

##Regularization


###Label Smoothing

we implement label smoothing using the KL div loss.

In [None]:
class LabelSmoothing(nn.Module) :
  def __init__(self, size, padding_idx, smoothing=0.1) :
    super(LabelSmoothing,self).__init__()
    self.criterion = nn.KLDivLoss(reduction='sum')
    self.padding_idx = padding_idx
    self.confidence = 1 - smoothing
    self.smoothing = smoothing
    self.size = size
    self.true_dist = None

  
  def forward(self, x, target) :
    assert x.size(1) == self.size
    true_dist = x.data.clone()
    true.dist_fill_(self.smoothing / (self.size - 2))
    true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
    true_dist[:, self.padding_idx] = 0
    mask = torch.nonzero(target.data == self.padding_idx)

    if mask_dim() > 0 :
      true_dist.index_fill(0, mask.squeeze(), 0.0)
    self.true_dist = true_dist
    return self.criterion(x, true_dist.clone().detach())

Label smoothing actually starts to penalize the model if it gets very confident about a given choice.

In [None]:
def loss(x,crit) :
  d = x +3 * 1
  predict = torch.FloatTensor([[0, x / d, 1 / d, 1 / d, 1 / d]])
  return crit(predict.log(), torch.LongTensor([1])).data

def penalization_visualization() :
  crit = LabelSmoothing(5,0,0.1)
  loss_data = pd.DataFrame({
      'loss' : [loss(x,crit) for x in range( 1,100)],
      'steps' : list(range(99))}
  ).astype('float')

  return(
      alt.Chart(loss_data).
      mark_line()
      .properties(width=350)
      .encode(
          x= ' steps',
          y=  'loss',

      )
      .interactive()
  )

show_examples(penalization_visualization)