<a href="https://colab.research.google.com/github/yinghao1019/NLP_and_DL_practice/blob/master/Packed_PAD(Prac).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
from torch import nn,optim
import torchtext
from torchtext import datasets
from torchtext.data import Field,BucketIterator

import numpy as np
import spacy
import matplotlib.pyplot as plt

import random
import tqdm
import math
import os
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [None]:
#build each tokenizer
en_nlp=spacy.load('en')
de_nlp=spacy.load('de')
def tokenize_en(text):
  return [t.text for t in en_nlp.tokenizer(text)]
def tokenize_de(text):
  return [t.text for t in de_nlp.tokenizer(text)]
#build source and target sent field 
SRC=Field(init_token='<sos>',eos_token='<eos>',tokenize=tokenize_de,include_lengths=True)
TRG=Field(init_token='<sos>',eos_token='<eos>',tokenize=tokenize_en)
train_data,val_data,test_data=datasets.Multi30k.splits(exts=('.de','.en'),fields=(SRC,TRG))
#build vocabulary
SRC.build_vocab(train_data,min_freq=2)
TRG.build_vocab(train_data,min_freq=2)

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 1.04MB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 273kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 266kB/s]


In [None]:
BATCH_SIZE=64
device=torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
train_iter=BucketIterator(train_data,device=device,
                          batch_size=BATCH_SIZE,sort_key=lambda x:len(x.src),
                          sort_within_batch=True,shuffle=True)
val_iter,test_iter=BucketIterator.splits((val_data,test_data),
                                          device=device,
                                          batch_size=BATCH_SIZE,
                                          sort_key=lambda x:len(x.src),
                                          sort_within_batch=True)

class Encoder(nn.Module):
  def __init__(self,input_dim,embed_dim,encoder_hid_dim,decoder_hid_dim,n_layers,drop_rate):
    super(Encoder,self).__init__()
    self.decode_hid_dim=decoder_hid_dim
    self.input_dim=input_dim
    self.encode_hid_dim=encoder_hid_dim
    self.n_layers=n_layers

    self.embed=nn.Embedding(input_dim,embed_dim)
    self.rnn_layer=nn.GRU(embed_dim,encoder_hid_dim,n_layers,bidirectional=True)
    self.fc_layer=nn.Linear(encoder_hid_dim*2,decoder_hid_dim)

    self.dropout_layer=nn.Dropout(drop_rate)
    self.tanh=nn.Tanh()
  def forward(self,input_tensors,input_lens):
    #input_tensor=[seqlen,bs]
    #input_lens=[]
    #input_embed=[seqlen,bs,embed_dim]

    #embedding input_tensor
    input_embed=self.embed(input_tensors)
    #packed embed tensor
    packed_embed=torch.nn.utils.rnn.pack_padded_sequence(input_embed,input_lens.cpu()).cuda()
    packed_output,hidden=self.rnn_layer(packed_embed)
    #unpack packed output to padded output
    padded_seq,_=torch.nn.utils.rnn.pad_packed_sequence(packed_output)
    #transform encoder final layer forward &backward into init decoder hidden state
    hidden=self.tanh(self.fc_layer(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)))

    return padded_seq,hidden

In [None]:
class Attention(nn.Module):
  def __init__(self,encoder_hid_dim,decoder_hid_dim):
    super(Attention,self).__init__()
    self.fc_layer=nn.Linear(encoder_hid_dim*2+decoder_hid_dim,decoder_hid_dim)
    self.attn=nn.Linear(decoder_hid_dim,1,bias=False)

    self.softmax=nn.Softmax(dim=1)
    self.tanh=nn.Tanh()
  def forward(self,hidden,encoder_outputs,mask):
    #hidden=[bs,1,hidden_dim]
    #encoder_outputs=[bs,seqlen,encoder_hidden_dim*2]
    #mask=[bs,seqlen]

    seqlen=encoder_outputs.shape[1]
    hidden=hidden.unsqueeze(1)
    #copy hidden data
    hidden=hidden.repeat(1,seqlen,1)
    #concat encoder output & transform into hidden dim
    context_vector=self.tanh(self.fc_layer(torch.cat((encoder_outputs,hidden),dim=2)))

    #transform to attn weight alpha(before softmax)
    #attn_w=[bs,seqlen,1]
    attn_w=self.attn(context_vector).squeeze(2)
    #fill 1e-10 along mask
    attn_w=attn_w.masked_fill_(mask==0,-1e10)

    return self.softmax(attn_w).unsqueeze(2)

In [None]:
class Decoder(nn.Module):
  def __init__(self,embed_dim,encoder_hid_dim,decoder_hid_dim,output_dim,attention,dropout_rate):
    super(Decoder,self).__init__()
    self.output_dim=output_dim
    self.encoder_hid_dim=encoder_hid_dim
    self.decoder_hid_dim=decoder_hid_dim
    self.embed_dim=embed_dim
    self.attn=attention

    #build layer
    self.embed=nn.Embedding(output_dim,embed_dim)
    self.rnn_layer=nn.GRU((encoder_hid_dim*2+embed_dim),decoder_hid_dim)
    self.fc_layer=nn.Linear((encoder_hid_dim*2+decoder_hid_dim+embed_dim),output_dim)

    self.dropout=nn.Dropout(dropout_rate)
  def forward(self,input_word,hidden_state,encoder_outputs,mask):

    #shape
    #input_word=[1,bs]
    #hidden_state=[1,bs,decoder_hid_dim]
    #mask=[bs,seqlen]
    #encoder_output=[bs,seqlen,encoder_hid]

    #input_embed=[1,bs,embed_dim]
    input_embed=self.embed(input_word).unsqueeze(0)

    #compute current word attention vector
    #attn_w=[bs,seq,1]
    attn_w=self.attn(hidden_state,encoder_outputs,mask)
    #context vector=[bs,1,encoder_hid]
    context_vector=torch.bmm(attn_w.permute(0,2,1),encoder_outputs)
    context_vector=context_vector.permute(1,0,2)


    decoder_output,hidden_state=self.rnn_layer(torch.cat((context_vector,input_embed),dim=2),hidden_state.unsqueeze(0))

    assert (decoder_output==hidden_state).all()

    context_vector=context_vector.squeeze(0)
    input_embed=input_embed.squeeze(0)
    hidden_state=hidden_state.squeeze(0)

    #pred logitics=[bs,output_dim]
    pred_logitics=self.fc_layer(torch.cat((input_embed,hidden_state,context_vector),dim=1))

    return pred_logitics,hidden_state,attn_w.squeeze(2)

In [None]:
class seq2seq(nn.Module):
  def __init__(self,input_dim,embed_dim,encoder_hid_dim,
               decoder_hid_dim,output_dim,n_layers,
               src_pad_idx,dropout_rate,device):
    super(seq2seq,self).__init__()

    #build each sub module(encoder,decoder,attention)
    self.encoder=Encoder(input_dim,embed_dim,
                         encoder_hid_dim,decoder_hid_dim,
                         n_layers,dropout_rate)
    self.attner=Attention(encoder_hid_dim,decoder_hid_dim)
    self.decoder=Decoder(embed_dim,encoder_hid_dim,
                         decoder_hid_dim,output_dim,
                         self.attner,dropout_rate)
    
    self.src_pad_idx=src_pad_idx
    self.softmax=nn.Softmax(dim=1)
    self.device=device
  def create_mask(self,src_tensor):
    #mask=[bs,seqlen]
    mask=(src_tensor!=self.src_pad_idx).permute(1,0)
    return mask
  def forward(self,src_tensor,src_len,trg_tensor,teaching_forcing_ratio=0.5):
    #src_tensor=[seqlen,bs]
    #src_len=[bs]
    #trg_tensor=[seqlen,bs]

    batch_size=trg_tensor.shape[1]
    seqlen=trg_tensor.shape[0]
    src_seqlen=src_tensor.shape[0]
    trg_vocab=self.decoder.output_dim

    #build storage decoder predicts
    trg_preds=torch.zeros(seqlen,batch_size,trg_vocab,device=self.device)
    #build storage decoder each token attn weight
    attentions=torch.zeros(seqlen,batch_size,src_seqlen,device=self.device)

    #encoder stage
    encoder_outputs,hidden=self.encoder(src_tensor,src_len)
    encoder_mask=self.create_mask(src_tensor)

    encoder_outputs=encoder_outputs.permute(1,0,2)
    decoder_output=trg_tensor[0,:]
    decoder_hidden=hidden

    #decoder stage
    for ti in range(1,seqlen):
      decoder_output,decoder_hidden,decoder_attn=self.decoder(decoder_output,decoder_hidden,encoder_outputs,encoder_mask)

      #storage attn_w & preds
      trg_preds[ti]=decoder_output
      attentions[ti]=decoder_attn

      if random.random()<teaching_forcing_ratio:
        decoder_output=trg_tensor[ti,:]
      else:
        decoder_output=self.softmax(decoder_output)
        #argmax proba word index
        decoder_output=torch.argmax(decoder_output,dim=1)
    #trg_preds=[seqlen,bs,output_dim]
    #attentions=[seqlen,bs,seqlen]
    return trg_preds,attentions

In [None]:
def getBatchCorrect(predict_tensor,true_tensor,true_len):
  correct=0
  assert predict_tensor.shape[0]==true_tensor.shape[0]==len(true_len)
  batch_size=predict_tensor.shape[0]

  for i in range(batch_size):
    correct+=torch.equal(predict_tensor[i,1:true_len[i]],true_tensor[i,1:true_len[i]])
  
  return correct
def saveModel(model_dir,model_path,model,optim,model_info,ep):
  if os.path.isdir(model_dir):
    print('Model dir already exists')
  else:
    print('Model dir not exists')
    os.mkdir(model_dir)
    print('Model dir already build!')

  save_path=os.path.join(model_dir,model_path)
  torch.save({
      'model':model.state_dict(),
      'optimizer':optim.state_dict(),
      'model_info':model_info,
      'EPOCHS':ep,
  },save_path)
  print(f'Already save model to {save_path}')
def init_weight(m):
  for name,parameters in m.named_parameters():
    if 'weight' in name:
      torch.nn.init.normal_(parameters.data,mean=0,std=0.01)
    else:
      torch.nn.init.constant_(parameters.data,0)
def count_parameters(model):
  return sum(param.numel() for param in model.parameters() if param.requires_grad)

In [None]:
def train(model,train_loader,optimizer,loss_fn,clip):
  epoch_loss=0
  for b_data in train_loader:
      src_tensors,src_lens=b_data.src
      trg_tensors=b_data.trg
      #output=[seqlen,bs,vocab_size]
      model_output,_=model(src_tensors,src_lens,trg_tensors)
      model_output=model_output[1:].view(-1,model.decoder.output_dim)
      trg_tensors=trg_tensors[1:].view(-1)
      #compute loss
      loss=loss_fn(model_output,trg_tensors)
      loss.backward()
      epoch_loss+=loss.item()
      
      #gradient scaling
      torch.nn.utils.clip_grad_norm_(model.parameters(),clip)
      
      optimizer.step()
      optimizer.zero_grad()

  return epoch_loss/len(train_loader)

In [None]:
def evaluateModel(model,val_loader,loss_fn):
  correct_num=0
  total_loss=0
  model.eval()
  with torch.no_grad():
    for batch in val_loader:
      src_tensors,src_len=batch.src
      trg_tensors=batch.trg

      model_output,_=model(src_tensors,src_len,trg_tensors,teaching_forcing_ratio=0)
      
      #compute batch loss
      loss=loss_fn(model_output[1:].view(-1,model.decoder.output_dim),trg_tensors[1:].view(-1))
      total_loss+=loss

      #predict_tensors=[seqlen,bs]
      predict_tensors=torch.argmax(F.softmax(model_output[1:],dim=2),dim=2)
  
  return total_loss/len(val_loader)


In [None]:
def train_pipeline(model,train_loader,val_loader,optimizer,loss_fn,epochs,clip,model_dir,model_path,model_structer):
  for ep in range(epochs):
    model.train()
    ep_trainLoss=train(model,train_loader,optimizer,loss_fn,clip)
    ep_valLoss=evaluateModel(model,val_loader,loss_fn)
    
    print('[{}/{}] train loss:{} val loss:{}'.format(ep,epochs,ep_trainLoss,ep_valLoss))
    saveModel(model_dir,model_path,model,optimizer,model_structer,ep)

In [None]:
BATCH_size=128
input_dim=len(SRC.vocab)
output_dim=len(TRG.vocab)
embed_dim=256
encoder_hid_dim=512
decoder_hid_dim=512
dropout_rate=0.5
epochs=10
n_layers=1
clip=1
src_pad_idx=SRC.vocab.stoi[SRC.pad_token]
trg_pad_idx=TRG.vocab.stoi[TRG.pad_token]
model_dir='./Model'
model_path='seq2seq.pt'

#build Model dict
model_dict={'input_dim':input_dim,'embed_dim':embed_dim,'encoder_hid_dim':encoder_hid_dim,'decoder_hid_dim':decoder_hid_dim,
            'output_dim':output_dim,'n_layers':n_layers,'src_pad_idx':src_pad_idx,'dropout_rate':dropout_rate,'device':device}
model=seq2seq(**model_dict)
#set model init weight
model.apply(init_weight)

model.to(device)
#conut model total parameters
print('Model total parameters num:{}'.format(count_parameters(model)))
#build optimizer & loss
optimizer=optim.Adam(model.parameters(),lr=0.001)
criterion=nn.CrossEntropyLoss(ignore_index=trg_pad_idx)
train_pipeline(model,train_iter,val_iter,optimizer,criterion,epochs,clip,model_dir,model_path,model_dict)

Model total parameters num:21170223
[0/10] train loss:4.824056623790757 val loss:4.62297248840332
Model dir not exists
Model dir already build!
Already save model to ./Model/seq2seq.pt
[1/10] train loss:3.4048942674099085 val loss:3.648510217666626
Model dir already exists
Already save model to ./Model/seq2seq.pt
[2/10] train loss:2.6091284371157575 val loss:3.2830631732940674
Model dir already exists
Already save model to ./Model/seq2seq.pt
[3/10] train loss:2.1177981443342135 val loss:3.1920197010040283
Model dir already exists
Already save model to ./Model/seq2seq.pt
[4/10] train loss:1.7799209717349334 val loss:3.2499167919158936
Model dir already exists
Already save model to ./Model/seq2seq.pt
[5/10] train loss:1.4866672822557356 val loss:3.337244749069214
Model dir already exists
Already save model to ./Model/seq2seq.pt
[6/10] train loss:1.2822412714559077 val loss:3.4594545364379883
Model dir already exists
Already save model to ./Model/seq2seq.pt
[7/10] train loss:1.09975715999

In [None]:
def ModelTranslation(model,translated_sents,SRC_Field,TRG_Field,device):
  attn_list=None
  token_indexes=[TRG_Field.vocab.stoi[TRG_Field.init_token]]

  #source sentences preprocess
  if isinstance(translated_sents,str):
    src_tokens=tokenize_de(translated_sents)
  else:
    src_tokens=[t.lower() for t in translated_sents]

  src_tokens=[SRC_Field.init_token]+src_tokens+[SRC_Field.eos_token]
  src_seqlen=len(src_tokens)

  src_tensor,srcLen_tensor=SRC_Field.numericalize(([src_tokens],[src_seqlen]),device=device)

  #get encoder output & hidden state from encoder
  with torch.no_grad():
    encoder_outputs,encoder_hidden=model.encoder(src_tensor,srcLen_tensor)
    src_masking=model.create_mask(src_tensor)#get masking

    encoder_outputs=encoder_outputs.permute(1,0,2)
    decoder_hidden=encoder_hidden

    while True:
      decoder_output=torch.tensor([token_indexes[-1]],dtype=torch.long,device=device)

      #decoder_output=[bs,output_dim]
      #decoder_attn=[bs,src_seqlen]
      #decoder_hidden=[layers,bs,decoder_hid]
      decoder_output,decoder_hidden,decoder_attn=model.decoder(decoder_output,decoder_hidden,encoder_outputs,src_masking)

      if attn_list is None:
        attn_list=decoder_attn
      else:
        attn_list=torch.cat((attn_list,decoder_attn),0)
      
      #get decoder next time input
      decoder_output=torch.argmax(F.softmax(decoder_output,dim=1),dim=1)
      token_indexes.append(decoder_output.item())

      if decoder_output.item()==TRG_Field.vocab.stoi[TRG_Field.eos_token]:
        break

    predict_tokens=[TRG.vocab.itos[t_id] for t_id in token_indexes]
    return predict_tokens[1:],attn_list

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
#define plot translation lang attn weight
def display_attnetion(sentences,translated,attention):
  fig=plt.figure(figsize=(10,10))
  ax=fig.add_subplot(1,1,1)

  ax.matshow(attention,cmap='bone')
  ax.tick_params(labelsize=15)
  ax.set_xticklabels(['']+['<sos>']+[t.lower() for t in sentences]+['<eos>'],rotation=0.45)
  ax.set_yticklabels(['']+translated)

  #set tick locate
  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()
  plt.close()

In [None]:
from torchtext.data.metrics import bleu_score
def compute_bleu(model,data,SRC_Field,TRG_Field,device):
  trans_corpus=[]
  ref_corpus=[]

  for data in data:
    src_tensors=vars(data)['src']
    trg_tensors=vars(data)['trg']

    predict_tensor,_=ModelTranslation(model,src_tensors,SRC_Field,TRG_Field,device)

    trans_corpus.append(predict_tensor[:-1])
    ref_corpus.append([trg_tensors])

  return bleu_score(trans_corpus,ref_corpus)

ModuleNotFoundError: ignored

In [None]:
#test language
examples_id=3
src_example=vars(test_data.examples[examples_id])['src']
trg_example=vars(test_data.examples[examples_id])['trg']

#translated
trans_tokens,predict_attn=ModelTranlation(model,src_example,SRC,TRG,device)
test_bleu=compute_bleu(model,test_data,SRC,TRG,device)
print(f'Source sentences:{src_example}')
print(f'translation Target sentences:{trg_example}')
print(f'Machine translation:{trans_tokens}')
print(f'Model belu score:{test_bleu}')