<a href="https://colab.research.google.com/github/yinghao1019/NLP_and_DL_practice/blob/master/NMT_jointLearn(Prac).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Attention Based Sequence to Sequence Model.(German-english)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext
from torch.utils.data import RandomSampler

import spacy
import numpy as np

import os
import time
import tqdm
import random

In [None]:
#set device
device=torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
#load German & english spacy tokenizer
de_nlp=spacy.load('de')
en_nlp=spacy.load('en')

Build text *tokenizer* func

In [None]:
def en_tokenizer(text):
  return [t.text for t in en_nlp.tokenizer(text)]
def de_tokenizer(text):
  return [t.text for t in de_nlp.tokenizer(text)]

# **sequencial data的處理步驟**


1.   進行tokenize
2.   建立各Language的vocabulary(涵蓋special token-<sos>,<eos>,<pad>,<unk>)
3.   將一個sequence加入sos,eos special token
4.   轉換成index


In [None]:
#build source sents & target field
SRC=torchtext.data.Field(init_token='<sos>',eos_token='<eos>',tokenize=de_tokenizer)
TRG=torchtext.data.Field(init_token='<sos>',eos_token='<eos>',tokenize=en_tokenizer)
#load data
train_data,val_data,test_data=torchtext.datasets.Multi30k.splits(('.de','.en'),(SRC,TRG))
print(f'Train exmaples num:{len(train_data)}')
print(f'Val exmaples num:{len(val_data)}')
print(f'test exmaples num:{len(test_data)}')
print(f'One example from Train data:{train_data[0].src}')

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 856kB/s] 


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 224kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 216kB/s]


Train exmaples num:29000
Val exmaples num:1014
test exmaples num:1000
One example from Train data:['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'im', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.']


In [None]:
#build vocab
SRC.build_vocab(train_data,min_freq=2,vectors='glove.42B.300d')
TRG.build_vocab(train_data,min_freq=2,vectors='glove.42B.300d')

.vector_cache/glove.42B.300d.zip: 1.88GB [14:31, 2.15MB/s]                           
100%|█████████▉| 1916654/1917494 [03:50<00:00, 9086.25it/s]

In [None]:
print(f'SRC vocab size:{len(SRC.vocab)}')
print(f'TRG vocab size:{len(TRG.vocab)}')
#set vocab size embedding weight for glove
SRC.vocab.set_vectors(SRC.vocab.stoi,SRC.vocab.vectors,dim=300)
TRG.vocab.set_vectors(TRG.vocab.stoi,TRG.vocab.vectors,dim=300)
print(f'SRC glove embedding size:{SRC.vocab.vectors.size()}')
print(f'TRG glove embedding size:{TRG.vocab.vectors.size()}')

SRC vocab size:8014
TRG vocab size:6191
SRC glove embedding size:torch.Size([8014, 300])
TRG glove embedding size:torch.Size([6191, 300])


In [None]:
#set data iterator
Batch_size=256
train_iter,val_iter,test_iter=torchtext.data.BucketIterator.splits((train_data,val_data,test_data), batch_size=Batch_size,device=device)

# **Encoder(Bidirectional)架構**
**模型架構**
*   輸入-每個時間點的token id
*   輸出-每個時間點的h_state,最後一個時間點的不同layer之h_state

**演算流程:**

對於每個time step的embedding vector，rnn layer會計算出當前word的h_state，然後if 雙向RNN，則是Output foward,backward的concat hid

In [None]:
#build Encoder Model
class BiEncoder(nn.Module):
  def __init__(self,input_dim,hid_dim,n_layers,dropout_rate,pretrain_embed=None):
    super(BiEncoder,self).__init__()
    self.input_dim=input_dim
    self.hid_dim=hid_dim
    self.n_layers=n_layers

    #determined use embed layer whether is pretrained weight or not
    if pretrain_embed is None:
      self.embed=nn.Embedding(input_dim,hid_dim)
    else:
      self.embed=nn.Embedding.from_pretrained(pretrain_embed)
    self.rnn_layer=nn.GRU(hid_dim,hid_dim,n_layers,dropout=dropout_rate,bidirectional=True)
    self.linear_layer=nn.Linear(hid_dim*2,hid_dim)
    self.tanh=nn.Tanh()
    self.dropout=nn.Dropout(dropout_rate)
  def forward(self,input_tensors):
    #input tensor shape=[seqL,bs]
    embed_input=self.dropout(self.embed(input_tensors))
    
    outputs,h_state=self.rnn_layer(embed_input)
    #output shape=[seq_len,bs,hid_dim*2]
    #h_state shape=[layer_num,direction,bs,hid_dim]
    #concat forward & backward last layer
    h_state=self.tanh(self.linear_layer(torch.cat((h_state[-2,:,:],h_state[-1,:,:]),dim=1)))
    return outputs,h_state.unsqueeze(0)

# **Attention match layer**
**模型架構**
*   輸入-decoder前一次的h_state,encoder的每個time step hidden output
*   輸出-每個time step的attention weight

**演算流程:**

對於每一個time step，將前一次的decoder h_state進行concat&tranform(tanh)
然後再與一個vector(trainable)進行dot-product來得到基於前一次h_state的當前  time step attention weight,最後將每個時間點的attention weight通過softmax來輸出


In [None]:
#build attention Model
class Attention(nn.Module):
  def __init__(self,hid_dim):
    super(Attention,self).__init__()
    self.attn_layer=nn.Linear(2*hid_dim+hid_dim,hid_dim)
    self.v=nn.Linear(hid_dim,1,bias=False)
    self.softmax=nn.Softmax(dim=1)
    self.tanh=nn.Tanh()
  def forward(self,decoder_hidden,encoder_outputs):
    batch_size=encoder_outputs.size(1)
    seqLen=encoder_outputs.size(0)

    decoder_hidden=decoder_hidden.unsqueeze(1)#insert dim shape=[bs,1,hid_dim]
    decoder_hidden=decoder_hidden.repeat(1,seqLen,1)

    #concat decoder_hidden & encoder_outputs
    #shape=[bs,seqLen,encoder_hid+decoder_hid]
    attn_hid=torch.cat((decoder_hidden,encoder_outputs.permute(1,0,2)),dim=2)
    #non-linear transform
    attn_hid=self.tanh(self.attn_layer(attn_hid))
    #compute attnetion weight
    #shape=[bs,seqLen]
    attn_weight=self.softmax(self.v(attn_hid).squeeze(2))

    return attn_weight

# **建立Decoder**

**Decoder架構**
*   輸入-前一個時間點預測的word Embedding,h_state以及關注encoder ouutput的attention weight,encoder context word
*   輸出-當前時間點預測的word Dist. 以及hidden_state

 
 **演算過程**:
  將輸入word的Embedding與Context(將encoder output藉由attention weight來weight sum)進行concat然後輸入至rnn_layer中，隨後將rnn layer的hidden_state與input word embedding,context vector送進linear classifier

In [None]:
class Decoder(nn.Module):
  def __init__(self,input_dim,hid_dim,output_dim,n_layers,dropout_rate,pretrain_embed=None):
    super(Decoder,self).__init__()
    self.input_dim=input_dim
    self.hid_dim=hid_dim
    self.output_dim=output_dim
    self.n_layers=n_layers

    #determined use embed layer whether is pretrained weight or not
    if pretrain_embed is None:
      self.embed=nn.Embedding(input_dim,hid_dim)
    else:
      self.embed=nn.Embedding.from_pretrained(pretrain_embed)
    self.rnn_layer=nn.GRU(hid_dim*3,hid_dim,n_layers,dropout=dropout_rate)
    self.fc=nn.Linear(hid_dim+hid_dim+hid_dim*2,output_dim)
  def forward(self,input_tensors,hidden_state,attn_weight,encoder_outputs):
    input_embed=self.embed(input_tensors)
    context_vector=torch.bmm(attn_weight.unsqueeze(dim=1),encoder_outputs)[:,0,:]
    #compute input rnn tensor and insert dim0
    input_tensor=torch.cat((input_embed,context_vector),dim=1).unsqueeze(0)
    
    outputs,h_state=self.rnn_layer(input_tensor,hidden_state)
    #output shape=[bs,output_dim]
    outputs=self.fc(torch.cat((input_embed,context_vector,outputs[0]),dim=1))
    return outputs,h_state

# **Sequence to Sequence Model(Coditional Generation)**

模型架構


*   輸入-source sentences index.
*   輸出-target sentences index.

演算流程
 

1.   將src sents 透過encoder進行encoding,得到該sent每個word的Repr. 以及 最後一個time step的hidden state
2.   將context vector視為最初要輸入至decoder的hidden state，sos token id則設置維第一個要輸入至decoder的word
3.   以下流程則是反覆iter seqlen+1次
    

*   使用attention layer計算前一次h_state與encoder output的attention
*   將得到的attention weight與encoder output進行weighted sum得到context vector
*   將context vector、前一次decoder 預測的詞彙之word embedding & h_sate輸入至decoder中
*   將當前decoder輸出的h_state 與前一次的word embedding以及context vector輸入至linear classifier
*   將Linear classifier的output以及當前計算的h_state當作是下一個decoder要輸入的word,h_state




 

In [None]:
class AttnSeq2Seq(nn.Module):
  def __init__(self,encoder,decoder,attner,device):
    super(AttnSeq2Seq,self).__init__()
    self.encoder=encoder
    self.decoder=decoder
    self.attner=attner
    self.device=device
    
    assert self.encoder.hid_dim==self.decoder.hid_dim
  def forward(self,src_tensors,trg_tensors,teach_ratio):
    seqLen=trg_tensors.size(0)
    Batch_size=trg_tensors.size(1)
    OutputVocab=self.decoder.output_dim
    preds=torch.zeros(seqLen,Batch_size,OutputVocab,device=self.device)
    #get seq hidden vector and Context vector
    encoder_outputs,final_hidden=self.encoder(src_tensors)

    decoder_hidden=final_hidden
    decoder_input=trg_tensors[0,:]
    for ti in range(1,seqLen):
      #compute attention weight
      #shape=[bs,seqLen]
      attn_weight=self.attner(decoder_hidden[-1,:,:],encoder_outputs)
      #output vector,shape=[bs,vocab]
      #decoder hidden,shape=[n_layers,bs,hid_dim]
      decoder_outputs,decoder_hidden=self.decoder(decoder_input,decoder_hidden,attn_weight,encoder_outputs.permute(1,0,2))
      preds[ti]=decoder_outputs
      
      #determined next input using teacher forcing or not
      teach_force=True if random.random()<teach_ratio else False
      top1=decoder_outputs.argmax(dim=1)
      decoder_input=trg_tensors[ti] if teach_force else top1
    
    #preds shape=[seq_len,bs,vocab_dim]
    return preds
  def getPredict(self,src_tensors,trg_initTokenId,end_tokenId):
    preds=[]

    #get hidden_vector,context vector
    encoder_outputs,final_hidden=self.encoder(src_tensors)
    decoder_hidden=final_hidden
    decoder_input=torch.tensor([trg_initTokenId],device=self.device)
    
    while True:
      #compute attention weight
      attn_weight=self.attner(decoder_hidden[-1,:,:],encoder_outputs)
      
      #output
      decoder_outputs,decoder_hidden=self.decoder(decoder_input,decoder_hidden,attn_weight,encoder_outputs.permute(1,0,2))

      top1=decoder_outputs.argmax(dim=1)
      decoder_input=top1
      #append to preds
      preds.append(decoder_input.item())
      if decoder_input.item()==end_tokenId:
        break
    #return list of token id
    return preds

In [None]:
#set weight except bias value for N(0,1) value and bias weight for 0
def init_weights(m):
  for named,params in m.named_parameters():
    if 'weight' in named:
      torch.nn.init.normal_(params.data,0,0.01)
    else:
      torch.nn.init.constant_(params.data,0)


Define training Model process

In [None]:
encoder_config={'input_dim':len(SRC.vocab),'hid_dim':256,
        'n_layers':2,'dropout_rate':0.3,
        'pretrain_embed':None}
decoder_config={'input_dim':len(TRG.vocab),'hid_dim':256,
        'output_dim':len(TRG.vocab),'n_layers':1,
        'dropout_rate':0,'pretrain_embed':None}
attner_config={'hid_dim':256}
model_configs={
    'encoder':encoder_config,
    'decoder':decoder_config,
    'attner':attner_config,
}
EPOCHS=20
GRAD_NORM=1
Learning_rate=1e-3
model_dir='./MTmodel'
model_path='fra2eng_model.pt'
config_path='fra2eng_training_config.bin'
per_epoch_evaluate=5
#build Model
encoder_model=BiEncoder(**encoder_config)
decoder_model=Decoder(**decoder_config)
attn_model=Attention(**attner_config)
model=AttnSeq2Seq(encoder_model,decoder_model,attn_model,device)
model.to(device)
model.apply(init_weights)
#build optimizer &loss func
optimizer=torch.optim.Adam(model.parameters(),lr=Learning_rate)
criterion=nn.CrossEntropyLoss(ignore_index=TRG.vocab.stoi['<pad>'])

In [None]:
#save Model and model config
def save_model(model_dir,model_path,config_path,model,optimizer,configs,epochs):
  if not os.path.isdir(model_dir):
    print('Model dir is not existed')
    os.mkdir(model_dir)
  else:
    print('Model dir already existed')
  #save model state and config
  print(f'Save Model to dir:{model_dir}')
  model_state={'model':model.state_dict(),'optimizer':optimizer.state_dict(),'epochs':epochs}

  torch.save(model_state,os.path.join(model_dir,model_path))
  torch.save(configs,os.path.join(model_dir,config_path))
  print('Save Model success!')

#load Model
def load_model(model_dir,model_path,device):
  if not os.path.isdir(model_dir):
    raise Exception('Model dir is not existed')
  else:
    load_path=os.path.join(model_dir,model_path)
    try:
      state_dict=torch.load(load_path,map_location=device)
      print('Read model file is successed')
      return state_dict
    except:
      raise Exception('Model file is loss...')
  
#training Model stage
def training(model,train_iters,optimizer,criterion,teach_ratio):
  progress_bar=tqdm.tqdm(train_iters,desc='Iteration')
  epoch_loss=0
  for b in progress_bar:
    #fetch tensors
    src_tensors=b.src
    trg_tensors=b.trg
    preds=model(src_tensors,trg_tensors,teach_ratio)
    #compute loss
    loss=criterion(preds[1:,:,:].view(-1,preds.size(2)),trg_tensors[1:,:].view(-1))
    loss.backward()

    #clipping gradient
    nn.utils.clip_grad_norm_(model.parameters(),max_norm=1)
    #update params
    optimizer.step()
    optimizer.zero_grad()
    epoch_loss+=loss.item()
  progress_bar.close()
  return epoch_loss/len(train_iters)
def get_evaluate(model,eval_data,de_field,en_field,device):
  #random sampling example from eval_data
  idx=next(iter(RandomSampler(eval_data)))
  src_sents=['<sos>']+eval_data[idx].src+['<eos>']
  trg_sents=eval_data[idx].trg
  #convert tensors shape=[seqLen,batch]
  src_tensors=de_field.numericalize([src_sents],device=device)
  print(f'Origin eval sents:{src_sents}')
  print(f'Origin eval sents index:{src_tensors}')

  #translate sent
  model.eval()
  with torch.no_grad():
    pred_index=model.getPredict(src_tensors,en_field.vocab.stoi['<sos>'],en_field.vocab.stoi['<eos>'])
  pred_sent=[en_field.vocab.itos[id] for id in pred_index]

  print(f'Original target sents:{trg_sents}')
  print(f'Translated target sents:{pred_sent}')

可修該的不同方法
1.init Embedding,GLoVe,fastText
2.Gated Rucurrent Unit
 Layer-2,3
 hid_dim-256
3.輸出到Decoder的context vector 的策略
  3.1.將第一層,最後一層layer的forward,backward h_state進行concat,然後non-linear transform (tanh)
  3.2採用forward or backward的每層layer h_state進行concat transform or pooling(Max,average)
  3.3只採用backward or forward 的第一、最後一層layer h_state