Attention Based Sequence to Sequence Model.(German-english)

In [4]:
import os
os.chdir('./gdrive/MyDrive/nl_prac')
os.getcwd()

'/content/gdrive/MyDrive/nl_prac'

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext
from torch.utils.data import RandomSampler

import spacy
import numpy as np
from nlp_utils import Model_pipeline,set_rnd_seed,init_model_weights,count_parameters

import os
import time
import tqdm
import random

In [20]:
set_rnd_seed(1234)


#set device
device=torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
spacy.require_gpu()
#load German & english spacy tokenizer
de_nlp=spacy.load('de_core_news_sm')
en_nlp=spacy.load('en_core_web_sm')

Build text *tokenizer* func

In [13]:
def en_tokenizer(text):
  return [t.text for t in en_nlp.tokenizer(text)]
def de_tokenizer(text):
  return [t.text for t in de_nlp.tokenizer(text)]

# **sequential data的處理步驟**


1.   進行tokenize
2.   建立各Language的vocabulary(涵蓋special token-<sos>,<eos>,<pad>,<unk>)
3.   將一個sequence加入sos,eos special token
4.   轉換成index


In [14]:
#build source sents & target field
SRC=torchtext.data.Field(init_token='<sos>',eos_token='<eos>',tokenize=de_tokenizer)
TRG=torchtext.data.Field(init_token='<sos>',eos_token='<eos>',tokenize=en_tokenizer)
#load data
train_data,val_data,test_data=torchtext.datasets.Multi30k.splits(('.de','.en'),(SRC,TRG))
print(f'Train exmaples num:{len(train_data)}')
print(f'Val exmaples num:{len(val_data)}')
print(f'test exmaples num:{len(test_data)}')
print(f'One example from Train data:{train_data[0].src}')



downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:03<00:00, 322kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 90.9kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 77.5kB/s]


Train exmaples num:29000
Val exmaples num:1014
test exmaples num:1000
One example from Train data:['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'im', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.']


In [15]:
#build vocab
SRC.build_vocab(train_data,min_freq=2,vectors='glove.42B.300d')
TRG.build_vocab(train_data,min_freq=2,vectors='glove.42B.300d')

.vector_cache/glove.42B.300d.zip: 1.88GB [05:56, 5.27MB/s]                            
100%|█████████▉| 1917135/1917494 [05:01<00:00, 7376.54it/s]

In [16]:
print(f'SRC vocab size:{len(SRC.vocab)}')
print(f'TRG vocab size:{len(TRG.vocab)}')
#set vocab size embedding weight for glove
SRC.vocab.set_vectors(SRC.vocab.stoi,SRC.vocab.vectors,dim=300)
TRG.vocab.set_vectors(TRG.vocab.stoi,TRG.vocab.vectors,dim=300)
print(f'SRC glove embedding size:{SRC.vocab.vectors.size()}')
print(f'TRG glove embedding size:{TRG.vocab.vectors.size()}')

SRC vocab size:8014
TRG vocab size:6191
SRC glove embedding size:torch.Size([8014, 300])
TRG glove embedding size:torch.Size([6191, 300])


In [18]:
#set data iterator
Batch_size=256
train_iter=torchtext.data.BucketIterator(train_data,batch_size=Batch_size,shuffle=True,device=device)
val_iter,test_iter=torchtext.data.BucketIterator.splits((val_data,test_data),batch_size=Batch_size,device=device)



# **Encoder(Bidirectional)架構**
**模型架構**
*   輸入-每個時間點的token id
*   輸出-每個時間點的h_state,最後一個時間點的不同layer之h_state

**演算流程:**

對於每個time step的embedding vector，rnn layer來extract當前word的h_state，然後if 雙向RNN，則是Output foward,backward的concat hid

In [21]:
#build Encoder Model
class BiEncoder(nn.Module):
  def __init__(self,input_dim,hid_dim,n_layers,dropout_rate,pretrain_embed=None):
    super(BiEncoder,self).__init__()
    self.input_dim=input_dim
    self.hid_dim=hid_dim
    self.n_layers=n_layers

    #determined use embed layer whether is pretrained weight or not
    if pretrain_embed is None:
      self.embed=nn.Embedding(input_dim,hid_dim)
    else:
      self.embed=nn.Embedding.from_pretrained(pretrain_embed)
    self.rnn_layer=nn.GRU(hid_dim,hid_dim,n_layers,dropout=dropout_rate,bidirectional=True)
    self.linear_layer=nn.Linear(hid_dim*2,hid_dim)
    self.tanh=nn.Tanh()
    self.dropout=nn.Dropout(dropout_rate)
  def forward(self,input_tensors):
    #input tensor shape=[seqL,bs]
    embed_input=self.dropout(self.embed(input_tensors))
    
    outputs,h_state=self.rnn_layer(embed_input)
    #output shape=[seq_len,bs,hid_dim*2]
    #h_state shape=[layer_num,direction,bs,hid_dim]
    #concat forward & backward last layer
    h_state=self.tanh(self.linear_layer(torch.cat((h_state[-2,:,:],h_state[-1,:,:]),dim=1)))
    return outputs,h_state.unsqueeze(0)

# **Attention match layer**
**模型架構**
*   輸入-decoder前一次的h_state,encoder每個time step的Repr.(hidden_output)
*   輸出-每個time step的attention weight


  演算流程:  
      將decoder的h_state與每個Repr.的concatenate進行降維，隨後再讓他與一個weight matrix進行inner product，得到的wieght即為前一次h_state對encoder output中的每個Repr之關注程度。


In [22]:
#build attention Model
class Attention(nn.Module):
  def __init__(self,hid_dim):
    super(Attention,self).__init__()
    self.attn_layer=nn.Linear(2*hid_dim+hid_dim,hid_dim)
    self.v=nn.Linear(hid_dim,1,bias=False)
    self.softmax=nn.Softmax(dim=1)
    self.tanh=nn.Tanh()
  def forward(self,decoder_hidden,encoder_outputs):
    batch_size=encoder_outputs.size(1)
    seqLen=encoder_outputs.size(0)

    decoder_hidden=decoder_hidden.unsqueeze(1)#insert dim shape=[bs,1,hid_dim]
    decoder_hidden=decoder_hidden.repeat(1,seqLen,1)

    #concat decoder_hidden & encoder_outputs
    #shape=[bs,seqLen,encoder_hid+decoder_hid]
    attn_hid=torch.cat((decoder_hidden,encoder_outputs.permute(1,0,2)),dim=2)
    #non-linear transform
    attn_hid=self.tanh(self.attn_layer(attn_hid))
    #compute attnetion weight
    #shape=[bs,seqLen]
    attn_weight=self.softmax(self.v(attn_hid).squeeze(2))

    return attn_weight

# **建立Decoder**

**Decoder架構**
*   輸入-前一個時間點decoder輸出的token之Embedding,h_state以及關注encoder ouutput的attention weight,encoder output
*   輸出-當前時間點=的word Dist.以及hidden_state

 
 **演算過程**:
  將輸入word的Embedding與Context(將encoder output藉由attention weight來weight sum)進行concat然後輸入至rnn_layer中，隨後將rnn layer的hidden_state與input word embedding,context vector送進linear classifier

In [24]:
class Decoder(nn.Module):
  def __init__(self,input_dim,hid_dim,output_dim,n_layers,dropout_rate,pretrain_embed=None):
    super(Decoder,self).__init__()
    self.input_dim=input_dim
    self.hid_dim=hid_dim
    self.output_dim=output_dim
    self.n_layers=n_layers

    #determined use embed layer whether is pretrained weight or not
    if pretrain_embed is None:
      self.embed=nn.Embedding(input_dim,hid_dim)
    else:
      self.embed=nn.Embedding.from_pretrained(pretrain_embed)
    self.rnn_layer=nn.GRU(hid_dim*3,hid_dim,n_layers,dropout=dropout_rate)
    self.fc=nn.Linear(hid_dim+hid_dim+hid_dim*2,output_dim)
  def forward(self,input_tensors,hidden_state,attn_weight,encoder_outputs):
    input_embed=self.embed(input_tensors)
    context_vector=torch.bmm(attn_weight.unsqueeze(dim=1),encoder_outputs)[:,0,:]
    #compute input rnn tensor and insert dim0
    input_tensor=torch.cat((input_embed,context_vector),dim=1).unsqueeze(0)
    
    outputs,h_state=self.rnn_layer(input_tensor,hidden_state)
    #output shape=[bs,output_dim]
    outputs=self.fc(torch.cat((input_embed,context_vector,outputs[0]),dim=1))
    return outputs,h_state

# **Sequence to Sequence Model(Coditional Generation)**

模型架構


*   輸入-source sentences index.
*   輸出-target sentences index.

演算流程
 

1.   將src sents 透過encoder進行encoding,得到該sent每個word的Repr. 以及 最後一個time step的hidden state
2.   將context vector視為最初要輸入至decoder的hidden state，sos token id則設置為第一個要輸入至decoder的word token
3.   以下流程則是反覆iter seqlen+1次
    

*   使用attention layer計算前一次h_state與encoder output的attention
*   將得到的attention weight與encoder output進行weighted sum得到context vector
*   將context vector、前一次decoder 預測的詞彙之word embedding & h_sate輸入至decoder中
*   將當前decoder輸出的h_state 與前一次的word embedding以及context vector輸入至linear classifier
*   將Linear classifier的output以及當前計算的h_state當作是下一個decoder要輸入的word,h_state




 

In [25]:
class AttnSeq2Seq(nn.Module):
  def __init__(self,encoder,decoder,attner,device,pad_idx):
    super(AttnSeq2Seq,self).__init__()
    self.encoder=encoder
    self.decoder=decoder
    self.attner=attner
    self.device=device
    self.criterion=nn.CrossEntropyLoss(ignore_index=pad_idx)
    assert self.encoder.hid_dim==self.decoder.hid_dim
  def forward(self,src,trg=None,teach_ratio=0.5):
    seqLen=trg.size(0)
    Batch_size=trg.size(1)
    OutputVocab=self.decoder.output_dim
    preds=torch.zeros(seqLen,Batch_size,OutputVocab,device=self.device)
    #get seq hidden vector and Context vector
    encoder_outputs,final_hidden=self.encoder(src)

    decoder_hidden=final_hidden
    decoder_input=trg[0,:]
    for ti in range(1,seqLen):
      #compute attention weight
      #shape=[bs,seqLen]
      attn_weight=self.attner(decoder_hidden[-1,:,:],encoder_outputs)
      #output vector,shape=[bs,vocab]
      #decoder hidden,shape=[n_layers,bs,hid_dim]
      decoder_outputs,decoder_hidden=self.decoder(decoder_input,decoder_hidden,attn_weight,encoder_outputs.permute(1,0,2))
      preds[ti]=decoder_outputs
      
      #determined next input using teacher forcing or not
      teach_force=True if random.random()<teach_ratio else False
      top1=decoder_outputs.argmax(dim=1)
      decoder_input=trg[ti] if teach_force else top1

    loss=None
    if trg is not None:
      #output=[seqLen*Bs,class_num]
      #labels=[seqlen*Bs,]
      outputs=preds[1:,:,:].reshape(-1,OutputVocab)
      labels=trg[1:,:].reshape(-1)
      loss=self.criterion(outputs,labels)
    #preds shape=[seq_len,bs,vocab_dim]
    return preds,loss
  def getPredict(self,src_tensors,trg_initTokenId,end_tokenId):
    preds=[]

    #get hidden_vector,context vector
    encoder_outputs,final_hidden=self.encoder(src_tensors)
    decoder_hidden=final_hidden
    decoder_input=torch.tensor([trg_initTokenId],device=self.device)
    
    while True:
      #compute attention weight
      attn_weight=self.attner(decoder_hidden[-1,:,:],encoder_outputs)
      
      #output
      decoder_outputs,decoder_hidden=self.decoder(decoder_input,decoder_hidden,attn_weight,encoder_outputs.permute(1,0,2))

      top1=decoder_outputs.argmax(dim=1)
      decoder_input=top1
      #append to preds
      preds.append(decoder_input.item())
      if decoder_input.item()==end_tokenId:
        break
    #return list of token id
    return preds

In [33]:
def get_evaluate(model,eval_data,de_field,en_field,device):
  #random sampling example from eval_data
  idx=random.choice(range(0,len(eval_data)))
  eval_example=eval_data.examples[idx]
  src_sents=['<sos>']+eval_example.src+['<eos>']
  trg_sents=eval_example.trg
  #convert tensors shape=[seqLen,batch]
  src_tensors=de_field.numericalize([src_sents],device=device)
  print(f'Origin eval sents:{src_sents}')

  #translate sent
  model.eval()
  with torch.no_grad():
    pred_index=model.getPredict(src_tensors,en_field.vocab.stoi['<sos>'],en_field.vocab.stoi['<eos>'])
  pred_sent=[en_field.vocab.itos[id] for id in pred_index if en_field.vocab.itos[id]!='<eos>']

  print(f'Original target sents:{trg_sents}')
  print(f'Translated target sents:{pred_sent}')

In [43]:
#define Model
encoder_config={'input_dim':len(SRC.vocab),'hid_dim':256,
        'n_layers':2,'dropout_rate':0.3,
        'pretrain_embed':None}
decoder_config={'input_dim':len(TRG.vocab),'hid_dim':256,
        'output_dim':len(TRG.vocab),'n_layers':1,
        'dropout_rate':0,'pretrain_embed':None}
attner_config={'hid_dim':256}
EPOCHS=70
GRAD_NORM=1
pad_idx=TRG.vocab.stoi['<pad>']
Learning_rate=1e-3
model_dir='./MTmodel'
model_path='fra2eng_model.pt'

model_configs={
    'encoder':encoder_config,
    'decoder':decoder_config,
    'attner':attner_config,
    'pad_idx':pad_idx,
}

#build Model
encoder_model=BiEncoder(**encoder_config)
decoder_model=Decoder(**decoder_config)
attn_model=Attention(**attner_config)
model=AttnSeq2Seq(encoder_model,decoder_model,attn_model,device,pad_idx=pad_idx)
model.to(device)
model.apply(init_model_weights)
#build optimizer &loss func
optimizer=torch.optim.Adam(model.parameters(),lr=Learning_rate)

print('Model total params:{}'.format(count_parameters(model)))

Model total params:13070895


In [44]:
#build pipeline
train_pipe=Model_pipeline(model,train_iter,optimizer,None,val_iter,model_configs)
train_pipe.amp_training(EPOCHS,model_dir,'src','trg',max_norm=1,teach_ratio=0.5,per_ep_eval=0)




Model dir already existed
[1/70] training loss:                   5.351160438437211
Start save Model to dir:./MTmodel



EPOCHS:   1%|▏         | 1/70 [00:31<35:49, 31.15s/it][A

Save Model success!
[2/70] training loss:                   4.425472556499013
Start save Model to dir:./MTmodel



EPOCHS:   3%|▎         | 2/70 [01:02<35:22, 31.21s/it][A

Save Model success!
[3/70] training loss:                   4.130690794242056
Start save Model to dir:./MTmodel



EPOCHS:   4%|▍         | 3/70 [01:33<34:43, 31.10s/it][A

Save Model success!
[4/70] training loss:                   3.8972834746042886
Start save Model to dir:./MTmodel



EPOCHS:   6%|▌         | 4/70 [02:04<34:16, 31.16s/it][A

Save Model success!
[5/70] training loss:                   3.6236027562827395
Start save Model to dir:./MTmodel



EPOCHS:   7%|▋         | 5/70 [02:35<33:46, 31.18s/it][A

Save Model success!
[6/70] training loss:                   3.3036988597167167
Start save Model to dir:./MTmodel



EPOCHS:   9%|▊         | 6/70 [03:07<33:25, 31.34s/it][A

Save Model success!
[7/70] training loss:                   2.9751720679433724
Start save Model to dir:./MTmodel



EPOCHS:  10%|█         | 7/70 [03:38<32:52, 31.31s/it][A

Save Model success!
[8/70] training loss:                   2.701392546034696
Start save Model to dir:./MTmodel



EPOCHS:  11%|█▏        | 8/70 [04:09<32:17, 31.25s/it][A

Save Model success!
[9/70] training loss:                   2.474060853322347
Start save Model to dir:./MTmodel



EPOCHS:  13%|█▎        | 9/70 [04:41<31:50, 31.32s/it][A

Save Model success!
[10/70] training loss:                   2.268858304149226
Start save Model to dir:./MTmodel



EPOCHS:  14%|█▍        | 10/70 [05:13<31:35, 31.60s/it][A

Save Model success!
[11/70] training loss:                   2.0678740060120298
Start save Model to dir:./MTmodel



EPOCHS:  16%|█▌        | 11/70 [05:45<31:02, 31.56s/it][A

Save Model success!
[12/70] training loss:                   1.9359404235555415
Start save Model to dir:./MTmodel



EPOCHS:  17%|█▋        | 12/70 [06:16<30:26, 31.49s/it][A

Save Model success!
[13/70] training loss:                   1.8277555244010792
Start save Model to dir:./MTmodel



EPOCHS:  19%|█▊        | 13/70 [06:47<29:48, 31.37s/it][A

Save Model success!
[14/70] training loss:                   1.7079379903642755
Start save Model to dir:./MTmodel



EPOCHS:  20%|██        | 14/70 [07:18<29:12, 31.30s/it][A

Save Model success!
[15/70] training loss:                   1.6187288844794558
Start save Model to dir:./MTmodel



EPOCHS:  21%|██▏       | 15/70 [07:49<28:39, 31.27s/it][A

Save Model success!
[16/70] training loss:                   1.5295341046232926
Start save Model to dir:./MTmodel



EPOCHS:  23%|██▎       | 16/70 [08:21<28:08, 31.27s/it][A

Save Model success!
[17/70] training loss:                   1.4728523105905766
Start save Model to dir:./MTmodel



EPOCHS:  24%|██▍       | 17/70 [08:52<27:39, 31.30s/it][A

Save Model success!
[18/70] training loss:                   1.420649786790212
Start save Model to dir:./MTmodel



EPOCHS:  26%|██▌       | 18/70 [09:23<27:05, 31.26s/it][A

Save Model success!
[19/70] training loss:                   1.3535083605532061
Start save Model to dir:./MTmodel



EPOCHS:  27%|██▋       | 19/70 [09:54<26:33, 31.24s/it][A

Save Model success!
[20/70] training loss:                   1.3016763925552368
Start save Model to dir:./MTmodel



EPOCHS:  29%|██▊       | 20/70 [10:26<26:05, 31.31s/it][A

Save Model success!
[21/70] training loss:                   1.2204064779114305
Start save Model to dir:./MTmodel



EPOCHS:  30%|███       | 21/70 [10:57<25:31, 31.25s/it][A

Save Model success!
[22/70] training loss:                   1.1711236918181704
Start save Model to dir:./MTmodel



EPOCHS:  31%|███▏      | 22/70 [11:28<24:58, 31.21s/it][A

Save Model success!
[23/70] training loss:                   1.138314986438082
Start save Model to dir:./MTmodel



EPOCHS:  33%|███▎      | 23/70 [11:59<24:26, 31.21s/it][A

Save Model success!
[24/70] training loss:                   1.0993932909087132
Start save Model to dir:./MTmodel



EPOCHS:  34%|███▍      | 24/70 [12:30<23:53, 31.16s/it][A

Save Model success!
[25/70] training loss:                   1.0670770364895201
Start save Model to dir:./MTmodel



EPOCHS:  36%|███▌      | 25/70 [13:02<23:27, 31.28s/it][A

Save Model success!
[26/70] training loss:                   0.9904010865771979
Start save Model to dir:./MTmodel



EPOCHS:  37%|███▋      | 26/70 [13:33<22:57, 31.31s/it][A

Save Model success!
[27/70] training loss:                   0.9734097719192505
Start save Model to dir:./MTmodel



EPOCHS:  39%|███▊      | 27/70 [14:04<22:22, 31.21s/it][A

Save Model success!
[28/70] training loss:                   0.9673836173718435
Start save Model to dir:./MTmodel



EPOCHS:  40%|████      | 28/70 [14:35<21:49, 31.17s/it][A

Save Model success!
[29/70] training loss:                   0.8922524426067084
Start save Model to dir:./MTmodel



EPOCHS:  41%|████▏     | 29/70 [15:07<21:19, 31.21s/it][A

Save Model success!
[30/70] training loss:                   0.9008331445225498
Start save Model to dir:./MTmodel



EPOCHS:  43%|████▎     | 30/70 [15:38<20:47, 31.19s/it][A

Save Model success!
[31/70] training loss:                   0.833664123426404
Start save Model to dir:./MTmodel



EPOCHS:  44%|████▍     | 31/70 [16:09<20:17, 31.21s/it][A

Save Model success!
[32/70] training loss:                   0.8212109140136785
Start save Model to dir:./MTmodel



EPOCHS:  46%|████▌     | 32/70 [16:40<19:47, 31.24s/it][A

Save Model success!
[33/70] training loss:                   0.8044969575446949
Start save Model to dir:./MTmodel



EPOCHS:  47%|████▋     | 33/70 [17:12<19:21, 31.38s/it][A

Save Model success!
[34/70] training loss:                   0.7800719884403965
Start save Model to dir:./MTmodel



EPOCHS:  49%|████▊     | 34/70 [17:43<18:48, 31.34s/it][A

Save Model success!
[35/70] training loss:                   0.7604020180409414
Start save Model to dir:./MTmodel



EPOCHS:  50%|█████     | 35/70 [18:15<18:21, 31.46s/it][A

Save Model success!
[36/70] training loss:                   0.7224832773208618
Start save Model to dir:./MTmodel



EPOCHS:  51%|█████▏    | 36/70 [18:46<17:43, 31.29s/it][A

Save Model success!
[37/70] training loss:                   0.699154008375971
Start save Model to dir:./MTmodel



EPOCHS:  53%|█████▎    | 37/70 [19:17<17:11, 31.27s/it][A

Save Model success!
[38/70] training loss:                   0.6906261846684573
Start save Model to dir:./MTmodel



EPOCHS:  54%|█████▍    | 38/70 [19:48<16:38, 31.20s/it][A

Save Model success!
[39/70] training loss:                   0.6872122716485408
Start save Model to dir:./MTmodel



EPOCHS:  56%|█████▌    | 39/70 [20:19<16:06, 31.19s/it][A

Save Model success!
[40/70] training loss:                   0.631361142846576
Start save Model to dir:./MTmodel



EPOCHS:  57%|█████▋    | 40/70 [20:51<15:38, 31.27s/it][A

Save Model success!
[41/70] training loss:                   0.6250406725887667
Start save Model to dir:./MTmodel



EPOCHS:  59%|█████▊    | 41/70 [21:22<15:06, 31.25s/it][A

Save Model success!
[42/70] training loss:                   0.6124159615290793
Start save Model to dir:./MTmodel



EPOCHS:  60%|██████    | 42/70 [21:53<14:33, 31.21s/it][A

Save Model success!
[43/70] training loss:                   0.6027265371460664
Start save Model to dir:./MTmodel



EPOCHS:  61%|██████▏   | 43/70 [22:24<14:03, 31.23s/it][A

Save Model success!
[44/70] training loss:                   0.5668498214922453
Start save Model to dir:./MTmodel



EPOCHS:  63%|██████▎   | 44/70 [22:56<13:33, 31.27s/it][A

Save Model success!
[45/70] training loss:                   0.5545815679064968
Start save Model to dir:./MTmodel



EPOCHS:  64%|██████▍   | 45/70 [23:27<13:01, 31.24s/it][A

Save Model success!
[46/70] training loss:                   0.5269413917211064
Start save Model to dir:./MTmodel



EPOCHS:  66%|██████▌   | 46/70 [23:58<12:30, 31.27s/it][A

Save Model success!
[47/70] training loss:                   0.5155393238130369
Start save Model to dir:./MTmodel



EPOCHS:  67%|██████▋   | 47/70 [24:30<12:00, 31.31s/it][A

Save Model success!
[48/70] training loss:                   0.5116030273207447
Start save Model to dir:./MTmodel



EPOCHS:  69%|██████▊   | 48/70 [25:01<11:30, 31.39s/it][A

Save Model success!
[49/70] training loss:                   0.4730681502505353
Start save Model to dir:./MTmodel



EPOCHS:  70%|███████   | 49/70 [25:33<10:59, 31.41s/it][A

Save Model success!
[50/70] training loss:                   0.4639512452117184
Start save Model to dir:./MTmodel



EPOCHS:  71%|███████▏  | 50/70 [26:04<10:26, 31.32s/it][A

Save Model success!
[51/70] training loss:                   0.45877804986217563
Start save Model to dir:./MTmodel



EPOCHS:  73%|███████▎  | 51/70 [26:35<09:55, 31.32s/it][A

Save Model success!
[52/70] training loss:                   0.4675797150846113
Start save Model to dir:./MTmodel



EPOCHS:  74%|███████▍  | 52/70 [27:06<09:20, 31.16s/it][A

Save Model success!
[53/70] training loss:                   0.4509265945668806
Start save Model to dir:./MTmodel



EPOCHS:  76%|███████▌  | 53/70 [27:37<08:49, 31.14s/it][A

Save Model success!
[54/70] training loss:                   0.4342719596206096
Start save Model to dir:./MTmodel



EPOCHS:  77%|███████▋  | 54/70 [28:08<08:18, 31.13s/it][A

Save Model success!
[55/70] training loss:                   0.42633265910441415
Start save Model to dir:./MTmodel



EPOCHS:  79%|███████▊  | 55/70 [28:39<07:46, 31.10s/it][A

Save Model success!
[56/70] training loss:                   0.4056122948726018
Start save Model to dir:./MTmodel



EPOCHS:  80%|████████  | 56/70 [29:10<07:15, 31.09s/it][A

Save Model success!
[57/70] training loss:                   0.3803158120105141
Start save Model to dir:./MTmodel



EPOCHS:  81%|████████▏ | 57/70 [29:41<06:43, 31.06s/it][A

Save Model success!
[58/70] training loss:                   0.37167666577979136
Start save Model to dir:./MTmodel



EPOCHS:  83%|████████▎ | 58/70 [30:13<06:13, 31.14s/it][A

Save Model success!
[59/70] training loss:                   0.37454060059890415
Start save Model to dir:./MTmodel



EPOCHS:  84%|████████▍ | 59/70 [30:43<05:41, 31.06s/it][A

Save Model success!
[60/70] training loss:                   0.3656554904423262
Start save Model to dir:./MTmodel



EPOCHS:  86%|████████▌ | 60/70 [31:14<05:10, 31.02s/it][A

Save Model success!
[61/70] training loss:                   0.36635740336618927
Start save Model to dir:./MTmodel



EPOCHS:  87%|████████▋ | 61/70 [31:45<04:38, 30.98s/it][A

Save Model success!
[62/70] training loss:                   0.34608043612618195
Start save Model to dir:./MTmodel



EPOCHS:  89%|████████▊ | 62/70 [32:16<04:07, 30.99s/it][A

Save Model success!
[63/70] training loss:                   0.36131198665029124
Start save Model to dir:./MTmodel



EPOCHS:  90%|█████████ | 63/70 [32:47<03:36, 30.97s/it][A

Save Model success!
[64/70] training loss:                   0.3283959601009101
Start save Model to dir:./MTmodel



EPOCHS:  91%|█████████▏| 64/70 [33:18<03:05, 30.94s/it][A

Save Model success!
[65/70] training loss:                   0.3165012492161048
Start save Model to dir:./MTmodel



EPOCHS:  93%|█████████▎| 65/70 [33:49<02:34, 30.92s/it][A

Save Model success!
[66/70] training loss:                   0.3021888757745425
Start save Model to dir:./MTmodel



EPOCHS:  94%|█████████▍| 66/70 [34:20<02:03, 30.91s/it][A

Save Model success!
[67/70] training loss:                   0.29470028166185347
Start save Model to dir:./MTmodel



EPOCHS:  96%|█████████▌| 67/70 [34:51<01:33, 31.02s/it][A

Save Model success!
[68/70] training loss:                   0.3011055354747856
Start save Model to dir:./MTmodel



EPOCHS:  97%|█████████▋| 68/70 [35:22<01:01, 30.97s/it][A

Save Model success!
[69/70] training loss:                   0.2838251220813969
Start save Model to dir:./MTmodel



EPOCHS:  99%|█████████▊| 69/70 [35:53<00:30, 30.94s/it][A

Save Model success!
[70/70] training loss:                   0.29845728280774336
Start save Model to dir:./MTmodel



EPOCHS: 100%|██████████| 70/70 [36:24<00:00, 31.21s/it]

Save Model success!





([5.351160438437211,
  4.425472556499013,
  4.130690794242056,
  3.8972834746042886,
  3.6236027562827395,
  3.3036988597167167,
  2.9751720679433724,
  2.701392546034696,
  2.474060853322347,
  2.268858304149226,
  2.0678740060120298,
  1.9359404235555415,
  1.8277555244010792,
  1.7079379903642755,
  1.6187288844794558,
  1.5295341046232926,
  1.4728523105905766,
  1.420649786790212,
  1.3535083605532061,
  1.3016763925552368,
  1.2204064779114305,
  1.1711236918181704,
  1.138314986438082,
  1.0993932909087132,
  1.0670770364895201,
  0.9904010865771979,
  0.9734097719192505,
  0.9673836173718435,
  0.8922524426067084,
  0.9008331445225498,
  0.833664123426404,
  0.8212109140136785,
  0.8044969575446949,
  0.7800719884403965,
  0.7604020180409414,
  0.7224832773208618,
  0.699154008375971,
  0.6906261846684573,
  0.6872122716485408,
  0.631361142846576,
  0.6250406725887667,
  0.6124159615290793,
  0.6027265371460664,
  0.5668498214922453,
  0.5545815679064968,
  0.5269413917211064,

# Inference Stage 
### 德文翻譯英文成果

In [46]:
#loading Model in given path
# model_ckp=load_modelState(model_dir,model_path,device)
# model_configs=load_modelState(model_dir,config_path,device)
# encoder_model=BiEncoder(**model_configs['encoder'])
# decoder_model=Decoder(**model_configs['decoder'])
# attn_model=Attention(**model_configs['attner'])
# model=AttnSeq2Seq(encoder_model,decoder_model,attn_model,device)
# model.load_state_dict(model_ckp['model'])

#using test_data to evaluate Model
get_evaluate(model,test_data,SRC,TRG,device)

Origin eval sents:['<sos>', 'Ein', 'Mann', 'lädt', 'gebackene', 'Brezeln', 'auf', 'einen', 'Koffer-LKW', '.', '<eos>']
Original target sents:['A', 'man', 'is', 'loading', 'a', 'box', 'truck', 'with', 'lots', 'of', 'baked', 'pretzels', '.']
Translated target sents:['A', 'man', 'is', 'loading', '<unk>', 'makeup', 'in', 'a', 'metal', 'fence', '.']


可修該的不同方法
1.init Embedding,GLoVe,fastText
2.Gated Rucurrent Unit
 Layer-2,3
 hid_dim-256
3.輸出到Decoder的context vector 的策略
  3.1.將第一層,最後一層layer的forward,backward h_state進行concat,然後non-linear transform (tanh)
  3.2採用forward or backward的每層layer h_state進行concat transform or pooling(Max,average)
  3.3只採用backward or forward 的第一、最後一層layer h_state