<a href="https://colab.research.google.com/github/yinghao1019/NLP_and_DL_practice/blob/master/Conv_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Prepare Data

In [40]:
import torch
from torch import nn,optim
import torch.nn.functional as F
from torchtext.datasets import WMT14,Multi30k
from torchtext.data import Field,BucketIterator,metrics
import numpy as np
import matplotlib.pyplot
import spacy

import os
import tqdm
import random
import math
import time

In [41]:
#set random seed to fixed random number
Random_SEED=1234
random.seed(Random_SEED)
np.random.seed(Random_SEED)
torch.random.manual_seed(Random_SEED)
torch.cuda.manual_seed_all(Random_SEED)
if torch.backends.cudnn.is_available():
  torch.backends.cudnn.deterministic=True
#set gpu device
device=torch.device('cuda:0')
spacy.require_gpu()

True

In [42]:
#load tokenizer Model
de_tokenizer=spacy.load('de_core_news_sm')
# fr_tokenizer=spacy.load('fr_core_news_sm')
en_tokenizer=spacy.load('en_core_web_sm')

#build tokenize process
#English toknize process
def en_tokenize(text):
  return [token.text.lower() for token in en_tokenizer.tokenizer(text)]
#Gereman toknize process
def de_tokenize(text):
  return [token.text.lower() for token in de_tokenizer.tokenizer(text)]
#Franch toknize process
def fr_tokenize(text):
  return [token.text.lower() for token in fr_tokenizer.tokenizer(text)]

#build Field,we will translate English-German & English-Franch
TRG_en=Field(init_token='<sos>',eos_token='<eos>',batch_first=True,tokenize=en_tokenize)
SRC_de=Field(init_token='<sos>',eos_token='<eos>',batch_first=True,tokenize=de_tokenize)
# TRG_fr=Field(init_token='<sos>',eos_token='<eos>',tokenize=fr_tokenize)

#build data
train_data,val_data,test_data=Multi30k.splits(exts=('.de','.en'),fields=(SRC_de,TRG_en))
print(f'Dataset size:{len(train_data)}\n{len(val_data)}\n{len(test_data)}\n')

#display data examples info.
train_idx=random.choice(range(0,len(train_data)))
train_src=train_data.examples[train_idx].src
train_trg=train_data.examples[train_idx].trg
print(f'train src_sents:{train_src}\n train trg sents:{train_trg}')
#Build vocabulary
SRC_de.build_vocab(train_data,min_freq=2,specials=['<unk>','<pad>'])
TRG_en.build_vocab(train_data,min_freq=2,specials=['<unk>','<pad>'])




Dataset size:29000
1014
1000

train src_sents:['ein', 'actionfoto', 'bei', 'einem', 'roller-derby-spiel', 'mit', 'einer', 'spielerin', 'im', 'vordergrund', '.']
 train trg sents:['a', 'roller', 'derby', 'match', 'with', 'an', 'action', 'shot', 'of', 'a', 'derby', 'girl', 'in', 'the', 'foreground', '.']


In [43]:
#build Data iterator with bs128
Batch_size=128
train_iter,val_iter,test_iter=BucketIterator.splits((train_data,val_data,test_data),batch_sizes=(Batch_size,Batch_size,Batch_size),device=device)
print(TRG_en.vocab.stoi['<pad>'])

1




##Encoder implementation
![](https://github.com/bentrevett/pytorch-seq2seq/raw/9479fcb532214ad26fd4bda9fcf081a05e1aaf4e/assets/convseq2seq1.png)
圖片來源:  

Encoder每層Layer的說明-  
1.token abd position Embed:首先，我們會將input sentence的word idx轉換成word Embeddin，除此之外，為了加強每個token位置資訊的重要性，採用element-wise sum的方法將位置資訊的embedding加入至word Embedding.

2.linear layer:隨後我們將組合後的Embedding vector的embed_dim轉換成hidden dim.  

3.N*conv block:然後我們將轉後的Embedding vector餵至'conv block'後，藉由N個'conv block'會提取出word的重要資訊,此時output的vector就是conved vector.  

4.linear_layer:將轉換後的embedding feature之hid dim轉換成原來的embed_dim，此步驟的目的為由於後續會與Word Embedding進行residual connection,所以在dim部分必須一致

5.residul connection:將一開始該token的word Embedding與conved vector進行加總成為combined output

####而在Conv block中演算流程如下圖:
![](https://github.com/bentrevett/pytorch-seq2seq/raw/9479fcb532214ad26fd4bda9fcf081a05e1aaf4e/assets/convseq2seq2.png)  
圖片來源:

假設當前是採用\n*embed_dim的2d filter,以及1個block的情況下來說明運算流程:
1.在Input 部分:採用前述合併postion與word資訊的Embedding  
2.convolution layer:我們將使用n*Embedding dim的filter來extract feature_map，這邊要注意的是只能在seqLen那個方向上的dim進行移動，並且N的大小可以把它看作如同採用N-gram Model來extract Feature一樣。
Note:由於我們希望通過layer後的input ,output seqLen一致，所以採用padding的方式來進行調整。由於output須通過GLU activation func.如果為了讓input vector與output vector的dim一致，須將filter的channel數量設成hidden_dim的2倍才能達成條件。  
3.最後我們將output與Input在進行一次residual connection.  
4.第二個block的input則是吃第一個block的output，後續的block也是這樣。  


In [44]:
#build convolution block
class cnnEncoder(nn.Module):
  def __init__(self,input_dim,embed_dim,hid_dim,kernel_size,n_layers,max_length,drop_rate,device):
    super(cnnEncoder,self).__init__()
    #embed_layer attr
    self.input_dim=input_dim
    self.pos_dim=max_length
    self.embed_dim=embed_dim
    #conv block attr.
    self.n_layers=n_layers
    self.n_channels=hid_dim*2
    self.hid_dim=hid_dim
    self.kernel_size=kernel_size
    self.padding=(self.kernel_size-1)//2
    
    #build layer
    self.word_embed=nn.Embedding(input_dim,embed_dim)
    self.pos_embed=nn.Embedding(max_length,embed_dim)
    self.embed2hid=nn.Linear(embed_dim,hid_dim)
    self.conv_blocks=nn.ModuleList([nn.Conv1d(self.hid_dim,
                        self.n_channels,
                        kernel_size=self.kernel_size,
                        padding=self.padding)
                    for _ in range(self.n_layers)])
    self.hid2embed=nn.Linear(hid_dim,embed_dim)
    self.device=device
    self.dropout=nn.Dropout(drop_rate)
    self.scale=torch.sqrt(torch.FloatTensor([0.5])).to(device)
  def forward(self,src_tensors):
    #create pos tensors
    batch_size=src_tensors.size()[0]
    src_len=src_tensors.size()[1]
    pos_tensors=torch.arange(0,src_len).unsqueeze(0).repeat(batch_size,1).to(self.device)

    #get pos and word token Embed
    #src_tokenEmbeds=[Bs,seqlen,embed_dim],src_posEmbeds=[Bs,seqlen,embed_dim]
    src_tokenEmbeds=self.word_embed(src_tensors)
    src_posEmbeds=self.pos_embed(pos_tensors)

    #elemwise sum and linear transform
    #src_hids=[Bs,seqlen,hid_dim]
    src_embeds=self.dropout(src_tokenEmbeds+src_posEmbeds)
    src_hids=self.embed2hid(src_embeds)

    #permute dim
    #conv_inputs=[Bs,hid,seqlen]
    conv_inputs=src_hids.permute(0,2,1)

    #N convblock extract conved embed
    for convb in self.conv_blocks:
      #Fed into convolution filter
      #conved_output=[Bs,2*hid_dim,seqlen]
      conved_output=convb(self.dropout(conv_inputs))

      #through glu activation,shape=[Bs,2*hid_dim,seqlen]
      #Residual connection=[Bs,hid_dim,seqlen]
      conved_output=(conv_inputs+F.glu(conved_output,dim=1))*self.scale
      conv_inputs=conved_output

    #tranform conved output hid_dim to Embed_dim
    #conved_output=[Bs,seqLen,Embed_dim]
    conved_output=self.hid2embed(conved_output.permute(0,2,1))

    #Through Residual connect
    #combined_output=[Bs,seqLen,Embed_dim]
    combined_output=(conved_output+src_embeds)*self.scale
    return conved_output,combined_output

Decoder 架構部分

In [45]:
class cnnDecoder(nn.Module):
  def __init__(self,output_dim,embed_dim,hid_dim,kernel_size,n_layers,max_length,drop_rate,trg_pad_idx,device):
    super(cnnDecoder,self).__init__()
    #set decoder attr
    self.output_dim=output_dim
    self.pos_dim=max_length
    self.embed_dim=embed_dim
    self.device=device
    self.trg_pad_idx=trg_pad_idx
    #set conv_block attr.
    self.hid_dim=hid_dim
    self.n_filters=hid_dim*2
    self.kernel_size=kernel_size
    self.n_layers=n_layers
    self.padding=kernel_size-1

    #build layer
    self.word_embed=nn.Embedding(output_dim,embed_dim)
    self.pos_embed=nn.Embedding(self.pos_dim,embed_dim)
    self.embed2hid=nn.Linear(self.embed_dim,self.hid_dim)
    self.hid2embed=nn.Linear(self.hid_dim,self.embed_dim)
    self.attn_hid2embed=nn.Linear(self.hid_dim,self.embed_dim)
    self.attn_embed2hid=nn.Linear(self.embed_dim,self.hid_dim)
    self.conv_blocks=nn.ModuleList([nn.Conv1d(self.hid_dim,self.n_filters,kernel_size=self.kernel_size) for _ in range(self.n_layers)])
    self.padding=kernel_size-1
    self.dropout=nn.Dropout(drop_rate)
    self.output_layer=nn.Linear(self.embed_dim,self.output_dim)
    self.scale=torch.sqrt(torch.FloatTensor([0.5])).to(device)
  def calculate_attn(self,hid_tensors,embed_tensors,encoder_conved,encoder_combined):
    #hid_tensors=[Bs,hid_dim,trg_seqlen]
    #embed_tensors=[Bs,trg_seqlen,embed_dim]
    #encoder_conved=[Bs,src_seqlen,embed_dim]
    #encoder_combined=[Bs,src_seqlen,embed_dim]

    #residual connection
    #attn_query=[Bs,trg_seqlen,embed_dim]
    attn_query=(self.attn_hid2embed(hid_tensors.permute(0,2,1))+embed_tensors)*self.scale

    #get attn weight
    #attn_w=[Bs,trg_seqlen,src_seqlen]
    attn_w=torch.matmul(attn_query,encoder_conved.permute(0,2,1))

    #through softmax and weighted sum encoder_combined
    #attn_tensors=[Bs,trg_seqlen,emed_dim]
    attn_tensors=torch.matmul(F.softmax(attn_w,dim=2),encoder_combined)

    #convert to hid_dim and permute
    #attn_tensors=[Bs,hid_dim,seqlen]
    attn_tensors=self.attn_embed2hid(attn_tensors).permute(0,2,1)

    #apply residual connection with input_hid and attended_input
    attn_tensors=(attn_tensors+hid_tensors)*self.scale
    return attn_tensors,attn_w
  def forward(self,trg_tensors,encoder_conved,encoder_combined):
    #create pos embedding
    bs=trg_tensors.size()[0]
    seqlen=trg_tensors.size()[1]
    pos_tensors=torch.arange(0,seqlen).repeat(bs,1).to(self.device)

    #pos_embed=[bs,seqlen,Embed_dim],token_embed=[bs,seqlen,Embed_dim]
    token_embed=self.word_embed(trg_tensors)
    pos_embed=self.pos_embed(pos_tensors)

    #element-wise sum,embed_tensors=[Bs,seqLen,embed_dim]
    embed_tensors=self.dropout(token_embed+pos_embed)
    #convert into hid dim and transpose,conv_input=[Bs,hid_dim,seqLen]
    conv_inputs=self.embed2hid(embed_tensors).permute(0,2,1)
    
    for conv_b in self.conv_blocks:
      #create conv pad to avoid cheat
      #new conv_inputs=[Bs,hid_dim,(filter_size-1)+orig_seqlen]
      conv_inputs=self.dropout(conv_inputs)
      conv_pad=torch.zeros(bs,self.hid_dim,self.padding).fill_(self.trg_pad_idx).to(self.device)
      conv_pad=torch.cat((conv_pad,conv_inputs),dim=2)

      #conv_output=[Bs,hid_dim*2,seqLen]
      conv_outputs=conv_b(conv_pad)
      #through GLU and transpose=[Bs,seqlen,hid_dim]
      conv_outputs=F.glu(conv_outputs,dim=1)
      #calculate attention=[Bs,hid_dim,seqlen]
      conv_outputs,conv_attnW=self.calculate_attn(conv_outputs,embed_tensors,encoder_conved,encoder_combined)

      #Residual connection attn output & conv_inputs
      #conv_outputs=[Bs,hid_dim,seqLen]
      conv_outputs=(conv_outputs+conv_inputs)*self.scale
      conv_inputs=conv_outputs

    #transform hid_dim to embed_dim=[Bs,seqlen,embed_dim]
    embed_outputs=self.hid2embed(conv_outputs.permute(0,2,1))

    #output each token logitics=[Bs,seqlen,output_dim]
    #conv_attnW=[Bs,]
    return self.output_layer(embed_outputs),conv_attnW

Seq2

In [46]:
class cnn_Seq2Seq(nn.Module):
  def __init__(self,encoder,decoder):
    super(cnn_Seq2Seq,self).__init__()
    self.encoder=encoder
    self.decoder=decoder
  def forward(self,src_tensors,trg_tensors):
    #src_tensors=[Bs,src_seqlen]
    #trg_tensors=[Bs,trg_seqlen-1]

    #encoding src_tensor
    #conved_output=[Bs,seqlen,Embed_dim],conv_combined=[Bs,seqlen,Embed_dim]
    conved_output,conv_combined=self.encoder(src_tensors)
    #decoder conved_output
    output_logitics,attn_w=self.decoder(trg_tensors,conved_output,conv_combined)
    return output_logitics,attn_w

Training Model

In [47]:
#set Encoder & Decoder hyperparameters
Input_dim=len(SRC_de.vocab)

Output_dim=len(TRG_en.vocab)
Embed_dim=256
hid_dim=512 #so filter size=1024
Encoder_layers=10
Decoder_layers=10
Enc_kernel_size=3
Dec_kernel_size=3
Max_length=100
Enc_dropout=0.25
Dec_dropout=0.25
TRG_pad_idx=TRG_en.vocab.stoi['<pad>']
model_params={'encoder':{'input_dim':Input_dim,'kernel_size':Enc_kernel_size,'n_layers':Encoder_layers,'drop_rate':Enc_dropout},
       'decoder':{'output_dim':Output_dim,'kernel_size':Dec_kernel_size,'n_layers':Decoder_layers,'drop_rate':Dec_dropout,'trg_pad_idx':TRG_pad_idx},
       'common':{'embed_dim':Embed_dim,'hid_dim':hid_dim,'max_length':Max_length,'device':device}}
model_dir='./model'
model_path='s2s_model.pt'

In [48]:
def count_parameters(m):
  return sum([p.numel() for p in m.parameters() if p.requires_grad])

In [49]:
encoder_model=cnnEncoder(**model_params['encoder'],**model_params['common'])
decoder_model=cnnDecoder(**model_params['decoder'],**model_params['common'])
s2s_model=cnn_Seq2Seq(encoder_model,decoder_model).to(device)#put model to GPU
#compute Model trainable parameters
print(f'Model trainable params num:{count_parameters(s2s_model)}')
print(f'Model artictcure:{s2s_model}')
optimizer=optim.Adam(s2s_model.parameters())
criterion=nn.CrossEntropyLoss(ignore_index=TRG_pad_idx)

Model trainable params num:37351685
Model artictcure:cnn_Seq2Seq(
  (encoder): cnnEncoder(
    (word_embed): Embedding(7855, 256)
    (pos_embed): Embedding(100, 256)
    (embed2hid): Linear(in_features=256, out_features=512, bias=True)
    (conv_blocks): ModuleList(
      (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (8): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (9): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), paddin

In [50]:
#define model training pipeline
def train_model(model,train_iter,criterion,optimizer,device,clip):
  train_loss=0
  model.train()
  for bs in train_iter:
    if next(model.parameters()).is_cuda:
      src_tensors=bs.src.to(device)
      trg_tensors=bs.trg.to(device)
    else:
      src_tensors=bs.src
      trg_tensors=bs.trg
    output_trg,_=model(src_tensors,trg_tensors[:,:-1])
    #compute loss
    output_trg=output_trg.reshape(-1,model.decoder.output_dim)
    labels_trg=trg_tensors[:,1:].reshape(-1)
    loss=criterion(output_trg,labels_trg)
    train_loss+=loss.item()

    #compute gradient & clipping
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(),clip)

    #update model weight
    optimizer.step()
    optimizer.zero_grad()
  return train_loss/len(train_iter)
#define model testing pipeline
def evaluate_model(model,val_iter,criterion,device):
  total_loss=0
  model.eval()
  with torch.no_grad():
    for bs in val_iter:
      if next(model.parameters()).is_cuda:
        src_tensors=bs.src.to(device)
        trg_tensors=bs.trg.to(device)
      else:
        src_tensors=bs.src
        trg_tensors=bs.trg
      trg_outputs,_=model(src_tensors,trg_tensors[:,:-1])
      #compute loss
      trg_outputs=trg_outputs.reshape(-1,model.decoder.output_dim)
      trg_labels=trg_tensors[:,1:].reshape(-1)
      loss=criterion(trg_outputs,trg_labels)
      total_loss+=loss.item()
  return total_loss/len(val_iter)
def save_modelCkp(model,optimizer,model_params,epochs,model_dir,model_path):
  if not os.path.exists(model_dir):
    print('Model dir not exists')
    os.mkdir(model_dir)
    print(f'Already create directory:{model_dir}!')
  save_path=os.path.join(model_dir,model_path)
  torch.save({
        'model':model.state_dict(),'optimzier':optimizer.state_dict(),
        'model_params':model_params,'epochs':epochs
        },save_path)
  print(f'Model checkpoint already saved to{save_path}!')
def load_modelCkp(model_dir,model_path):
  save_path=os.path.join(model_dir,model_path)
  #detect dir_path
  if os.path.exits(model_dir):
    print('Model dir exists...')
  else:
    raise FileNotFoundError('Model dir not exists')

  try:
    ck_point=torch.load(save_path)
    print('Successful load Model info.')
    return ck_point
  except FileNotFoundError:
    print("Save path don't exists")

In [51]:
train_epochs=15
grad_clip=0.1
epochs_progress=tqdm.trange(train_epochs)
save_everyEpochs=3
for ep in epochs_progress:
  #train Model
  start_time=time.time()
  train_loss=train_model(s2s_model,train_iter,criterion,optimizer,device,grad_clip)
  end_time=time.time()
  epochs_loss=evaluate_model(s2s_model,val_iter,criterion,device)#compute val loss

  #save Model checkpoints in every 3 epochs and final epochs
  if (ep+1)%save_everyEpochs==0 or (ep+1)%save_everyEpochs==train_epochs:
    save_modelCkp(s2s_model,optimizer,model_params,ep+1,model_dir,model_path)
  
  #display Training information
  print(f'[{ep+1}/{train_epochs}] train_loss:{train_loss} \t eval_loss:{epochs_loss} cost_time:{end_time-start_time}s')



  7%|▋         | 1/15 [01:00<14:10, 60.77s/it][A

[1/15] train_loss:5.593705547013472 	 eval_loss:3.853549540042877 cost_time:60.29278635978699s



 13%|█▎        | 2/15 [02:04<13:22, 61.73s/it][A

[2/15] train_loss:3.540686007638335 	 eval_loss:2.7958647310733795 cost_time:63.46308350563049s



 20%|██        | 3/15 [03:09<12:33, 62.75s/it][A

Model checkpoint already saved to./model/s2s_model.pt!
[3/15] train_loss:2.789023985421605 	 eval_loss:2.302263021469116 cost_time:63.15321946144104s



 27%|██▋       | 4/15 [04:13<11:34, 63.10s/it][A

[4/15] train_loss:2.421034505188728 	 eval_loss:2.0697994977235794 cost_time:63.41914439201355s



 33%|███▎      | 5/15 [05:17<10:32, 63.29s/it][A

[5/15] train_loss:2.203066254502351 	 eval_loss:1.9650320261716843 cost_time:63.25152349472046s



 40%|████      | 6/15 [06:23<09:35, 63.99s/it][A

Model checkpoint already saved to./model/s2s_model.pt!
[6/15] train_loss:2.0616136369201055 	 eval_loss:1.9085756987333298 cost_time:63.59196972846985s



 47%|████▋     | 7/15 [07:26<08:31, 63.91s/it][A

[7/15] train_loss:1.9559736236076524 	 eval_loss:1.8502509742975235 cost_time:63.21267509460449s



 53%|█████▎    | 8/15 [08:30<07:26, 63.82s/it][A

[8/15] train_loss:1.8709784856451765 	 eval_loss:1.8321518748998642 cost_time:63.12448811531067s



 60%|██████    | 9/15 [09:35<06:25, 64.27s/it][A

Model checkpoint already saved to./model/s2s_model.pt!
[9/15] train_loss:1.8059868077349557 	 eval_loss:1.8221572786569595 cost_time:63.32690167427063s



 67%|██████▋   | 10/15 [10:39<05:20, 64.07s/it][A

[10/15] train_loss:1.7476308718651927 	 eval_loss:1.801610380411148 cost_time:63.12400960922241s



 73%|███████▎  | 11/15 [11:43<04:16, 64.03s/it][A

[11/15] train_loss:1.6993651374321153 	 eval_loss:1.7833548039197922 cost_time:63.436952352523804s



 80%|████████  | 12/15 [12:48<03:13, 64.37s/it][A

Model checkpoint already saved to./model/s2s_model.pt!
[12/15] train_loss:1.6537788281881862 	 eval_loss:1.7976618558168411 cost_time:63.155280113220215s



 87%|████████▋ | 13/15 [13:51<02:08, 64.11s/it][A

[13/15] train_loss:1.6192423520109203 	 eval_loss:1.7762237936258316 cost_time:63.01210379600525s



 93%|█████████▎| 14/15 [14:55<01:03, 63.91s/it][A

[14/15] train_loss:1.5832782500640936 	 eval_loss:1.7591708451509476 cost_time:62.95311903953552s



100%|██████████| 15/15 [16:00<00:00, 64.03s/it]

Model checkpoint already saved to./model/s2s_model.pt!
[15/15] train_loss:1.550870656967163 	 eval_loss:1.7740721553564072 cost_time:63.148723125457764s





Inference stage

In [52]:
def translate_sents(model,sentence,src_Field,trg_field,max_len,device):
  model.eval()
  #text process
  if isinstance(sentence,str):
    en_tokenizer=spacy.load('en_core_web_sm')
    translated=['<sos>']+[token.text.lower() for t in en_tokenizer(sentence)]+['<eos>']
  else:
    translated=['<sos>']+[t.lower() for t in sentence]+['<eos>']
  print(f'After tokenized:{translated}')
  #convert to word indx
  token_idx=src_Field.numericalize([translated],device)
  print(f'transform word index:{token_idx}')
  #create source sents Tensor
  src_tensors=token_idx.to(device)

  with torch.no_grad():
    #encoding source sents
    conved_output,combined_output=model.encoder(src_tensors)

  trg_idx=[trg_field.vocab.stoi['<sos>']]
  i=0
  while i<max_len:
    model_output=torch.LongTensor(trg_idx).unsqueeze(0).to(device)
    #outputs=[1,seqlen,output_dim]
    output_logitics,attn_weights=model.decoder(model_output,conved_output,combined_output)
    #get predict word idx
    predict_idx=F.softmax(output_logitics,dim=2).argmax(2)[:,-1].item()

    #determined predict token id
    if predict_idx==trg_field.vocab.stoi['<eos>']:
      break
    trg_idx.append(predict_idx)
    i+=1
  #convert word idx into string
  translated_sents=[trg_field.vocab.itos[idx] for idx in trg_idx]
  
  return translated_sents[1:],attn_weights

In [53]:
#Translate test example
test_idx=random.choice(range(len(test_data)))
test_example=test_data.examples[test_idx]
test_src=test_example.src
test_trg=test_example.trg

#translated source sent
translated_trg,attn_w=translate_sents(s2s_model,test_src,SRC_de,TRG_en,100,device)
print(f'Translated source sentence:{test_src}\n')
print(f'Actual target sentence:{test_trg}\n')
print(f"model translated target sentence:{' '.join(translated_trg)}\n")
print(attn_w)

After tokenized:['<sos>', 'eine', 'afrikanische', 'familie', 'steht', 'vor', 'ein', 'paar', 'provisorischen', 'behausungen', '.', '<eos>']
transform word index:tensor([[   2,    8, 1088,  323,   29,   27,    5,  116,    0,    0,    4,    3]])
Translated source sentence:['eine', 'afrikanische', 'familie', 'steht', 'vor', 'ein', 'paar', 'provisorischen', 'behausungen', '.']

Actual target sentence:['an', 'african', 'family', 'are', 'standing', 'in', 'front', 'of', 'some', 'makeshift', 'houses', '.']

model translated target sentence:a african family stands in front of some <unk> <unk> .



檢視test eaxmple Target sents關注source sents的注意力權重，顏色越白者代表關注度越高

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
#get tick labels location
xticks_loc=range(len(test_src))
yticks_loc=range(len(translated_trg))
#create fig
fig=plt.figure(figsize=(10,8))
ax=fig.add_subplot(1,1,1)
ax.imshow()