<a href="https://colab.research.google.com/github/zhihong1224/RNN_demo/blob/master/hw8_Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1 MXNet 数据集

## 1.1 MXNet 实现

### 1.1.1 读取和预处理数据

In [0]:
import collections
import os
import io
import math
import torch
from torch import nn,optim
import torch.nn.functional as F
import torchtext.vocab as Vocab
import torch.utils.data as Data
import sys

PAD,BOS,EOS='<pad>','<bos>','<eos>'
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
def process_one_seq(seq_tokens,all_tokens,all_seqs,max_seq_len):
  all_tokens.extend(seq_tokens)
  seq_tokens+=[EOS]+[PAD]*(max_seq_len-len(seq_tokens)-1)
  all_seqs.append(seq_tokens)

def build_data(all_tokens,all_seqs):
  vocab=Vocab.Vocab(collections.Counter(all_tokens),specials=[PAD,BOS,EOS])
  indices=[[vocab.stoi[w] for w in seq] for seq in all_seqs]
  return vocab,torch.tensor(indices)

In [0]:
ROOT='drive/My Drive/Colab Notebooks/MXNet/MX_data'
def read_data(max_seq_len):
  in_tokens,out_tokens,in_seqs,out_seqs=[],[],[],[]
  with open(os.path.join(ROOT,'fr-en-small.txt')) as f:
    lines=f.readlines()
  for line in lines:
    in_seq,out_seq=line.rstrip().split('\t')
    in_seq_tokens,out_seq_tokens=in_seq.split(' '),out_seq.split(' ')
    if max(len(in_seq_tokens),len(out_seq_tokens))>max_seq_len-1:
      continue
    process_one_seq(in_seq_tokens,in_tokens,in_seqs,max_seq_len)
    process_one_seq(out_seq_tokens,out_tokens,out_seqs,max_seq_len)
  in_vocab,in_data=build_data(in_tokens,in_seqs)
  out_vocab,out_data=build_data(out_tokens,out_seqs)
  return in_vocab,out_vocab,Data.TensorDataset(in_data,out_data)

In [0]:
max_seq_len=7
in_vocab,out_vocab,dataset=read_data(max_seq_len)
dataset[0]

(tensor([ 5,  4, 45,  3,  2,  0,  0]), tensor([ 8,  4, 27,  3,  2,  0,  0]))

### 1.1.2 含注意力机制的编码器-解码器

In [0]:
# 编码器
class Encoder(nn.Module):
  def __init__(self,vocab_size,embed_size,num_hiddens,num_layers,drop_prob=0,**kwargs):
    super(Encoder,self).__init__(**kwargs)
    self.embedding=nn.Embedding(vocab_size,embed_size)
    self.rnn=nn.GRU(embed_size,num_hiddens,num_layers,dropout=drop_prob)
  
  def forward(self,inputs,state):
    # inputs:(n,seq_len)
    embedding=self.embedding(inputs.long()).permute(1,0,2)  #(seq_len,n,embed_size)
    return self.rnn(embedding,state)

  def begin_state(self):
    return None

In [0]:
# 测试编码器输入输出
encoder=Encoder(vocab_size=10,embed_size=8,num_hiddens=16,num_layers=2)
output,state=encoder(torch.zeros((4,7)),encoder.begin_state())
print(output.shape,state.shape)

torch.Size([7, 4, 16]) torch.Size([2, 4, 16])


In [0]:
# 注意力机制
def attention_model(input_size,attention_size):
  model=nn.Sequential(
    nn.Linear(input_size,attention_size,bias=False),
    nn.Tanh(),
    nn.Linear(attention_size,1,bias=False)
  )
  return model

In [0]:
def attention_forward(model,enc_states,dec_state):
  # enc_states:(seq_len,n,num_hiddens)
  # dec_states:(n,num_hiddens)
  dec_states=dec_state.unsqueeze(dim=0).expand_as(enc_states)
  enc_and_dec_states=torch.cat((enc_states,dec_states),dim=2) #(seq_len,n,2*num_hiddens)
  e=model(enc_and_dec_states)  #(seq_len,n,1)
  alpha=F.softmax(e,dim=0) #(seq_len,n,1)
  return (alpha*enc_states).sum(dim=0)

In [0]:
# 测试注意力机制
seq_len,batch_size,num_hiddens=10,4,8
model=attention_model(2*num_hiddens,10)
enc_states=torch.zeros((seq_len,batch_size,num_hiddens))
dec_state=torch.zeros((batch_size,num_hiddens))
attention_forward(model,enc_states,dec_state).shape

torch.Size([4, 8])

In [0]:
# 含注意力机制的解码器
class Decoder(nn.Module):
  def __init__(self,vocab_size,embed_size,num_hiddens,num_layers,attention_size,drop_prob=0):
    super(Decoder,self).__init__()
    self.embedding=nn.Embedding(vocab_size,embed_size)
    self.attention=attention_model(2*num_hiddens,attention_size)
    self.rnn=nn.GRU(num_hiddens+embed_size,num_hiddens,num_layers,dropout=drop_prob)
    self.out=nn.Linear(num_hiddens,vocab_size)
  
  def forward(self,cur_input,state,enc_states):
    # cur_input:(n,)
    # state:(num_layers,n,num_hiddens)
    c=attention_forward(self.attention,enc_states,state[-1]) #(n,num_hiddens)
    input_and_c=torch.cat((self.embedding(cur_input),c),dim=1) #(n,embed_size+num_hiddens)
    output,state=self.rnn(input_and_c.unsqueeze(0),state)
    output=self.out(output).squeeze(dim=0)
    return output,state
  
  def begin_state(self,enc_state):
    return enc_state

### 1.1.3 带mask的损失函数

In [0]:
# 损失函数
def batch_loss(encoder,decoder,X,Y,loss):
  batch_size=X.shape[0]
  enc_state=encoder.begin_state()
  enc_outputs,enc_state=encoder(X,enc_state)
  dec_state=decoder.begin_state(enc_state)
  dec_input=torch.tensor([out_vocab.stoi[BOS]]*batch_size).cuda()
  mask,num_not_pad_tokens=torch.ones(batch_size,).cuda(),0
  l=torch.tensor([0.]).cuda()
  for y in Y.permute(1,0):
    dec_output,dec_state=decoder(dec_input,dec_state,enc_outputs)
    l=l+(mask*loss(dec_output,y)).sum()
    dec_input=y
    num_not_pad_tokens+=mask.sum().item()
    mask=mask*(y!=out_vocab.stoi[PAD]).float()
  return l/num_not_pad_tokens

### 1.1.4 模型训练

In [0]:
def train(encoder,decoder,dataset,lr,batch_size,num_epochs):
  encoder=encoder.cuda()
  decoder=decoder.cuda()
  enc_optimizer=optim.Adam(encoder.parameters(),lr=lr)
  dec_optimizer=optim.Adam(decoder.parameters(),lr=lr)

  loss=nn.CrossEntropyLoss()
  data_iter=Data.DataLoader(dataset,batch_size,shuffle=True)
  for epoch in range(num_epochs):
    l_sum=0.
    for X,Y in data_iter:
      X=X.cuda()
      Y=Y.cuda()
      enc_optimizer.zero_grad()
      dec_optimizer.zero_grad()
      l=batch_loss(encoder,decoder,X,Y,loss)
      l.backward()
      enc_optimizer.step()
      dec_optimizer.step()
      l_sum+=l.item()
    if (epoch+1)%10==0:
      print('epoch %d,loss %.3f'%(epoch+1,l_sum/len(data_iter)))

In [0]:
embed_size,num_hiddens,num_layers=64,64,2
attention_size,drop_prob,lr,batch_size,num_epochs=10,0.5,0.01,2,50
encoder=Encoder(len(in_vocab),embed_size,num_hiddens,num_layers,drop_prob)
decoder=Decoder(len(out_vocab),embed_size,num_hiddens,num_layers,attention_size,drop_prob)
train(encoder,decoder,dataset,lr,batch_size,num_epochs)

epoch 10,loss 0.395
epoch 20,loss 0.173
epoch 30,loss 0.127
epoch 40,loss 0.037
epoch 50,loss 0.033


### 1.1.5 预测不定长的序列

In [0]:
def translate(encoder,decoder,input_seq,max_seq_len):
  in_tokens=input_seq.split(' ')
  in_tokens+=[EOS]+[PAD]*(max_seq_len-len(in_tokens)-1)
  enc_input=torch.tensor([[in_vocab.stoi[tk] for tk in in_tokens]]).cuda()
  enc_state=encoder.begin_state()
  enc_output,enc_state=encoder(enc_input,enc_state)
  dec_input=torch.tensor([out_vocab.stoi[BOS]]).cuda()
  dec_state=decoder.begin_state(enc_state)
  output_tokens=[]
  for _ in range(max_seq_len):
    dec_output,dec_state=decoder(dec_input,dec_state,enc_output)
    pred=dec_output.argmax(dim=1)
    pred_token=out_vocab.itos[int(pred.item())]
    if pred_token==EOS:
      break
    else:
      output_tokens.append(pred_token)
      dec_input=pred
  return output_tokens

In [0]:
input_seq='ils regardent .'
translate(encoder,decoder,input_seq,max_seq_len)

['they', 'are', 'watching', '.']

### 1.1.6 BLEU

In [0]:
def bleu(pred_tokens,label_tokens,k):
  len_pred,len_label=len(pred_tokens),len(label_tokens)
  score=math.exp(min(0,1-len_label/len_pred))
  for n in range(1,k+1):
    num_matches,label_subs=0,collections.defaultdict(int)
    for i in range(len_label-n+1):
      label_subs[''.join(label_tokens[i:i+n])]+=1
    for i in range(len_pred-n+1):
      if label_subs[''.join(pred_tokens[i:i+n])]>0:
        num_matches+=1
        label_subs[''.join(pred_tokens[i:i+n])]-=1
    score*=math.pow(num_matches/(len_pred-n+1),math.pow(0.5,n))
  return score

In [0]:
def score(input_seq,label_seq,k):
  pred_tokens=translate(encoder,decoder,input_seq,max_seq_len)
  label_tokens=label_seq.split(' ')
  print('bleu %.3f,predict:%s'%(bleu(pred_tokens,label_tokens,k),' '.join(pred_tokens)))

In [0]:
score('ils regardent .','they are watching .',k=2)

bleu 1.000,predict:they are watching .


In [0]:
score('ils sont canadiens .','they are canadian .',k=2)

bleu 0.658,predict:they are actors .


## 1.2 自己实现Seq2Seq & attention

### 1.2.1 读取并处理数据

In [0]:
# 读入并处理数据
BOS,EOS,PAD='<bos>','<eos>','<pad>'
def get_data(filename):
  in_lines,out_lines,in_tokens,out_tokens=[],[],[],[]
  max_in_len,max_out_len=0,0
  with open(filename) as f:
    lines=f.readlines()
  for line in lines:
    line=line.rstrip().split('\t')
    in_line=line[0].split(' ')
    out_line=line[1].split(' ')
    in_tokens.extend(in_line)
    out_tokens.extend(out_line)
    in_lines.append(in_line)
    out_lines.append(out_line)
    if len(in_line)>max_in_len:
      max_in_len=len(in_line)
    if len(out_line)>max_out_len:
      max_out_len=len(out_line)
  max_in_len+=1
  max_out_len+=1
  for i in range(len(in_lines)):
    in_lines[i]+=[EOS]+[PAD]*(max_in_len-1-len(in_lines[i]))
    out_lines[i]+=[EOS]+[PAD]*(max_out_len-1-len(out_lines[i]))
  return in_lines,out_lines,in_tokens,out_tokens,max_in_len,max_out_len

In [35]:
ROOT='drive/My Drive/Colab Notebooks/MXNet/MX_data'
filename=os.path.join(ROOT,'fr-en-small.txt')
in_lines,out_lines,in_tokens,out_tokens,max_in_len,max_out_len=get_data(filename)
print(in_lines[0],out_lines[0])

['elle', 'est', 'vieille', '.', '<eos>', '<pad>', '<pad>'] ['she', 'is', 'old', '.', '<eos>', '<pad>', '<pad>']


In [23]:
# 创建字典
in_vocab=Vocab.Vocab(collections.Counter(in_tokens),specials=[EOS,BOS,PAD])
out_vocab=Vocab.Vocab(collections.Counter(out_tokens),specials=[EOS,BOS,PAD])
print(len(in_vocab),len(out_vocab))

46 38


In [0]:
# 将句子转换为corpus
def get_corpus(in_lines,out_lines,in_vocab,out_vocab):
  in_corpus,out_corpus=[],[]
  for i in range(len(in_lines)):
    in_corpus.append([in_vocab.stoi[w] for w in in_lines[i]])
    out_corpus.append([out_vocab.stoi[w] for w in out_lines[i]])
  return torch.tensor(in_corpus),torch.tensor(out_corpus)

In [25]:
in_corpus,out_corpus=get_corpus(in_lines,out_lines,in_vocab,out_vocab)
print(in_corpus.shape,out_corpus.shape,in_corpus[0],out_corpus[0])

torch.Size([20, 7]) torch.Size([20, 7]) tensor([ 5,  4, 45,  3,  0,  2,  2]) tensor([ 8,  4, 27,  3,  0,  2,  2])


In [28]:
dataset=Data.TensorDataset(in_corpus,out_corpus)
dataset[0]

(tensor([ 5,  4, 45,  3,  0,  2,  2]), tensor([ 8,  4, 27,  3,  0,  2,  2]))

### 1.2.2 编码器

In [0]:
class Encoder(nn.Module):
  def __init__(self,vocab_size,embed_size,num_hiddens,num_layers,drop_prob=0.5):
    super().__init__()
    self.embedding=nn.Embedding(vocab_size,embed_size)
    self.rnn=nn.GRU(embed_size,num_hiddens,num_layers,batch_first=True,dropout=drop_prob)
  
  def forward(self,x,state):
    # x:(batch_size,seq_len)
    embed=self.embedding(x.long())  #(batch_size,seq_len,embed_size)
    output,state=self.rnn(embed,state) #output:(batch_size,seq_len,num_hiddens) state:(num_layers,batch_size,num_hiddens)
    return output,state

  def begin_state(self):
    return None 

In [21]:
# 测试编码器输入输出
batch_size=3
seq_len=8
vocab_size,embed_size,num_hiddens,num_layers=40,10,16,2
encoder=Encoder(vocab_size,embed_size,num_hiddens,num_layers).cuda()
x=torch.zeros((batch_size,seq_len)).cuda()
state=encoder.begin_state()
enc_outputs,enc_state=encoder(x,state)
print(enc_outputs.shape,enc_state.shape)

torch.Size([3, 8, 16]) torch.Size([2, 3, 16])


# 1.2.3 带注意力机制的解码器

In [0]:
class Decoder(nn.Module):
  def __init__(self,vocab_size,embed_size,num_hiddens,num_layers,attention_size,drop_prob=0.5):
    super().__init__()
    self.embedding=nn.Embedding(vocab_size,embed_size)
    self.rnn=nn.GRU(embed_size+num_hiddens,num_hiddens,num_layers,batch_first=True,dropout=drop_prob)
    self.attention=nn.Sequential(
      nn.Linear(2*num_hiddens,attention_size),
      nn.Tanh(),
      nn.Linear(attention_size,1)
    )
    self.fc=nn.Linear(num_hiddens,vocab_size)
    self.vocab_size=vocab_size

  def forward(self,x,state,enc_outputs):
    # x:(batch_size,seq_len)
    # state:(num_layers,batch_size,num_hiddens)
    # enc_outputs:(batch_size,seq_len,num_hiddens)
    output=torch.zeros((batch_size,x.shape[1],self.vocab_size)).cuda()
    x_step=torch.tensor([out_vocab.stoi[BOS]]*batch_size).cuda() #(batch_size,)
    for i in range(x.shape[1]):
      embed_step=self.embedding(x_step.long())  #(batch_size,embed_size)
      h=state[-1]  #(batch_size,num_hiddens)
      h_expand=h.unsqueeze(dim=1).expand_as(enc_outputs) #(batch_size,seq_len,num_hiddens)
      enc_and_h=torch.cat((enc_outputs,h_expand),dim=2) #(batch_size,seq_len,2*num_hiddens)
      alpha=F.softmax(self.attention(enc_and_h),dim=1) #(batch_size,seq_len,1)
      c=(alpha*enc_outputs).sum(dim=1) #(batch_size,num_hiddens)
      x_and_c=torch.cat((embed_step,c),dim=1).unsqueeze(dim=1) #(batch_size,1,embed+num_hiddens)
      out_step,state=self.rnn(x_and_c,state) #(batch_size,1,num_hiddens) (num_layers,batch_size,num_hiddens)
      out=self.fc(out_step)  #(batch_size,1,vocab_size)
      # output.append(out.squeeze())
      output[:,i,:]=out.squeeze(dim=1)
      x_step=x[:,i] #(batch_size,)
    return output

  def begin_state(self,enc_state):
    # enc_state:(num_layers,batch_size,num_hiddens)
    return enc_state

In [26]:
# 测试带注意力机制的解码器
seq_len=9
vocab_size,embed_size,num_hiddens,num_layers,attention_size=30,10,16,2,7
decoder=Decoder(vocab_size,embed_size,num_hiddens,num_layers,attention_size).cuda()
state=decoder.begin_state(enc_state)
x=torch.zeros((batch_size,seq_len)).cuda()
output=decoder(x,state,enc_outputs)
# print(len(output),output[0].shape)
print(output.shape)

torch.Size([3, 9, 30])


### 1.2.4 损失函数

In [0]:
class cross_loss(nn.Module):
  def __init__(self):
    # real_Y:(batch_size,seq_len)
    # output:(batch_size,seq_len,vocab_size)
    super().__init__()
    self.loss=nn.CrossEntropyLoss(reduction='none')
  def forward(self,real_Y,output):
    sum_loss,num_not_pad=0.0,0
    for i in range(real_Y.shape[1]):
      y=real_Y[:,i]  #(batch_size,)
      mask=y!=torch.tensor([out_vocab.stoi[PAD]]).cuda()  #(batch_size,)
      loss=self.loss(output[:,i,:],y.long()) #(batch_size)
      sum_loss+=(mask*loss).sum()
      num_not_pad+=sum(mask).item()
    return sum_loss,num_not_pad

### 1.2.5 训练

In [0]:
def train(encoder,decoder,lr,num_epochs,train_iter,loss_fn,print_every=10):
  encoder=encoder.cuda()
  decoder=decoder.cuda()
  e_optimizer=optim.Adam(encoder.parameters(),lr=lr)
  d_optimizer=optim.Adam(decoder.parameters(),lr=lr)

  encoder.train()
  decoder.train()

  for epoch in range(num_epochs):
    train_loss,sum_to_compute=0.0,0
    for X,Y in train_iter:  #(batch_size,seq_len),(batch_size,seq_len)
      X=X.cuda()
      Y=Y.cuda()
      e_optimizer.zero_grad()
      d_optimizer.zero_grad()
      enc_state=encoder.begin_state()
      enc_outputs,enc_state=encoder(X,enc_state) #(batch_size,seq_len,num_hiddens),(num_layers,batch_size,num_hiddens)

      dec_state=decoder.begin_state(enc_state)  #(batch_size,num_hiddens)
      output=decoder(Y,dec_state,enc_outputs) #(batch_size,seq_len,vocab_size)

      loss,num_not_pad=loss_fn(Y,output)

      loss.backward()
      e_optimizer.step()
      d_optimizer.step()

      sum_to_compute+=num_not_pad
      train_loss+=loss.item()
    if (epoch+1)%print_every==0:
      print('Epoch:{} | Loss:{}'.format(epoch+1,train_loss/sum_to_compute))

In [32]:
batch_size=2
train_iter=Data.DataLoader(dataset,batch_size=batch_size,shuffle=True)
embed_size,num_hiddens,num_layers,attention_size=16,16,2,10
encoder=Encoder(len(in_vocab),embed_size,num_hiddens,num_layers)
decoder=Decoder(len(out_vocab),embed_size,num_hiddens,num_layers,attention_size)
loss_fn=cross_loss()
num_epochs,lr=200,0.003
train(encoder,decoder,lr,num_epochs,train_iter,loss_fn)

Epoch:10 | Loss:1.9978419019464861
Epoch:20 | Loss:1.370218561406721
Epoch:30 | Loss:1.0945608448563962
Epoch:40 | Loss:0.9123836985805578
Epoch:50 | Loss:0.8233084344027335
Epoch:60 | Loss:0.6963301666995936
Epoch:70 | Loss:0.6384744895131964
Epoch:80 | Loss:0.5627626745324386
Epoch:90 | Loss:0.52522873878479
Epoch:100 | Loss:0.4579703975142094
Epoch:110 | Loss:0.4482660962824236
Epoch:120 | Loss:0.387531506387811
Epoch:130 | Loss:0.3505792722367404
Epoch:140 | Loss:0.34979275234958584
Epoch:150 | Loss:0.3206354191428737
Epoch:160 | Loss:0.3142714416771604
Epoch:170 | Loss:0.31412462184303686
Epoch:180 | Loss:0.2946873380426775
Epoch:190 | Loss:0.27587059924477025
Epoch:200 | Loss:0.26231403936419573


### 1.2.6 预测

In [0]:
def predict(in_line,encoder,decoder,max_in_len,max_out_len):
  in_tokens=in_line.rstrip().split(' ')
  in_tokens+=[EOS]+[PAD]*(max_in_len-1-len(in_tokens))
  corpus=torch.tensor([in_vocab.stoi[w] for w in in_tokens]).unsqueeze(dim=0).cuda() #(1,seq_len)
  with torch.no_grad():
    encoder.eval()
    decoder.eval()
    enc_state=encoder.begin_state()
    enc_outputs,enc_state=encoder(corpus,enc_state) #(1,seq_len,num_hiddens),(num_layers,1,num_hiddens)

    dec_state=decoder.begin_state(enc_state)  #(2,1,num_hiddens)
    # output=decoder.predict(max_out_len,dec_state,enc_outputs) #(1,seq_len,vocab_size)


    # x:(batch_size,seq_len)
    # state:(num_layers,batch_size,num_hiddens)
    # enc_outputs:(batch_size,seq_len,num_hiddens)
    output=torch.zeros((1,max_out_len,decoder.vocab_size)).cuda()
    x_step=torch.tensor([out_vocab.stoi[BOS]]).cuda() #(batch_size,)
    for i in range(max_out_len):
      embed_step=decoder.embedding(x_step.long())  #(batch_size,embed_size)
      h=dec_state[-1]  #(batch_size,num_hiddens)
      h_expand=h.unsqueeze(dim=1).expand_as(enc_outputs) #(batch_size,seq_len,num_hiddens)
      enc_and_h=torch.cat((enc_outputs,h_expand),dim=2) #(batch_size,seq_len,2*num_hiddens)
      alpha=F.softmax(decoder.attention(enc_and_h),dim=1) #(batch_size,seq_len,1)
      c=(alpha*enc_outputs).sum(dim=1) #(batch_size,num_hiddens)
      x_and_c=torch.cat((embed_step,c),dim=1).unsqueeze(dim=1) #(batch_size,1,embed+num_hiddens)
      out_step,dec_state=decoder.rnn(x_and_c,dec_state) #(batch_size,1,num_hiddens) (num_layers,batch_size,num_hiddens)
      out=decoder.fc(out_step)  #(batch_size,1,vocab_size)
      # output.append(out.squeeze())
      output[:,i,:]=out.squeeze(dim=1)
      x_step=out.argmax(dim=2).squeeze(dim=1) #(batch_size,)
  pred=output.argmax(dim=2)  #(1,seq_len)
  pred_tokens=[out_vocab.itos[int(idx.item())] for idx in pred[0]]
  out=[]
  for w in pred_tokens:
    if w==EOS:
      break
    else:
      out.append(w)
  return out

In [33]:
in_line='ils regardent .'
predict(in_line,encoder,decoder,max_in_len,max_out_len)

['they', 'are', 'watching', '.']

> 注意：在进行预测的时候代码中要加上：with tprch.no_grad() 和 encoder.eval(),decoder.eval()，否则多次运行预测代码，相同输入会导致不同输出。

In [36]:
in_line='elle est vieille .'
predict(in_line,encoder,decoder,max_in_len,max_out_len)

['she', 'is', 'quiet', '.']

### 1.2.7 评价BLEU

In [0]:
def bleu(pred,real):
  import math
  c=len(pred)
  real_list=real.rstrip().split()
  r=len(real_list)
  BP=1 if c>r else math.exp(1-r/c)
  num_right=0
  for w in pred:
    if w in real_list:
      num_right+=1
  Precision=num_right/c
  bleu=BP*Precision
  return bleu

In [39]:
# 使用bleu进行评价
in_line='elle est vieille .'
pred=predict(in_line,encoder,decoder,max_in_len,max_out_len)
real='she is old .'
print(pred,bleu(pred,real))

['she', 'is', 'quiet', '.'] 0.75


# 2 使用Hongyi_Li数据集

## 2.1 自己实现作业

### 2.1.1 读取并处理数据

In [40]:
Li_root='drive/My Drive/Colab Notebooks/Hongyi_Li/data'
!gdown --id '1r4px0i-NcrnXy1-tkBsIwvYwbWnxAhcg' --output '{Li_root}/seq2seq_data.tar.gz'
!tar -zxvf '{Li_root}/seq2seq_data.tar.gz'

Downloading...
From: https://drive.google.com/uc?id=1r4px0i-NcrnXy1-tkBsIwvYwbWnxAhcg
To: /content/drive/My Drive/Colab Notebooks/Hongyi_Li/data/seq2seq_data.tar.gz
5.83MB [00:00, 90.4MB/s]
cmn-eng/
cmn-eng/int2word_cn.json
cmn-eng/int2word_en.json
cmn-eng/preprocess/
cmn-eng/preprocess/build_dataset.py
cmn-eng/preprocess/build_dictionary.sh
cmn-eng/preprocess/cmn.txt
cmn-eng/preprocess/cn.txt
cmn-eng/preprocess/dict.txt.big
cmn-eng/preprocess/dict.txt.small
cmn-eng/preprocess/en.txt
cmn-eng/preprocess/en_code.txt
cmn-eng/preprocess/en_refine.txt
cmn-eng/preprocess/en_vocab.txt
cmn-eng/preprocess/tokenizer.py
cmn-eng/testing.txt
cmn-eng/training.txt
cmn-eng/validation.txt
cmn-eng/word2int_cn.json
cmn-eng/word2int_en.json


In [0]:
train_file='./cmn-eng/training.txt'
valid_file='./cmn-eng/validation.txt'
test_file='./cmn-eng/testing.txt'

In [42]:
in_lines,out_lines,in_tokens,out_tokens,max_in_len,max_out_len=get_data(train_file)
print(in_lines[0],out_lines[0])

['it', "'s", 'none', 'of', 'your', 'concern', '.', '', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'] ['這不關', '你', '的', '事', '。', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [43]:
# 创建字典
in_vocab=Vocab.Vocab(collections.Counter(in_tokens),specials=[EOS,BOS,PAD])
out_vocab=Vocab.Vocab(collections.Counter(out_tokens),specials=[EOS,BOS,PAD])
print(len(in_vocab),len(out_vocab))

4404 9949


In [44]:
# 将句子转为corpus
in_corpus,out_corpus=get_corpus(in_lines,out_lines,in_vocab,out_vocab)
print(in_corpus.shape,out_corpus.shape,in_corpus[0],out_corpus[0])

torch.Size([18000, 32]) torch.Size([18000, 26]) tensor([  16,   18,  861,   19,   32, 2039,    4,    3,    0,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2]) tensor([4544,    7,    5,   63,    3,    0,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2])


In [45]:
# 建立数据集
dataset=Data.TensorDataset(in_corpus,out_corpus)
dataset[0]

(tensor([  16,   18,  861,   19,   32, 2039,    4,    3,    0,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2]),
 tensor([4544,    7,    5,   63,    3,    0,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2]))

### 2.1.2 训练

In [47]:
batch_size=150
train_iter=Data.DataLoader(dataset,batch_size=batch_size,shuffle=True)
embed_size,num_hiddens,num_layers,attention_size=100,64,2,30
encoder=Encoder(len(in_vocab),embed_size,num_hiddens,num_layers)
decoder=Decoder(len(out_vocab),embed_size,num_hiddens,num_layers,attention_size)
loss_fn=cross_loss()
num_epochs,lr=20,0.003
train(encoder,decoder,lr,num_epochs,train_iter,loss_fn,print_every=1)

Epoch:1 | Loss:5.714402944493042
Epoch:2 | Loss:4.911403534971058
Epoch:3 | Loss:4.563551391723723
Epoch:4 | Loss:4.309564571819631
Epoch:5 | Loss:4.1036216816969135
Epoch:6 | Loss:3.923650357256636
Epoch:7 | Loss:3.759410717015689
Epoch:8 | Loss:3.608635410184233
Epoch:9 | Loss:3.4703566909348744
Epoch:11 | Loss:3.231262918464312
Epoch:12 | Loss:3.1273974740500154
Epoch:13 | Loss:3.0266089341241846
Epoch:14 | Loss:2.938236445272846
Epoch:15 | Loss:2.8496086191847647
Epoch:16 | Loss:2.770799174000113
Epoch:17 | Loss:2.6953737122455173
Epoch:18 | Loss:2.6235850066590487
Epoch:19 | Loss:2.5608888374512677
Epoch:20 | Loss:2.4960548272730874


In [48]:
in_line='he is a teacher .'
pred=predict(in_line,encoder,decoder,max_in_len,max_out_len)
real='他 是 老师 。'
print(pred,bleu(pred,real))

['他', '是', '個', '好人', '。'] 0.6
