# 模型部分

In [105]:
import torch
import torch.nn as nn
import math
from torch.autograd import Variable
import torch.nn.functional as F
import copy

### (1):  embedding 层
* transformaer 有两种embedding。一种是词的embedding,一种是位置的embedding。Word embedding就不多说了，pos embedding 是由facebook提出的。这里用pos embedding 的原因是机器翻译需要很强的顺序性，但是self-attention没有关注到顺序，所以加入pos embedding。在transformer里是将pos  embedding和word embedding 相加，得到最终的embedding。
* pos embedding 的计算方法是

    $PE_(pos, 2i) = sin(pos/1000^{2i/d_model})$
    
    $PE_(pos, 2i+1) = cos(pos/1000^{2i/d_model})$
    
    $pos$ 是在词在句子中的相对位置，$i$ 是pos embedding 中的相对位置

In [106]:
class Embedder(nn.Module):
    def __init__(self, vocab_szie, emb_dim):
        super(Embedder, self).__init__()
        self.embed = nn.Embedding(vocab_szie, emb_dim)

    def forward(self, x):
        return self.embed(x)

In [107]:
class PositionalEncoder(nn.Module):
    def __init__(self, emb_dim, max_seq_len=200, dropout=0.1):
        super(PositionalEncoder, self).__init__()
        self.emb_dim = emb_dim

        pe = torch.zeros(max_seq_len, emb_dim)
        for pos in range(max_seq_len):
            for i in range(0, emb_dim, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / emb_dim)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1)) / emb_dim)))

        # self.register_buffer可以将tensor注册成buffer
        #  网络存储时也会将buffer存下，当网络load模型时，会将存储的模型的buffer也进行赋值。
        pe = pe.unsqueeze(0) # word embedding shape为)(ba, seq_len, dim),这里为了后两维相同
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x * math.sqrt(self.emb_dim)

        seq_len = x.size(1)
        # 据说训练出来的pos embedding 和计算出来的embedding效果相当，所以这里不需要求导
        pe = Variable(self.pe[:, :seq_len,:], requires_grad=False)

        x = x + pe
        return x

### (2)：Attention

#### Scaled Dot-Product Attention
计算公式: $\text{Attention}(Q,K,V)=softmax(\frac{QK^T}{\sqrt d_k})V$

Q, K, V 分别代表Query, Key, Value,论文中是这样解释这个公式: 通过$Q$和$K$的相似程度来确定$V$。除以$\sqrt d_k$的原因是$Q$和$K$可能得到比较大的值，而有一些值特别小，而导致梯度过小。
<img src="image/scaled_dot_product_attention_arch.png"  height="400" width="250">
    
#### Multi-Head Attention

将Q、K、V通过一个线性映射之后，分成 h 份，对每一份进行scaled dot-product attention效果更好。然后，把各个部分的结果合并起来，再次经过线性映射，得到最终的输出。这就是所谓的multi-head attention。
 <img src="image/multi_head_attention_arch.png"  height="400" width="250">


In [108]:
def attention(q, k, v, d_k, mask=None, dropout=None):
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)

    # 这里mask的在decoder的时候输入的，在训练的时候，不需要得到当前翻译词
    # 后面的知识， 所以需要mask来表示哪些是后面的词。
    # 把当前词后面的词的位置的值改为-1e9,这样后面的位置经过softmax就会接近于0
    # 从而得不到后面的信息
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)

    scores = F.softmax(scores, dim=-1)

    if dropout is not None:
        scores = dropout(scores)

    output = torch.matmul(scores, v)
    return output

In [109]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()

        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        bs = q.size(0)

        # 经过线性映射，然后将q,k,v分为 n 份
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)

        # transpose to get dimensions bs * N * seq_len * d_model
        k = k.transpose(1, 2)
        q = q.transpose(1, 2)
        v = v.transpose(1, 2)

        # calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1, 2).contiguous() \
            .view(bs, -1, self.d_model)
        output = self.out(concat)

        return output

### (3):  Position-wise Feed-Forward Networks
计算公式: $FFN(x)=max(0,xW_1+b_1)W_2+b_2$

先经过一个线性变换，然后经过relu,最后再经过线性变换

In [110]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()

        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

### (4):  Layer Normalization

我们对batch normalization 很熟悉，batch normalization在整个训练样本上计算均值和方差，然后对样本进行归一化。然而Layer Normalization是对每一个样本上计算均值和方差，然后进行归一化。也就是说每一个样本都是根据自己的方差和均值进行归一化的。


pytorch 有已经实现了的库

In [111]:
class Norm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()

        self.size = d_model

        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        # nn.LayerNorm
        self.eps = eps

    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
               / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

### (5): EncoderLayer
* 在encoder的self-attention中，$Q、K、V$都来自同一个地方（相等），他们是上一层encoder的输出。对于第一层encoder，它们就是word embedding和positional encoding相加得到的输入。
* $Q、K、V$三者的维度一样，即 $dq=dk=dv$。
* 每一个encoder-layer都由self-attention 和 Position-wise Feed-Forward Networks 组成。
* 含有残差操作
<img src="image/encoder_layer.png" width="250" height="150">

In [112]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)

    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2, x2, x2, mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x

### (6): DecoderLayer
* 在decoder的self-attention中，Q、K、V都来自于同一个地方（相等），它们是上一层decoder的输出。对于第一层decoder，它们就是word embedding和positional encoding相加得到的输入。
* 在encoder-decoder attention中，Q来自于decoder的上一层的输出，K和V来自于encoder的输出，K和V是一样的。
* Q、K、V三者的维度一样，即 $dq=dk=dv$。
* 含有残差操作。
<img src="image/decoder_layer.png" height="300" width="200">

In [113]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)

        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)

        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)

    def forward(self, x, e_outputs, src_mask, trg_mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask))
        x2 = self.norm_3(x)
        x = x + self.dropout_3(self.ff(x2))
        return x

### (7): Encoder Decoder 
encoder 和decoder 都是分别由N个encoderLayer 和 decoderlayer组成

In [114]:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])


class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)

    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        return self.norm(x)


class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)

    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
   http://localhost:8888/notebooks/image/transfomer.png     return self.norm(x)

### (8): Transformer
只需把encoder 和 decoder组装在一起。
<img src="image/transfomer.png" >

In [115]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
        self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
        self.out = nn.Linear(d_model, trg_vocab)

    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        # print("DECODER")
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output

# 二: 处理数据

In [116]:
import jieba
import nltk
import torch
from torchtext import data
import pandas as pd
import params
from sklearn.utils import shuffle
import os

In [117]:
def getData(path):
    data = []
    with open(path, encoding='utf-8') as f:
        data = f.readlines()
    lang = []
    lang2 = []

    for idx, da in enumerate(data):
        #         print(da)
        d = da.split("\t")
        lang.append(d[0].strip())
        lang2.append(d[1].strip())
    # print(lang[0:3])
    #     print(lang2[0:3])
    return lang, lang2


In [118]:
def cn_tokenize(sentence):
    seg_list = jieba.cut(sentence)
#     print(", ".join(seg_list))
    return [toekn for toekn in  seg_list]
# cn_tokenize(cn_lang[307])

def en_tokenize(sentence):
    seg_list = nltk.word_tokenize(sentence)
    return [token for token in seg_list]
# en_tokenize(en_lang[307])


In [119]:
def createDataSet(SRC, TRG, lang, lang2, batch_size):
    raw_data = {'src': lang, 'trg': lang2}
    df = pd.DataFrame(raw_data, columns=['src', 'trg'])

    if not os.path.exists("./data/train.csv") or not os.path.exists("./data/test.csv"):
        df = shuffle(df) # 打乱数据
        test_df = df[:params.test_nums] # 得到后test nums 个数据做测试
        train_df = df[params.test_nums:] # 剩下的做训练
        test_df.to_csv("data/test.csv", index=False, encoding='utf-8')
        train_df.to_csv("data/train.csv", index=False, encoding='utf-8')

    data_fields = [('src', SRC), ('trg', TRG)]
    train = data.TabularDataset('./data/train.csv', format='csv', fields=data_fields, skip_header=True)
    test = data.TabularDataset('./data/test.csv', format='csv', fields=data_fields, skip_header=True)

    if params.is_cuda:
        device = torch.device("cuda:0")
    train_iter = data.Iterator(train, sort_key=lambda x: (len(x.src), len(x.trg)),
                                      batch_size=params.batch_size,device=device)
    test_iter = data.Iterator(test, sort_key=lambda x: (len(x.src), len(x.trg)),
                               batch_size=params.test_batch_size, device=device)

    SRC.build_vocab(train, min_freq=params.src_min_freq)
    TRG.build_vocab(train, min_freq=params.trg_min_freq)

    return train_iter, test_iter

In [120]:
def getFiled(path, batch_size=1):
    en_lang, cn_lang = getData(path)
    SRC = data.Field(lower=True, tokenize=en_tokenize, init_token='<sos>', eos_token='<eos>')
    # trg是中文
    TRG = data.Field(tokenize=cn_tokenize, eos_token='<eos>', init_token='<sos>')
    createDataSet(SRC, TRG, en_lang, cn_lang, batch_size)

    return SRC, TRG

In [121]:
def preparedData(path, batch_size):
    # path = "./data/en-cn.txt"
    en_lang, cn_lang = getData(path)
    SRC = data.Field(lower=True, tokenize=en_tokenize, init_token='<sos>', eos_token='<eos>')
    # trg是中文
    TRG = data.Field(tokenize=cn_tokenize, eos_token='<eos>', init_token='<sos>')
    createDataSet(SRC, TRG, en_lang, cn_lang, batch_size)
    train_iter, test_iter = createDataSet(SRC, TRG, en_lang, cn_lang, batch_size)
    return SRC, TRG, train_iter, test_iter

In [122]:
# path = "./data/en-cn.txt"
# preparedData(path, 1)
# en_lang, cn_lang = getData(path)
# SRC = data.Field(lower=True, tokenize=en_tokenize, init_token='<sos>', eos_token='<eos>')
# # trg是中文
# TRG = data.Field(tokenize=cn_tokenize, eos_token='<eos>', init_token='<sos>')
# train_iter, test_iter = createDataSet(SRC, TRG, en_lang, cn_lang, batch_size=1)

# 三: 训练模型

In [123]:
import time
import params
from tensorboardX import SummaryWriter
import mask


In [124]:
SRC, TRG,  train_iter, test_iter = preparedData(params.data_path, params.batch_size)
src_pad = SRC.vocab.stoi['<pad>']
trg_pad = TRG.vocab.stoi['<pad>']
print('trg_vocal_len: ', len(TRG.vocab))
print('src_vocab_len: ', len(SRC.vocab))
print(SRC.vocab.stoi['<unk>'], " ", TRG.vocab.stoi['<unk>'])
print(SRC.vocab.stoi['<sos>'], " ", TRG.vocab.stoi['<sos>'])
print(SRC.vocab.stoi['<eos>'], " ", TRG.vocab.stoi['<eos>'])

trg_vocal_len:  2316
src_vocab_len:  1910
0   0
2   2
3   3


In [125]:
model = Transformer(len(SRC.vocab), len(TRG.vocab), params.d_model, params.n_layers, params.heads, params.dropout)
if params.is_cuda:
    model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=params.lr, betas=(0.9, 0.98), eps=1e-9)

In [25]:
print("\nbegin training model")
model.train()
if os.path.exists("models/tfs.pkl"):
    model.load_state_dict(torch.load("models/tfs.pkl"))

best_loss = None
writer = SummaryWriter()
for epoch in range(params.epochs):
    model.train()
    start = time.time()
    total_loss = 0.0
    step = 0
    for i, batch in enumerate(train_iter):
        # print(batch)
        src = batch.src.transpose(0, 1)
        trg = batch.trg.transpose(0, 1)

        trg_input = trg[:, :-1]
        src_mask, trg_mask = mask.createMask(src, src_pad, trg_input, trg_pad)

        preds = model(src, trg_input, src_mask, trg_mask)
        _, pred = torch.max(preds, 2)
        # print(preds.size())
        # print(pred.size())
        # print(pred[0, :].size())
        # print(trg[0,1:].size())
#         print(pred[0,:].cpu().numpy())
#         print(trg[0,1:].cpu().numpy())
        # exit(1)
        ys = trg[:, 1:].contiguous().view(-1)
        if params.is_cuda:
            params = ys.cuda()

        optimizer.zero_grad()
        loss = F.cross_entropy(preds.view(-1, preds.size(-1)), ys, ignore_index=trg_pad)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        step = i+1
        # print(src.size())
        # print(trg.size())
        # print(trg_input.size())
        # print(src_mask.size())
        # print(trg_mask.size())
        # print(preds.size())
        # print(ys.size())
        # exit(1)
        # print(loss.item())
        # break
    test_total_loss = 0.0
    test_total_step = 0
    for i, batch in enumerate(test_iter):
        # print(batch)
        src = batch.src.transpose(0, 1)
        trg = batch.trg.transpose(0, 1)

        trg_input = trg[:, :-1]
        src_mask, trg_mask = mask.createMask(src, src_pad, trg_input, trg_pad)

        preds = model(src, trg_input, src_mask, trg_mask)
        _, pred = torch.max(preds, 2)
        # print(pred[0, :].cpu().numpy())
        # print(trg[0, 1:].cpu().numpy())
        # exit(1)
        ys = trg[:, 1:].contiguous().view(-1)
        if params.is_cuda:
            params = ys.cuda()

        loss = F.cross_entropy(preds.view(-1, preds.size(-1)), ys, ignore_index=trg_pad)
        test_total_loss += loss.item()
        test_total_step = i + 1
        # print(src.size())
        # print(trg.size())
        # print(trg_input.size())
        # print(src_mask.size())
        # print(trg_mask.size())
        # print(preds.size())
        # print(ys.size())
        # exit(1)
        # print(loss.item())

        # break

    writer.add_scalar('scalar/train_loss', total_loss/step, epoch)
    writer.add_scalar('scalar/test_loss', test_total_loss/test_total_step, epoch)
    print("epoch:{} train_loss:{} test_loss:{} time:{:.2f}".format(epoch+1, total_loss/step, test_total_loss/test_total_step, time.time()-start))
    if best_loss is None:
        best_loss = test_total_loss/test_total_step
        torch.save(model.state_dict(), "models/tfs.pkl")
    if best_loss > test_total_loss/test_total_step :
        best_loss = test_total_loss/test_total_step
        torch.save(model.state_dict(), "models/tfs.pkl")



print("training end!")


begin training model
epoch:1 train_loss:3.9178114094620193 test_loss:3.4168846864700315 time:577.90
epoch:2 train_loss:3.269623666977155 test_loss:3.1415644764900206 time:555.91
epoch:3 train_loss:2.967221039538529 test_loss:2.943972107887268 time:553.65
epoch:4 train_loss:2.717956202539254 test_loss:2.8046658070087434 time:559.36
epoch:5 train_loss:2.4985898237825914 test_loss:2.7050731492042543 time:555.36
epoch:6 train_loss:2.3037051836125437 test_loss:2.627584369182587 time:548.67
epoch:7 train_loss:2.1178745956137863 test_loss:2.5776197934150695 time:568.21
epoch:8 train_loss:1.9447398918872025 test_loss:2.562746128082275 time:686.68
epoch:9 train_loss:1.7756196801955308 test_loss:2.5576643748283385 time:828.80
epoch:10 train_loss:1.6192045680570484 test_loss:2.550012111902237 time:775.78
epoch:11 train_loss:1.4661846965281116 test_loss:2.58151980304718 time:692.78
epoch:12 train_loss:1.3258367551326358 test_loss:2.620600311756134 time:655.46
epoch:13 train_loss:1.196697258497208

KeyboardInterrupt: 

In [126]:
# print(model)

# 四: 预测

In [None]:
from new_process import createDataSet, getData, preparedData
import params
from model import Transformer
from torchtext import data
import torch
import time
from torch.autograd import Variable
import numpy as np
import torch.nn.functional as F
import os
from tensorboardX import SummaryWriter
import mask
from performance import Performance
from my_optim import ScheduledOptim

SRC, TRG,  train_iter, test_iter = preparedData(params.data_path, params.eval_batch_size)
src_pad = SRC.vocab.stoi['<pad>']
trg_pad = TRG.vocab.stoi['<pad>']
model = Transformer(len(SRC.vocab), len(TRG.vocab), params.d_model, params.n_layers, params.heads, params.dropout)
if params.is_cuda:
    model = model.cuda()

# print(model)
print('trg_vocal_len: ', len(TRG.vocab))
print('src_vocab_len: ', len(SRC.vocab))

model.load_state_dict(torch.load('models/tfs.pkl'))
model.eval()

cnt = 10
for i, batch in enumerate(train_iter):
    # print(batch)
    src = batch.src.transpose(0, 1)
    trg = batch.trg.transpose(0, 1)
    print(src.size())
    print(trg.size())
    # print(src)
    print("trg: ", trg)
    trg_input = trg[:, 0].unsqueeze(1)
    # print(trg_input.size())

    src_mask = None
    trg_mask = None
    for i in range(params.max_len):
        preds = model(src, trg_input, src_mask, trg_mask)
        # print("pred ", torch.max(preds, 2)[1])
        pred = torch.max(preds, 2)[1]
        trg_input = torch.cat((trg_input, pred[:, i].unsqueeze(1)), dim=1)
        # print(pred[:, i].unsqueeze(1))
    print("pred:", trg_input)
    cnt -= 1
    if cnt==0:
        break
    # break



