# Transformer PyTorch实现

## 1.数据导入 预处理（分词，建立词表）

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy
from torch.autograd import Variable

import matplotlib.pyplot as plt

In [None]:
from torchtext import data,datasets
!python -m spacy download en
!python -m spacy download de
#下载预处理模型

In [None]:
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')
#加载

In [None]:

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
#torchtext.data.Field : 用来定义字段的处理方法（文本字段，标签字段） 
TGT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD, 
                 eos_token = EOS_WORD, pad_token=BLANK_WORD)
#此处先设置最大长度20 太长电脑跑不动

MAX_LEN = 20
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(SRC, TGT), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)
#filter_pred（callable或None）仅使用filter_pred（example）为True的示例，或使用所有示例（如果为None）
MIN_FREQ = 1
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
TGT.build_vocab(train.trg, min_freq=MIN_FREQ)

In [None]:
print(next(train.trg))
#train.trg 是生成器

In [None]:
SRC.vocab.freqs[',']

## 2.重写批处理函数，实现按长度分批

In [None]:
BATCH_SIZE = 4096
global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.src))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

In [None]:
#重写了Iterator的函数
class MyIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)
            
        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                          self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

In [None]:
class Batch:
    def __init__(self, src, trg, src_mask, trg_mask, ntokens):
        self.src = src
        self.trg = trg
        self.src_mask = src_mask
        self.trg_mask = trg_mask
        self.ntokens = ntokens
        
def rebatch(pad_idx, batch):
    "Fix order in torchtext to match ours"
    src, trg = batch.src.transpose(0, 1), batch.trg.transpose(0, 1)
    src_mask, trg_mask = make_std_mask(src, trg, pad_idx)
    return Batch(src, trg, src_mask, trg_mask, (trg[1:] != pad_idx).data.sum())


In [None]:
train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=0,
                        repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                        batch_size_fn=batch_size_fn, train=True)
valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=0,
                        repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                        batch_size_fn=batch_size_fn, train=False)


In [None]:
#看一下valid_iter
for i, batch in enumerate(valid_iter):
    print(i,'-',batch)

## 3.创建掩码

In [None]:
pad_idx = TGT.vocab.stoi["<blank>"]
def make_std_mask(src, tgt, pad):
    src_mask = (src != pad).unsqueeze(-2)
    tgt_mask = (tgt != pad).unsqueeze(-2)
    tgt_mask = tgt_mask & Variable(subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
    #type_as 将张量转换为给定类型的张量
    return src_mask, tgt_mask
def subsequent_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k = 1).astype('uint8')
    #Return a copy of a matrix with the elements below the k-th diagonal zeroed.
    return torch.from_numpy(subsequent_mask) == 0

In [None]:
#探究掩码
for i, batch in enumerate(train_iter):
    src, trg = batch.src.transpose(0, 1), batch.trg.transpose(0, 1)
    src_mask = (src != pad_idx).unsqueeze(-2)
    
    #unsqueeze 增加一维
    tgt_mask = (trg != pad_idx).unsqueeze(-2)
   #print(Variable(subsequent_mask(trg.size(-1)).type_as(tgt_mask.data)).size())
    
    tgt_mask = tgt_mask & Variable(subsequent_mask(trg.size(-1)).type_as(tgt_mask.data))
    #print(tgt_mask)
    #print((trg[1:] != pad_idx).data.sum())
    #统计trg中除bos，以及blank外词的个数
    print(trg[1:].size())
    print(trg.size())
    print('--')
    #print(src_mask)

In [None]:
print(trg[:, :-1])
print(trg)
#为什么model.forward中,带入trg的句子都把最后一个词给去掉
#删除的这个词必定是ENS 或者是 pad

In [None]:
print(tgt_mask.size())
print(tgt_mask[:, :-1, :-1].size())

## 4.建立整个模型框架

In [None]:
#定义标准的编码器-解码器框架
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "take in and process masked src and tgt sequences"
        memory = self.encoder(self.src_embed(src), src_mask)
        output = self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
        return output

In [None]:
def clones(module, N):
    "Produce N identical layers"
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [None]:
class Encoder(nn.Module):
    "core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
    
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
        
    def forward(self, x):
        x = x.float()
        mean = x.mean(-1, keepdim=True)#求mean最后一个维度的均值，并保持维度不变
        std = x.std(-1, keepdim=True)
        return self.a_2*(x - mean)/(std + self.eps) + self.b_2
#归一化层

In [None]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [None]:
class EncoderLayer(nn.Module):
    "Encoder is made up of two sublayers, self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [None]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, size,self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
        
    def forward(self, x,memory, src_mask, tgt_mask):
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [None]:
def attention(query, key, value, mask = None, dropout = 0.0):
    "scaled dot product attention"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2,-1))/math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0,-1e9)
    p_attn = F.softmax(scores, dim = -1)
   #print('--')
    #print(p_attn)
    p_attn = F.dropout(p_attn, p = dropout)
    return torch.matmul(p_attn, value), p_attn


In [None]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout = 0.1):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model//h
        self.h = h
        self.p = dropout
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        
    def forward(self, query, key, value, mask = None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        #1）do all the linear projections in batch from d_model => h x d_k
        query, key, value = [l(x).view(nbatches, -1, self.h,self.d_k).transpose(1, 2) for l, x in 
                             zip(self.linears, (query, key, value))]
        # 依次取出每一个数组的元素然后进行组合
        #2) Apply attention on all the projected vectors in batch
        x, self.attn = attention (query, key, value, mask = mask, dropout = self.p)
        #3)"Concat" using a view and apply a final linear
        x = x.transpose(1,2).contiguous().view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [None]:
#逐位置的前馈网络  编码器和解码器模块最后都包含一个全连接的前馈网络，独立相同的应用于每一个位置

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout = 0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model,d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [None]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings,self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
        
    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len = 5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p = dropout)
        
        pe = torch.zeros(max_len, d_model)
        
        position = torch.arange(0.,max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0., d_model, 2) * 
                            -(math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x + Variable(self.pe[:, x.size(1)], requires_grad = False)
        return self.dropout(x)

In [None]:
class Generator(nn.Module):
    #softmax 实现单词生成
    def __init__(self, d_model, vocab):
        super(Generator,self).__init__()
        self.proj = nn.Linear(d_model, vocab)
    def forward(self, x):
        return F.log_softmax(self.proj(x), dim = -1)

In [None]:
#定义模型整体,将以上模块组合
def make_model(src_vocab, tgt_vocab, N = 6, d_model = 512, d_ff = 2048, h = 8, dropout = 0.1):
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model, dropout)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab)
    )
    for p in model.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform(p)
    return model

In [None]:
tmp_model = make_model(10, 10, 1)
tmp_model

## 5.定义学习率，误差计算，进行训练

In [None]:
#定义学习率
class NoamOpt:
    def __init__(self, model_size, factor, warmup, optimizer):
        #批数，倍率，预热度，梯度方式
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size **(-0.5) *
            min(step ** (-0.5),step * self.warmup**(-1.5)))
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
                    torch.optim.Adam(model.parameters(), lr=0,betas=(0.9, 0.98),eps=1e-9))

In [None]:
#定义标签平滑
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False)
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))

In [None]:
#定义误差计算
def loss_backprop(generator, criterion, out, targets, normalize):
    """
    Memory optmization. Compute each timestep separately and sum grads.
    """
    assert out.size(1) == targets.size(1)
    total = 0.0
    out_grad = []
    for i in range(out.size(1)):
        out_column = Variable(out[:, i].data, requires_grad=True)
        gen = generator(out_column)
        loss = criterion(gen, targets[:, i]) / normalize
        total += loss.item()
        loss.backward()
        out_grad.append(out_column.grad.data.clone())
    out_grad = torch.stack(out_grad, dim=1)
    out.backward(gradient=out_grad)
    return total

In [None]:
def train_epoch(train_iter, model, criterion, opt, transpose=False):
    model.train()
    for i, batch in enumerate(train_iter):
        src, trg, src_mask, trg_mask = \
            batch.src, batch.trg, batch.src_mask, batch.trg_mask
        out = model.forward(src, trg[:, :-1], src_mask, trg_mask[:, :-1, :-1])
        loss = loss_backprop(model.generator, criterion, out, trg[:, 1:], batch.ntokens) 
                        
        model_opt.step()
        model_opt.optimizer.zero_grad()
        if i % 10 == 1:
            print(i, loss, model_opt._rate)

In [None]:
def valid_epoch(valid_iter, model, criterion, transpose=False):
    #model.test()
    model.train()
    total = 0

    for i,batch in enumerate(valid_iter):
        src, trg, src_mask, trg_mask = \
            batch.src, batch.trg, batch.src_mask, batch.trg_mask
        out = model.forward(src, trg[:, :-1], src_mask, trg_mask[:, :-1, :-1])
        loss = loss_backprop(model.generator, criterion, out, trg[:, 1:], batch.ntokens)
        #以下改动
        model_opt.step()
        model_opt.optimizer.zero_grad()
        print(i, loss, model_opt._rate)

In [None]:
pad_idx = TGT.vocab.stoi["<blank>"]
model = make_model(len(SRC.vocab), len(TGT.vocab), N=6)
model_opt = get_std_opt(model)

criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
#criterion.cuda()
#for epoch in range(5):
    #train_epoch((rebatch(pad_idx, b) for b in train_iter), model, criterion, model_opt)
    #valid_epoch((rebatch(pad_idx, b) for b in valid_iter), model, criterion)
    
#这里为了节约时间仅仅对验证集训练一次
train_epoch((rebatch(pad_idx, b) for b in train_iter), model, criterion, model_opt)

## 6.利用贪心解码实现翻译，并对翻译进行评估

In [None]:

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encoder(model.src_embed(src), src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len-1):
        out = model.decoder(Variable(model.tgt_embed(ys)), memory, src_mask, Variable(subsequent_mask(ys.size(1)).type_as(src_mask.data)))
        prob = model.generator(out[:, -1])
        #为什么我觉得应该取第i个词
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    return ys


In [None]:
model.eval()
list_tr = []
list_tg = []
for i, batch in enumerate(valid_iter):
    src = batch.src.transpose(0, 1)[:1]
    src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2)
    out = greedy_decode(model, src, src_mask,  max_len=60, start_symbol=TGT.vocab.stoi["<s>"])
    list_trans = []
    list_tgt = []
    print("Translation:", end="\t")
    for i in range(1, out.size(1)):
        sym = TGT.vocab.itos[out[0, i]]
        if sym == "</s>": break
        list_trans.append(sym)
        print(sym, end =" ")
    list_tr.append(list_trans)
    print()
    print("Target:", end="\t")
    for i in range(1, batch.trg.size(0)):
        sym = TGT.vocab.itos[batch.trg.data[i, 0]]
        if sym == "</s>": break
        list_tgt.append(sym)
        print(sym, end =" ")
    list_tg.append(list_tgt)
    print()

In [None]:
#调试
for i, batch in enumerate(valid_iter):
    src = batch.src.transpose(0, 1)[:1]#这里只测试第一个句子
    src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2)
    #print(src.size())
    #print(src_mask)
    ys = torch.ones(1, 1).fill_(TGT.vocab.stoi["<s>"]).type_as(src.data)
    tgtmask = Variable(subsequent_mask(ys.size(1)).type_as(src_mask.data))
    #.type_as(src.data)
    #
    print(ys.type())
    print(src_mask.size())

### 利用bleu对翻译进行评价

In [None]:
from nltk.translate.bleu_score import sentence_bleu

In [None]:
for i in range(len(list_tr)):
    reference = [list_tg[i]]
    candidate = list_tr[i]
    score = sentence_bleu(reference, candidate)
    print(score)