# Transformer的pythoch实现

## 数据预处理
使用transformer执行文本翻译任务，数据集选用英文和法语数据集

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from batch import create_masks
from process import *
import numpy as np
import time

# 数据
src_file = 'data/english.txt'
trg_file = 'data/french.txt'
src_lang = 'en_core_web_sm'
trg_lang = 'fr_core_news_sm'
max_strlen = 80
batchsize = 1500
src_data, trg_data = read_data(src_file, trg_file)  # 一个包含所有源语言（英语）句子的字符串列表。154883
EN_TEXT, FR_TEXT = create_fields(src_lang, trg_lang)
train_iter, src_pad, trg_pad = create_dataset(src_data, trg_data, EN_TEXT, FR_TEXT, max_strlen, batchsize) # , 1 , 1
src_vocab = len(EN_TEXT.vocab) #13724 源语句的token数量
trg_vocab = len(FR_TEXT.vocab) #23469 目标语句的token数量


loading spacy tokenizers...
creating dataset and iterator... 
1090
第一个batch的源语言（索引）:
tensor([[  13,   15,   73,  ...,    3,   13,   13],
        [  65,   67,   26,  ...,   20,   16,   51],
        [  88,    8,    4,  ...,    5,  298,  801],
        ...,
        [  94,    5,   79,  ...,    8,  293,   85],
        [  47, 3519, 2640,  ...,  653,  549,  607],
        [   2,    2,    7,  ...,    2,    2,    2]])
第一个batch的目标语言（索引）:
tensor([[    2,     2,     2,  ...,     2,     2,     2],
        [   11,    24,    83,  ...,     5,    36,    11],
        [   33,    18,    57,  ...,   145,    10,  1309],
        ...,
        [  163,  4650, 13618,  ...,   658,   354,   189],
        [    4,     4,     7,  ...,     4,  1219,     4],
        [    3,     3,     3,  ...,     3,     3,     3]])


In [None]:
# 获取训练集中的前2个batch，便于测试
sample_batches = []
for i, batch in enumerate(train_iter):
    sample_batches.append(batch)
    if i >= 0:  # 只取前1个batch
        break

# 打印第一个batch的源语言和目标语言内容（索引形式）
print("第一个batch的源语言（索引）:")
print(sample_batches[0].src)
print("第一个batch的目标语言（索引）:")
print(sample_batches[0].trg)
# shape为torch.Size([7, 214])，214句话，每句话有7个token
# 如果想要将索引还原为单词，可以这样做：
src_vocab = EN_TEXT.vocab
trg_vocab = FR_TEXT.vocab

def indices_to_words(indices, vocab):
    return [vocab.itos[idx] for idx in indices]

# 以第一个batch的第一句话为例，转换为单词
src_indices = sample_batches[0].src[:, 0]  # 第一句
trg_indices = sample_batches[0].trg[:, 0]  # 第一句

print("第一个batch第一句源语言（单词）:")
print(indices_to_words(src_indices, src_vocab))
print("第一个batch第一句目标语言（单词）:")
print(indices_to_words(trg_indices, trg_vocab))

第一个batch的源语言（索引）:
tensor([[  41,  191,   27,  ...,   10,   25,   11],
        [  36,   26,  322,  ...,    9,   30,   14],
        [  44,  132,  266,  ...,   33,   24,   29],
        [2497,  229, 5832,  ...,  650,  466,  372],
        [   2,    2,    2,  ...,    2,    7,    7]])
第一个batch的目标语言（索引）:
torch.Size([7, 214])
第一个batch第一句源语言（单词）:
['they', "'re", 'all', 'terrified', '.']
第一个batch第一句目标语言（单词）:
['<sos>', 'elles', 'sont', 'toutes', 'terrifiées', '.', '<eos>']


## 模型参数

In [7]:
d_model = 512
heads = 8
N = 6
dropout = 0.1

## Embedding

In [10]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, d_model) -> None:
        super().__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)

## Positional Encoding
![positional encoding](./picture/image1.png)

In [8]:
import math
from torch.autograd import Variable
class positionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len=80, dropout=0.1) -> None:
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        #根据输入语句的token数和嵌入向量的维度构造位置编码矩阵
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** (i / d_model)))
                if i + 1 < d_model:
                    pe[pos, i + 1] = math.cos(pos / (10000 ** (i / d_model)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe) 

    def forward(self, x):
        # 使得单词嵌入表示相对大一些
        x = x * math.sqrt(self.d_model)
        # 增加位置常量到单词嵌入表示中
        seq_len = x.size(1)
        x = x + Variable(self.pe[:, :seq_len, :], requires_grad=False)
        return self.dropout(x)