In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

<torch._C.Generator at 0x254a03578b0>

In [3]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [4]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

#text : str " Can be called before training to pre-process gold data."
# ['.',
#  'data',
#  'gold',
#  'process',
#  '-',
#  'pre',
#  'to',
#  'training',
#  'before',
#  'called',
#  'be',
#  'Can']

In [5]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)
# 字段：SRC (source) 

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)
# 字段：TRG (target)

In [6]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))
# 这里都是原始数据，还没有转换为索引，也没有pad
# exts ：指定来源和目标语言
# fields ：指定源和目标使用哪个字段
# train_data 是从torchtext创建的

In [7]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [8]:
# 一个样本，包含src和trg字段
print(vars(train_data.examples[0]))

{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [9]:
# torchtext，创建词典并赋予对应词索引，出现次数小于2次的，都标记为<unk>
SRC.build_vocab(train_data, min_freq = 2)# 从train_data数据集中为SRC字段创建Vocab 对象
TRG.build_vocab(train_data, min_freq = 2)# 从train_data数据集中为TRC字段创建Vocab 对象
# VOCAB对象：
# Vocab.vocab.freqs:collections.counter对象，每个词和对应的频数
# Vocab.vocab.stoi:collections.defaultdict 实例，每个词对应的词典索引

In [10]:
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7853
Unique tokens in target (en) vocabulary: 5893


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)
# 这里创建数据集时，使用了索引，并进行了pad，但是Bucket之间的长度可能不一致
# 当source、target使用batch时，要求各自取的样本中，数据长度一样，source和target之间的长度可以不同
# BucketIterator可以高效做到这件事情，可以计算并使用最少的填充量
# 染回一个batch迭代器：例如
# 批次1，src size为29*128,trg size为30*128
# 批次2，src size为32*128,trg size为26*128
# 这里的数据还是列数为数据个数，进入模型中进行计算时，一般变为行数为数据个数

In [13]:
# Encoder(input_dim=7853, emb_dim=256, hid_dim=512, n_layers=2, dropout=0.5)
class Encoder(nn.Module):
    
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        #Encoder中不需要y输出，只需要最后时刻的h和c
        
        self.hid_dim = hid_dim # 隐藏层单元个数512
        self.n_layers = n_layers # 隐藏层层数2
        
        self.embedding = nn.Embedding(input_dim, emb_dim) # (7853,256)
        # 输入到RNN的为词嵌入向量
        # input_dim:基于词典的one-hot向量长度，emb_dim:词嵌入层的向量维度
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout) # (256,512,2)
        # 这里的dropout是层与层之间的，不是时间步与时间步之间的
        # 2层t+1 cell接收2层t cell的h,c和1层t+1 cell的h
        # emb_dim:词嵌入向量大小，hid_dim：隐藏层大小，n_layers：深度
        
        self.dropout = nn.Dropout(dropout) # x到embedding层的dropout
        
    def forward(self, src):
        
        # src = [src len, batch size]
        # batch1 src=torch.Size([24, 128])
        # 一个样本一次性输入24个字符索引
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]，[24,128,256]，
        # embedded[0].shape = torch.Size([128, 256]) ,第一个字符位置上(128行，256列)，128个样本的词向量
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell # 2层的最后一个时刻的隐藏状态和细胞状态

In [14]:
# Decoder(output_dim=5893, emb_dim=256, hid_dim=512, n_layers=2, dropout=0.5)
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim # trg词典大小
        self.hid_dim = hid_dim # decoder中lstm的隐藏层，和encoder中可以不一样
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        # output_dim：trg的词典大小
        # emb_dim：Decoder的词嵌入向量大小，可以和encoder的不一样
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        # hid_dim:是第二层的h，不使用c
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0) # torch.nn只支持小批量，如果只有一个样本，要用unsqueeze添加假的批量维度
        # 因为decoder中，样本字符是一个一个输入的
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0)) # 同样，要从批量压缩为单个样本
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [15]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        # src为train_iterator中每个batch 的src
        # batch1 src=torch.Size([24, 128])
        # batch2 src=torch.Size([29, 128])
        # btach3 src=torch.Size([33, 128])
        #src = [src len, batch size]
        
        # trg为train_iterator中每个batch 的trg
        # batch1 trg=torch.Size([30, 128])
        # batch2 trg=torch.Size([32, 128])
        # batch3 trg=torch.Size([29, 128])
        #trg = [trg len, batch size]
        
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time

        batch_size = trg.shape[1] # 128
        trg_len = trg.shape[0] # 30
        trg_vocab_size = self.decoder.output_dim # 5893
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        # 一次output一个字符的批次结果[128,5893],128个样本的第一个字符的预测
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:] # 一开始是trg的第一个字符<SOS>的index,长度为batch长度
        #tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        #2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        #2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0')
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1 
        
        return outputs

In [16]:
INPUT_DIM = len(SRC.vocab) # 
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)
# src = batch.src
# trg = batch.trg
# model(src, trg)

In [17]:
#  -0.08 到+0.08 均匀分布初始化
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights) #model.apply,应用init_weights 初始化model中的每个子模块

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [31]:
for i in model.named_parameters():
    i[0]
    i[1].shape

'encoder.embedding.weight'

torch.Size([7853, 256])

'encoder.rnn.weight_ih_l0'

torch.Size([2048, 256])

'encoder.rnn.weight_hh_l0'

torch.Size([2048, 512])

'encoder.rnn.bias_ih_l0'

torch.Size([2048])

'encoder.rnn.bias_hh_l0'

torch.Size([2048])

'encoder.rnn.weight_ih_l1'

torch.Size([2048, 512])

'encoder.rnn.weight_hh_l1'

torch.Size([2048, 512])

'encoder.rnn.bias_ih_l1'

torch.Size([2048])

'encoder.rnn.bias_hh_l1'

torch.Size([2048])

'decoder.embedding.weight'

torch.Size([5893, 256])

'decoder.rnn.weight_ih_l0'

torch.Size([2048, 256])

'decoder.rnn.weight_hh_l0'

torch.Size([2048, 512])

'decoder.rnn.bias_ih_l0'

torch.Size([2048])

'decoder.rnn.bias_hh_l0'

torch.Size([2048])

'decoder.rnn.weight_ih_l1'

torch.Size([2048, 512])

'decoder.rnn.weight_hh_l1'

torch.Size([2048, 512])

'decoder.rnn.bias_ih_l1'

torch.Size([2048])

'decoder.rnn.bias_hh_l1'

torch.Size([2048])

'decoder.fc_out.weight'

torch.Size([5893, 512])

'decoder.fc_out.bias'

torch.Size([5893])

In [18]:
# 模型中可训练参数的数量
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 13,898,501 trainable parameters


In [19]:
# 使用adam优化器
optimizer = optim.Adam(model.parameters()) # 把参数放进优化器中

In [20]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [21]:
def train(model, iterator, optimizer, criterion, clip):
    # model.train()指定当前模型是在训练，启用 BatchNormalization 和 Dropout，保证BN层(Batch Normalization）用每一批数据的均值和方差
    # model.eval()指定当前模型是在验证，不启用 BatchNormalization 和 Dropout是，保证BN用全部训练数据的均值和方差
    # 对于Dropout，model.train()是随机取一部分网络连接来训练更新参数，model.eval()是利用到了所有网络连接

    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator): # iterator为train_iterator
        
        src = batch.src 
#         batch1 src=torch.Size([24, 128])
#         batch2 src=torch.Size([29, 128])
#         btach3 src=torch.Size([33, 128])
#         ......
        trg = batch.trg
#         batch1 trg=torch.Size([30, 128])
#         batch2 trg=torch.Size([32, 128])
#         batch3 trg=torch.Size([29, 128])
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim) # 因为decoder的output是从1开始存储的
        # view之后output = [(trg len - 1) * batch size, output dim],相当于根据字符进行行的堆叠，方便循环迭代
        trg = trg[1:].view(-1) # 在计算损失的时候，不会用到开始的<sos>字符，根据字符进行堆叠
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
    
        loss = criterion(output, trg)
        # output行对应trg中的数
        
        loss.backward() # 计算各个参数的梯度
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step() # 更新梯度
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator) # 每个批次的平均损失

In [22]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [68]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 56s
	Train Loss: 5.062 | Train PPL: 157.911
	 Val. Loss: 4.963 |  Val. PPL: 142.978
Epoch: 02 | Time: 0m 53s
	Train Loss: 4.528 | Train PPL:  92.549
	 Val. Loss: 4.850 |  Val. PPL: 127.753
Epoch: 03 | Time: 0m 53s
	Train Loss: 4.220 | Train PPL:  68.042
	 Val. Loss: 4.681 |  Val. PPL: 107.842
Epoch: 04 | Time: 0m 53s
	Train Loss: 4.004 | Train PPL:  54.817
	 Val. Loss: 4.518 |  Val. PPL:  91.673
Epoch: 05 | Time: 0m 53s
	Train Loss: 3.834 | Train PPL:  46.237
	 Val. Loss: 4.305 |  Val. PPL:  74.072
Epoch: 06 | Time: 0m 52s
	Train Loss: 3.663 | Train PPL:  38.996
	 Val. Loss: 4.313 |  Val. PPL:  74.649
Epoch: 07 | Time: 0m 52s
	Train Loss: 3.565 | Train PPL:  35.323
	 Val. Loss: 4.143 |  Val. PPL:  63.014
Epoch: 08 | Time: 0m 52s
	Train Loss: 3.414 | Train PPL:  30.378
	 Val. Loss: 4.125 |  Val. PPL:  61.845
Epoch: 09 | Time: 0m 52s
	Train Loss: 3.288 | Train PPL:  26.782
	 Val. Loss: 4.046 |  Val. PPL:  57.187
Epoch: 10 | Time: 0m 52s
	Train Loss: 3.163 | Train PPL