In [2]:
#导入基本包
import random
import math
import os
import numpy as np
import collections
#导入模型计算基本包
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import BertTokenizer
#数据及模型处理
from torch.utils.data import Dataset, DataLoader, random_split
from d2l import torch as d2l
'''
#设置环境变量免责声明
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
'''

"\n#设置环境变量免责声明\nos.environ['KMP_DUPLICATE_LIB_OK'] = 'True'\n"

In [31]:
#设置参数
batch_size, num_epochs = 20, 50#设置批量、学习率和迭代次数
data_dir = './predata'#设置预处理数据路径

# 设置随机数种子
seed = 0

# 设置语句最大长度
max_len = 200
# 创建Bert分词器
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
    cache_dir='./tokenizer',
    force_download=False)
#设置模型超参数
num_layers, d_model, heads, middle_dim = 6, 512, 8, 2048

word_map = tokenizer.get_vocab()

#设置输出批次
out_batch = 10

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-chinese/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001C20D3BF990>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 4f40bc0e-578e-4039-96b6-e5f7f235c02d)')' thrown while requesting HEAD https://huggingface.co/bert-base-chinese/resolve/main/tokenizer_config.json


In [4]:
def setup_seed(seed):
    #下面两个常规设置了，用来np和random的话要设置 
    np.random.seed(seed) 
    random.seed(seed)

    os.environ['PYTHONHASHSEED'] = str(seed)
    # 禁止hash随机化
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
    # 在cuda 10.2及以上的版本中，需要设置以下环境变量来保证cuda的结果可复现

    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)# 多GPU训练需要设置这个
    torch.manual_seed(seed)

    torch.use_deterministic_algorithms(True)
    # 一些操作使用了原子操作，不是确定性算法，不能保证可复现，设置这个禁用原子操作，保证使用确定性算法
    torch.backends.cudnn.deterministic = True 
    # 确保每次返回的卷积算法是确定的
    torch.backends.cudnn.enabled = False
    # 禁用cudnn使用非确定性算法
    torch.backends.cudnn.benchmark = False
    # 与上面一条代码配套使用，True的话会自动寻找最适合当前配置的高效算法，来达到优化运行效率的问题。
    # False保证实验结果可复现。

setup_seed(seed)

In [5]:
with open(data_dir+'/pairs_encoded.txt', 'r', encoding='gb18030') as file:
    pairs = []
    for line in file:
        pair = eval(line)
        pairs.append(pair)

In [6]:
word_map = tokenizer.get_vocab()
print("Total words are: {}".format(len(word_map)))

Total words are: 21128


In [7]:
class Dataset(Dataset):

    def __init__(self):

        self.pairs = pairs
        self.dataset_size = len(self.pairs)

    def __getitem__(self, i):
        
        question = torch.LongTensor(self.pairs[i][0])
        reply = torch.LongTensor(self.pairs[i][1])
            
        return question, reply

    def __len__(self):
        return self.dataset_size

In [8]:
dataset = Dataset()
# 计算训练集的大小
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
# 分割数据集为训练集和测试集
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [9]:
train_iter = torch.utils.data.DataLoader(train_dataset,
                                        batch_size = batch_size, 
                                        shuffle=True, 
                                        pin_memory=True)
test_iter = torch.utils.data.DataLoader(test_dataset,
                                        batch_size = batch_size, 
                                        shuffle=True, 
                                        pin_memory=True)

In [15]:
def create_masks(question, reply_input, reply_target):
    
    def subsequent_mask(size):
        mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
        return mask.unsqueeze(0)
    
    question_mask = question!=0
    question_mask = question_mask.to(device)
    question_mask = question_mask.unsqueeze(1).unsqueeze(1)         # (batch_size, 1, 1, max_words)
     
    reply_input_mask = reply_input!=0
    reply_input_mask = reply_input_mask.unsqueeze(1)  # (batch_size, 1, max_words)
    reply_input_mask = reply_input_mask & subsequent_mask(reply_input.size(-1)).type_as(reply_input_mask.data) 
    reply_input_mask = reply_input_mask.unsqueeze(1) # (batch_size, 1, max_words, max_words)
    reply_target_mask = reply_target!=0              # (batch_size, max_words)
    
    return question_mask, reply_input_mask, reply_target_mask

In [16]:
class Embeddings(nn.Module):
    """
    Implements embeddings of the words and adds their positional encodings. 
    """
    def __init__(self, vocab_size, max_len, d_model, num_layers):
        super(Embeddings, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(0.1)
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = self.create_positinal_encoding(max_len, self.d_model)     # (1, max_len, d_model)
        self.te = self.create_positinal_encoding(num_layers, self.d_model)  # (1, num_layers, d_model)
        self.dropout = nn.Dropout(0.1)
        
    def create_positinal_encoding(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model).to(device)
        for pos in range(max_len):   # for each position of the word
            for i in range(0, d_model, 2):   # for each dimension of the each position
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0)   # include the batch size
        return pe
        
    def forward(self, embedding, layer_idx):
        if layer_idx == 0:
            embedding = self.embed(embedding) * math.sqrt(self.d_model)
        embedding += self.pe[:, :embedding.size(1)]   # pe will automatically be expanded with the same batch size as encoded_words
        # embedding: (batch_size, max_len, d_model), te: (batch_size, 1, d_model)
        embedding += self.te[:, layer_idx, :].unsqueeze(1).repeat(1, embedding.size(1), 1)
        embedding = self.dropout(embedding)
        return embedding

In [17]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, heads, d_model):
        
        super(MultiHeadAttention, self).__init__()
        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = nn.Dropout(0.1)
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.concat = nn.Linear(d_model, d_model)
        
    def forward(self, query, key, value, mask):
        """
        query, key, value of shape: (batch_size, max_len, 512)
        mask of shape: (batch_size, 1, 1, max_words)
        """
        # (batch_size, max_len, 512)
        query = self.query(query)
        key = self.key(key)        
        value = self.value(value)   
        
        # (batch_size, max_len, 512) --> (batch_size, max_len, h, d_k) --> (batch_size, h, max_len, d_k)
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)   
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        
        # (batch_size, h, max_len, d_k) matmul (batch_size, h, d_k, max_len) --> (batch_size, h, max_len, max_len)
        scores = torch.matmul(query, key.permute(0,1,3,2)) / math.sqrt(query.size(-1))
        scores = scores.masked_fill(mask == 0, -1e9)    # (batch_size, h, max_len, max_len)
        weights = F.softmax(scores, dim = -1)           # (batch_size, h, max_len, max_len)
        weights = self.dropout(weights)
        # (batch_size, h, max_len, max_len) matmul (batch_size, h, max_len, d_k) --> (batch_size, h, max_len, d_k)
        context = torch.matmul(weights, value)
        # (batch_size, h, max_len, d_k) --> (batch_size, max_len, h, d_k) --> (batch_size, max_len, h * d_k)
        context = context.permute(0,2,1,3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)
        # (batch_size, max_len, h * d_k)
        interacted = self.concat(context)
        return interacted 

In [18]:
class FeedForward(nn.Module):

    def __init__(self, d_model, middle_dim):
        super(FeedForward, self).__init__()
        
        self.fc1 = nn.Linear(d_model, middle_dim)
        self.fc2 = nn.Linear(middle_dim, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out

In [19]:
class EncoderLayer(nn.Module):

    def __init__(self, d_model, heads, middle_dim):
        super(EncoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model, middle_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, embeddings, mask):
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        interacted = self.layernorm(interacted + embeddings)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded

In [20]:
class DecoderLayer(nn.Module):
    
    def __init__(self, d_model, heads, middle_dim):
        super(DecoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.src_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model, middle_dim)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, embeddings, encoded, src_mask, target_mask):
        query = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, target_mask))
        query = self.layernorm(query + embeddings)
        interacted = self.dropout(self.src_multihead(query, encoded, encoded, src_mask))
        interacted = self.layernorm(interacted + query)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        decoded = self.layernorm(feed_forward_out + interacted)
        return decoded

In [21]:
class Transformer(nn.Module):
    
    def __init__(self, word_map, max_len, d_model=512, heads=8, middle_dim=2048, num_layers=6):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        self.vocab_size = len(word_map)
        self.embed = Embeddings(self.vocab_size, max_len, d_model, num_layers)
        self.encoder = EncoderLayer(d_model, heads, middle_dim) 
        self.decoder = DecoderLayer(d_model, heads, middle_dim)
        self.logit = nn.Linear(d_model, self.vocab_size)
        
    def encode(self, src_embeddings, src_mask):
        for i in range(self.num_layers):
            src_embeddings = self.embed(src_embeddings, i)
            src_embeddings = self.encoder(src_embeddings, src_mask)
        return src_embeddings
    
    def decode(self, tgt_embeddings, target_mask, src_embeddings, src_mask):
        for i in range(self.num_layers):
            tgt_embeddings = self.embed(tgt_embeddings, i)
            tgt_embeddings = self.decoder(tgt_embeddings, src_embeddings, src_mask, target_mask)
        return tgt_embeddings
        
    def forward(self, src_words, src_mask, target_words, target_mask):
        encoded = self.encode(src_words, src_mask)
        decoded = self.decode(target_words, target_mask, encoded, src_mask)
        out = F.log_softmax(self.logit(decoded), dim = 2)
        return out

In [22]:
class AdamWarmup:
    
    def __init__(self, model_size, warmup_steps, optimizer):
        
        self.model_size = model_size
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer
        self.current_step = 0
        self.lr = 0
        
    def get_lr(self):
        return self.model_size ** (-0.5) * min(self.current_step ** (-0.5), self.current_step * self.warmup_steps ** (-1.5))
        
    def step(self):
        # Increment the number of steps each time we call the step function
        self.current_step += 1
        lr = self.get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        # update the learning rate
        self.lr = lr
        self.optimizer.step()       

In [23]:
class LossWithLS(nn.Module):

    def __init__(self, size, smooth):
        super(LossWithLS, self).__init__()
        self.criterion = nn.KLDivLoss(reduction='none')
        self.confidence = 1.0 - smooth
        self.smooth = smooth
        self.size = size
        
    def forward(self, prediction, target, mask):
        """
        prediction of shape: (batch_size, max_words, vocab_size)
        target and mask of shape: (batch_size, max_words)
        """
        prediction = prediction.view(-1, prediction.size(-1))   # (batch_size * max_words, vocab_size)
        target = target.contiguous().view(-1)   # (batch_size * max_words)
        mask = mask.float()
        mask = mask.view(-1)       # (batch_size * max_words)
        labels = prediction.data.clone()
        labels.fill_(self.smooth / (self.size - 1))
        labels.scatter_(1, target.data.unsqueeze(1), self.confidence)
        loss = self.criterion(prediction, labels)    # (batch_size * max_words, vocab_size)
        loss = (loss.sum(1) * mask).sum() / mask.sum()
        return loss

In [24]:
def evaluate(transformer, question, question_mask):
    rev_word_map = {v: k for k, v in word_map.items()}
    transformer.eval()
    start_token = word_map['[CLS]']
    encoded = transformer.encode(question, question_mask)
    words = torch.LongTensor([[start_token]]).to(device)
    
    for step in range(max_len - 1):
        size = words.shape[1]
        target_mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
        target_mask = target_mask.to(device).unsqueeze(0).unsqueeze(0)
        decoded = transformer.decode(words, target_mask, encoded, question_mask)
        predictions = transformer.logit(decoded[:, -1])
        _, next_word = torch.max(predictions, dim = 1)
        next_word = next_word.item()
        if next_word == word_map['[SEP]']:
            break
        words = torch.cat([words, torch.LongTensor([[next_word]]).to(device)], dim = 1)   # (1,step+2)
        
    # Construct Sentence
    if words.dim() == 2:
        words = words.squeeze(0)
        words = words.tolist()
        
    sen_idx = [w for w in words if w not in {word_map['[CLS]']}]
    sentence = ' '.join([rev_word_map[sen_idx[k]] for k in range(len(sen_idx))])
    
    return sentence

In [25]:
def bleu(question, reply, k, transformer):
    question = torch.LongTensor(question).to(device).unsqueeze(0)
    question_mask = (question!=0).to(device).unsqueeze(1).unsqueeze(1) 
    pred_tokens = evaluate(transformer, question, question_mask)
    label_tokens = []
    for num in reply.tolist():
        char = tokenizer.convert_ids_to_tokens(num)
        label_tokens.append(char)
    len_pred, len_label = len(pred_tokens), len(label_tokens)
    score = math.exp(min(0, 1 - len_label / len_pred))
    for n in range(1, min(k, len_pred) + 1):
        num_matches, label_subs = 0, collections.defaultdict(int)
        for i in range(len_label - n + 1):
            label_subs[' '.join(label_tokens[i: i + n])] += 1
        for i in range(len_pred - n + 1):
            if label_subs[' '.join(pred_tokens[i: i + n])] > 0:
                num_matches += 1
                label_subs[' '.join(pred_tokens[i: i + n])] -= 1
        score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n))
    return score

In [26]:
def accuracy(transformer, question, reply, k):
    """Compute the average BLEU score for a batch of predictions and targets."""
    total_score = 0.0
    for i in range(len(question)):
        total_score += bleu(question[i].to('cpu'), reply[i], k, transformer)
    return total_score / len(question)

In [27]:
def evaluate_accuracy_gpu(model, data_iter, k, device=None):
    if isinstance(model, nn.Module):
        model.eval()
        if not device:
            device = next(iter(model.parameters())).device
    metric = d2l.Accumulator(2)
    with torch.no_grad():
        for i, (question, reply) in data_iter:
            metric.add(accuracy(transformer, question, reply, k), question.shape[0])
    return metric[0] / metric[1]

In [28]:
device = d2l.try_gpu()
transformer = Transformer(word_map = word_map, max_len = max_len, d_model = d_model, heads = heads, middle_dim = middle_dim, num_layers = num_layers)
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
criterion = LossWithLS(len(word_map), 0.2)

k = 2#(可选：1，2，3，4)

In [45]:
def train(model, train_iter, test_iter, num_epochs, loss, optimizer, device, k, out_batch):
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    print('training on', device)
    model.to(device)
    timer, num_batches = d2l.Timer(), len(train_iter)
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(3)
        model.train()
        for i, (question, reply) in enumerate(train_iter):
            timer.start()
            optimizer.optimizer.zero_grad()
            
            # Move to device
            question = question.to(device)
            reply = reply.to(device)

            # Prepare Target Data
            reply_input = reply[:, :-1]
            reply_target = reply[:, 1:]

            # Create mask and add dimensions
            question_mask, reply_input_mask, reply_target_mask = create_masks(question, reply_input, reply_target)
            
            out = transformer(question, question_mask, reply_input, reply_input_mask)
            l = loss(out, reply_target, reply_target_mask)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l * question.shape[0], accuracy(transformer, question, reply, k), question.shape[0])
            timer.stop()
            if (i!=0 and i%((len(train_iter)-2)//(out_batch-1)) == 0) or i == len(train_iter)-1:
                train_l = metric[0] / metric[2]
                train_acc = metric[1] / metric[2]
                print("Epoch [{}][{}/{}]\ttrain loss: {:.3f}\ttrain accuracy: {:.3f}".format(epoch, i, len(train_iter), train_l, train_acc))
        test_acc = evaluate_accuracy_gpu(model, test_iter, k)
        print("Epoch [{}]\ttest accuracy: {:.3f}".format(epoch, test_acc))

        torch.save(model, './checkpoint/checkpoint_' + str(epoch) + '.pt')

    print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(device)}')

In [46]:
train(transformer, train_iter, test_iter, num_epochs, criterion, transformer_optimizer, device, k, out_batch)

training on cuda:0


KeyboardInterrupt: 

In [None]:
%reset