In [4]:
import re
import math
import torch
import numpy as np
from random import *
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

# 下面的text 是一个不同样本呢以\n 分割开的字符串
text = (
    'Hello, how are you? I am Romeo.\n' # R
    'Hello, Romeo My name is Juliet. Nice to meet you.\n' # J
    'Nice meet you too. How are you today?\n' # R
    'Great. My baseball team won the competition.\n' # J
    'Oh Congratulations, Juliet\n' # R
    'Thank you Romeo\n' # J
    'Where are you going today?\n' # R
    'I am going shopping. What about you?\n' # J
    'I am going to visit my grandmother. she is not very well' # R
)

# 去除特殊字符
sentences = re.sub('[.,!?\\-]','', text.lower()).split('\n')
word_list = list(set(' '.join(sentences).split(' ')))  # ['hello', 'how', 'are', 'you',...]
word2idx = {'[PAD]':0, '[CLS]':1, '[SEP]':2, '[MASK]':3}
for i, w in enumerate(word_list):
    word2idx[w] = i + 4
idx2word = {i: w for i, w in enumerate(word2idx)}
vocab_size = len(word2idx)

token_list = list()
for sentence in sentences:
    arr = [word2idx[s] for s in sentence.split()]
    token_list.append(arr)  # 获取每个样本的id 表示

In [5]:
# BERT Parameters
maxlen = 30        # 一条样本的最大长度设置为30
batch_size = 6     #
max_pred = 5       # 由于样本长度不长，我们每个句子最多mask 5个token
n_layers = 6      
n_heads = 12
d_model = 768      # embedding 的维度
d_ff = 768*4       # 4*d_model, FeedForward dimension  特征提取的维度
d_k = d_v = 64     # dimension of K(=Q), V
n_segments = 2     # 拼接两条句子为一个样本

### 样本模式
- 在bert 中，两种任务MASK LM 和 NSP 任务是同时进行的
    - 每一个样本都是由两条数据拼接而成
    - 每一条样本中都随机Mask 掉 15% 的Token
    - 同时要保证样本中两条句子相邻：不相邻 = 1：1
    

In [10]:
### 数据集构造
def make_data():
    batch = []
    positive = negative = 0
    while positive != batch_size/2 or negative != batch_size/2:
        # 随机从样本sentences 中取样，拼接， sentences 与 token_list 属于原始样本与 token_id 对应
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]
        # 拼接两条为一条样本
        input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']]
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        # MASK LM 原文要求15% 但是如果拼接之后的样本很短，15%还不到一个token 会有问题，加上我们自己设置的最多mask 5个
        # 需要被mask 的数量
        n_pred = min(max_pred, max(1, int(len(input_ids) * 0.15)))  
        # 可能被mask 的位置，cls, sep 这些去除掉
        candidate_mask_pos = [i for i, token in enumerate(input_ids)
                              if token != word2idx['[CLS]'] and token != word2idx['[SEP]']]
        # mask 位置是随机选择的，可以先mask 再取前n_pred 个
        shuffle(candidate_mask_pos)
        masked_tokens, masked_pos = [], [] 
        for pos in candidate_mask_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random() < 0.8:  # 80% 的直接替换为Mask
                input_ids[pos] = word2idx['[MASK]']
            elif random() > 0.9:  # 10% 的替换为随机错误的单词，剩下10%的保持不变，可以不进行调整
                # 随机找到一个不同的单词，进行替换
                index = randint(0, vocab_size-1)
                while index < 4:  # 替换后的单词不能是CLS SEP PAD 这种
                    index = randint(0, vocab_size-1)
                input_ids[pos] = index
        
        # PAD 填充，固定每个batch 的长度
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)  # 这点不是很理解，segment_id 用于区分不同句子，这样添加不是乱了

        # PAD 填充 
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True])
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])
            negative += 1
    
    return batch

In [11]:
batch = make_data()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = zip(*batch)
input_ids, segment_ids, masked_tokens, masked_pos, isNext = \
    torch.LongTensor(input_ids),  torch.LongTensor(segment_ids), torch.LongTensor(masked_tokens),\
    torch.LongTensor(masked_pos), torch.LongTensor(isNext)

class MyDataSet(Data.Dataset):
    def __init__(self, input_ids, segment_ids, masked_tokens, masked_pos, isNext):
        self.input_ids = input_ids
        self.segment_ids = segment_ids
        self.masked_tokens = masked_tokens
        self.masked_pos = masked_pos
        self.isNext = isNext
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.segment_ids[idx], self.masked_tokens[idx], self.masked_pos[idx], self.isNext[idx]
    

loader = Data.DataLoader(MyDataSet(input_ids, segment_ids, masked_tokens, masked_pos, isNext), batch_size=batch_size, shuffle=True)

- 注意 bert 中pos_embeded 是要参与到模型更新的，这点与transformer 不同

- batch normalization对一批样本同一纬度特征做归一化，一列身高的平均值
- 而layer normalization是对单个样本的所有维度特征做归一化

- 生成self_attn_mask 矩阵
- 定义激活函数gelu 

In [17]:
# padding mask 
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, seq_len = seq_q.size()
    pad_attn_mask = seq_q.eq(0).unsqueeze(1)  # [batch_size, 1, seq_len]
    return pad_attn_mask.expand(batch_size, seq_len, seq_len)  # [batch_size, seq_len, seq_len]

# FFN 层激活函数使用的是gelu 
def gelu(x):
    """
    Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
    """
    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

- 传入数据并进行编码
    - [batch_size, seq_len] --> [batch_size, seq_len, d_model]

In [18]:
# 包含字编码 + 位置编码 + 分隔句子编码
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(maxlen, d_model)
        self.seg_embed = nn.Embedding(n_segments, d_model)
        self.norm = nn.LayerNorm(d_model)  # 在最后一个维度进行标准化

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # [seq_len] --> [batch_size, seq_len]
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

- 单层Encoder BLOCK

In [19]:
# 专门计算多头注意力过程中Q,K,V 矩阵乘积  
class ScaledDotProduct(nn.Module):
    def __init__(self):
        super(ScaledDotProduct, self).__init__()
    
    def forward(self, Q, K, V, attn_mask):
        """
        Q,K,V 形状 [batch_size, n_heads, seq_q/k/v, d_k/d_k/d_v]
        attn_mask  [batch_size, n_heads, seq_q, seq_k]
        """
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)  # [batch_size, n_heads, seq_q, seq_k]
        scores.masked_fill_(attn_mask, -1e9)
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)  # [batch_size, n_heads, seq_q, d_v]
        return context

# 多头自注意力，这里主要生成Q,K,V 矩阵
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)  # d_k == d_q 可以不等于d_v   (len_k == len_v 可以不等于len_q ，在transformer 中有enc_dec_attn 就可以不等于)
        self.fc = nn.Linear(n_heads*d_v, d_model)

    def forward(self, seq_q, seq_k, seq_v, attn_mask):
        """
        seq_q, seq_k, seq_v：[batch_size, seq_len, d_model]  在bert 中就是编码后的input
        """
        residual, batch_size = seq_q, seq_q.size(0)
        # [B,S,D] -proj-> [B,S,D_new]-proj-> [B, S, n_heads, d_k/d_v] -trans-> [B, n_heads, seq_len, d_k/d_v]
        # 根据输入的seq_q, k, v获取 Q,K,V 矩阵
        Q = self.W_Q(seq_q).view(batch_size, -1, n_heads, d_k).transpose(1, 2)  # [batch_size, n_heads, seq_len, d_k]
        K = self.W_K(seq_k).view(batch_size, -1, n_heads, d_k).transpose(1, 2)  # [batch_size, n_heads, seq_len, d_k]
        V = self.W_V(seq_v).view(batch_size, -1, n_heads, d_v).transpose(1, 2)  # [batch_size, n_heads, seq_len, d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)  # [batch_size, n_heads, seq_len, seq_len]
        # 计算context: [batch_size, n_heads, seq_len, d_v]
        context = ScaledDotProduct()(Q, K, V, attn_mask)  # [batch_size, n_heads, seq_q, d_v]
        # 经过形状变换
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads*d_v)
        output = self.fc(context)  # [batch_size, seq_q, d_model]
        return nn.LayerNorm(d_model)(output + residual)  #  [batch_size, seq_q, d_model]

# 前向全连接层
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        """
        x：[batch_size, seq_q, d_model]  经过多头自注意力机制后的输出
        """
        return self.fc2(gelu(self.fc1(x)))

# 一个encoder_block  主要包含多头自注意力 + 前向全连接
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        """
        enc_inputs：经过编码之后的输入
        """
        # 经过多头自注意力之后的输出  [batch_size, seq_q, d_model]
        enc_outputs = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
        enc_outputs = self.pos_ffn(enc_outputs)
        return enc_outputs  #[batch_size, seq_q, d_model]

- 定义模型

In [20]:
class Bert(nn.Module):
    def __init__(self):
        super(Bert, self).__init__()
        self.embedding = Embedding()
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.Dropout(0.5),
            nn.Tanh()
        )
        self.classifier = nn.Linear(d_model, 2)  # BERT中的NSP任务，判断前后是否为相邻句子
        self.linear = nn.Linear(d_model, d_model)
        self.activ2 = gelu
        embed_weight = self.embedding.tok_embed.weight  # fc2 和 embedding layer 共享权重
        self.fc2 = nn.Linear(d_model, vocab_size, bias=False)
        self.fc2.weight = embed_weight

    def forward(self, input_ids, segment_ids, masked_pos):
        # 传入mask_pos 原因是，计算损失时，MLM 任务只需要计算mask位置的损失
        output = self.embedding(input_ids, segment_ids)  # [batch_size, seq_q, d_model]
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)  # [batch_szie, maxlen, maxlen]
        for layer in self.layers:
            # [batch_size, max_len, d_model]
            output = layer(output, enc_self_attn_mask)
        # ①NSP 损失准备
        # 拿出cls ，经过一层特征提取，用来后续形状变换后，进行二分类
        h_pooled = self.fc(output[:, 0])  # [batch_size, d_model]
        logits_clsf = self.classifier(h_pooled)  # [batch_size, 2]
        # ②MLM损失准备
        masked_pos = masked_pos[:,:,None].expand(-1, -1, d_model)  # [batch_size, max_pred, d_model]
        h_masked = torch.gather(output, 1, masked_pos)             # [batch_size, max_pred, d_model]
        h_masked = self.activ2(self.linear(h_masked))              # [batch_size, max_pred, d_model]
        logits_lm = self.fc2(h_masked)                             # [batch_size, max_pred, vocab_size]
        return logits_lm, logits_clsf

model = Bert()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.001)


In [21]:
### 模型训练
for epoch in range(50):
    for input_ids, segment_ids, masked_tokens, masked_pos, isNext in loader:
        logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
        # 计算MLM 损失
        loss_lm = criterion(logits_lm.view(-1, vocab_size), masked_tokens.view(-1))
        # 计算NSP损失
        loss_clsf = criterion(logits_clsf, isNext)
        loss = loss_lm + loss_clsf
        print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Epoch: 0001 loss = 4.590352
Epoch: 0002 loss = 2.455920
Epoch: 0003 loss = 1.908595
Epoch: 0004 loss = 1.635944
Epoch: 0005 loss = 1.392686
Epoch: 0006 loss = 1.201845
Epoch: 0007 loss = 1.032200
Epoch: 0008 loss = 0.921432
Epoch: 0009 loss = 0.899910
Epoch: 0010 loss = 0.831523
Epoch: 0011 loss = 0.783036
Epoch: 0012 loss = 0.821681
Epoch: 0013 loss = 0.794956
Epoch: 0014 loss = 0.741466
Epoch: 0015 loss = 0.794941
Epoch: 0016 loss = 0.767450
Epoch: 0017 loss = 0.725908
Epoch: 0018 loss = 0.787155
Epoch: 0019 loss = 0.759007
Epoch: 0020 loss = 0.695979
Epoch: 0021 loss = 0.732249
Epoch: 0022 loss = 0.754515
Epoch: 0023 loss = 0.742332
Epoch: 0024 loss = 0.738854
Epoch: 0025 loss = 0.697506
Epoch: 0026 loss = 0.713531
Epoch: 0027 loss = 0.726564
Epoch: 0028 loss = 0.692116
Epoch: 0029 loss = 0.718726
Epoch: 0030 loss = 0.723593
Epoch: 0031 loss = 0.703812
Epoch: 0032 loss = 0.749539
Epoch: 0033 loss = 0.709716
Epoch: 0034 loss = 0.691878
Epoch: 0035 loss = 0.682202
Epoch: 0036 loss = 0

In [34]:
# 随便拿条数据，验证模型
input_ids, segment_ids, masked_tokens, masked_pos, isNext = batch[1]
print(text)
print('================================')
print([idx2word[w] for w in input_ids if idx2word[w] != '[PAD]'])
logits_lm, logits_clsf = model(torch.LongTensor([input_ids]), \
                 torch.LongTensor([segment_ids]), torch.LongTensor([masked_pos]))
# MLM 位置预测
logits_lm = logits_lm.max(2)[1][0].numpy()
print('================================')
print('masked tokens list : ',[pos for pos in masked_tokens if pos != 0])
print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])

# NSP预测
print('================================')
logits_clsf = logits_clsf.max(-1)[1].numpy()[0]
print(logits_clsf)
print('isNext : ', isNext)
print('predict isNext : ',True if logits_clsf else False)

Hello, how are you? I am Romeo.
Hello, Romeo My name is Juliet. Nice to meet you.
Nice meet you too. How are you today?
Great. My baseball team won the competition.
Oh Congratulations, Juliet
Thank you Romeo
Where are you going today?
I am going shopping. What about you?
I am going to visit my grandmother. she is not very well
['[CLS]', 'oh', '[MASK]', 'juliet', '[SEP]', 'oh', 'congratulations', 'juliet', '[SEP]']
masked tokens list :  [6]
predict masked tokens list :  [6]
0
isNext :  False
predict isNext :  False
