In [1]:
import re
import math
import torch
import numpy as np
from random import *
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data


In [2]:
text = (
    'Hello, how are you? I am Romeo.\n' # R
    'Hello, Romeo My name is Juliet. Nice to meet you.\n' # J
    'Nice meet you too. How are you today?\n' # R
    'Great. My baseball team won the competition.\n' # J
    'Oh Congratulations, Juliet\n' # R
    'Thank you Romeo\n' # J
    'Where are you going today?\n' # R
    'I am going shopping. What about you?\n' # J
    'I am going to visit my grandmother. she is not very well' # R
)

In [51]:
#去掉所有句子的标点符号
sentences = re.sub("[.,?!]",'',text.lower()).split('\n')
#所有词的列表
word_list = list(set(" ".join(sentences).split()))
#生成带索引的词字典，并添加上四种标志词
word2idx = {'[PAD]':0,'[CLS]':1,'[SEP]':2,'[MASK]':3}
for i,w in enumerate(word_list):
    word2idx[w] = i+4
# 对换word2idx的key和value->idx2word
idx2word = dict(zip(word2idx.values(),word2idx.keys()))
# 上面的两个字典的大小
vocab_size = len(word2idx)

# token是所有句子的idx表示，每个句子一个list，所有句子组成一个大list
token_list = list()
for sentence in sentences:
    #每句话的idx表示
    idx_sentence = [word2idx[s] for s in sentence.split()]
    token_list.append(idx_sentence)
    

In [52]:
# BERT Parameters

# maxlen表示同一个batch中的所有句子都由30个token组成，不够的补PAD
# （这里我实现的方式比较粗暴，直接固定所有batch中的所有句子都为30）
maxlen = 30
batch_size = 6
# max_pred表示最多需要预测多少个单词，即BERT中的完形填空任务
max_pred = 5 
# n_layers表示Encoder Layer的数量
n_layers = 6
n_heads = 12
# d_model表示Token Embeddings、Segment Embeddings、Position Embeddings的维度
d_model = 768
# d_ff表示Encoder Layer中全连接层的维度
d_ff = 768*4 
d_k = d_v = 64  # dimension of K(=Q), V
# n_segments表示Decoder input由几句话组成
n_segments = 2


In [64]:
def make_data():
    batch = []
    positive = negative = 0
    while positive!=batch_size/2 or negative!=batch_size/2:
        #从sentences中随机取两句话的序号
        tokens_a_index,tokens_b_index = randrange(len(sentences)),randrange(len(sentences))
        #取对应的句子的token（idx_list）
        tokens_a,tokens_b = token_list[tokens_a_index],token_list[tokens_b_index]
        #将两个句子合并，并且加上开头结尾符号和中间分隔符，得到一个list，此为input_ids
        input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']]
        #segment_ids表示句子前后顺序
        segment_ids = [0]*(1+len(tokens_a)+1)+[1]*(len(tokens_b)+1)
        
        #n_pred(编码器需要预测的也就是被遮盖住的词的个数)按原文要求取单个训练例子（两个句子结合的句子）长度的15%遮盖住，限定必须小于max_pred
        n_pred = min(max_pred,max(1,int(len(input_ids)*0.15)))
        #指代了真实单词的位置，也就是可以被mask的单词的位置
        cand_masked_pos = [i for i,token in enumerate(input_ids)
                         if token!=word2idx['[CLS]'] and token!=word2idx['[SEP]']]
        #将这个句子的所有单词的位置都打散
        shuffle(cand_masked_pos)
        #被遮住的词和被遮住的词的位置
        #注：masked_tokens是原词，input_ids中对应位置的是替换后的词
        masked_tokens, masked_pos = [],[]
        #取前n_pred个词，因为已经打散了，这就相当于随机去了n_pred个
        for pos in cand_masked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random()<0.8:#80%的概率-----MASK掉
                input_ids[pos] = word2idx['[MASK]']
            elif random()>0.9:#10%的概率-----随便替换一个总词典中的一个词
                index = randint(4,vocab_size-1)#不能取前四个标志词
                input_ids[pos] = index
        
        #总句子不够长度的位置补[PAD]
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0]*n_pad)
        segment_ids.extend([0]*n_pad)
        
        #前面有写到n_pred<max_pred
        #E.g. input_ids = [1,8, 36, 27, 13, 39, 33, 34,2,39, 33, 35, 26, 30, 38, 17, 5, 22, 16, 6, 12,2]
        #     (shuffle)cand_maked_pos = [4, 17, 13, 5, 6, 14, 2, 15, 18, 20, 1, 3, 19, 7, 12, 16, 11, 9, 10]
        #     len:19  ;  maxlen:30  ;   n_pred = 19*0.15 = 2.85 = 2  ;  max_pred = 5
        #     masked_pos:[4,17]  masked_tokens:[13,22]
        #     (masked)input_ids:[4->4, 17->4, 13, 5, 6, 14, 2, 15, 18, 20, 1, 3, 19, 7, 12, 16, 11, 9, 10]
        #     masked_pos(补零):[4, 17, 0, 0, 0] masked_tokens(补零):[13,22,0,0,0]
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0]*n_pad)
            masked_pos.extend([0]*n_pad)
        
        #tokens_a_index + 1 == tokens_b_index:表示两个句子相邻，是上下文关系
        #positive：表示随机抽到的句子对中，两个句子相邻，的次数
        #batch的最后一个参数True表示，两个句子相邻
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
            negative += 1
            
    return batch
        
    

In [156]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = zip(*batch)
#转换为torch张量
input_ids, segment_ids, masked_tokens, masked_pos, isNext = \
    torch.LongTensor(input_ids),  torch.LongTensor(segment_ids), torch.LongTensor(masked_tokens),\
    torch.LongTensor(masked_pos), torch.LongTensor(isNext)

In [157]:
input_ids[0], segment_ids[0], masked_tokens[0], masked_pos[0], isNext[0]

(tensor([ 1, 39, 33, 35,  3, 31, 29, 13,  2,  3, 33, 35, 11, 31, 29, 13,  2,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]),
 tensor([11, 39,  0,  0,  0]),
 tensor([4, 9, 0, 0, 0]),
 tensor(0))

In [158]:
class MyDataSet(Data.Dataset):
  def __init__(self, input_ids, segment_ids, masked_tokens, masked_pos, isNext):
    self.input_ids = input_ids
    self.segment_ids = segment_ids
    self.masked_tokens = masked_tokens
    self.masked_pos = masked_pos
    self.isNext = isNext
  
  def __len__(self):
    return len(self.input_ids)
  
  def __getitem__(self, idx):
    return self.input_ids[idx], self.segment_ids[idx], self.masked_tokens[idx], self.masked_pos[idx], self.isNext[idx]

loader = Data.DataLoader(MyDataSet(input_ids, segment_ids, masked_tokens, masked_pos, isNext), batch_size, True)

In [213]:
#seq_q(=inputs_idx)扩展成[batch_size,seq_len,seq_len],所有值等于0的变成True，值等于0表示PAD填充的
def get_attn_pad_mask(seq_q,seq_k):
    batch_size,seq_len = seq_q.size()
#     print(batch_size,seq_len)
    pad_attn_mask = seq_q.data.eq(0).unsqueeze(1)
    return pad_attn_mask.expand(batch_size,seq_len,seq_len)

def gelu(x):
    return x*0.5*(1.0+torch.erf(x/math.sqrt(2.0)))

In [214]:
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        #定义超参数
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)  # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)
    
    def forward(self,x,seg):     
        #input_x：是一个batch=6个句子对，每个句子对的所有词的idx组成一个list，即二维
        #input_pos：每个句子对的位置索引list，没有传入，在下面这三行生成
        #下面这三行相当于把pos(input)扩展成和input_x,input_segment一样的shape
        
        #第二维大小=maxlen=30
        seq_len = x.size(1)
        #取这个batch中所有位置索引[0,1,2...,29]
        pos = torch.arange(seq_len, dtype=torch.long)
        #unsqueeze在第一个维度前增加一个维度，E.g.shape[1,2]->shape[1,1,2]
        #expand_as将pos扩展成和x一样的shape
        pos = pos.unsqueeze(0).expand_as(x)
        
        #调用自带的embedding函数，分别对input_x,input_pos,input_seg生成三个embed值，累加
        embedding = self.tok_embed(x)+self.pos_embed(pos)+self.seg_embed(seg)
        #归一化
        return self.norm(embedding)
        

In [215]:
i = 0
for input_ids, segment_ids, masked_tokens, masked_pos, isNext in loader:
    if i==1:
        break
#     print("input_ids:",input_ids)
#     print("\nsegment_ids:",segment_ids)
#     print("\nmasked_tokens:",masked_tokens)
#     print("\nmasked_pos:",masked_pos)
#     print("\nisNext:",isNext)
    i+=1
    
    seq_len = input_ids.size(1)#第二维大小=30
    pos = torch.arange(seq_len, dtype=torch.long)
    print(pos)
    print(pos.unsqueeze(0))
    print(input_ids)
    print(pos.unsqueeze(0).expand_as(input_ids))
    
    tok_embed = nn.Embedding(vocab_size, d_model)
    print("tok_embed:",tok_embed(input_ids))

    pos = pos.unsqueeze(0).expand_as(input_ids)
    pos_embed = nn.Embedding(maxlen, d_model)
    print("pos_embed:",pos_embed(pos))

    seg_embed = nn.Embedding(n_segments, d_model)
    print("seg_embed:",seg_embed(segment_ids))

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]])
tensor([[ 1,  8, 36, 27, 13, 39, 33, 34,  2, 21, 38, 18, 14, 10, 23,  3,  2,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 28, 13, 34,  2,  4, 27, 13, 35, 20,  2,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  4, 27, 13,  3,  3,  2, 21, 38, 18, 14, 10, 23,  7,  2,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 39,  3, 35, 11, 31, 29, 13,  2, 39, 33, 35,  3, 30, 38, 17,  5, 22,
         16,  3, 12,  2,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 39, 33, 35,  3, 31, 29, 13,  2,  3, 33, 35, 11, 31, 29, 13,  2,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  9,  3, 15,  2,

In [216]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size, n_heads, seq_len, seq_len]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context

In [217]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention,self).__init__()
        self.Model_W_Q = nn.Linear(d_model,d_k * n_heads)
        self.Model_W_K = nn.Linear(d_model,d_k * n_heads)#d_k==d_q
        self.Model_W_V = nn.Linear(d_model,d_v * n_heads)
    
    def forward(self,Q,K,V,attn_mask):
        #self-attention:Q=K=V
        residual, batch_size = Q, Q.size(0)
        
        q_s = self.Model_W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size, n_heads, seq_len, d_k]
        k_s = self.Model_W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size, n_heads, seq_len, d_k]
        v_s = self.Model_W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size, n_heads, seq_len, d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]

        # context: [batch_size, n_heads, seq_len, d_v], attn: [batch_size, n_heads, seq_len, seq_len]
        context = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size, seq_len, n_heads, d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model)(output + residual) # output: [batch_size, seq_len, d_model]

        

In [218]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet,self).__init__()
        self.fc1 = nn.Linear(d_model,d_ff)
        self.fc2 = nn.Linear(d_ff,d_model)
    
    def forward(self, x):
        # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_ff) -> (batch_size, seq_len, d_model)
        return self.fc2(gelu(self.fc1(x)))

In [219]:
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer,self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()
    
    def forward(self,enc_inputs,enc_self_attn_mask):
        enc_outputs = self.enc_self_attn(enc_inputs,enc_inputs,enc_inputs,enc_self_attn_mask)
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, seq_len, d_model]
        return enc_outputs

In [220]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT,self).__init__()
        self.embedding = Embedding()
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        #nn.Sequential(API):一个有序的容器，神经网络模块将按照在传入构造器的顺序依次被添加到计算图中执行，同时以神经网络模块为元素的有序字典也可以作为传入参数。
        self.fc = nn.Sequential(
            nn.Linear(d_model,d_model),
            nn.Dropout(0.5),
            nn.Tanh(),
        )
        self.classifier = nn.Linear(d_model,2)
        self.linear = nn.Linear(d_model,d_model)
        self.activ2 = gelu
        embed_weight = self.embedding.tok_embed.weight
        self.fc2 = nn.Linear(d_model,vocab_size,bias=False)
        self.fc2.weight = embed_weight
    
    def forward(self,input_ids,segment_ids,masked_pos):
        output = self.embedding(input_ids,segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids,input_ids)
        for layer in self.layers:
            output = layer(output,enc_self_attn_mask)
        
        h_pooled = self.fc(output[:, 0]) # [batch_size, d_model]
        logits_clsf = self.classifier(h_pooled) # [batch_size, 2] predict isNext

        masked_pos = masked_pos[:, :, None].expand(-1, -1, d_model) # [batch_size, max_pred, d_model]
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        h_masked = self.activ2(self.linear(h_masked)) # [batch_size, max_pred, d_model]
        logits_lm = self.fc2(h_masked) # [batch_size, max_pred, vocab_size]
        return logits_lm, logits_clsf
        

In [221]:
model = BERT()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.001)

In [222]:
for epoch in range(180):
    for input_ids, segment_ids, masked_tokens, masked_pos, isNext in loader:
      logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
      loss_lm = criterion(logits_lm.view(-1, vocab_size), masked_tokens.view(-1)) # for masked LM
      loss_lm = (loss_lm.float()).mean()
      loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
      loss = loss_lm + loss_clsf
      if (epoch + 1) % 10 == 0:
          print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()


Epoch: 0010 loss = 1.374374
Epoch: 0020 loss = 1.003074
Epoch: 0030 loss = 0.881680
Epoch: 0040 loss = 0.843361
Epoch: 0050 loss = 0.823525
Epoch: 0060 loss = 0.805731
Epoch: 0070 loss = 0.837577
Epoch: 0080 loss = 0.847171
Epoch: 0090 loss = 0.815844
Epoch: 0100 loss = 0.822766
Epoch: 0110 loss = 0.787375
Epoch: 0120 loss = 0.821672
Epoch: 0130 loss = 0.818850
Epoch: 0140 loss = 0.810293
Epoch: 0150 loss = 0.829645
Epoch: 0160 loss = 0.795248
Epoch: 0170 loss = 0.822731
Epoch: 0180 loss = 0.800113
