In [1]:
import torch
from torch import nn
from d2l import torch as d2l
import pandas as pd

In [3]:
def get_tokens_and_segment(token_a, token_b=None):
    """获取输入序列的词元及其片段索引"""
    tokens = ['<cls>'] + token_a + ['<sep>']
    segments = [0]*(len(tokens_a)+2)
    if token_b is not None:
        tokens += tokens_b + ['<sep>']
        segments += [1]*(len(tokens)+1)
    return tokens, segments

In [9]:
class BERTEncoder(nn.Module):
    "Bert编码器：输入序列的嵌入是词元嵌入、片段嵌入和位置嵌入的和"
    def __init__(self,vocab_size,num_hiddens,norm_shape,ffn_num_input,
                ffn_num_hiddens,num_heads, num_layers,dropout,
                 max_len = 1000, key_size=768,query_size=768,value_size=768,**kwargs):
        super(BERTEncoder, self).__init__(**kwargs)
        self.token_embedding = nn.Embedding(vocab_size,num_hiddens)
        self.segment_embedding = nn.Embedding(2,num_hiddens)
        self.blks = nn.Sequential()
        for i in range(num_layers):
            self.blks.add_module(f"{i}",d2l.EncoderBlock(
                key_size,query_size,value_size,num_hiddens,norm_shape,
                ffn_num_input,ffn_num_hiddens,num_heads,dropout,True))
        # 在BERT中，位置嵌入是可学习的，因此我们创建一个足够长的位置嵌入参数
        self.pos_embedding = nn.Parameter(torch.randn(1,max_len,num_hiddens))
        
    def forward(self,tokens, segments, valid_lens):
        # 在以下代码段中，X的形状保持不变：（批量大小，最大序列长度，num_hiddens）
        X = self.token_embedding(tokens) + self.segment_embedding(segments)
        X = X + self.pos_embedding.data[:, :X.shape[1],:]
        for blk in self.blks:
            X = blk(X, valid_lens)
        return X
                                 

In [25]:
#
vocab_size, num_hiddens, ffn_num_hiddens, num_heads = 10000, 768, 1024, 4
norm_shape, ffn_num_input, num_layers, dropout = [768], 768, 2, 0.2
encoder = BERTEncoder(vocab_size, num_hiddens, norm_shape, ffn_num_input,
                      ffn_num_hiddens, num_heads, num_layers, dropout)

#
tokens = torch.randint(0,vocab_size,(2,8))
segments = torch.tensor([[0,0,0,0,1,1,1,1],
                        [0,0,0,1,1,1,1,1]])
encoded_X = encoder(tokens,segments,None)
encoded_X.shape

torch.Size([2, 8, 768])

# 预训练任务
## 1.掩蔽语言模型（Masked Language Modeling）

In [26]:
class MaskLM(nn.Module):
    """Bert的掩码语言模型任务"""
    def __init__(self, vocab_size, num_hiddens,num_inputs=768,**kwargs):
        super(MaskLM,self).__init__(**kwargs)
        self.mlp = nn.Sequential(nn.Linear(num_inputs,num_hiddens),
                                nn.ReLU(),
                                 nn.LayerNorm(num_hiddens),
                                nn.Linear(num_hiddens,vocab_size))
        
    def forward(self, X, pred_positions):
        num_pred_positions = pred_positions.shape[1]
        print(num_pred_positions)
        pred_positions = pred_positions.reshape(-1)
        batch_size = X.shape[0]
        print(pred_positions)
        batch_idx = torch.arange(0,batch_size)
        # 假设batch_size=2，num_pred_positions=3
        # 那么batch_idx是np.array（[0,0,0,1,1,1]）
        batch_idx = torch.repeat_interleave(batch_idx, num_pred_positions)
        masked_X = X[batch_idx, pred_positions]
        print(masked_X.shape)
        masked_X = masked_X.reshape(batch_size, num_pred_positions,-1)
        print(masked_X.shape)
        mlm_Y_hat = self.mlp(masked_X)
        return mlm_Y_hat
    
mlm = MaskLM(vocab_size, num_hiddens)
mlm_positions = torch.tensor([[1, 5, 2], [6, 1, 5]])
mlm_Y_hat = mlm(encoded_X, mlm_positions)
mlm_Y_hat.shape

3
tensor([1, 5, 2, 6, 1, 5])
torch.Size([6, 768])
torch.Size([2, 3, 768])


torch.Size([2, 3, 10000])

In [27]:
# 计算真实标签mlm_Y与预测词元mlm_Y_hat之间的交叉熵损失
mlm_Y = torch.tensor([[7, 8, 9], [10, 20, 30]])
loss = nn.CrossEntropyLoss(reduction='none')
mlm_l = loss(mlm_Y_hat.reshape((-1, vocab_size)), mlm_Y.reshape(-1))
mlm_l.shape

torch.Size([6])

# 下一句预测（Next Sentence Prediction）

In [28]:
class NextSentencePred(nn.Module):
    """Bert的下一句预测任务,返回每个BERT输入序列的二分类预测"""
    def __init__(self, num_inputs, **kwargs):
        super(NextSentencePred,self).__init__(**kwargs)
        self.output = nn.Linear(num_inputs,2)
        
    def forward(self, X):
        # X的形状：(batchsize,num_hiddens)
        return self.output(X)

In [30]:
encoded_X = torch.flatten(encoded_X, start_dim=1)
print(encoded_X.shape)
# NSP的输入形状:(batchsize，num_hiddens)
nsp = NextSentencePred(encoded_X.shape[-1])
nsp_Y_hat = nsp(encoded_X)
nsp_Y_hat.shape

torch.Size([2, 6144])


torch.Size([2, 2])

In [31]:
# 计算两个二元分类的交叉熵损失
nsp_y = torch.tensor([0, 1])
nsp_l = loss(nsp_Y_hat, nsp_y)
nsp_l.shape

torch.Size([2])

# 整合代码
预训练BERT时，最终的损失函数是掩蔽语言模型损失函数和下一句预测损失函数的线性组合

In [40]:
class BERTModel(nn.Module):
    """Bert模型"""
    def __init__(self, vocab_size, num_hiddens, norm_shape,ffn_num_input,
                ffn_num_hiddens, num_heads, num_layers,dropout,
                max_len = 1000, key_size=768,query_size=768, value_size=768,
                 hid_in_features=768, mlm_in_features=768, nsp_in_features=768):
        super(BERTModel,self).__init__()
        self.encoder = BERTEncoder(vocab_size, num_hiddens, norm_shape, ffn_num_input,
                      ffn_num_hiddens, num_heads, num_layers, dropout,max_len=max_len,
                                  key_size=key_size,query_size=query_size,value_size=value)
        self.hidden = nn.Sequential(nn.Linear(hid_in_features,num_hiddens),
                                   nn.Tanh())
        self.mlm = MaskLM(vocab_size, num_hiddens, mlm_in_features)
        self.nsp = NextSentencePred(nsp_in_features)
        
    def forward(self, tokens, segments, valid_lens = None,
               pred_positions = None):
        encoded_X = self.encoder(tokens, segments, valid_lens)
        if pred_positions is not None:
            mlm_Y_hat = self.mlm(encoded_X, pred_positions)
        else:
            mlm_Y_hat = None
        # 用于下一句预测的多层感知机分类器的隐藏层，0是“<cls>”标记的索引
        nsp_Y_hat = self.nsp(self.hidden(encoder_X[:,0,:]))
        return encoded_X, mlm_Y_hat, nsp_Y_hat
        

# 微调BERT——自然语言推断
## 1. 加载预训练的BERT

In [2]:
import json
import multiprocessing
import os
import torch
from torch import nn
from d2l import torch as d2l

In [3]:
# 加载预训练的BERT
# “bert.base”与原始的BERT基础模型一样大，需要大量的计算资源才能进行微调，
# “bert.small”是一个小版本

d2l.DATA_HUB['bert.base'] = (d2l.DATA_URL + 'bert.base.torch.zip',
                             '225d66f04cae318b841a13d32af3acc165f253ac')
d2l.DATA_HUB['bert.small'] = (d2l.DATA_URL + 'bert.small.torch.zip',
                              'c72329e68a732bef0452e4b96a1c341c8910f81f')

In [4]:
d2l.DATA_HUB['bert.small']

('http://d2l-data.s3-accelerate.amazonaws.com/bert.small.torch.zip',
 'c72329e68a732bef0452e4b96a1c341c8910f81f')

In [5]:
def load_pretrained_model(pretrained_model, num_hiddens, ffn_num_hiddens,
                         num_heads, num_layers, dropout, max_len, device):
    data_dir = d2l.download_extract(pretrained_model)
    # 定义空词表以加载预定义词表
    vocab = d2l.Vocab()
    vocab.idx_to_token = json.load(open(os.path.join(data_dir,
                                                     'vocab.json')))
    vocab.token_to_idx = {token:idx 
                          for idx, token in enumerate(vocab.idx_to_token)}
    bert = d2l.BERTModel(len(vocab), num_hiddens, norm_shape = [256],
                        ffn_num_input=256, ffn_num_hiddens=ffn_num_hiddens,
                        num_heads=4, num_layers=2, dropout=0.2,
                        max_len = max_len, key_size=256, query_size=256,
                        value_size=256, hid_in_features=256,
                        mlm_in_features=256, nsp_in_features=256)
    # 加载预训练BERT参数
    bert.load_state_dict(torch.load(os.path.join(data_dir,
                                                 'pretrained.params')))
    return bert, vocab

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert, vocab = load_pretrained_model('bert.small', num_hiddens=256, 
                                    ffn_num_hiddens=512, num_heads=4,
                                   num_layers=2,dropout=0.1,max_len=512,device=device)

## 2. 微调BERT的数据集

In [18]:
class SNLIBERTDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, max_len, vocab=None):
        all_premise_hypothesis_tokens = [[
            p_tokens, h_tokens] for p_tokens, h_tokens in zip(
            *[d2l.tokenize([s.lower() for s in sentences])
              for sentences in dataset[:2]])]

        self.labels = torch.tensor(dataset[2])
        self.vocab = vocab
        self.max_len = max_len
        (self.all_token_ids, self.all_segments,
         self.valid_lens) = self._preprocess(all_premise_hypothesis_tokens)
        print('read ' + str(len(self.all_token_ids)) + ' examples')

    def _preprocess(self, all_premise_hypothesis_tokens):
#         pool = multiprocessing.Pool(4)  # 使用4个进程
#         out = pool.map(self._mp_worker, all_premise_hypothesis_tokens)
#         all_token_ids = [
#             token_ids for token_ids, segments, valid_len in out]
#         all_segments = [segments for token_ids, segments, valid_len in out]
#         valid_lens = [valid_len for token_ids, segments, valid_len in out]
        all_token_ids ,ll_segments,valid_lens =\
            self._mp_worker(all_premise_hypothesis_tokens)
        return (torch.tensor(all_token_ids, dtype=torch.long),
                torch.tensor(all_segments, dtype=torch.long),
                torch.tensor(valid_lens))

    def _mp_worker(self, premise_hypothesis_tokens):
#         p_tokens, h_tokens = premise_hypothesis_tokens
        p_tokens = [pre_hyp[0] for pre_hyp in premise_hypothesis_tokens]
        h_tokens = [pre_hyp[1] for pre_hyp in premise_hypothesis_tokens]
        self._truncate_pair_of_tokens(p_tokens, h_tokens)
        tokens, segments = d2l.get_tokens_and_segments(p_tokens, h_tokens)
        token_ids = self.vocab[tokens] + [self.vocab['<pad>']] \
                             * (self.max_len - len(tokens))
        segments = segments + [0] * (self.max_len - len(segments))
        valid_len = len(tokens)
        return token_ids, segments, valid_len

    def _truncate_pair_of_tokens(self, p_tokens, h_tokens):
        # 为BERT输入中的'<CLS>'、'<SEP>'和'<SEP>'词元保留位置
        while len(p_tokens) + len(h_tokens) > self.max_len - 3:
            if len(p_tokens) > len(h_tokens):
                p_tokens.pop()
            else:
                h_tokens.pop()

    def __getitem__(self, idx):
        return (self.all_token_ids[idx], self.all_segments[idx],
                self.valid_lens[idx]), self.labels[idx]

    def __len__(self):
        return len(self.all_token_ids)

In [19]:
# 预处理
def preprocess_nmt(text):
    '''在字符与标点符号之间添加空格'''
    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '
    #用空格代替不间断空格（non-breaking space）\xa0 是不间断空白符 
    text = text.replace('\u202f',' ').replace('\xa0',' ').lower()
    #在单词和标点符号之间插入空格
    out = [' ' + char if i > 0 and no_space(char,text[i-1]) 
          else char for i, char in enumerate(text)]
    return ''.join(out)

def read_snli(data_dir, is_train):
    """将SNLI数据集解析为前提、假设和标签"""
    label_set = {'entailment':0, 'contradiction':1 ,'neutral':2}
    file_name = os.path.join(data_dir,'snli_1.0_train.csv'
                            if is_train else 'snli_1.0_test.csv')
    
    data = pd.read_csv(file_name)[['gold_label','sentence1','sentence2']]
    data = data[data['gold_label']!='-']
    premises = data['sentence1'].map(preprocess_nmt).values
    hypotheses = data['sentence2'].astype(str).map(preprocess_nmt).values
    labels = data['gold_label'].map(label_set).values
    return premises, hypotheses, labels


is_train = True
data_dir = 'archive'
batch_size, max_len = 256, 64
train_data = read_snli(data_dir, is_train=True)
test_data = read_snli(data_dir, is_train=False)

train_set = SNLIBERTDataset(train_data, max_len, vocab)
test_set = SNLIBERTDataset(test_data, max_len, vocab)
# train_iter = torch.utils.data.DataLoader(train_set, batch_size, shuffle=True)
# test_iter = torch.utils.data.DataLoader(test_set, batch_size, shuffle=False)

TypeError: an integer is required (got type list)

## 3. 微调BERT

In [22]:
class BERTClassifier(nn.Module):
    def __init__(sefl, bert):
        super(BERTClassifier, self).__init__()
        self.encoder = bert.encoder
        self.hidden = bert.hidden
        self.output = nn.Linear(256, 3)
        
    def forward(self, inputs):
        tokens_X, segments_X, valid_lens_X = inputs
        encoded_X = self.encoder(tokens_X, segments_X, valid_lens_X)
        return self.output(self.hidden(encoded_X[:,0,:]))
    

In [21]:
net = BERTClassifier(bert)
lr, num_epochs = 1e-4, 5
trainer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss(reduction='none')
d2l.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs,device)

[1, 2]