In [5]:
import torch
from transformers import BertForMaskedLM, BertTokenizer
import torch.nn as nn
import transformers as tfs
from transformers.models.bert.modeling_bert import BertEmbeddings, BertEncoder, BertOnlyMLMHead

In [2]:
class BertMLM1(nn.Module):
    def __init__(self, bert_path):
        super(BertMLM1, self).__init__()
        # 加载预训练好的BERT，使用其中的参数来初始化MLM
        self.bert = BertForMaskedLM.from_pretrained(bert_path)

    '''
    这里用下载好的预训练模型来初始化我们的MLM模型
    BertMLM分为三部分：1.embedding层 2.12个encoder层 3.线性分类层cls
    一般来说，不同的任务，需要去更改第三个部分。MLM本质上是输出一堆word的probability，可以看做是分类任务
    inputs_ids等三个参数的维度都是[batch_size, max_len]。它们是通过transformers库中的BertTokenizer模块来处理输入的句子得出的
    '''

    def forward(self, input_ids, input_tyi, input_attention_mask):
        out = self.bert(input_ids=input_ids, input_tyi=input_tyi, attention_mask=input_attention_mask)
        print(input_ids)
        print(input_tyi)
        print(input_attention_mask)
        return out['logits']


class BertMLM2(nn.Module):
    def __init__(self, bert_path):
        super(BertMLM2, self).__init__()
        # 加载超参数
        config = tfs.AutoConfig.from_pretrained(bert_path)
        '''
        注意：这里的bert_path是bert模型所在的文件夹，而不是直接指向bert模型本身，其中包含了config.json
        是不是预先规定了，遇到了config.json就把它当做配置文件？
        '''
        self.embedding = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.cls = BertOnlyMLMHead(config)

    def forward(self, input_ids, input_tyi, input_attention_mask):
        embeddings = self.embedding(input_ids=input_ids, token_type_ids=input_tyi)
        '''
        原始输入的input_attention_mask的shape是[batch_size, 1, 1, max_len]
        但是bert encoder要求维度为[batch_size, max_len]
        所以需要升维两次
        '''
        input_attention_mask = torch.squeeze(input_attention_mask, dim=1)
        input_attention_mask = torch.squeeze(input_attention_mask, dim=1)

        encoder_out = self.encoder(hidden_states=embeddings, attention_mask=input_attention_mask)
        out_bert = encoder_out[0]
        cls_out = self.cls(out_bert)
        # cls_out的shape为[batch_size. max_len, vocab_size]，即batch_size个句子中，每个句子都有max_len个字，每个字映射到词表vocab_size个字的时候，其概率的大小

In [8]:
bert_path = 'bert-base-chinese'
bert_model = BertForMaskedLM.from_pretrained(bert_path)
print("下面是bert模型可供调用查看的参数名")
print(bert_model.state_dict().keys())
mlm_model = BertMLM2(bert_path)
print("下面是BertEmbeddings模型可供调用查看的参数名")
print(mlm_model.embedding.state_dict().keys())
print("下面是BertEncoder模型可供调用查看的参数名")
print(mlm_model.encoder.state_dict().keys())
print("下面是BertOnlyMLMHead模型可供调用查看的参数名")
print(mlm_model.cls.state_dict().keys())
# 可以看到bert模型对应其余三个模型的参数名均有改变，我们需要讲其参数名称改变一下，再统一赋值

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


下面是bert模型可供调用查看的参数名
odict_keys(['bert.embeddings.position_ids', 'bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.ou

In [9]:
# 将前j个前缀干掉
def rm_forward(key_str, j):
    key_list = key_str.split('.')
    key_list = key_list[j:]
    return '.'.join(key_list)

class BertMLM(nn.Module):
    def __init__(self, bert_path):
        super(BertMLM, self).__init__()
        config = tfs.AutoConfig.from_pretrained(bert_path)
        self.embedding = BertEmbeddings(config).to('cuda')
        self.encoder = BertEncoder(config).to('cuda')
        self.cls = BertOnlyMLMHead(config).to('cuda')
        self.bert = BertForMaskedLM.from_pretrained(bert_path).to('cuda')

        i = 0
        emd_i = 6 # embedding层参数是6个
        enc_i = 198 # encoder层参数是192个
        cls_i = 205 # cls层参数是7个
        for key, value in self.bert.state_dict().items():
            if i < emd_i:
                key = rm_forward(key, 2)
                self.embedding.state_dict()[key].copy_(value)
            elif i < enc_i:
                key = rm_forward(key, 2)
                self.encoder.state_dict()[key].copy_(value)
            else:
                key = rm_forward(key, 1)
                self.cls.state_dict()[key].copy_(value)
            i += 1

    def forward(self, input_ids, input_tyi, input_attention_mask, mask_ids):
        embeddings = self.embedding(input_ids=input_ids, token_type_ids=input_tyi)
        '''
        原始输入的input_attention_mask的shape是[batch_size, 1, 1, max_len]
        但是bert encoder要求维度为[batch_size, max_len]
        所以需要升维两次
        '''
        # input_attention_mask = torch.squeeze(input_attention_mask, dim=1)
        # input_attention_mask = torch.squeeze(input_attention_mask, dim=1)

        encoder_out = self.encoder(hidden_states=embeddings, attention_mask=input_attention_mask)
        out_bert = encoder_out[0]
        cls_out = self.cls(out_bert)
        return cls_out[0][mask_ids]
        # cls_out的shape为[batch_size. max_len, vocab_size]，即batch_size个句子中，每个句子都有max_len个字，每个字映射到词表vocab_size个字的时候，其概率的大小

In [10]:
tokenizer = BertTokenizer.from_pretrained(bert_path)
text = "[CLS]中国的首都是北京[SEP]"
tokenized_text = tokenizer.tokenize(text) # tokenize函数似乎就是把一个句子拆成一个字的list
word_to_mask_ids = [7, 8] # 指定哪个字需要被mask
for id in word_to_mask_ids:
    tokenized_text[id] = '[MASK]'
print(tokenized_text)

indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # 根据tokenizer.json，将字转换为索引
segment_ids = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

token_tensor = torch.tensor([indexed_tokens]).to('cuda')
segment_tensor = torch.tensor([segment_ids]).to('cuda')
attention_mask = torch.tensor([attention_mask]).to('cuda')

bert_mlm = BertMLM(bert_path)
cls_out = bert_mlm.forward(token_tensor, segment_tensor, attention_mask, word_to_mask_ids)
prediction_ids = torch.topk(cls_out, 10, sorted=True).indices.cpu().detach().numpy()
prediction = []
print(prediction_ids)
for prediction_id in prediction_ids:
    prediction.append(tokenizer.convert_ids_to_tokens(prediction_id))
print(prediction)

['[CLS]', '中', '国', '的', '首', '都', '是', '[MASK]', '[MASK]', '[SEP]']


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[[ 784 6443 1525 1567 2124 3862 8106  776 2255  862]
 [8043  511 8013  136 8024 1408 8038 1450 8106  720]]
[['什', '谁', '哪', '啥', '它', '海', '...', '京', '山', '何'], ['？', '。', '！', '?', '，', '吗', '：', '呢', '...', '么']]
