#### 掩码语言模型MLM： 是BERT模型的核心预训练任务。通过在输入文本中随机遮掩部分词语并让模型预测这些词，模型能够有效地学习词的上下文语义。

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

In [2]:
# 自注意力模块
class SelfAttention(nn.Module):
    def __init__(self, embedding_size: int, heads: int):
        """
        Args:
            embedding_size: 嵌入向量的维度
            heads: 注意力头的数量
        """
        super(SelfAttention, self).__init__()
        self.embedding_size = embedding_size
        self.heads = heads
        self.head_dim = embedding_size // heads
        assert (
            self.head_dim * heads == embedding_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embedding_size)

    def forward(self, values, keys, query, mask):
        """
        Args:
            values: shape (N, value_len, embedding_size)
            keys: shape (N, key_len, embedding_size)
            query: shape (N, query_len, embedding_size)
            mask: shape (N, 1, query_len, key_len)
        Returns:
            out: shape (N, query_len, embedding_size)
        """
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # 分割嵌入向量为多个头
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        # 计算注意力分数
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys]) / (self.embedding_size ** 0.5)

        # 调整mask的形状
        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2)  # (N, 1, 1, seq_len)
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(energy, dim=-1)

        out = torch.einsum("nhqk,nkhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )

        out = self.fc_out(out)
        return out
    

# 前馈神经网络模块
class FeedForward(nn.Module):
    def __init__(self, embedding_size: int, forward_expansion: int):
        """
        Args:
            embedding_size: 嵌入向量的维度
            forward_expansion: 前馈网络中间层的扩展倍数
        """
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(embedding_size, forward_expansion * embedding_size)
        self.fc2 = nn.Linear(forward_expansion * embedding_size, embedding_size)

    def forward(self, x):
        """
        Args:
            x: shape (N, seq_len, embedding_size)
        Returns:
            out: shape (N, seq_len, embedding_size)
        """
        out = self.fc1(x)
        out = F.relu(out)
        out = self.fc2(out)
        return out
    

# Transformer编码器块
class TransformerBlock(nn.Module):
    def __init__(
        self,
        embedding_size: int,
        heads: int,
        forward_expansion: int,
        dropout: float=0.1,
    ):
        """
        Args:
            embedding_size: 嵌入向量的维度
            heads: 注意力头的数量
            forward_expansion: 前馈网络中间层的扩展倍数
            dropout: dropout概率
        """
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embedding_size, heads)
        self.norm1 = nn.LayerNorm(embedding_size)
        self.norm2 = nn.LayerNorm(embedding_size)
        self.feed_forward = FeedForward(embedding_size, forward_expansion)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        """
        Args:
            value: shape (N, value_len, embedding_size)
            key: shape (N, key_len, embedding_size)
            query: shape (N, query_len, embedding_size)
            mask: shape (N, 1, query_len, key_len)
        Returns:
            out: shape (N, query_len, embedding_size)
        """
        attention = self.attention(x, x, x, mask)
        norm1_out = self.norm1(attention + x)
        # 残差连接和层归一化
        drop1_out = self.dropout(norm1_out)
        forward = self.feed_forward(drop1_out)
        norm2_out = self.norm2(forward + drop1_out)
        drop2_out = self.dropout(norm2_out)
        return drop2_out
    

# BERT编码器
class BERTEncoder(nn.Module):
    def __init__(self, embedding_size: int, heads: int, forward_expansion: int, num_layers: int, dropout: float=0.1):
        """
        Args:
            embedding_size: 嵌入向量的维度
            heads: 注意力头的数量
            forward_expansion: 前馈网络中间层的扩展倍数
            num_layers: Transformer块的数量
            dropout: dropout概率
        """
        super(BERTEncoder, self).__init__()
        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embedding_size,
                    heads,
                    forward_expansion,
                    dropout,
                )
                for _ in range(num_layers)
            ]
        )
    def forward(self, x, mask):
        """
        Args:
            x: shape (N, seq_len, embedding_size)
            mask: shape (N, 1, seq_len, seq_len)
        Returns:
            out: shape (N, seq_len, embedding_size)
        """
        for layer in self.layers:
            x = layer(x, mask)
        return x

In [3]:
# 创建掩码任务数据
def create_masked_lm_data(inputs, vocab_size, mask_token_id, pad_token_id, mlm_probability=0.15):
    """
    Args:
        inputs: shape (N, seq_len)
        vocab_size: 词汇表大小
        mask_token_id: 掩码标记的ID
        pad_token_id: 填充标记的ID
        mlm_probability: 掩码概率
    Returns:
        inputs_masked: shape (N, seq_len)
        labels: shape (N, seq_len)
    """
    inputs_with_masks = inputs.clone()
    labels = inputs.clone()
    for i in range(inputs.shape[0]):
        for j in range(inputs.shape[1]):
            if inputs[i, j] == pad_token_id:
                labels[i, j] = -100  # 不计算填充位置的损失
                continue
            prob = random.random()
            if prob < mlm_probability:
                prob /= mlm_probability
                # 80%的时间替换为掩码标记
                if prob < 0.8:
                    inputs_with_masks[i, j] = mask_token_id
                # 10%的时间替换为随机标记
                elif prob < 0.9:
                    inputs_with_masks[i, j] = random.randint(0, vocab_size - 1)
                # 剩下的10%保持不变
                # labels已经是原始输入，无需修改
            else:
                labels[i, j] = -100  # 只计算掩码位置的损失
    return inputs_with_masks, labels

    # labels = inputs.clone()
    # probability_matrix = torch.full(labels.shape, mlm_probability)
    # special_tokens_mask = (inputs == pad_token_id)
    # probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
    # masked_indices = torch.bernoulli(probability_matrix).bool()
    # labels[~masked_indices] = -100  # 只计算掩码位置的损失

    # # 80%的时间替换为掩码标记
    # indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    # inputs[indices_replaced] = mask_token_id

    # # 10%的时间替换为随机标记
    # indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    # random_words = torch.randint(vocab_size, labels.shape, dtype=torch.long)
    # inputs[indices_random] = random_words[indices_random]

    # # 剩下的10%保持不变

    # return inputs, labels

In [4]:
# MLM任务实现
class MLMTask(nn.Module):
    def __init__(self, embedding_size: int, vocab_size: int, heads: int, forward_expansion: int, num_layers: int, dropout: float=0.1):
        """
        Args:
            embedding_size: 嵌入向量的维度
            vocab_size: 词汇表大小
        """
        super(MLMTask, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.bert_encoder = BERTEncoder(
            embedding_size=embedding_size,
            heads=heads,
            forward_expansion=forward_expansion,
            num_layers=num_layers,
            dropout=dropout,
        )
        self.fc = nn.Linear(embedding_size, vocab_size)

    def forward(self, x, mask):
        """
        Args:
            x: shape (N, seq_len, embedding_size)
            mask: shape (N, 1, seq_len, seq_len)
        Returns:
            out: shape (N, seq_len, vocab_size)
        """
        embeddings = self.embedding(x)
        encorder_output = self.bert_encoder(embeddings, mask)
        logits = self.fc(encorder_output)
        return logits

In [5]:
# 模拟数据
vocab_size = 30522
embedding_size = 768
num_layers = 12
heads = 12
forward_expansion = 4
dropout = 0.1
mask_token_id = 103
pad_token_id = 0
sequence_length = 128
batch_size = 2

In [6]:
torch.randint(0, vocab_size, (2, 5))

tensor([[ 9064, 21771,  5708,  1027, 15083],
        [ 6383, 10097, 22037,  7989, 15117]])

In [7]:
# 初始化MLM任务模型
mlm_model = MLMTask(
    embedding_size=embedding_size,
    vocab_size=vocab_size,
    heads=heads,
    forward_expansion=forward_expansion,
    num_layers=num_layers,
    dropout=dropout,
)
# 模拟输入数据
inputs = torch.randint(0, vocab_size, (batch_size, sequence_length))  # [2, 20]
# 创建掩码任务数据
mask_for_attention = torch.ones((batch_size, sequence_length)) # 全部位置都可见
inputs_masked, labels = create_masked_lm_data(inputs, vocab_size, mask_token_id, pad_token_id)
# 前向传播
logits = mlm_model(inputs_masked, mask_for_attention)
# 计算损失
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(logits.view(-1, vocab_size), labels.view(-1))
print(f"Loss: {loss.item()}")
print("模型输出形状:", logits.shape)  # 应为 (batch_size, sequence_length, vocab_size)
print("标签形状:", labels.shape)  # 应为 (batch_size, sequence_length)

Loss: 10.618671417236328
模型输出形状: torch.Size([2, 128, 30522])
标签形状: torch.Size([2, 128])


In [15]:
#模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('google-bert/bert-base-uncased', local_dir='./bert-base-uncased')

2026-01-27 14:57:04,870 - modelscope - INFO - Not logged-in, you can login for uploadingor accessing controlled entities.


Downloading Model from https://www.modelscope.cn to directory: /data/nvme1n1p1/zhangningboo/workspace/tsinghua-lm-books/从零构建大模型-算法训练与微调/第三章BERT模型核心实现与预训练/bert-base-uncased


In [16]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
tokenizer = BertTokenizer.from_pretrained('./bert-base-uncased')
model = BertForMaskedLM.from_pretrained('./bert-base-uncased')
# 定义带有掩码的句子
text = "In the context of global economic growth, the demand for technology companies' [MASK] is increasing rapidly. "
input_ids = tokenizer.encode(text, return_tensors='pt')
print(input_ids)

mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]

# 前向传播预测掩码位置的词汇分布
with torch.no_grad():
    output = model(input_ids)
logits = output.logits
# 获取掩码位置的预测结果，并取概率最高的词
mask_token_logits = logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
    print(f"Predicted token: {tokenizer.decode([token])}")
# 将预测词插入句子中，生成完整的标题
predicted_tokens = [tokenizer.decode([token]) for token in top_5_tokens]
print("Predicted tokens for [MASK]:", predicted_tokens)

Loading weights: 100%|██████████| 202/202 [00:00<00:00, 411.44it/s, Materializing param=cls.predictions.transform.dense.weight]                 
BertForMaskedLM LOAD REPORT from: ./bert-base-uncased
Key                         | Status     |  | 
----------------------------+------------+--+-
bert.pooler.dense.bias      | UNEXPECTED |  | 
cls.seq_relationship.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight | UNEXPECTED |  | 
bert.pooler.dense.weight    | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tensor([[ 101, 1999, 1996, 6123, 1997, 3795, 3171, 3930, 1010, 1996, 5157, 2005,
         2974, 3316, 1005,  103, 2003, 4852, 5901, 1012,  102]])
Predicted token: products
Predicted token: services
Predicted token: technology
Predicted token: solutions
Predicted token: equipment
Predicted tokens for [MASK]: ['products', 'services', 'technology', 'solutions', 'equipment']
