### BERT代码实现

基于huggingface/Transformers库(3.1.0版本)中的pytorch版本BERT实现，我来实现自己的BERT模型

In [55]:
import pdb
import math
from dataclasses import dataclass
from typing import Optional, Tuple

import torch
from torch import nn
import torch.nn.functional as F
from transformers import (BertConfig,
                          BertTokenizer,
                         )

#### 首先引入transformers中的Config和Tokenizer

In [3]:
pretrained_model_name_or_path = '/dfsdata2/yucc1_data/models/huggingface/bert-base-cased'
# 12个label
# pdb.set_trace()
config = BertConfig.from_pretrained(pretrained_model_name_or_path,
                                    num_labels=6,)
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path,
                                         config=config)

#### 这一部分是为了标准化输出

In [85]:
@dataclass
class BaseModelOutput:
    """
    模型输出的基类，可能有hidden_states和attentions
    last_hidden_state: 模型最后一层的输出，(batch_size, seq_length, hidden_size) eg. (64, 128, 768)
    hidden_states: 元组，(num_hidden_layer+1)个，也就是13个，包含embedding的输出和其他所有层的输出
    attentions: num_hidden_layer * (batch_size, num_heads, seq_length, seq_length) eg. 12 * (64, 12, 128, 128)
    """
    
    last_hidden_state: torch.FloatTensor
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None    

In [119]:
@dataclass
class BaseModelOutputWithPooling:
    """
    BERT模型输出的基类
    last_hidden_state: 模型最后一层的输出，(batch_size, seq_length, hidden_size) eg. (64, 128, 768)
    pooler_output: 模型最后一层的cls输出，乘以(hidden_size, hidden_size)后的结果，(batch_size, hidden_size) eg. (64, 768)
    hidden_states: 元组，(num_hidden_layer+1)个，也就是13个，包含embedding的输出和其他所有层的输出
    attentions: num_hidden_layer * (batch_size, num_heads, seq_length, seq_length) eg. 12 * (64, 12, 128, 128)
    """
    
    last_hidden_state: torch.FloatTensor
    pooler_output: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

#### BERT模型的构建

BERT模型构建，包含三层：embedding、encoder、pooler。其中，encoder包含多层layer。每个layer包含attention、全连接层。

In [4]:
# layer norm层
BertLayerNorm = nn.LayerNorm

In [5]:
# embedding层实现，三个输入相加，然后layernorm，再然后dropout
class BertEmbeddings(nn.Module):
    """ embeddings相关处理 """
    
    def __init__(self, config):
        super().__init__()
        # 首先是词，位置，type三个相加
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        
        # LayerNorm & dropout
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        # 位置编码 (1, seq_length)
        self.register_buffer('position_ids', torch.arange(config.max_position_embeddings).expand(1, -1))
        
    def forward(self, input_ids, position_ids=None, token_type_ids=None, input_embeds=None):
        """
        input_ids (batch, seq_length)
        position_ids (batch, seq_length) or None
        type_ids (batch,) or None
        """
        # 得到batch size和seq length
        # 用于处理postion_ids和type_ids为空时的默认值
        if input_embeds is not None:
            input_shape = input_embeds.shape[:-1] 
        else:
            input_shape = input_ids.shape 
        batch_size, seq_length = input_shape
        
        # 位置编码处理, 默认0-n
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]
        # type编码处理，默认全是0，全是第一个
        # 生成全是0的tensor的方法
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
        
        # 词embedding优先使用输入的，其次是input_ids得到的
        if input_embeds is not None:
            word_embeds = input_embeds
        else:
            word_embeds = self.word_embeddings(input_ids)
        # 位置和type，使用embedding查找
        position_embeds = self.position_embeddings(position_ids)
        token_type_embeds = self.type_embeddings(token_type_ids)
        
        # 相加得到需要的结果，然后在经过layer norm层和dropout层
        embeddings = word_embeds + position_embeds + token_type_embeds
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [44]:
class BertSelfAttention(nn.Module):
    """
    self attention实现
    """
    
    def __init__(self, config):
        super().__init__()
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, 'embedding_size'):
            raise ValueError(
                f'The hidden size {config.hidden_size} is not a multiple of the number of attention '
                f'heads (config.num_attention_heads)'
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # eg. query (768, 768)
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
    
    def transpose_for_scores(self, x):
        # x (batch_size, seq_length, hidden_states) eg. (64, 128, 768)
        # new_x_shape (batch_size, seq_length, num_attention_heads, attention_head_size) eg. (64, 128, 12, 64)
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        # 转换shape；调整顺序
        x = x.view(*new_x_shape)
        # 转换后： (batch_size, num_attention_heads, seq_length, attention_head_size) eg. (64, 12, 128, 64) 
        x = x.permute(0, 2, 1, 3)
        return x
    
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
    ):
        # 初始化
        hidden_states = embedding_output
        attention_mask = extended_attention_mask
        output_attentions = output_attentions

        # hidden_states (64, 128, 768)
        # query (768, 768)
        # mixed_query_layer (64, 128, 768)
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        # query_layer (batch_size, num_attention_heads, seq_length, attention_head_size) eg. (64, 12, 128, 64) 
        # key_layer & value_layer同理
        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # quey_layer
        # (batch_size, num_attention_heads, seq_length, attention_head_size) eg. (64, 12, 128, 64) 
        # key_layer.transpose(-1, -2):
        # (batch_size, num_attention_heads, attention_head_size, seq_length) eg. (64, 12, 64, 128) 
        # 乘积结果 (batch_size, num_attention_heads, seq_length, seq_length) eg. (64, 12, 128, 128)
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        # 除以attention_head_size的开根号 也就是除以12的开根号
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        # attention_mask: (batch_size, num_heads, from_seq_length, to_seq_length)
        # 但是设置的是哦户num_heads, from_seq_length设置为1，所以eg: (64, 1, 1, 128)
        if attention_mask is not None:
            # attention-scores: (batch_size, num_attention_heads, seq_length, seq_length) eg. (64, 12, 128, 128)
            attention_scores = attention_scores + attention_mask

        # 归一化attention_scores为概率率
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.dropout(attention_probs)

        # attention_probs: (batch_size, num_attention_heads, seq_length, seq_length) eg. (64, 12, 128, 128)
        # value_layer: (batch_size, num_attention_heads, seq_length, attention_head_size) eg. (64, 12, 128, 64) 
        # context_layer: (batch_size, num_attention_heads, seq_length, attention_head_size) eg. (64, 12, 128, 64)
        context_layer = torch.matmul(attention_probs, value_layer)

        # context_layer: (batch_size, num_attention_heads, seq_length, attention_head_size) eg. (64, 12, 128, 64)
        # context_layer 新: (batch_size, seq_length, num_attention_heads, attention_head_size) eg. (64, 128, 12, 64)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        # new_context_layer_shape: (batch_size, seq_length, all_head_size) eg. (64, 128, 768)
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size, )
        context_layer = context_layer.view(*new_context_layer_shape)

        # context_layer: (batch_size, seq_length, all_head_size) eg. (64, 128, 768)
        # attention_probs: (batch_size, num_attention_heads, seq_length, seq_length) eg. (64, 12, 128, 128)
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
        return outputs

In [45]:
class BertSelfOutput(nn.Module):
    """
    实现了attention之后的全连接和残差层
    与BertSelfAttention一起构成了BERT的Attention层
    """
    
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
    def forward(self, hidden_states, input_tensor):
        # hidden_states: (batch_size, seq_length, hidden_size) eg. (64, 128, 768)
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

In [46]:
class BertAttention(nn.Module):
    """
    Bert Attention层
    """
    
    def __init__(self, config):
        super().__init__()
        self.self = BertSelfAttention(config)
        self.output = BertSelfOutput(config)
        
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
    ):
        # 首先是self attention层
        self_outputs = self.self(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
        )
        # 然后是全连接层和残差层
        attention_output = self.output(self_outputs[0], hidden_states)
        # 返回attention的输出，如果output_attentions为True的话，self_outptus第二位有attention权重
        # 也就是要么一个结果，要么两个结果
        outputs = (attention_output,) + self_outputs[1:]
        return outputs

In [47]:
class BertIntermediate(nn.Module):
    """
    中间层，attention上面的全连接层的一半
    """
    
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act, str) and config.hidden_act == 'gelu':
            self.intermediate_act_fn = F.gelu
        else:
            self.intermediate_act_fn = config.hidden_act
            
    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states

In [48]:
class BertOutput(nn.Module):
    """
    中间层，attention上面的全连接的另一半
    """
    
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    
    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

In [49]:
class BertLayer(nn.Module):
    """
    bert的encoder中的一层
    """
    
    def __init__(self, config):
        super().__init__()
        self.attention = BertAttention(config)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)
        
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
    ):
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
        )
        # 第一个输出永远是attention output
        attention_output = self_attention_outputs[0]
        # self_attention_outputs 
        # 如果output_attentions为False，则只有第一个输出
        # 如果output_attentions为True，则有两个输出，第二个输出为attention的概率
        outputs = self_attention_outputs[1:]
        
        # 下面全连接层
        intermediate_output = self.intermediate(attention_output)
        # layer_output, 是hidden_states
        layer_output = self.output(intermediate_output, attention_output)
        
        # 打包输出
        # (hidden_states, attention_probs) or (layer_outputs)
        # hidden_states (batch_size, seq_length, hidden_size) eg (64, 128, 768)
        outputs = (layer_output,) + outputs
        return outputs

In [81]:
class BertEncoder(nn.Module):
    """
    bert encoder, bert模型分为三块，一个块是embedding，一块是encoder，一块是pooler
    encoder包含12层的layer，每个layer又包含上下两层
    """
    
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 保存多个layer的方法，nn.ModuleList
        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
        
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=False,
    ):
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
            
            # 如果output_attentions == True 返回(hidden_states, attention_probs)
            # 如果output_attentions == False, 返回(hidden_states)
            # hidden_states (batch_size, seq_length, hidden_size) eg (64, 128, 768)
            layer_outputs = layer_module(
                hidden_states,
                attention_mask=attention_mask,
                output_attentions=output_attentions,
            )
            hidden_states = layer_outputs[0]
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)
        
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)
        
        # hidden_states: (batch_size, seq_length, hidden_size) eg (64, 128, 768) 最后一层的hidden_states
        # all_hidden_states: (num_hidden_layers+1) * hidden_states = 13 * (64, 128, 768)
        # all_attentions: (batch_size, num_attention_heads, seq_length, seq_length) eg. (64, 12, 128, 128)
        # 默认肯定返回hidden_states，另两个根据参数控制决定是否输出
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
        )    

In [84]:
class BertPooler(nn.Module):
    """
    bert三个模块：embedding，encoder，pooler。这里是pooler模块， 将cls对应的hidden_states乘以(hidden_size, hidden_size)
    也就是乘以 eg. (768, 768)
    """
    
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()
        
    def forward(self, hidden_states):
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

In [113]:
class BertModel(nn.Module):
    """
    bert模型，包含embedding、encoder、pooler三层；其中encoder包含多层layer，每个layer又分为attention层和全连接层
    """
    
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)
        
        self.init_weights()
        
    def forward(
        self,
        # 四个输入相关的参数
        # inputs_embeds与input_ids是二选一的关系
        # input_ids + attention_mask + token_type_ids
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        input_embeds=None,
        # 三个输出相关的参数
        # 是否输出attention；
        # 是否输出hidden states
        # 返回字典格式，或者dataclass格式
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
#         pdb.set_trace()
        # 返回格式处理
        # eg: output_attentions False; 
        # eg: output_hidden_states: False;
        # eg: return_dict: False;
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        # 四个输入相关的标准化处理
        # input shape、device的处理；异常的警告
        # input_shape: (batch_size, seq_length) eg. (64, 128)
        if input_ids is not None and input_embeds is not None:
            # value error警告用法
            raise ValueError("You cnanot sepcify both input_ids and input_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
        elif input_embeds is not None:
            input_shape = input_embeds.size()[:-1]
        else:
            raise ValueError('You have to specify either input_ids or inputs_embeds')
        # device
        device = input_ids.device if input_ids is not None  else inputs_embeds.device
        # 处理attention_mask、token_type_ids
        # attention_mask: (batch_size, seq_length) eg. (64, 128)
        # token_type_ids: (batch_size, seq_length) eg. (64, 128)
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
        
        # 获取extended_attention_mask (batch_size, num_heads, from_seq_length, to_seq_length)
        # attention_mask: (batch_size, seq_length), input_shap: (batch_size, seq_length), devcie:
        # attention_mask查看维度为3； .dim()
        # 此时为 (batch_size, from_seq_length, to_seq_length)
        # extended_attention_mask eg: (64, 1, 1, 128)
        if attention_mask.dim() == 3:
            extended_attention_mask = attention_mask[:, None, :, :]
        elif attention_mask.dim() == 2:
            extended_attention_mask = attention_mask[:, None, None, :]
        else:
            raise ValueError(
                f'wrong shape for input_ids (shape {input_shape})'
                f'or attention_mask (shape {attention_mask.shape})'
            )
        # 1.0 表示未mask，0.0表示mask
        # 这里处理与之前有点不一样，是将未mak的attention置为0，mask的置为-10000
        # 然后与softmax之前的值相加，结果是一样
        extended_attention_mask = extended_attention_mask.to(device)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        
        # 处理完上面以后，开始真正进入bert了
        # bert可以分成三段：分别是embedding阶段、encoder阶段、pooler阶段
        # 下面这个是embedding阶段
        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            input_embeds=input_embeds,
        )
        # 第二阶段 encoder阶段
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 第三阶段 pooler阶段
        sequence_output = encoder_outputs[0] if not return_dict else encoder_outputs.last_hidden_state
        pooler_output = self.pooler(sequence_output)
        
        # 打包返回
        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]
        return BaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooler_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
    
    def init_weights(self):
        """
        初始化权重
        """
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        """
        初始化权重
        """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        # BertLayerNorm的初始化？有什么权重？
        elif isinstance(module, BertLayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

#### BERT接入各种head

基于上面的BertModel，上面接入各种head，可以完成MLM、NSP的任务，可以完成分类、NER等任务。

In [163]:
class BertOnlyMLMHead(nn.Module):
    """
    一层MLM的head，与bert基本模型配合使用
    """
    
    def __init__(self, config):
        super().__init__()
        # dense层和layernorm层
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        if isinstance(config.hidden_act, str) and config.hidden_act == 'gelu':
            self.transform_act_fn = F.gelu
        else:
            self.transform_act_fn = config.hidden_act
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 解码层
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
        self.decoder.bias = self.bias
        
    def forward(self, hidden_states):
        # dense层和layernorm层
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        # 解码层
        hidden_states = self.decoder(hidden_states)
        return hidden_states    

In [164]:
class BertOnlyNSPHead(nn.Module):
    """
    一层NSP的head，与bert基本模型配合使用
    """
    
    def __init__(self, config):
        super().__init__()
        self.seq_relationship = nn.Linear(cofnig.hidden_size, 2)
    
    def forward(self, pooled_output):
        seq_relationship_score = self.seq_relationship(pooled_output)

In [165]:
class BertPreTrainingHeads(nn.Module):
    """
    一层预训练head，包含MLM与NSP任务
    """
    def __init__(self, config):
        super().__init__()
        self.predictions = BertOnlyMLMHead(config)
        self.seq_relationship = nn.Linear(config.hidden_size, 2)
    
    def forward(self, sequence_output, pooled_output):
        prediction_scores = self.predictions(sequence_output)
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score

In [133]:
class BertForMaskedLM(nn.Module):
    """
    MLM任务
    不含labels输入，会预测
    含labels输入，会返回
    """
    pass
    

In [134]:
class BertForNextSentencePrediction(nn.Module):
    """
    NSP任务
    """
    
    pass

In [None]:
class BertForPreTraining(nn.Module):
    """
    用于预训练任务，包含MLM任务和NSP任务
    """
    
    def __init__(self, config):
        self.predictions = Bert
    
    pass

In [135]:
class BertForSequenceClassification(nn.Module):
    """
    用于文本分类任务
    """
    
    pass

In [None]:
class BertForTokenClassification(nn.Module):
    """
    用于序列标注任务（ner等）
    """
    
    pass

In [167]:
bert = BertModel(config)
clss = BertPreTrainingHeads(config)

In [168]:
input_ids = torch.tensor([[1, 5, 3, 1], [2, 1, 2, 1]])
position_ids = torch.tensor([[0, 1, 2, 3], [0, 1, 2, 3]], dtype=torch.long)
token_type_ids = torch.tensor([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=torch.long)

return_dict = True

bert_outputs = bert(
    input_ids=input_ids,
    position_ids=position_ids,
    token_type_ids=token_type_ids,
    output_attentions=True,
    output_hidden_states=True,
    return_dict=return_dict,
)

In [169]:
# bert_output: last_hidden_state; pooler_output; hidden_states; attentions
# 四个结果中，前两个是必有；后两个根据参数决定是否有
if return_dict:
    sequence_output, pooled_output = bert_outputs.last_hidden_state, bert_outputs.pooler_output
else:
    sequence_output, pooled_output = bert_outputs[:2]

In [170]:
# 得到两个预训练任务的结果
prediction_scores, seq_relationship_score = clss(sequence_output, pooled_output)

In [173]:
labels = torch.tensor([[1, 5, 3, 1], [2, 1, 2, 1]])
next_sentence_label = torch.tensor([0, 0])

In [174]:
labels

tensor([[1, 5, 3, 1],
        [2, 1, 2, 1]])

In [176]:
labels.view(-1)

tensor([1, 5, 3, 1, 2, 1, 2, 1])

In [63]:
x = BertEncoder(config)

In [60]:
x = BertLayer(config)

In [74]:
y = x(
    hidden_states=embedding_output,
    attention_mask=extended_attention_mask,
    output_attentions=True,
    output_hidden_states=True,
    return_dict=True,
)

In [75]:
y.last_hidden_state.shape

torch.Size([2, 4, 768])

In [76]:
len(y.hidden_states)

13

In [77]:
y.hidden_states[5].shape

torch.Size([2, 4, 768])

In [79]:
y.attentions[0].shape

torch.Size([2, 12, 4, 4])

In [62]:
y[0].shape

torch.Size([2, 4, 768])

In [130]:
isinstance(config.hidden_act, str)

True

In [132]:
config.hidden_act

'gelu'

In [125]:
attention = BertAttention(config)
self_outputs = attention(
    hidden_states=embedding_output,
    attention_mask=extended_attention_mask,
    output_attentions=output_attentions,
)

In [128]:
self_outputs[0].shape

torch.Size([2, 4, 768])

In [None]:
self.

In [111]:
x.shape

torch.Size([2, 4, 768])