In [1]:
from transformers import AutoConfig,AutoModel,AutoTokenizer,AdamW,get_linear_schedule_with_warmup,logging
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset,SequentialSampler,RandomSampler,DataLoader

In [2]:
# 预训练模型名称
MODEL_NAME="bert-base-chinese"
# MODEL_NAME="roberta-large"


## config

In [3]:
#  预训练模型配置
config = AutoConfig.from_pretrained(MODEL_NAME)

In [4]:
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.6.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

## tokenizer

参考文档：https://huggingface.co/transformers/v4.6.0/main_classes/tokenizer.html

In [5]:
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

一些特殊符号：['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [6]:
tokenizer.all_special_ids

[100, 102, 0, 101, 103]

In [7]:
tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [8]:
# tokenizer.vocab

In [9]:
# 词汇表大小
tokenizer.vocab_size

21128

### 将文本转为词汇表id

- 方法1
```
    def encode(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        stride: int = 0,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs
    ) -> List[int]:
        """
        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.

        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.

        Args:
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
                method).
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the ``tokenize`` method) or a list of integers (tokenized string ids using the
                ``convert_tokens_to_ids`` method).
        """
```

In [10]:
text="我在北京工作"
token_ids=tokenizer.encode(text)
token_ids

[101, 2769, 1762, 1266, 776, 2339, 868, 102]

In [11]:
type(token_ids)

list

In [12]:
# 将id转为原始字符
tokenizer.convert_ids_to_tokens(token_ids)

['[CLS]', '我', '在', '北', '京', '工', '作', '[SEP]']

padding的模式

```
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:

                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
```

In [13]:
# 加入参数
token_ids=tokenizer.encode(text,padding=True,max_length=30,add_special_tokens=True)
token_ids

[101, 2769, 1762, 1266, 776, 2339, 868, 102]

In [14]:
# 加入参数
token_ids=tokenizer.encode(text,padding="max_length",max_length=30,add_special_tokens=True)
token_ids

[101,
 2769,
 1762,
 1266,
 776,
 2339,
 868,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [15]:
token_ids=tokenizer.encode(text,padding="max_length",max_length=30,add_special_tokens=True,return_tensors='pt')
token_ids

tensor([[ 101, 2769, 1762, 1266,  776, 2339,  868,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]])

- 方法2 encode_plus
```
def encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
 ```

In [16]:
token_ids=tokenizer.encode_plus(
    text,padding="max_length",
    max_length=30,
    add_special_tokens=True,
    return_tensors='pt',
    return_token_type_ids=True,
    return_attention_mask=True
)
token_ids

{'input_ids': tensor([[ 101, 2769, 1762, 1266,  776, 2339,  868,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]])}

## Model

In [17]:
model=AutoModel.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [19]:
# outputs=model(token_ids['input_ids'],token_ids['token_type_ids'])
outputs=model(token_ids['input_ids'],token_ids['attention_mask'])

# outputs=model(token_ids['input_ids'],token_ids['attention_mask'],token_ids['token_type_ids'])


In [20]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.2670, -0.0858,  0.2122,  ..., -0.0070,  0.9425, -0.3466],
         [ 0.5193, -0.3700,  0.4482,  ..., -1.0237,  0.7864, -0.1775],
         [-0.1792, -0.7018,  1.0653,  ..., -0.3034,  1.0692,  0.0429],
         ...,
         [-0.2591,  0.0598,  0.3403,  ..., -0.1995,  0.1566, -0.3007],
         [-0.2168,  0.0471,  0.3638,  ..., -0.2013,  0.2269, -0.3189],
         [-0.2386, -0.0272,  0.2252,  ..., -0.0456, -0.0596, -0.2200]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[ 0.9986,  0.9999,  0.9988,  0.9545, -0.6417,  0.5586,  0.3451,  0.6832,
          0.9936, -0.9965,  1.0000,  0.9999,  0.0969, -0.9015,  0.9994, -0.9996,
         -0.0634,  1.0000,  0.9828,  0.5460,  0.9992, -1.0000, -0.9602, -0.9486,
         -0.8842,  0.9878,  0.9769,  0.0949, -0.9995,  0.9895,  0.9659,  0.9994,
          0.9980, -0.9999, -0.9976,  0.5098, -0.7977,  0.9948, -0.7914, -0.9849,
         -0.9965, -0.5981,  0.385

In [21]:
last_hidden_state=outputs[0]
outputs[0].shape # last_hidden_state

torch.Size([1, 30, 768])

In [22]:
outputs[1].shape # pooler_output # 整个句子的Pooler output

torch.Size([1, 768])

In [23]:
cls_embeddings=last_hidden_state[:,0] # 第一个字符CLS的embedding表示
last_hidden_state[:,0].shape

torch.Size([1, 768])

## 对Bert输出进行变换

In [24]:
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.6.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

In [25]:
config.update({
            'output_hidden_states':True
            }) 

In [26]:
model=AutoModel.from_pretrained(MODEL_NAME,config=config)

outputs=model(token_ids['input_ids'],token_ids['token_type_ids'])

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

In [28]:
outputs['last_hidden_state'].shape

torch.Size([1, 30, 768])

In [29]:
outputs['pooler_output'].shape

torch.Size([1, 768])

In [30]:
len(outputs['hidden_states'])

13

In [31]:
outputs['hidden_states'][-1].shape

torch.Size([1, 30, 768])

## 更改1

In [32]:
all_hidden_states=torch.stack(outputs.hidden_states)
all_hidden_states.shape

torch.Size([13, 1, 30, 768])

In [33]:
cat_over_last_layers = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
        )
cat_over_last_layers.shape

torch.Size([1, 30, 3072])

In [34]:
class AttentionHead(nn.Module):
    def __init__(self, h_size, hidden_dim=512):
        super().__init__()
        self.W = nn.Linear(h_size, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        
    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
h_size=config.hidden_size
head = AttentionHead(h_size*4)

In [35]:
cls_pooling = cat_over_last_layers[:, 0]   
head_logits = head(cat_over_last_layers)

In [36]:
cls_pooling.shape

torch.Size([1, 3072])

In [37]:
head_logits.shape

torch.Size([1, 3072])

In [38]:
torch.cat([head_logits, cls_pooling], -1).shape

torch.Size([1, 6144])

## 更改2 

In [39]:
class AttentionHead(nn.Module):
    def __init__(self, h_size, hidden_dim=512):
        super().__init__()
        self.W = nn.Linear(h_size, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        
    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [40]:
class MeanPoolingHead(nn.Module):
    def __init__(self, h_size, hidden_dim=512):
        super().__init__()
        self.W = nn.Linear(h_size, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        
    def forward(self, last_hidden_state,attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


In [41]:
class CLRPModel(nn.Module):
    def __init__(self,transformer,config):
        super(CLRPModel,self).__init__()
        self.h_size = config.hidden_size
        self.transformer = transformer
        self.attention_head = AttentionHead(self.h_size)
        self.mean_pooling_head = MeanPoolingHead(self.h_size)

        self.linear = nn.Linear(self.h_size, 1)
              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
        att_out = self.attention_head(transformer_out.last_hidden_state)
        mp_out = self.mean_pooling_head(transformer_out.last_hidden_state,attention_mask)

        x = self.linear(x)
        return x


In [42]:
attention_mask=token_ids['attention_mask']

In [43]:
attention_head = AttentionHead(config.hidden_size)
mean_pooling_head = MeanPoolingHead(config.hidden_size)

att_out = attention_head(outputs.last_hidden_state)
mp_out = mean_pooling_head(outputs.last_hidden_state,attention_mask)

In [44]:
att_out.shape

torch.Size([1, 768])

In [45]:
mp_out.shape

torch.Size([1, 768])

In [46]:
torch.cat([att_out, mp_out], -1).shape

torch.Size([1, 1536])