In [1]:
import transformers
import pprint
import torch

# §2.1 预处理/模型推理/后处理
`AutoTokenizer`/`AutoModel`/`torch.nn.Functional`

In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english",
    device_map="cuda"
)
raw_text = [
    "I love you",
    "I hate you"
]
token = tokenizer(raw_text, padding=True, truncation=True, return_tensors="pt").to("cuda:0")
# return_tensors == 'pt', PyTorch
# return_tensors == 'tf', TensorFlow
# return_tensors == 'np', NumPy

pprint.pprint(token)


{'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]], device='cuda:0'),
 'input_ids': tensor([[ 101, 1045, 2293, 2017,  102],
        [ 101, 1045, 5223, 2017,  102]], device='cuda:0')}




In [3]:
model = transformers.AutoModel.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
).to("cuda:0")
feature = model(**token)
pprint.pprint({
    "feature": feature,
    "feature_shape": feature.last_hidden_state.shape, # batch_size × sequence_length × hidden_size(encoded_size)
})

{'feature': BaseModelOutput(last_hidden_state=tensor([[[ 0.5110,  0.4176, -0.0830,  ...,  0.4443,  0.8320, -0.3374],
         [ 0.6341,  0.4539, -0.0347,  ...,  0.3301,  0.8103, -0.2437],
         [ 0.8464,  0.5750,  0.1138,  ...,  0.2452,  0.7705, -0.3324],
         [ 0.5107,  0.3850,  0.1263,  ...,  0.2038,  0.8559, -0.3721],
         [ 1.2287,  0.2805,  0.3941,  ...,  0.6246,  0.4263, -0.8306]],

        [[-0.3473,  0.8337, -0.4560,  ..., -0.2185, -0.7385, -0.0927],
         [-0.1072,  1.1280, -0.4087,  ..., -0.4267, -0.4758,  0.1905],
         [-0.0195,  1.0290, -0.4657,  ..., -0.4203, -0.5142,  0.1062],
         [-0.3486,  0.6474, -0.3795,  ..., -0.1535, -0.6559, -0.2947],
         [ 0.1300,  0.2511, -0.3323,  ..., -0.0435, -0.5050, -0.2333]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>),
                            hidden_states=None,
                            attentions=None),
 'feature_shape': torch.Size([2, 5, 768])}


In [4]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english",
).to("cuda:0")
feature = model(**token)
pprint.pprint({
    "feature": feature,
    "logits_shape": feature.logits.shape,
    "softmax_probility": torch.nn.functional.softmax(feature.logits, dim=-1),
    "model_label_map": model.config.id2label
})

{'feature': SequenceClassifierOutput(loss=None,
                                     logits=tensor([[-4.2756,  4.6393],
        [ 3.8724, -3.1543]], device='cuda:0', grad_fn=<AddmmBackward0>),
                                     hidden_states=None,
                                     attentions=None),
 'logits_shape': torch.Size([2, 2]),
 'model_label_map': {0: 'NEGATIVE', 1: 'POSITIVE'},
 'softmax_probility': tensor([[1.3436e-04, 9.9987e-01],
        [9.9911e-01, 8.8707e-04]], device='cuda:0', grad_fn=<SoftmaxBackward0>)}


# §2.2 模型

In [5]:
# 用随机参数创建一个BertModel
config = transformers.BertConfig()
model = transformers.BertModel(config).to("cuda:0")
pprint.pprint({
    "config": config,
    "model": model
})

{'config': BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}
,
 'model': BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): B

In [6]:
# 用预训练参数创建BertModel
model = transformers.BertModel.from_pretrained("bert-base-cased")
pprint.pprint({
    "model": model
})

{'model': BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inp

# §2.3 分词器(Tokenizer)

In [7]:
# Tokenizer输出结构
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")

# 第一种方法：先把str拆成list[str]，再使用map转为list[int]
token_list = tokenizer.tokenize("Using a Transformer network is simple")
token_id = tokenizer.convert_tokens_to_ids(token_list)

# 第二种方法：一步到位，tokenizer.__call__(str)
token_info = tokenizer("Using a Transformer network is simple")
raw_str = tokenizer.decode(token_info["input_ids"])

pprint.pprint({
    "token_list": token_list,
    "token_id": token_id,
    "raw_str": raw_str
})

{'raw_str': '[CLS] Using a Transformer network is simple [SEP]',
 'token_id': [7993, 170, 13809, 23763, 2443, 1110, 3014],
 'token_list': ['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']}




In [8]:
# Tokenizer与Model结合
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = transformers.AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english").to("cuda:0")

sequence = "I've been waiting for a HuggingFace course my whole life."

token_list: list[str] = tokenizer.tokenize(sequence)
token_id = tokenizer.convert_tokens_to_ids(token_list)
token_id_tensor: torch.Tensor = torch.tensor([token_id]).to("cuda:0") # model()接受batch_size × sequence_length的Tensor，所以要升维[token_id]

output = model(token_id_tensor)
pprint.pprint({
    "output": output
})

{'output': SequenceClassifierOutput(loss=None,
                                    logits=tensor([[-2.7276,  2.8789]], device='cuda:0', grad_fn=<AddmmBackward0>),
                                    hidden_states=None,
                                    attentions=None)}


In [9]:
# Tokenizer填充非等长序列 + 注意力掩码
model = transformers.AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english").to("cuda:0")

pprint.pprint({
    "unpadding": model(torch.tensor(
        [[200, 200]]
    ).cuda()),
    "manual_padding": model(torch.tensor(
        [[200, 200, 0]]
    ).cuda()),
    "contextual_padding": model(torch.tensor(
        [[200, 200, tokenizer.pad_token_id]]
    ).cuda()),
    "contextual_masked_padding": model(torch.tensor(
        [[200, 200, 200], [200, 200, tokenizer.pad_token_id]]
    ).cuda(), attention_mask=torch.tensor([[1, 1, 0], [1, 1, 0]]).cuda())
})

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


{'contextual_masked_padding': SequenceClassifierOutput(loss=None,
                                                       logits=tensor([[ 0.5803, -0.4125],
        [ 0.5803, -0.4125]], device='cuda:0', grad_fn=<AddmmBackward0>),
                                                       hidden_states=None,
                                                       attentions=None),
 'contextual_padding': SequenceClassifierOutput(loss=None,
                                                logits=tensor([[ 1.3373, -1.2163]], device='cuda:0', grad_fn=<AddmmBackward0>),
                                                hidden_states=None,
                                                attentions=None),
 'manual_padding': SequenceClassifierOutput(loss=None,
                                            logits=tensor([[ 1.3373, -1.2163]], device='cuda:0', grad_fn=<AddmmBackward0>),
                                            hidden_states=None,
                                            attentions=None

In [10]:
# Tokenizer的填充策略

tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
sequences = [
    "this is a word " * 10,
    "this is a word " * 5,
    "this is a word " * 2,
]

token_info_1 = tokenizer(sequences, return_tensors="pt", padding="longest") # 取各序列长度的最大值
token_info_2 = tokenizer(sequences, return_tensors="pt", padding="max_length") # Tokenizer模型的最长长度（此处使用DistilBert，最长为512）
token_info_3 = tokenizer(sequences, return_tensors="pt", padding="max_length", max_length=1024) # 人为指定最长长度

pprint.pprint({
    "token_info_1": token_info_1["input_ids"].shape,
    "token_info_2": token_info_2["input_ids"].shape,
    "token_info_3": token_info_3["input_ids"].shape
})

{'token_info_1': torch.Size([3, 42]),
 'token_info_2': torch.Size([3, 512]),
 'token_info_3': torch.Size([3, 1024])}


In [11]:
# Tokenizer的序列截断

tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
sequences = [
    "this is a word " * 10,
    "this is a word " * 5,
    "this is a word " * 2,
]

token_info_1 = tokenizer(sequences, return_tensors="pt", padding=True, truncation=True) # Tokenizer模型的最长长度（此处使用DistilBert，最长为512）
token_info_2 = tokenizer(sequences, return_tensors="pt", padding=True, truncation=True, max_length=32) # 人为指定最长长度

pprint.pprint({
    "token_info_1": token_info_1["input_ids"].shape,
    "token_info_2": token_info_2["input_ids"].shape
})

{'token_info_1': torch.Size([3, 42]), 'token_info_2': torch.Size([3, 32])}


In [12]:
# Tokenizer.__call__()会在起始和结尾添加标记，但是Tokenizer.tokenize()不会

tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
raw_text = "Hello, I'm here!"

pprint.pprint({
    "Tokenizer.__call__()": tokenizer.decode(tokenizer(raw_text)["input_ids"]),
    "Tokenizer.tokenize()": tokenizer.decode(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(raw_text)))
})


{'Tokenizer.__call__()': "[CLS] hello, i'm here! [SEP]",
 'Tokenizer.tokenize()': "hello, i'm here!"}
