## 分词背后的原理

In [9]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch

checkpoint = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."


# 分词：['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']
tokens = tokenizer.tokenize(sequence)

ids = tokenizer.convert_tokens_to_ids(tokens)
# 这里[ids]是因为 所有的神经网络模型都只接受批 (batch) 数据作为输入，即使只输入一段文本，也需要先将它组成只包含一个样本的 batch
#  ，更多情况下送入的是包含多段文本的 batch：batched_ids = [ids, ids, ids, ...]
input_ids = torch.tensor([ids])
print("Input IDs:\n", input_ids)

output = model(input_ids)
print("Logits:\n", output.logits)

Input IDs:
 tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits:
 tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


## 实际应用中，我们应该直接使用分词器

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

checkpoint = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."
# 批量处理。padding=True 表示在处理输入序列时，会自动将序列填充到相同的长度。
inputs = tokenizer(sequence,  return_tensors='pt', padding=True)
print("inputs", inputs)

output = model(**inputs)
print("output", output)

# tensor([[-1.5607,  1.6123]], grad_fn=<AddmmBackward0>)
# 模型的输出通常是一个张量（tensor），它包含了模型对输入数据的预测结果。
# output.logits 的结果 tensor([[-1.5607, 1.6123]], grad_fn=<AddmmBackward0>) 是一个包含两个元素的张量，这两个元素分别表示模型对输入序列的两个类别的预测分数。
# -1.5607 是模型对第一个类别的预测分数。；1.6123 是模型对第二个类别的预测分数。

# 为了得到最终的预测结果，通常需要对这些 logits 进行 softmax 操作，将它们转换为概率分布。
# softmax 操作会将这些分数转换为介于 0 和 1 之间的值，并且所有值的和为 1。这样，我们就可以更容易地解释模型对每个类别的预测概率。
output.logits


# softmax 操作
import torch.nn.functional as F
probs = F.softmax(output.logits, dim=-1)
print(probs)



inputs {'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
output SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[0.0402, 0.9598]], grad_fn=<SoftmaxBackward0>)


## 句子对的处理

In [20]:
from transformers import AutoTokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentence1_list = ["This is the first sentence 1.", "second sentence 1.","test1"]
sentence2_list = ["This is the first sentence 2.", "second sentence 2.", "test2"]


# 句子对配对时候是要 sentence1_list 中的第一个句子与 sentence2_list 中的第一个句子配对，
inputs = tokenizer(
    sentence1_list,
    sentence2_list,
    padding=True,
    truncation=True,
    return_tensors='pt'
)

print(inputs)

# torch.Size([3, 17]): 标识有三句子，每个句子最大长度17(包括填充)
print(inputs['input_ids'].shape)

{'input_ids': tensor([[ 101, 2023, 2003, 1996, 2034, 6251, 1015, 1012,  102, 2023, 2003, 1996,
         2034, 6251, 1016, 1012,  102],
        [ 101, 2117, 6251, 1015, 1012,  102, 2117, 6251, 1016, 1012,  102,    0,
            0,    0,    0,    0,    0],
        [ 101, 3231, 2487,  102, 3231, 2475,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
torch.Size([3, 17])
