### Models expect a batch of inputs

model期望按照batched收到input

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
# This line will fail.
# model(input_ids)



观察可以发现 tokenizer 的输出比我们之前的输入要多出一个维度

In [3]:
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs["input_ids"])

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([ids])
# This line will fail.
output = model(input_ids)

print(output["logits"])

tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [5]:
batched_ids = [ids, ids]
input_ids = torch.tensor(batched_ids)
output = model(input_ids)

print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


Padding the inputs

In [8]:
batched_ids = [
    [200, 200, 200],
    [200, 200]
]
torch.tensor(batched_ids)

ValueError: expected sequence of length 3 at dim 1 (got 2)

不规则的 list of list 无法转换为tensor。因此我们会对token少的sentence进行填充，使用 padding token.

In [7]:
padding_id = 100

batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id],
]

使用 padding token

In [10]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
seq1_ids = [[200,200,200]]
seq2_ids = [[200,200]]
bached_ids = [
    [200,200,200],
    [200,200,tokenizer.pad_token_id]
]

print(model(torch.tensor(seq1_ids)))
print(model(torch.tensor(seq2_ids)))
print(model(torch.tensor(bached_ids)))

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.5694, -1.3895],
        [ 1.3373, -1.2163]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


我们发现 seq2 的输出和 bached_ids 中的第二条句子的输出不一致。这是因为模型的attention layer把padding token也当作了上下文的一部分。因此我们需要使用 attention mask来告诉attention哪些token是不用考虑的。

### Attention mask

attention mask是和input IDs有一样形状的张量。每个元素由0和1组成，代表了是否需要被attention考虑

In [12]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

output = model(torch.tensor(bached_ids), attention_mask=torch.tensor(attention_mask))
print(output['logits'])

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


 Try it out! Apply the tokenization manually on the two sentences used in section 2 (“I’ve been waiting for a HuggingFace course my whole life.” and “I hate this so much!”). Pass them through the model and check that you get the same logits as in section 2. Now batch them together using the padding token, then create the proper attention mask. Check that you obtain the same results when going through the model!

In [32]:
sentences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!"
]
# 手动tokenize（add_special_tokens=False是为了更清晰对比）
seq1_ids = tokenizer.encode(sentences[0], add_special_tokens=False)
seq2_ids = tokenizer.encode(sentences[1], add_special_tokens=False)

print("句子1的token IDs:", seq1_ids)
print("句子2的token IDs:", seq2_ids)

句子1的token IDs: [1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
句子2的token IDs: [1045, 5223, 2023, 2061, 2172, 999]


In [28]:
# 计算最大长度
max_len = max(len(seq1_ids), len(seq2_ids))

# 对短句进行右侧填充
seq1_ids = seq1_ids + [tokenizer.pad_token_id] * (max_len - len(seq1_ids))
seq2_ids = seq2_ids + [tokenizer.pad_token_id] * (max_len - len(seq2_ids))

# 构造输入张量
input_ids = torch.tensor([seq1_ids, seq2_ids])

# 生成attention mask（padding位置为0）
attention_mask = (input_ids != tokenizer.pad_token_id).int()

print("\n手动构造的input_ids:\n", input_ids)
print("手动构造的attention_mask:\n", attention_mask)


手动构造的input_ids:
 tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  5223,  2023,  2061,  2172,   999,     0,     0,     0,     0,
             0,     0,     0,     0]])
手动构造的attention_mask:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=torch.int32)


In [29]:
print(model(input_ids, attention_mask=attention_mask))

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
