In [28]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this.",
]
tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
print(ids[0])
print(ids[1])
print(tokens)

[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
[1045, 5223, 2023, 1012]
[['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.'], ['i', 'hate', 'this', '.']]


In [2]:
import torch
input_ids = torch.tensor(ids)

# This results in an error as tensors expect inputs to be of the same length

ValueError: expected sequence of length 14 at dim 1 (got 4)

In [3]:
# Hence, we need to pad the shorter sequence to fit the longer sequence
import numpy as np
padded_ids = ids
len_diff = len(padded_ids[0]) - len(padded_ids[1])
padded_ids[1] = np.pad(padded_ids[1], (0,len_diff), 'constant')

print(padded_ids)

[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], array([1045, 5223, 2023, 1012,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0])]


In [4]:
# We also need to make sure that we use the same padding id as the model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token_id # this should be 0

0

In [22]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# The tensors need an input of (batch_size, sequence_length)
# Hence, since ids[0] only gives the sequence_length, we do [ids[0]] to give a batch_size of 1
print(model(torch.tensor([ids[0]])).logits)
print(model(torch.tensor([ids[1]])).logits)
print(model(torch.tensor(padded_ids)).logits)

tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)
tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward0>)
tensor([[-2.7276,  2.8789],
        [ 1.5444, -1.3998]], grad_fn=<AddmmBackward0>)


We notice that ids[1] and padded_ids[1] have different logits even though they are containing the same ids. This is because the attention layer is attending to the padding.

### Using Attention Masks
Attention masks help the attention layers to ignore tokens marked with 0 (eg padding)

In [23]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

# Ignore the padded ids with 0
attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [24]:
# The attention masks are also automatically generated with padding=True
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this.",
]
print(tokenizer(sentences, padding=True))

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}
