In [1]:
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification
torch.set_printoptions(edgeitems=2, precision=2, linewidth=75)

In [2]:
from transformers import AutoTokenizer

In [3]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
sequence = "I've been waiting for a HuggingFace course my whole life."
model_inputs = tokenizer(sequence)

In [5]:
model_inputs

{'input_ids': [101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

It also handles multiple sequences at a time, with no change in the API:

In [8]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
model_inputs = tokenizer(sequences)

In [18]:
len(model_inputs.input_ids[0]), len(model_inputs.input_ids[1])

(16, 6)

In [23]:
model_inputs = tokenizer(sequences, padding="longest", return_tensors="pt")
model_inputs.input_ids[0].shape, model_inputs.input_ids[1].shape

(torch.Size([16]), torch.Size([16]))

In [24]:
model_inputs = tokenizer(sequences, padding="max_length", return_tensors="pt")
model_inputs.input_ids[0].shape, model_inputs.input_ids[1].shape

(torch.Size([512]), torch.Size([512]))

In [25]:
model_inputs = tokenizer(sequences, padding="max_length", max_length=20, return_tensors="pt")
model_inputs.input_ids[0].shape, model_inputs.input_ids[1].shape

(torch.Size([20]), torch.Size([20]))

In [26]:
model_inputs.input_ids[0]

tensor([  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,
        12172,  2607,  2026,  2878,  2166,  1012,   102,     0,     0,
            0,     0])

In [27]:
tokenizer.decode(model_inputs.input_ids[0])

"[CLS] i've been waiting for a huggingface course my whole life. [SEP] [PAD] [PAD] [PAD] [PAD]"

In [28]:
tokenizer.decode(model_inputs.input_ids[1])

'[CLS] so have i! [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

### Wrapping up: From tokenizer to model

Now that we’ve seen all the individual steps the tokenizer object uses when applied on texts, let’s see one final time how it can handle multiple sequences (padding!), very long sequences (truncation!), and multiple types of tensors with its main API:

In [29]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [30]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

In [31]:
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

In [32]:
output = model(**tokens)

In [38]:
output, tokens

(SequenceClassifierOutput(loss=None, logits=tensor([[-1.56,  1.61],
         [-3.62,  3.91]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None),
 {'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,
          12172,  2607,  2026,  2878,  2166,  1012,   102],
         [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])})

In [34]:
predictions = torch.nn.functional.softmax(output.logits, dim=-1)
predictions

tensor([[4.02e-02, 9.60e-01],
        [5.35e-04, 9.99e-01]], grad_fn=<SoftmaxBackward0>)

In [35]:
s1 = "First sentence: {0}: {1:.3f}, {2}: {3:.3f}".format(model.config.id2label[0], predictions[0,0], model.config.id2label[1], predictions[0,1])
s1

'First sentence: NEGATIVE: 0.040, POSITIVE: 0.960'

In [36]:
s2 = "Second sentence: {0}: {1:.3f}, {2}: {3:.3f}".format(model.config.id2label[0], predictions[1,0], model.config.id2label[1], predictions[1,1])
s2

'Second sentence: NEGATIVE: 0.001, POSITIVE: 0.999'

In [40]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
result = tokenizer.tokenize("Hello!")

In [41]:
result

['Hello', '!']