In [None]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

text = ["I love transformers!", "NLP is a specialized field of machine learning"]
inputs = tokenizer(
    text,
    padding=True,
    truncation=True,
    return_tensors="pt"
)
# inputs is a dictionary containing input_ids and attention_mask
print(inputs)

{'input_ids': tensor([[  101,  1045,  2293, 19081,   999,   102,     0,     0,     0,     0,
             0],
        [  101, 17953,  2361,  2003,  1037,  7772,  2492,  1997,  3698,  4083,
           102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [3]:
inputs["input_ids"]

tensor([[  101,  1045,  2293, 19081,   999,   102,     0,     0,     0,     0,
             0],
        [  101, 17953,  2361,  2003,  1037,  7772,  2492,  1997,  3698,  4083,
           102]])

In [4]:
# Convert input IDs back to text
for i in range(len(text)):
    print(tokenizer.decode(inputs["input_ids"][i], skip_special_tokens=False))

[CLS] i love transformers! [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] nlp is a specialized field of machine learning [SEP]


In [5]:
from transformers import AutoModel
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)  # (batch_size, sequence_length, hidden_size)

torch.Size([2, 11, 768])


In [6]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits.shape)  # (batch_size, num_labels)

torch.Size([2, 2])


In [7]:
print(outputs.logits)

tensor([[-4.1216,  4.4303],
        [-2.3793,  2.2820]], grad_fn=<AddmmBackward0>)


In [9]:
import torch

pred_prob = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(pred_prob)
predicted_classes = torch.argmax(pred_prob, dim=-1)
print(predicted_classes) # 1 for positive, 0 for negative

tensor([[1.9314e-04, 9.9981e-01],
        [9.3661e-03, 9.9063e-01]], grad_fn=<SoftmaxBackward0>)
tensor([1, 1])


In [10]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [11]:
from transformers import AutoModel
model = AutoModel.from_pretrained("bert-base-uncased")
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [12]:
from transformers import BertModel
model = BertModel.from_pretrained("bert-base-uncased")

In [13]:
model.save_pretrained("./bert-base-uncased-model")

In [14]:
model = AutoModel.from_pretrained("./bert-base-uncased-model")

In [16]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
encoded_input = tokenizer("Transformer is awesome", return_tensors="pt")
for key in encoded_input:
    print(f"{key}: {encoded_input[key]}")

input_ids: tensor([[  101, 10938,  2121,  2003, 12476,   102]])
token_type_ids: tensor([[0, 0, 0, 0, 0, 0]])
attention_mask: tensor([[1, 1, 1, 1, 1, 1]])


In [19]:
encoded_input = tokenizer([
    "Transformer is awesome", "Hugging Face is creating a tool that democratizes AI."], 
                          padding=True, return_tensors="pt")
print(encoded_input)

{'input_ids': tensor([[  101, 10938,  2121,  2003, 12476,   102,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 17662,  2227,  2003,  4526,  1037,  6994,  2008,  7672, 10057,
          9932,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [22]:
encoded_input = tokenizer([
    "Transformer is awesome", 
    "Hugging Face is creating a tool that democratizes AI."], 
    padding=True, 
    truncation=True, 
    max_length=3,
    return_tensors="pt")
print(encoded_input)

{'input_ids': tensor([[  101, 10938,   102],
        [  101, 17662,   102]]), 'token_type_ids': tensor([[0, 0, 0],
        [0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1],
        [1, 1, 1]])}


In [23]:
encoded_input = tokenizer([
    "Transformer is awesome", "Hugging Face is creating a tool that democratizes AI."], 
                          padding=True, return_tensors="pt")
print(encoded_input)

{'input_ids': tensor([[  101, 10938,  2121,  2003, 12476,   102,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 17662,  2227,  2003,  4526,  1037,  6994,  2008,  7672, 10057,
          9932,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [26]:
encoded_input_list = encoded_input["input_ids"]
encoded_tensor = torch.tensor(encoded_input_list)
print(encoded_tensor)

tensor([[  101, 10938,  2121,  2003, 12476,   102,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 17662,  2227,  2003,  4526,  1037,  6994,  2008,  7672, 10057,
          9932,  1012,   102]])


  encoded_tensor = torch.tensor(encoded_input_list)


In [27]:
output = model(encoded_tensor)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [29]:
tokenizd_text = "I love transformers!".split()
print(tokenizd_text)

['I', 'love', 'transformers!']


In [30]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [32]:
import pprint
pprint.pprint(tokenizer("I love transformers!"))

{'attention_mask': [1, 1, 1, 1, 1, 1],
 'input_ids': [101, 1045, 2293, 19081, 999, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0]}


In [36]:
sequence = "Hugging Face is creating a tool that democratizes AI."
tokens = tokenizer.tokenize(sequence)
print(tokens)

['hugging', 'face', 'is', 'creating', 'a', 'tool', 'that', 'democrat', '##izes', 'ai', '.']


In [37]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[17662, 2227, 2003, 4526, 1037, 6994, 2008, 7672, 10057, 9932, 1012]


In [38]:
decoded_text = tokenizer.decode(ids)
print(decoded_text)

hugging face is creating a tool that democratizes ai.


In [39]:
ids_tensor = torch.tensor([ids])
print(ids_tensor)

tensor([[17662,  2227,  2003,  4526,  1037,  6994,  2008,  7672, 10057,  9932,
          1012]])


In [41]:
from transformers import AutoModelForSequenceClassification
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
output = model(ids_tensor)
print(f"logits: {output.logits}")

logits: tensor([[ 2.4455, -2.2021]], grad_fn=<AddmmBackward0>)


In [42]:
tokenizer.pad_token_id

0

In [43]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0]
]

outputs = model(torch.tensor(batched_ids),
                attention_mask=torch.tensor(attention_mask))
print(f"logits: {outputs.logits}")

logits: tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [45]:
sequence = "Hugging Face is creating a tool that democratizes AI."
model_inputs_1 = tokenizer(sequence)
print(model_inputs_1)

model_inputs_2 = tokenizer([sequence], padding="longest")
print(model_inputs_2)

model_inputs_3 = tokenizer([sequence], padding="max_length", max_length=5)
print(model_inputs_3)

{'input_ids': [101, 17662, 2227, 2003, 4526, 1037, 6994, 2008, 7672, 10057, 9932, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [[101, 17662, 2227, 2003, 4526, 1037, 6994, 2008, 7672, 10057, 9932, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
{'input_ids': [[101, 17662, 2227, 2003, 4526, 1037, 6994, 2008, 7672, 10057, 9932, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [48]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
pprint.pprint(model_inputs)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [49]:
print(tokenizer.decode(model_inputs["input_ids"][0]))

[CLS] i ' ve been waiting for a huggingface course my whole life. [SEP]


In [51]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
