In [2]:
from transformers import BertTokenizer, AutoTokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")



## Encoding

In [9]:
tokenizer.vocab

{'Roses': 17303,
 'hitting': 6886,
 '216': 22148,
 'phenomena': 14343,
 'cricketer': 9469,
 'Top': 3299,
 'wholly': 12907,
 'crossed': 3809,
 'Hatch': 25945,
 '##14': 17175,
 'Mitchell': 5741,
 '##hta': 26489,
 'appears': 2691,
 '##aw': 7220,
 '##leading': 28007,
 '##jana': 21026,
 'Hastings': 12446,
 'worm': 19686,
 'Tang': 10215,
 'quietly': 4432,
 'Rolling': 8782,
 'altitudes': 24604,
 '##payers': 27452,
 'Scotland': 3030,
 'rat': 11631,
 'handle': 4282,
 'ordination': 20424,
 'scrambled': 13988,
 'chords': 22098,
 'Burma': 11023,
 'DB': 24044,
 '##ga': 2571,
 'Video': 6301,
 'Inside': 7323,
 'resident': 6408,
 'Calendar': 26208,
 '##cola': 19673,
 '##ella': 7772,
 'vast': 6047,
 'dwell': 26812,
 'Koreans': 27757,
 'inspection': 11820,
 'Seymour': 13572,
 'entertainment': 5936,
 'Neptune': 23405,
 'Mariano': 25260,
 'headed': 2917,
 'subdivisions': 25238,
 '1769': 22632,
 '##pical': 15328,
 '##ffing': 17242,
 'Season': 5623,
 'collections': 6286,
 'signature': 8250,
 '##vocative': 2

In [10]:
tokenizer.vocab_size

28996

In [11]:
len(tokenizer.vocab)

28996

In [23]:
sequence = "Using a Transformer network is simple. But I think I can learn it."

In [25]:
tokens = tokenizer.tokenize(sequence)

print(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple', '.', 'But', 'I', 'think', 'I', 'can', 'learn', 'it', '.']


In [26]:
ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014, 119, 1252, 146, 1341, 146, 1169, 3858, 1122, 119]


##### Rather than doing it this way, you can directly convert sentence to tokens and then numeber (tensors) form as -

In [27]:
tokenizer(sequence)

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 119, 1252, 146, 1341, 146, 1169, 3858, 1122, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [28]:
tokanizer("I hate this so much!")

{'input_ids': [101, 146, 4819, 1142, 1177, 1277, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

#### Check the tensor(i.e. ids) from vocab

In [19]:
tokenizer.vocab['simple']

3014

In [20]:
tokenizer.vocab['.']

119

In [22]:
tokenizer.vocab['[CLS]'], tokenizer.vocab['[SEP]']

(101, 102)

## Decoding

In [30]:
decoded_string = tokenizer.decode([ 7993, 170, 11303, 1200, 2443, 1110, 3014])
print(decoded_string)

Using a transformer network is simple


# Handling Multiple Sequence

In [31]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [32]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [33]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [34]:
sequence = "I've been waiting for a HuggingFace course my whole life."

In [42]:
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
# This line will fail.
model(input_ids)

IndexError: too many indices for tensor of dimension 1

In [43]:
### The problem is that we sent a single sequence to the model, whereas Transformers models expect multiple sentences by default. 

In [44]:
# Lets try again adding few things

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)


In [45]:
input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])


In [51]:
output = model(input_ids)
print(output)
print("Logits:", output.logits)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [52]:
torch.nn.Softmax(output)

Softmax(dim=SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None))

0

In [54]:
##### Lets create multiple sequences manualy-
print(tokenizer.pad_token_id)
sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

0


In [56]:
sequence1_ids, sequence2_ids, batched_ids

([[200, 200, 200]], [[200, 200]], [[200, 200, 200], [200, 200, 0]])

In [57]:
print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [58]:
"""There’s something wrong with the logits in our batched predictions: the second row should be the same as the logits for the second sentence, but we’ve got completely different values!
This is because the key feature of Transformer models is attention layers that contextualize each token. 
These will take into account the padding tokens since they attend to all of the tokens of a sequence. 
To get the same result when passing individual sentences of different lengths through the model or when passing a batch with the same sentences and padding applied, 
we need to tell those attention layers to ignore the padding tokens. This is done by using an attention mask.
"""

'There’s something wrong with the logits in our batched predictions: the second row should be the same as the logits for the second sentence, but we’ve got completely different values!\nThis is because the key feature of Transformer models is attention layers that contextualize each token. \nThese will take into account the padding tokens since they attend to all of the tokens of a sequence. \nTo get the same result when passing individual sentences of different lengths through the model or when passing a batch with the same sentences and padding applied, \nwe need to tell those attention layers to ignore the padding tokens. This is done by using an attention mask.\n'

#### Attention masks

Attention masks are tensors with the exact same shape as the input IDs tensor, filled with 0s and 1s: 1s indicate the corresponding tokens should be attended to, and 0s indicate the corresponding tokens should not be attended to (i.e., they should be ignored by the attention layers of the model).
So attention layer will only consider the tensors whose attention mask =1, Hence keeping the importance of the meachanism.

In [60]:
# Now lets do this as -

batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]
# This below step is needed
attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


#### We are Manually doing this conversion and then applying model. But in general we can directly use Tokenizer and Model as below

In [61]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

In [62]:
model_inputs = tokenizer(sequences)

In [64]:
print(model_inputs) # No paddings added 

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}


In [65]:
# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")
print(model_inputs)

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


In [66]:
# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")
print(model_inputs)

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [72]:
# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
print(model_inputs)

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0]]}


In [73]:
### Returning type


# Returns PyTorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
print(model_inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [74]:
# Returns NumPy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")
print(model_inputs)

{'input_ids': array([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,
        12172,  2607,  2026,  2878,  2166,  1012,   102],
       [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


## Wrapping up: From tokenizer to model

In [80]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print(f"Token :- \n {tokens}")
output = model(**tokens)
output

Token :- 
 {'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [77]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I hated it!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)

In [78]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [ 4.2356, -3.4542]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)