# What's happening behind the scene of Tokenization?

In [1]:
import torch

In [2]:
# Step 1: Load the model and tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")

In [3]:
# Step 2: Preprocess the input
inputs = tokenizer("We liked the embedders, we were okay with encoder decoders, but we love the transformers.", return_tensors="pt")

In [4]:
inputs

{'input_ids': tensor([[  101,  2057,  4669,  1996,  7861,  8270, 13375,  1010,  2057,  2020,
          3100,  2007,  4372, 16044,  2099, 21933, 13375,  1010,  2021,  2057,
          2293,  1996, 19081,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}

## How did we get from input string to these numbers?

## Tokenization

- Our inputs are text. These models work with numbers, so the first thing we need to do is to convert the text inputs into numbers.

- The tokenizer first tokenizes the inputs. This means that it splits the input string in words (or part of words, punctuation symbols, etc.) that are called tokens.

- Then, it converts these tokens into numbers. Each token is associated to an input ID, which is an integer. The same token will always be associated to the same ID.


In [30]:
tokenized_str = tokenizer.tokenize("We liked 😊 the embedders, we were okay with encoder decoders, but we love the transformers.")
print(tokenized_str)

['we', 'liked', '😊', 'the', 'embedders', ',', 'we', 'were', 'okay', 'with', 'en', '##code', '##r', 'deco', '##ders', ',', 'but', 'we', 'love', 'the', 'transformers', '.']


In [12]:
tokenized_inp_id = tokenizer.convert_tokens_to_ids(tokenized_str)
print(tokenized_inp_id)

[2057, 4669, 1996, 7861, 8270, 13375, 1010, 2057, 2020, 3100, 2007, 4372, 16044, 2099, 21933, 13375, 1010, 2021, 2057, 2293, 1996, 19081, 1012]


In [16]:
tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token, tokenizer.mask_token, tokenizer.unk_token

('[CLS]', '[SEP]', '[PAD]', '[MASK]', '[UNK]')

In [17]:
tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id, tokenizer.mask_token_id

(101, 102, 0, 103)

In [18]:
tokenized_inp_id = [tokenizer.cls_token_id] + tokenized_inp_id + [tokenizer.sep_token_id]
print(tokenized_inp_id)

[101, 2057, 4669, 1996, 7861, 8270, 13375, 1010, 2057, 2020, 3100, 2007, 4372, 16044, 2099, 21933, 13375, 1010, 2021, 2057, 2293, 1996, 19081, 1012, 102]


In [19]:
tokenizer.decode(tokenized_inp_id)

'[CLS] we liked the embedders, we were okay with encoder decoders, but we love the transformers. [SEP]'

In [20]:
inputs['input_ids']==torch.tensor(tokenized_inp_id).unsqueeze(0)

tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True]])

In [21]:
inputs['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])

In [22]:
inputs_batch2 = tokenizer(["this summer is killing me", "me too"], padding=True, return_tensors="pt")

In [23]:
inputs_batch2

{'input_ids': tensor([[ 101, 2023, 2621, 2003, 4288, 2033,  102],
        [ 101, 2033, 2205,  102,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0]])}

In [24]:
inputs_batch2['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0]])

In [25]:
tokenizer.tokenize("We liked the embedders, we were okay with encoder decoders, but we love the transformers.", return_tensors="pt")

['we',
 'liked',
 'the',
 'em',
 '##bed',
 '##ders',
 ',',
 'we',
 'were',
 'okay',
 'with',
 'en',
 '##code',
 '##r',
 'deco',
 '##ders',
 ',',
 'but',
 'we',
 'love',
 'the',
 'transformers',
 '.']

## How did it decide to split some words and not others?

- During training the model encountered some words from the training dataset. From those words it creates a vocabulary with which it can work with.

- If every encountered word is treated as a separate token, then that will lead to a very large vocabulary size, indirectly increasing the size of model.
- To avoid this, the tokenizer uses subword tokenization, which means that it splits some words into smaller parts.
- The tokenizer is trained to perform these splits in a way that minimizes the vocabulary size while maintaining the ability to reconstruct the original words.
- This is why the word "embedders" was split into "em", "##bed", "##ders".
## How to check whether my word is entirely present in vocabulary or not?
- The tokenizer has a method called get_vocab() that returns the vocabulary of the tokenizer. You can use it to check if a word is in the vocabulary or not.

- For instance, tokenizer.get_vocab().get("embedders") will return None, while tokenizer.get_vocab().get("embed") will return a number.

BERT uses WordPiece tokenization, which means that if a word is not in the vocabulary, it will be split into subwords.

In [6]:
print(tokenizer.get_vocab().get("embedders"))

print(tokenizer.get_vocab().get("transformers"))

None
19081


In [53]:
vocab = tokenizer.get_vocab()
len(vocab)

30523

In [49]:
vocab

{'##38': 22025,
 'collar': 9127,
 '##ising': 9355,
 'tribes': 6946,
 'ki': 11382,
 'holiday': 6209,
 'oppressive': 28558,
 'corps': 3650,
 'hop': 6154,
 '##va': 3567,
 '##হ': 29913,
 '##rm': 10867,
 'benches': 19571,
 '華': 1942,
 'jacket': 6598,
 '##vita': 28403,
 'sustainable': 9084,
 'ns': 24978,
 'printing': 8021,
 '##cula': 19879,
 'prc': 26141,
 'opposes': 29158,
 '##dles': 27822,
 'everyday': 10126,
 '##)': 29620,
 'broadcasters': 18706,
 'publishers': 8544,
 'rune': 23276,
 'specimen': 11375,
 '[unused432]': 437,
 'received': 2363,
 'several': 2195,
 'nouns': 19211,
 'dresses': 14464,
 'expressed': 5228,
 'dealers': 16743,
 'connor': 6720,
 'odor': 19255,
 'books': 2808,
 'netball': 25034,
 'ා': 1408,
 'contradiction': 26917,
 '[unused533]': 538,
 'spaces': 7258,
 '##bino': 21891,
 '##eral': 21673,
 'lunged': 17755,
 'basal': 15191,
 'harmonica': 16527,
 '##son': 3385,
 'schubert': 24645,
 'magdalene': 26890,
 '##ath': 8988,
 '##ulu': 20391,
 '##cton': 28312,
 'competes': 14190,

### How can we add new words into tokenizer?

If you want to add a new word to the vocabulary, you can use the add_tokens() method.

In [29]:
# Add new word into tokenizer
tokenizer.add_tokens(["embedders", "😊"])

2

In [8]:
tokenizer.get_vocab().get("embedders")

30522

In [54]:
vocab = tokenizer.get_vocab()
len(vocab)

30523

In [55]:
tokenized_str = tokenizer.tokenize("We liked the embedders, we were okay with encoder decoders, but we love the transformers.")
print(tokenized_str)

['we', 'liked', 'the', 'embedders', ',', 'we', 'were', 'okay', 'with', 'en', '##code', '##r', 'deco', '##ders', ',', 'but', 'we', 'love', 'the', 'transformers', '.']
