<a href="https://colab.research.google.com/github/yotamnahum/Mamram-Language-Modelling-Workshop/blob/main/tokenizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title install dependencies
!pip -q install transformers[sentencepiece]

# Types of Tokenizers

In [68]:
def print_tokenization_stats(text, tokenized_text, tokenizer):
    print(f"original text:  {text}\n")
    print(f"tokenized text: {tokenized_text}\n")
    print(f"length: {len(tokenized_text)}\n")
    print(f"vocabulary size: {len(tokenizer)}")

In [56]:
text = "Using tokenizers is easy! And simple! OpenAI!"

Whitespace tokenizer

In [57]:
tokenized_text = text.split()
print(text)
print(tokenized_text)

Using tokenizers is easy! And simple! OpenAI!
['Using', 'tokenizers', 'is', 'easy!', 'And', 'simple!', 'OpenAI!']


Naive tokenizer

In [58]:
import re

def naive_tokenizer(text):
    return re.findall(r'\w+|\W+', text)

tokenized_text = naive_tokenizer(text)
print(text)
print(tokenized_text)


Using tokenizers is easy! And simple! OpenAI!
['Using', ' ', 'tokenizers', ' ', 'is', ' ', 'easy', '! ', 'And', ' ', 'simple', '! ', 'OpenAI', '!']


Subword tokenization

In [69]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_text = tokenizer.tokenize(text)
print_tokenization_stats(text, tokenized_text, tokenizer)

original text:  Using tokenizers is easy! And simple! OpenAI!

tokenized text: ['Using', 'token', '##izer', '##s', 'is', 'easy', '!', 'And', 'simple', '!', 'Open', '##A', '##I', '!']

length: 14

vocabulary size: 28996


Byte-Pair Encoding (BPE)

In [76]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
tokenized_text = tokenizer.tokenize(text)
print_tokenization_stats(text, tokenized_text, tokenizer)

original text:  Using tokenizers is easy! And simple! OpenAI!

tokenized text: ['Using', 'Ġtoken', 'izers', 'Ġis', 'Ġeasy', '!', 'ĠAnd', 'Ġsimple', '!', 'ĠOpen', 'AI', '!']

length: 12

vocabulary size: 50257


Sentence-piece tokenization

In [72]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-base")
tokenized_text = tokenizer.tokenize(text)
print_tokenization_stats(text, tokenized_text, tokenizer)

original text:  Using tokenizers is easy! And simple! OpenAI!

tokenized text: ['▁', 'Using', '▁token', 'izer', 's', '▁is', '▁easy', '!', '▁And', '▁simple', '!', '▁Open', 'AI', '!']

length: 14

vocabulary size: 32100


Byte Tokenizer

In [73]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')
tokenized_text = tokenizer.tokenize(text)
print_tokenization_stats(text, tokenized_text, tokenizer)

original text:  Using tokenizers is easy! And simple! OpenAI!

tokenized text: ['U', 's', 'i', 'n', 'g', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r', 's', ' ', 'i', 's', ' ', 'e', 'a', 's', 'y', '!', ' ', 'A', 'n', 'd', ' ', 's', 'i', 'm', 'p', 'l', 'e', '!', ' ', 'O', 'p', 'e', 'n', 'A', 'I', '!']

length: 45

vocabulary size: 384


# Decoding

In [79]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
tokenized_text = tokenizer.tokenize(text)
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

print(f"original text:  {text}\n")
print(f"tokenized text: {tokenized_text}\n")
print(f"input ids: {input_ids}\n")
decoded_string = tokenizer.decode(input_ids)
print(f"decoded string: {decoded_string}\n")
decoded_string == text

original text:  Using tokenizers is easy! And simple! OpenAI!

tokenized text: ['Using', 'Ġtoken', 'izers', 'Ġis', 'Ġeasy', '!', 'ĠAnd', 'Ġsimple', '!', 'ĠOpen', 'AI', '!']

input ids: [12814, 11241, 11341, 318, 2562, 0, 843, 2829, 0, 4946, 20185, 0]

decoded string: Using tokenizers is easy! And simple! OpenAI!



True