In [None]:
# Notebook: Hugging Face Pre-Trained Tokenizer Comparison
# Author: Thomas Purk
# Date: 2025-04-02
# Reference: https://huggingface.co/docs/tokenizers/index

# Hugging Face Pre-Trained Tokenizer Comparison

This notebook contains sample code comparing the major type of text tokenizers
used to process text for NLP tasks. A set of five sample sentences is processed
by each tokenizer and then printed to an output cell for side by side comparison.

Tokenizers:
- Word-Based (Hugging Face BasicTokenizer class)
- Whitespace (Python's 'text'.split() function)
- Character-Based (Python's list('text') function)
- BPE Subword (Pre-Trained by the gpt2 model)
- Wordpiece Subword (Pre-Trained by the bert-base-cased model)
- Unigram Subword (Pre-Trained by the t5-small model)

In [37]:
from transformers import AutoTokenizer

In [3]:
# Define a set of strings to test each type of tokenizer
sentences = [
    "I can't believe it's already April!",  # Tests handling of contractions
    "The quick brown fox jumps over the lazy dog.",  # Classic sentence for testing full-word tokenization
    "Artificial intelligence (AI) is evolving rapidly!",  # Tests handling of parentheses and abbreviations
    "Unbelievably, the anti-establishment movement gained traction.",  # Tests rare and hyphenated words
    "🔥 Emojis & special #hashtags are tricky to tokenize!"  # Tests handling of emojis and special characters
]


## Word-Based Tokenizers

- Splits text into words based on spaces and punctuation.
- Easy to understand but inefficient for handling rare words.
- Issue: Large vocabulary size
- Issue: Ddifficulty handling out-of-vocabulary (OOV) words.

**Use Cases**
- Early NLP models like Bag-of-Words (BoW)
- Traditional rule-based NLP

In [68]:
# Word-based Tokenization Example
from transformers import BasicTokenizer
bt = BasicTokenizer()
word_based_tokens = list(map(bt.tokenize, sentences))

print(f'Example: {word_based_tokens[0]}')

Example: ['i', 'can', "'", 't', 'believe', 'it', "'", 's', 'already', 'april', '!']


## Whitespace Tokenizers
- Splits text by spaces only, ignoring punctuation.
- Issues: Doesn't differentiate between punctuation and words, leading to inconsistencies.

**Use Cases**
- Very basic NLP preprocessing
- Rule-based text processing

In [69]:
# Whitespace Tokenizer Example
# Hugging Face does not provide a whitespace tokenizer

# Define a simple whitespace tokenizer
def whitespace_tokenizer(text):
    return text.split()  # Split on spaces only

whitespace_tokens = list(map(whitespace_tokenizer, sentences))

print(f'Example: {whitespace_tokens[0]}')

Example: ['I', "can't", 'believe', "it's", 'already', 'April!']


## Character-Based Tokenizers
- Splits text into individual characters.
- Handles rare words well because every character exists in the vocabulary.
- Issue: Very long sequences
- Issue: Harder for models to learn meaning from individual characters.

**Use Cases**
- OCR (Optical Character Recognition)
- Some speech recognition models
- Character-level RNNs and CNNs



In [70]:
# Character-Based Tokenizer Example
# Hugging Face does not provide a whitespace tokenizer

# Define a simple Character-Based tokenizer
def character_based_tokenizer(text):
    return list(text)

character_tokens = list(map(character_based_tokenizer, sentences))

print(f'Example: {character_tokens[0]}')

Example: ['I', ' ', 'c', 'a', 'n', "'", 't', ' ', 'b', 'e', 'l', 'i', 'e', 'v', 'e', ' ', 'i', 't', "'", 's', ' ', 'a', 'l', 'r', 'e', 'a', 'd', 'y', ' ', 'A', 'p', 'r', 'i', 'l', '!']


## Subword Tokenizers
- Most Popular in Modern NLP
- Breaks words into meaningful subwords to balance vocabulary size and efficiency.
- Can handle rare and compound words effectively.
- Issue: Requires training a tokenizer model before use.

### Byte-Pair Encoding (BPE) Subword Tokenizer
- Merges frequently occurring character pairs to form subwords.
- Used in GPT-2, RoBERTa, and SentencePiece architectures.

In [71]:
# Load GPT-2's pretrained BPE tokenizer
bpe_tokenizer = AutoTokenizer.from_pretrained("gpt2")

print(f'Tokenizer Name: {bpe_tokenizer.__class__.__name__}')
print(f'Tokenizer Type: {bpe_tokenizer.backend_tokenizer.model.__class__.__name__}')

bpe_tokens = list(map(bpe_tokenizer.tokenize, sentences))

print(f'Example: {bpe_tokens[0]}')

Tokenizer Name: GPT2TokenizerFast
Tokenizer Type: BPE
Example: ['I', 'Ġcan', "'t", 'Ġbelieve', 'Ġit', "'s", 'Ġalready', 'ĠApril', '!']


### WordPiece Subword Tokenizer

- Similar to BPE but uses a probabilistic model to decide merges.
- Used in BERT, DistilBERT.

In [75]:
# Load Bert Base Cased's pretrained Wordpiece tokenizer
wp_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

print(f'Tokenizer Name: {wp_tokenizer.__class__.__name__}')
print(f'Tokenizer Type: {wp_tokenizer.backend_tokenizer.model.__class__.__name__}')

wp_tokens = list(map(wp_tokenizer.tokenize, sentences))

print(f'Example: {wp_tokens[0]}')

Tokenizer Name: BertTokenizerFast
Tokenizer Type: WordPiece
Example: ['I', 'can', "'", 't', 'believe', 'it', "'", 's', 'already', 'April', '!']


### Unigram Language Model
- Uses probabilities to split words into subwords differently based on the dataset.
- Used in ALBERT, T5, XLNet.

In [79]:
# Load T5's pretrained Unigram tokenizer
ulm_tokenizer = AutoTokenizer.from_pretrained('t5-small')

print(f'Tokenizer Name: {ulm_tokenizer.__class__.__name__}')
print(f'Tokenizer Type: {ulm_tokenizer.backend_tokenizer.model.__class__.__name__}')

ulm_tokens = list(map(ulm_tokenizer.tokenize, sentences))

print(f'Example: {ulm_tokens[0]}')

Tokenizer Name: T5TokenizerFast
Tokenizer Type: Unigram
Example: ['▁I', '▁can', "'", 't', '▁believe', '▁it', "'", 's', '▁already', '▁April', '!']


## Print examples side-by-side

In [78]:
for index, value in enumerate(sentences):
    print(f'Sentence: {value}')
    print('\nWord-based Tokenization:')
    print('----------------------------------')
    print(word_based_tokens[index])
    print('\nWhitespace Tokenization:')
    print('----------------------------------')
    print(whitespace_tokens[index])
    print('\nCharacter-based Tokenization:')
    print('----------------------------------')
    print(character_tokens[index])
    print('\nBPE Subword Tokenization:')
    print('----------------------------------')
    print(bpe_tokens[index])
    print('\nWordpiece Subword Tokenization:')
    print('----------------------------------')
    print(wp_tokens[index])
    print('\nUnigram Subword Tokenization:')
    print('----------------------------------')
    print(ulm_tokens[index])
    print('\n=======================================================\n')

Sentence: I can't believe it's already April!

Word-based Tokenization:
----------------------------------
['i', 'can', "'", 't', 'believe', 'it', "'", 's', 'already', 'april', '!']

Whitespace Tokenization:
----------------------------------
['I', "can't", 'believe', "it's", 'already', 'April!']

Character-based Tokenization:
----------------------------------
['I', ' ', 'c', 'a', 'n', "'", 't', ' ', 'b', 'e', 'l', 'i', 'e', 'v', 'e', ' ', 'i', 't', "'", 's', ' ', 'a', 'l', 'r', 'e', 'a', 'd', 'y', ' ', 'A', 'p', 'r', 'i', 'l', '!']

BPE Subword Tokenization:
----------------------------------
['I', 'Ġcan', "'t", 'Ġbelieve', 'Ġit', "'s", 'Ġalready', 'ĠApril', '!']

Wordpiece Subword Tokenization:
----------------------------------
['I', 'can', "'", 't', 'believe', 'it', "'", 's', 'already', 'April', '!']

Unigram Subword Tokenization:
----------------------------------
['▁I', '▁can', "'", 't', '▁believe', '▁it', "'", 's', '▁already', '▁April', '!']


Sentence: The quick brown fox jump