In [5]:
# ============================================
# WordPiece Tokenizer Training + Analysis
# ============================================

# ---------------------------
# Install (Run once if needed)
# ---------------------------
!pip install tokenizers
# ---------------------------
# Import Libraries
# ---------------------------
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
import os

# ---------------------------
# Step 1: Create Corpus File (If Not Exists)
# ---------------------------
corpus_file = "corpus.txt"

if not os.path.exists(corpus_file):
    corpus = [
        "Natural language processing is amazing",
        "WordPiece tokenizer is used in BERT models",
        "Tokenization splits text into subwords",
        "NLP is widely used in AI applications",
        "Machine learning and deep learning are part of AI",
        "Transformers are powerful NLP models",
        "BERT uses WordPiece tokenization technique"
    ]

    with open(corpus_file, "w", encoding="utf-8") as f:
        for line in corpus:
            f.write(line + "\n")

print("âœ… Corpus Ready")

# ---------------------------
# Step 2: Initialize Tokenizer
# ---------------------------
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# ---------------------------
# Step 3: Train Tokenizer
# ---------------------------
trainer = WordPieceTrainer(
    vocab_size=1000,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

tokenizer.train([corpus_file], trainer)

print("âœ… Tokenizer Training Completed")

# ---------------------------
# Step 4: Save Tokenizer
# ---------------------------
tokenizer.save("wordpiece_tokenizer.json")
print("âœ… Tokenizer Saved")

# ---------------------------
# Step 5: Vocabulary Analysis
# ---------------------------
vocab = tokenizer.get_vocab()

print("\nðŸ“Š Vocabulary Size:", len(vocab))

print("\nðŸ”¹ Sample Vocabulary (Top 20 Tokens):")
vocab_items = list(vocab.items())[:20]

for token, idx in vocab_items:
    print(token, ":", idx)

# ---------------------------
# Step 6: Test Tokenization
# ---------------------------
sample_text = "WordPiece tokenizer is powerful"

encoded = tokenizer.encode(sample_text)

print("\nðŸ§ª Sample Text:", sample_text)
print("Tokens:", encoded.tokens)
print("Token IDs:", encoded.ids)

# ---------------------------
# Step 7: Compare Different Vocabulary Sizes
# ---------------------------
print("\nðŸ“Š Vocabulary Size Comparison")

for size in [100, 300, 500, 1000]:
    temp_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    temp_tokenizer.pre_tokenizer = Whitespace()

    temp_trainer = WordPieceTrainer(
        vocab_size=size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )

    temp_tokenizer.train([corpus_file], temp_trainer)

    temp_vocab = temp_tokenizer.get_vocab()

    print(f"Requested: {size} | Actual Learned: {len(temp_vocab)}")

print("\nâœ… All Steps Completed Successfully")


Defaulting to user installation because normal site-packages is not writeable
âœ… Corpus Ready
âœ… Tokenizer Training Completed
âœ… Tokenizer Saved

ðŸ“Š Vocabulary Size: 198

ðŸ”¹ Sample Vocabulary (Top 20 Tokens):
Mach : 169
##ep : 138
models : 114
Natural : 190
##iec : 100
A : 5
##r : 47
##okeniz : 90
Tokenization : 171
of : 126
R : 13
subwords : 196
##ts : 148
Tra : 120
##ng : 71
##on : 78
##ni : 70
de : 124
subword : 181
learning : 113

ðŸ§ª Sample Text: WordPiece tokenizer is powerful
Tokens: ['WordPiece', 'tokenizer', 'is', 'powerful']
Token IDs: [116, 166, 74, 194]

ðŸ“Š Vocabulary Size Comparison
Requested: 100 | Actual Learned: 100
Requested: 300 | Actual Learned: 198
Requested: 500 | Actual Learned: 196
Requested: 1000 | Actual Learned: 198

âœ… All Steps Completed Successfully
