In [35]:
!pip install torchtext==0.17



# Building a Vocabulary and Tokenizer

1. Build vocabulary
1. Build tokenizer that maps sentences to sequences of tokens, including:
    1. Split
    1. Normalize: lowercase, remove punctuation, etc.
    1. Tokenize
    1. Pad and Truncate
1. Build Embedding layer that maps sequences of tokens to embedding vectors

## With `torchtext`

In [36]:
!pip freeze | find "torch"

torch==2.2.0+cu121
torchdata==0.7.1
torchtext==0.17.0


In [37]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab, build_vocab_from_iterator, Vocab

### Build a vocabulary from a frequency dictionary

In [38]:
from collections import OrderedDict

token_frequency_dict = OrderedDict(
    {
        "hello": 1,
        "world": 5,
        "this": 1,
        "is": 3,
        "a": 2,
        "test": 1,
    }
)


vocabulary = vocab(
    token_frequency_dict, specials=["<unk>", "<pad>", "<bos>", "<eos>"], min_freq=1
)


vocabulary.set_default_index(vocabulary["<unk>"])


sorted(vocabulary.get_stoi().items(), key=lambda x: x[1])

[('<unk>', 0),
 ('<pad>', 1),
 ('<bos>', 2),
 ('<eos>', 3),
 ('hello', 4),
 ('world', 5),
 ('this', 6),
 ('is', 7),
 ('a', 8),
 ('test', 9)]

### Build a vocabulary from a corpus

In [39]:
tokenizer = get_tokenizer("basic_english")
tokenizer("You can now install TorchText using pip!")

['you', 'can', 'now', 'install', 'torchtext', 'using', 'pip', '!']

In [40]:
# Building a vocabulary from a list of sentences
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)


corpus = ["Hello World", "This is a test", "Hello there!"]
vocab = build_vocab_from_iterator(yield_tokens(corpus), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
print(vocab.get_stoi())
print(vocab.get_itos())

{'<unk>': 0, '<pad>': 1, 'hello': 2, '!': 3, 'a': 4, 'there': 7, 'test': 6, 'is': 5, 'this': 8, 'world': 9}
['<unk>', '<pad>', 'hello', '!', 'a', 'is', 'test', 'there', 'this', 'world']


### Truncating and Padding Sequences

In [41]:
sample = "Hello world! This is a sample sentence."
# Tokenize the sample sentence
tokens = tokenizer(sample)
# Convert tokens to indices using the vocabulary
indices = [vocab[token] for token in tokens]
indices

[2, 9, 3, 8, 5, 4, 0, 0, 0]

In [42]:
[vocab.get_itos()[idx] for idx in indices]

['hello', 'world', '!', 'this', 'is', 'a', '<unk>', '<unk>', '<unk>']

In [43]:
MAX_LENGTH = 8

# Truncate or pad the sequence to the maximum length
if len(indices) > MAX_LENGTH:
    indices = indices[:MAX_LENGTH]
else:
    indices += [vocab["<pad>"]] * (MAX_LENGTH - len(indices))

# With `HuggingFace`

## Customized

In [44]:
corpus = ["Hello World", "This is a test", "Hello there!"]
sample = "Hello world! This is a sample sentence."
MAX_LENGTH = 8

In [45]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

# 1. Create a sample corpus
corpus = [
    "Hello world, how are you?",
    "This is an example of building a word-level tokenizer.",
    "We're using Hugging Face tokenizers library!",
    "Word-level tokenizers split text by whitespace and punctuation.",
    "They're simple but effective for many NLP tasks.",
]

# 2. Initialize a tokenizer with WordLevel model
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))  # type: ignore

# 3. Set the pre-tokenizer to split on whitespace
tokenizer.pre_tokenizer = Whitespace()  # type: ignore

# 4. Initialize a trainer
trainer = WordLevelTrainer(
    min_frequency=1, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]  # type: ignore
)

# 5. Train the tokenizer
tokenizer.train_from_iterator(corpus, trainer)

# 6. Add post-processing for adding special tokens
# tokenizer.post_processor = TemplateProcessing(
#     single="[CLS] $A [SEP]",
#     pair="[CLS] $A [SEP] $B [SEP]",
#     special_tokens=[
#         ("[CLS]", tokenizer.token_to_id("[CLS]")),
#         ("[SEP]", tokenizer.token_to_id("[SEP]")),
#     ],
# )

# 7. Test the tokenizer
test_text = "Hello world! This is a new sentence."
output = tokenizer.encode(test_text)

print(f"Input text: {test_text}")
print(f"Output: {output}")
print(f"Token IDs: {output.ids}")
print(f"Tokens: {output.tokens}")
print(f"Decoded text: {tokenizer.decode(output.ids)}")

# # 8. Save the tokenizer for later use
# tokenizer.save("word_level_tokenizer.json")

# # 9. Loading a saved tokenizer
# loaded_tokenizer = Tokenizer.from_file("word_level_tokenizer.json")

Input text: Hello world! This is a new sentence.
Output: Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Token IDs: [15, 46, 11, 19, 33, 22, 0, 0, 5]
Tokens: ['Hello', 'world', '!', 'This', 'is', 'a', '[UNK]', '[UNK]', '.']
Decoded text: Hello world ! This is a .


In [46]:
from transformers import PreTrainedTokenizerFast  # type: ignore

# 10. Wrap with PreTrainedTokenizerFast for use with transformers
fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

# Test the wrapped tokenizer
# Ref: https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizerFast.__call__
tokenized_output = fast_tokenizer(
    test_text,
    padding="max_length",  # require fast_tokenizer.pad_token
    truncation=True,
    max_length=8,
    return_tensors="pt",
)

print("\nPreTrainedTokenizerFast output:")
print(f"Tokenized output: {tokenized_output}")
print(f"Input IDs: {tokenized_output.input_ids}")
print(
    f"Tokens: {fast_tokenizer.decode(tokenized_output.input_ids[0], skip_special_tokens=True)}"
)

# 11. Save the tokenizer (optional)
fast_tokenizer.save_pretrained("./my_tokenizer")


PreTrainedTokenizerFast output:
Tokenized output: {'input_ids': tensor([[15, 46, 11, 19, 33, 22,  0,  0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
Input IDs: tensor([[15, 46, 11, 19, 33, 22,  0,  0]])
Tokens: Hello world ! This is a


('./my_tokenizer\\tokenizer_config.json',
 './my_tokenizer\\special_tokens_map.json',
 './my_tokenizer\\tokenizer.json')

In [None]:
test_text = [
    "This is a short sentence.",
    "This is a much longer sentence that will definitely exceed the maximum length and therefore should be split into multiple features by the tokenizer when we ask it to return overflowing tokens.",
]

output = fast_tokenizer(
    test_text,
    padding="max_length",  # require fast_tokenizer.pad_token
    truncation=True,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    stride=3,
    max_length=8,
    return_tensors="pt",
)

print("PreTrainedTokenizerFast output:")
# print(f"Tokenized output: {output}")
print(f"Input IDs shape: {output.input_ids.shape}")  # 7 chunks of 8 tokens each
print(
    f"Tokens: '{fast_tokenizer.decode(output.input_ids[0])}' and '{fast_tokenizer.decode(output.input_ids[1])}'"
)  # stride=3 of overlapping tokens between chunks
print(
    f"Overflowing chunk mapping: {output.overflow_to_sample_mapping}"
)  # map chunks to original sentences
print(
    f"Offsets mapping shape: {output.offset_mapping.shape}"
)
print(
    f"Offsets mapping: {output.offset_mapping[0]}"
)  # map tokens to original text positions

PreTrainedTokenizerFast output:
Input IDs shape: torch.Size([7, 8])
Tokens: 'This is a [UNK] [UNK] . [PAD] [PAD]' and 'This is a [UNK] [UNK] [UNK] [UNK] [UNK]'
Overflowing chunk mapping: tensor([0, 1, 1, 1, 1, 1, 1])
Offsets mapping shape: torch.Size([7, 8, 2])
Offsets mapping: tensor([[ 0,  4],
        [ 5,  7],
        [ 8,  9],
        [10, 15],
        [16, 24],
        [24, 25],
        [ 0,  0],
        [ 0,  0]])


In [48]:
fast_tokenizer.sep_token

'[SEP]'

In [49]:
fast_tokenizer.tokenize("Hello world! This is a sample sentence.")

['Hello', 'world', '!', 'This', 'is', 'a', '[UNK]', '[UNK]', '.']

## Pretrained

In [50]:
from transformers import AutoTokenizer

MAX_LENGTH = 8

# Load a pre-trained tokenizer (e.g., BERT)
# This will download the tokenizer files if not already cached. Default to Fast tokenizer if available.
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [51]:
# Process a single sentence
sample = "Hello world! This is a sample sentence."

encoding = tokenizer(
    sample,
    padding="max_length",
    truncation=True,
    max_length=MAX_LENGTH,
    return_tensors="pt",
)
print("Input text:", sample)
print("\nTokenizer output:")

for key, value in encoding.items():
    print(f"{key}: {value}")

# Decode tokens back to text
tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
print("\nTokens:", tokens)

Input text: Hello world! This is a sample sentence.

Tokenizer output:
input_ids: tensor([[ 101, 7592, 2088,  999, 2023, 2003, 1037,  102]])
token_type_ids: tensor([[0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1]])

Tokens: ['[CLS]', 'hello', 'world', '!', 'this', 'is', 'a', '[SEP]']


In [52]:
# Process corpus
corpus = ["Hello World", "This is a test", "Hello there!"]

batch_encoding = tokenizer(
    corpus,
    padding="max_length",
    truncation=True,
    max_length=MAX_LENGTH,
    return_tensors="pt",
)

print("Batch encoded corpus shape:", batch_encoding["input_ids"].shape)
# Compare with our custom tokenization
print("Custom tokenization vs Hugging Face:")

for sentence in corpus:
    print(f"Sentence: {sentence}")
    print(f"Custom tokens: {tokenizer(sentence)}")
    print(f"HF tokens: {tokenizer.tokenize(sentence)}")
    print()

Batch encoded corpus shape: torch.Size([3, 8])
Custom tokenization vs Hugging Face:
Sentence: Hello World
Custom tokens: {'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}
HF tokens: ['hello', 'world']

Sentence: This is a test
Custom tokens: {'input_ids': [101, 2023, 2003, 1037, 3231, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}
HF tokens: ['this', 'is', 'a', 'test']

Sentence: Hello there!
Custom tokens: {'input_ids': [101, 7592, 2045, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}
HF tokens: ['hello', 'there', '!']



# Embedding

## With `PyTorch` (untrained)

In [53]:
from torch.nn import Embedding

In [54]:
embedding = Embedding(
    num_embeddings=len(vocab), embedding_dim=10, padding_idx=vocab["<pad>"]
)
embedding(torch.tensor([vocab["Hello"], vocab["exam"]]))

tensor([[-0.8712, -0.3465,  0.5327,  0.1636, -0.7755, -0.1996, -1.3155, -1.4636,
         -0.4670,  1.7574],
        [-0.8712, -0.3465,  0.5327,  0.1636, -0.7755, -0.1996, -1.3155, -1.4636,
         -0.4670,  1.7574]], grad_fn=<EmbeddingBackward0>)

## With `torchtext` (pretrained)

In [55]:
import torchtext.vocab as vocab_utils

# Load a pretrained embedding (GloVe)
glove = vocab_utils.GloVe(name="6B", dim=100)

# Example of using the pretrained embeddings
print(f"Vocabulary size: {len(glove.stoi):,}")
print(f"Embedding dimension: {glove.vectors.shape[1]}")

# Get vectors for specific words
word = "hello"
if word in glove.stoi:
    word_idx = glove.stoi[word]
    word_vector = glove.vectors[word_idx]
    print(f"Vector for '{word}': {word_vector[:5]}...")  # Show first 5 dimensions


# Compare similar words using cosine similarity
def get_similarity(word1, word2):
    if word1 in glove.stoi and word2 in glove.stoi:
        vec1 = glove.vectors[glove.stoi[word1]]
        vec2 = glove.vectors[glove.stoi[word2]]
        return torch.nn.functional.cosine_similarity(
            vec1.unsqueeze(0), vec2.unsqueeze(0)
        )
    return None


# Print some word similarities
word_pairs = [("king", "queen"), ("man", "woman"), ("good", "bad"), ("hello", "world")]
for w1, w2 in word_pairs:
    similarity = get_similarity(w1, w2)
    if similarity is not None:
        print(f"Similarity between '{w1}' and '{w2}': {similarity.item():.4f}")

Vocabulary size: 400,000
Embedding dimension: 100
Vector for 'hello': tensor([ 0.2669,  0.3963,  0.6169, -0.7745, -0.1039])...
Similarity between 'king' and 'queen': 0.7508
Similarity between 'man' and 'woman': 0.8323
Similarity between 'good' and 'bad': 0.7703
Similarity between 'hello' and 'world': 0.2041
