In [1]:
from __future__ import annotations

import string
import re
from transformers import AutoTokenizer

# Intro

# Tokenization Steps

### Normalization

In [2]:
def normalize_text(text: str) -> str:
    # Can be multiple actions to normalize text
    # Only keep ASCII letters, numbers, punctuation, and whitespace characters
    # Evquivalent to string.printable
    acceptable_characters = (
        string.ascii_letters
        + string.digits
        + string.punctuation
        + string.whitespace
    )
    normalized_text = ''.join(
        filter(lambda letter: letter in acceptable_characters, text)
    )
    # Make text lower-case
    normalized_text = normalized_text.lower()
    return normalized_text

### Pretokenization

In [3]:
def pretokenize_text(text: str) -> list[str]:
    # Character-based
    smaller_pieces = [char for char in text]
    return smaller_pieces

In [4]:
def pretokenize_text(text: str) -> list[str]:
    # Split based on spaces
    smaller_pieces = text.split()
    return smaller_pieces

### Tokenization

In [5]:
# Combine normalization and pretokenization steps before breaking things further
def tokenize_text(text: str) -> list[str]:
    # Apply created steps 
    normalized_text: str = normalize_text(text)
    pretokenized_text: list[str] = pretokenize_text(normalized_text)
    tokens = []
    # Go through small pieces to make full tokens
    for word in pretokenized_text:
        tokens.extend(
            re.findall(
                f'[\w]+|[{string.punctuation}]', # Split word at punctuations 
                word,
            )
        )
    return tokens

### Postprocessing

In [6]:
# Useful for some tasks
def postprocess_tokens(tokens: list[str]) -> list[str]:
    # Add beginning and end of sequence tokens
    bos_token = '##BOS##'
    eos_token = '##EOS##'
    updated_tokens = (
        [bos_token]
        + tokens
        + [eos_token]
    )
    return updated_tokens

### Encoding: Putting It All Together

We can now try out our full tokenization process! Let's use this sample text to
see how our tokenization pipeline handles it!

In [7]:
sample_text = '''Mr. Louis continued to say, "Penguins are important, 
but we mustn't forget the nuumber 1 priority: the READER!"
'''

print(sample_text)
sample_text

Mr. Louis continued to say, "Penguins are important, 
but we mustn't forget the nuumber 1 priority: the READER!"



'Mr. Louis continued to say, "Penguins are important, \nbut we mustn\'t forget the nuumber 1 priority: the READER!"\n'

In [8]:
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)

print(tokens)

['##BOS##', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '##EOS##']


We now need to encode the tokens to IDs we'll give to a model. But first we need
to define how to map each token to a unique ID. An easy method can be to
arbitrarily count the unique tokens from our corpus.

We'll use the following as our sample corpus.

In [9]:
# Normally this would be much bigger
sample_corpus = (
    '''Mr. Louis continued to say, "Penguins are important, \nbut we mustn't forget the nuumber 1 priority: the READER!"''',
    '''BRUTUS:\nHe's a lamb indeed, that baes like a bear.''',
    '''Both by myself and many other friends:\mBut he, his own affections' counsellor,\nIs to himself--I will not say how true--\nBut to himself so secret and so close,'''
)

In [10]:
# Retrieve unique tokens (from the pipeline defined above) in a set
unique_tokens = set()
for text in sample_corpus:
    tokens_from_text = tokenize_text(text)
    tokens_from_text = postprocess_tokens(tokens_from_text)
    unique_tokens.update(tokens_from_text)

In [11]:
# Create mapping (dictionary) for unique tokens using arbitrary & unique IDs
token2id = {
    token: idx
    for idx, token in enumerate(unique_tokens)
}

For good measure, create a mapping for IDs to convert back to token

In [12]:
id2token = {idx: token for token, idx in token2id.items()}

Let's create our encoder and decoder to transform our tokens to IDS and back

In [13]:
def encode(tokens: list[str]) -> list[int]:
    # Note this doesn't handle tokens not mapped
    encoded_tokens = [
        token2id[token]
        for token in tokens
    ]
    return encoded_tokens


In [14]:
def decode(ids: list[int]) -> list[str]:
    token_strings = [
        id2token[idx]
        for idx in ids
    ]
    return token_strings

In [15]:
# Testing out encoding and decoding 
sample_text = '''Mr. Louis continued to say, "Penguins are important, 
but we mustn't forget the nuumber 1 priority: the READER!"
'''
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)

print(f'Tokens:\n{tokens}\n')

encoded_tokens = encode(tokens)
print(f'Encoded Tokens:\n{encoded_tokens}\n')

decoded_tokens = decode(encoded_tokens)
print(f'Decoded Tokens:\n{decoded_tokens}\n')

Tokens:
['##BOS##', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '##EOS##']

Encoded Tokens:
[33, 43, 57, 45, 31, 41, 32, 4, 20, 14, 21, 9, 4, 46, 35, 47, 5, 39, 49, 58, 38, 51, 53, 26, 58, 6, 22, 20, 8]

Decoded Tokens:
['##BOS##', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '##EOS##']

