In [1]:
from __future__ import annotations

import string
import re
from transformers import AutoTokenizer

# Intro

# Tokenization Steps

### Normalization

In [2]:
def normalize_text(text: str) -> str:
    # Can be multiple actions to normalize text
    # Only keep ASCII letters, numbers, punctuation, and whitespace characters
    # Evquivalent to string.printable
    acceptable_characters = (
        string.ascii_letters
        + string.digits
        + string.punctuation
        + string.whitespace
    )
    normalized_text = ''.join(
        filter(lambda letter: letter in acceptable_characters, text)
    )
    # Make text lower-case
    normalized_text = normalized_text.lower()
    return normalized_text

### Pretokenization

In [3]:
def pretokenize_text(text: str) -> list[str]:
    # Character-based
    smaller_pieces = [char for char in text]
    return smaller_pieces

In [4]:
def pretokenize_text(text: str) -> list[str]:
    # Split based on spaces
    smaller_pieces = text.split()
    return smaller_pieces

### Tokenization

In [5]:
# Combine normalization and pretokenization steps before breaking things further
def tokenize_text(text: str) -> list[str]:
    # Apply created steps 
    normalized_text: str = normalize_text(text)
    pretokenized_text: list[str] = pretokenize_text(normalized_text)
    tokens = []
    # Go through small pieces to make full tokens
    for word in pretokenized_text:
        tokens.extend(
            re.findall(
                f'[\w]+|[{string.punctuation}]', # Split word at punctuations 
                word,
            )
        )
    return tokens

### Postprocessing

In [6]:
# Useful for some tasks
def postprocess_tokens(tokens: list[str]) -> list[str]:
    # Add beginning and end of sequence tokens
    bos_token = '##BOS##'
    eos_token = '##EOS##'
    updated_tokens = (
        [bos_token]
        + tokens
        + [eos_token]
    )
    return updated_tokens

### Encoding: Putting It All Together

We can now try out our full tokenization process! Let's use this sample text to
see how our tokenization pipeline handles it!

In [7]:
sample_text = '''Mr. Louis continued to say, "Penguins are important, 
but we mustn't forget the nuumber 1 priority: the READER!"
'''

print(sample_text)
sample_text

Mr. Louis continued to say, "Penguins are important, 
but we mustn't forget the nuumber 1 priority: the READER!"



'Mr. Louis continued to say, "Penguins are important, \nbut we mustn\'t forget the nuumber 1 priority: the READER!"\n'

In [8]:
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)

print(tokens)

['##BOS##', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '##EOS##']


We now need to encode the tokens to IDs we'll give to a model. But first we need
to define how to map each token to a unique ID. An easy method can be to
arbitrarily count the unique tokens from our corpus.

We'll use the following as our sample corpus.

In [9]:
# Normally this would be much bigger
sample_corpus = (
    '''Mr. Louis continued to say, "Penguins are important, \nbut we mustn't forget the nuumber 1 priority: the READER!"''',
    '''BRUTUS:\nHe's a lamb indeed, that baes like a bear.''',
    '''Both by myself and many other friends:\mBut he, his own affections' counsellor,\nIs to himself--I will not say how true--\nBut to himself so secret and so close,'''
)

In [10]:
# Retrieve unique tokens (from the pipeline defined above) in a set
unique_tokens = set()
for text in sample_corpus:
    tokens_from_text = tokenize_text(text)
    tokens_from_text = postprocess_tokens(tokens_from_text)
    unique_tokens.update(tokens_from_text)

In [11]:
# Create mapping (dictionary) for unique tokens using arbitrary & unique IDs
token2id = {
    token: idx
    for idx, token in enumerate(unique_tokens)
}

For good measure, create a mapping for IDs to convert back to token

In [12]:
id2token = {idx: token for token, idx in token2id.items()}

Let's create our encoder and decoder to transform our tokens to IDS and back

In [13]:
def encode(tokens: list[str]) -> list[int]:
    # Note this doesn't handle tokens not mapped
    encoded_tokens = [
        token2id[token]
        for token in tokens
    ]
    return encoded_tokens


In [14]:
def decode(ids: list[int]) -> list[str]:
    token_strings = [
        id2token[idx]
        for idx in ids
    ]
    return token_strings

In [15]:
# Testing out encoding and decoding 
sample_text = '''Mr. Louis continued to say, "Penguins are important, 
but we mustn't forget the nuumber 1 priority: the READER!"
'''
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)

print(f'Tokens:\n{tokens}\n')

encoded_tokens = encode(tokens)
print(f'Encoded Tokens:\n{encoded_tokens}\n')

decoded_tokens = decode(encoded_tokens)
print(f'Decoded Tokens:\n{decoded_tokens}\n')

Tokens:
['##BOS##', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '##EOS##']

Encoded Tokens:
[36, 38, 18, 32, 1, 49, 12, 29, 10, 8, 58, 16, 29, 21, 57, 2, 27, 46, 50, 47, 51, 3, 20, 13, 47, 11, 44, 10, 26]

Decoded Tokens:
['##BOS##', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '##EOS##']



## Tokenization Reflection

# Using Hugging Face Tokenizers

Tokenization is an important step in most NLP tasks. Hugging Face has been an
invaluable resource in training, using, and sharing different tokenizers!

The API is flexible where you can use a tokenizer off the shelf, fine-tune a
tokenizer with your own data, or even train your own completely from scratch!

### Loading Tokenizer

In this notebook, we'll explore Hugging Face's tokenizers by using a pretrained
model. Hugging Face has many tokenizers available that have already been trained
for specific models and tasks!

In [16]:
# Choose a pretrained tokenizer to use
my_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

### Encoding: Text to Tokens

There's a couple ways to get the tokens from text. The simplest is calling
`.tokenize()` on the text.

In [17]:
raw_text = '''Rory\'s shoes are magenta and so are Corey\'s but they aren\'t nearly as dark!'''
tokens = my_tokenizer.tokenize(raw_text)

print(tokens)

['Rory', "'", 's', 'shoes', 'are', 'mage', '##nta', 'and', 'so', 'are', 'Corey', "'", 's', 'but', 'they', 'aren', "'", 't', 'nearly', 'as', 'dark', '!']


Another is calling the tokenizer with the text and then calling the `.tokens()`
method.

This will also return some special tokens depending on the pretrained tokenizer
used.

In [18]:
detailed_tokens = my_tokenizer(raw_text).tokens()

print(detailed_tokens)

['[CLS]', 'Rory', "'", 's', 'shoes', 'are', 'mage', '##nta', 'and', 'so', 'are', 'Corey', "'", 's', 'but', 'they', 'aren', "'", 't', 'nearly', 'as', 'dark', '!', '[SEP]']


To get the tokens as integer IDs, there are again a few methods.

The first is using the tokenizers `.encode()` method on the text.

In [19]:
print(my_tokenizer.encode(raw_text))

[101, 14845, 112, 188, 5743, 1132, 27595, 13130, 1105, 1177, 1132, 19521, 112, 188, 1133, 1152, 4597, 112, 189, 2212, 1112, 1843, 106, 102]


We can also the `.convert_tokens_to_ids()` tokenizer method if we already have
the tokens (as strings) to get the IDs

In [20]:
print(detailed_tokens)
detailed_ids = my_tokenizer.convert_tokens_to_ids(detailed_tokens)
print(detailed_ids)

['[CLS]', 'Rory', "'", 's', 'shoes', 'are', 'mage', '##nta', 'and', 'so', 'are', 'Corey', "'", 's', 'but', 'they', 'aren', "'", 't', 'nearly', 'as', 'dark', '!', '[SEP]']
[101, 14845, 112, 188, 5743, 1132, 27595, 13130, 1105, 1177, 1132, 19521, 112, 188, 1133, 1152, 4597, 112, 189, 2212, 1112, 1843, 106, 102]


Another way can look a little complex but can be useful when working with
tokenizers for certain tasks.

We first call the tokenizer on the text like we did last time but with no extra 
method.

This returns an object that has a few different keys available.

In [21]:
my_tokenizer(raw_text)

{'input_ids': [101, 14845, 112, 188, 5743, 1132, 27595, 13130, 1105, 1177, 1132, 19521, 112, 188, 1133, 1152, 4597, 112, 189, 2212, 1112, 1843, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

We'll then focus on `input_ids` which are the IDs associated with the
tokenizations.

In [22]:
print(my_tokenizer(raw_text).input_ids)

[101, 14845, 112, 188, 5743, 1132, 27595, 13130, 1105, 1177, 1132, 19521, 112, 188, 1133, 1152, 4597, 112, 189, 2212, 1112, 1843, 106, 102]


### Decoding: Tokens to Text

### Considerations


- Tokenizers differ
- Long sequences
- Adjusting to your use case (fine-tuning)


## Hugging Face Tokenizers Reflection

# Overall Reflection