In [1]:
from __future__ import annotations

# Exercise: Building a Tokenizer

In this exercise, you'll code your own tokenizer from scratching using base
Python!

You might normally start with a pretrained tokenizer, but this exercise will
help you get to know see some of the tokenization steps better.

In [3]:
example_text = 'Did Uncle Max like the jalapeño dip?'

## Normalization



This step is where you'll normalize your text by converting to lowercase,
removing punctuation, removing accented characters, etc.

For example, the text:

```python
'Did Uncle Max like the jalapeño dip?'
```

might be normalized to:

```python
'did uncle max like the jalapeno dip'
```

In [4]:
import string

def normalize_text(text: str) -> str:
    # COMPLETE: Normalize incoming text; can be multiple actions
    # Only keep ASCII letters, numbers, and whitespace characters
    acceptable_characters = (
        string.ascii_letters
        + string.digits
        + string.whitespace
    )
    normalized_text = ''.join(
        filter(lambda letter: letter in acceptable_characters, text)
    )
    # Make text lower-case
    normalized_text = normalized_text.lower()
    return normalized_text

In [None]:
# Test out your normalization
text_cleaned = normalize_text(example_text)
text_cleaned

## Pretokenization


This step will text and pretokenize the text into a list of smaller pieces
that will form the base of the tokens.

For example, the text:

```python
'Did Uncle Max like the jalapeño dip?'
```

might be normalized & then pretokenized to:

```python
[
    'Did',
    'Uncle',
    'Max',
    'like',
    'the',
    'jalapeño',
    'dip?',
]
```

In [5]:
def pretokenize_text(text: str) -> list[str]:
    # COMPLETE: Pretokenize normalized text
    # Split based on spaces
    smaller_pieces = text.split(sep=' ')
    return smaller_pieces

In [None]:
# Test out your pretokenization step
pretokenized_text = pretokenize_text(example_text)
pretokenized_text

## Remove stop words


Remove stop words that you define yourself.

Usually this is already precomputed from prior work, but here you'll decide
what counts as a stop word.

If you're not sure what stop words to use, you can use this list of words:
- of
- the
- a
- an
- at

In [7]:
def remove_stop_words(
    tokens: list[str],
    stop_words: list[str] = list(),
) -> str:
    # TODO: Remove stop words from incoming text
    text_no_stop_words = [
        token
        for token in tokens
        if token not in stop_words
    ]
    return text_no_stop_words

In [None]:
# Test out your stop word removal step (after previous step)
example_text_as_tokens = [
    'Did',
    'Uncle',
    'Max',
    'like',
    'jalapeño',
    'dip?',
]

my_stop_words = [
    'of',
    'the',
    'a',
    'an',
    'at',
]
tokens_filtered = remove_stop_words(
    example_text_as_tokens,
    stop_words=my_stop_words,
)
tokens_filtered

## Finalize Tokens

This step will combine all the previous steps to create the finals tokens
that will be used.

For example, the text:
```python
'Did Uncle Max like the jalapeño dip?'
```

might be normalized, pretokenized, and stop words removed to form the final
tokens:

```python
[
    'did',
    'uncle',
    'max',
    'like',
    'jalapeno',
    'dip'
]
```

We will use the provided `sample_text` to test our final tokenizer.

You might decide to go back and adjust your previous steps so that `sample_text`
is created to what you want & expect for your final tokens.

In [None]:
sample_text = (
    'The first time you see The Second Renaissance it may look boring. '
    'Look at it at least twice and definitely watch part 2. '
    'It will change your view of the matrix. '
    'Are the human people the ones who started the war? Is AI a bad thing ?'
)
print(sample_text)

In [14]:
# Combine normalization and pretokenization steps before breaking things further
def tokenize_text(text: str) -> list[str]:
    # COMPLETE: Apply normalization, stop word removal & pretokenization steps
    # normalizes text - lowercase, no punctuation, only ASCII
    text_normalized: str = normalize_text(text)
    # split into smaller parts after normalization
    pretokenized_text: list[str] = pretokenize_text(text_normalized)
    # remove stop words (defined by me)
    my_stop_words = [
        'of',
        'the',
        'a',
        'an',
        'at',
    ]
    tokens: list[str] = remove_stop_words(
        pretokenized_text,
        stop_words=my_stop_words,
    )
    # finalize tokens by removing empty strings
    final_tokens = [token for token in tokens if token]
    return final_tokens

In [None]:
tokenize_text(sample_text)