### Adapted from Deep Learning with Python, 2e and 3e, Chollet and Chollet and Watson.

#### Block 0A:  Import general purpose packages for ease of use and improved performance.

In [1]:
import matplotlib.pyplot as plt
import numpy as np

import collections
import re
import string

#### Block 0B:  Import packages for a typical deep learning workflow using TensorFlow and Keras.  Import the IMDB dataset from Keras.

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

#### Block 1A:  Define a function to create a vocabulary using single characters as tokens.

In [3]:
def set_charlvl_vocabulary(inputs, max_size):
    # Determine the number of occurrences of each character-level
    # token in the text sample.
    char_counts = collections.Counter()
    for elem in inputs:
        elem = elem.lower()
        tokens = re.findall(r'.', elem)
        char_counts.update(tokens)

    # Initialize the vocabulary with the tokens used to pad vectorized
    # text to a consistent length and to signal an out-of-vocabulary
    # word.
    vocabulary = ['[PAD]', '[UNK]']

    # Fill the remainder of the vocabulary with the most common tokens.
    most_common = char_counts.most_common(max_size - len(vocabulary))
    for token, count in most_common:
        vocabulary.append(token)

    # This function returns a Python **dict** mapping each character token
    # (string) to its integer index. Verified by calling:
    #     >>> type(set_charlvl_vocabulary(["Hello!"], 50))
    #     <class 'dict'>
    return {token: i for i, token in enumerate(vocabulary)}


- The **encoded length** (X) roughly matches the count of words/characters in our test sentence.
- Any discrepancies arise because words not in the top-N get mapped to `[UNK]`, and padding or truncation may occur.


#### Block 1B:  Define a function that creates a vocabulary using individual words as tokens.

In [4]:
def set_wordlvl_vocabulary(inputs, max_size):
    # Determine the number of occurrences of each word-level
    # token in the text sample.
    word_counts = collections.Counter()
    for elem in inputs:
        elem = elem.lower()
        # Regex explanation:
        #   [\w]+    → one or more “word” characters (letters, digits, underscore)
        #   [.,!?;]? → optionally followed by a single punctuation mark
        tokens = re.findall(r'[\w]+[.,!?;]?', elem)
        word_counts.update(tokens)

    # Initialize the vocabulary with the tokens used to pad vectorized
    # text to a consistent length and to signal an out-of-vocabulary word.
    vocabulary = ['[PAD]', '[UNK]']

    # Fill the remainder of the vocabulary with the most common tokens.
    most_common = word_counts.most_common(max_size - len(vocabulary))
    for token, count in most_common:
        vocabulary.append(token)

    # This function returns a Python **dict** mapping each word token
    # (a string like "hello" or "test,") to a unique integer index.
    # We verified this by calling:
    #     >>> type(set_wordlvl_vocabulary(["This is a test."], 50))
    #     <class 'dict'>
    return {token: i for i, token in enumerate(vocabulary)}

**Regex difference in 1A vs. 1B**

- **Char-level** uses `r'.'` to grab every single character (letters, spaces, punctuation).
- **Word-level** uses `r'[\w]+[.,!?;]?` to grab whole words plus an optional punctuation mark.


#### Block 2A:  Define a class for pre-processing text data at the character level.

In [5]:
class CharTokenizer:
    def __init__(self, vocabulary: dict[str, int]):
        """
        Initialize a character-level tokenizer.

        Args:
            vocabulary: A dict mapping character tokens (str) to integer indices.
                        Must include '[PAD]' and '[UNK]'.
        """
        self.vocabulary: dict[str, int] = vocabulary
        self.pad_id: int = vocabulary['[PAD]']
        self.unk_id: int = vocabulary['[UNK]']
        # Build inverse lookup so we can go from index → token when decoding
        self.inverse_vocabulary: dict[int, str] = {idx: tok for tok, idx in vocabulary.items()}

    def __call__(self, text: str) -> list[int]:
        """
        Tokenize and encode a raw string in one call.

        Args:
            text: The input string to vectorize.

        Returns:
            A list of integer token indices.
        """
        # 1) clean up the text
        clean = self.standardize(text)
        # 2) split into character tokens
        tokens = self.tokenize(clean)
        # 3) map tokens → integer indices
        return self.encode(tokens)

    def standardize(self, text: str) -> str:
        """
        Lowercase the text and strip out any punctuation.

        Args:
            text: Raw input string.

        Returns:
            A cleaned string containing only lowercase letters, digits, and whitespace.
        """
        text = text.lower()
        # Drop any character in string.punctuation
        return "".join(ch for ch in text if ch not in string.punctuation)

    def tokenize(self, text: str) -> list[str]:
        """
        Split a cleaned string into a list of single-character tokens.

        Args:
            text: The output of `standardize()`.

        Returns:
            A list where each entry is one character.
        """
        # The regex r'.' matches every character (including spaces)
        return re.findall(r'.', text)

    def encode(self, tokens: list[str]) -> list[int]:
        """
        Map each character token to its integer index.

        Args:
            tokens: List of single-character strings.

        Returns:
            List of ints; unknown tokens become self.unk_id.
        """
        return [self.vocabulary.get(tok, self.unk_id) for tok in tokens]

    def decode(self, indices: list[int]) -> str:
        """
        Map a sequence of indices back into a string.

        Args:
            indices: List of integer token IDs.

        Returns:
            A reconstructed string, with unknown indices as '[UNK]'.
        """
        return "".join(self.inverse_vocabulary.get(idx, '[UNK]') for idx in indices)


# === Quick test / sandbox for Block 2A ===
if __name__ == "__main__":
    # Build a tiny sample vocabulary
    sample_vocab = {'[PAD]': 0, '[UNK]': 1, 'a': 2, 'b': 3, ' ': 4}
    tok = CharTokenizer(sample_vocab)

    text = "Ab a!"
    print("Original:", text)
    encoded = tok(text)
    print("Encoded IDs:", encoded)           # e.g. [2, 3, 4, 2, 1]
    print("Decoded back:", tok.decode(encoded))
    # Verify return types
    print("encode() returns:", type(encoded), "with length", len(encoded))


Original: Ab a!
Encoded IDs: [2, 3, 4, 2]
Decoded back: ab a
encode() returns: <class 'list'> with length 4


#### Block 2B:  Define a class for pre-processing text data at the word level.

In [6]:
class WordTokenizer:
    def __init__(self, vocabulary: dict[str, int]):
        """
        Initialize a word-level tokenizer.

        Args:
            vocabulary: A dict mapping word tokens (str) to integer indices.
                        Must include '[PAD]' and '[UNK]'.
        """
        self.vocabulary: dict[str, int] = vocabulary
        self.pad_id: int = vocabulary['[PAD]']
        self.unk_id: int = vocabulary['[UNK]']
        # Build reverse lookup for decode()
        self.inverse_vocabulary: dict[int, str] = {
            idx: tok for tok, idx in vocabulary.items()
        }

    def __call__(self, text: str) -> list[int]:
        """
        Standardize → tokenize → encode in one step.

        Args:
            text: Raw input string.

        Returns:
            A list of integer token IDs.
        """
        clean = self.standardize(text)
        tokens = self.tokenize(clean)
        return self.encode(tokens)

    def standardize(self, text: str) -> str:
        """
        Lowercase and strip out punctuation.

        Args:
            text: Raw input string.

        Returns:
            A cleaned string containing only lowercase letters, digits, and spaces.
        """
        text = text.lower()
        # Drop anything in string.punctuation
        return "".join(ch for ch in text if ch not in string.punctuation)

    def tokenize(self, text: str) -> list[str]:
        """
        Split text into word-level tokens.

        Args:
            text: The standardized string.

        Returns:
            A list of tokens, where each token is one “word” plus optional punctuation.
        """
        # [\w]+  → one or more word characters (letters, digits, underscore)
        # [.,!?;]? → optional trailing punctuation
        return re.findall(r'[\w]+[.,!?;]?', text)

    def encode(self, tokens: list[str]) -> list[int]:
        """
        Map each token to its integer index.

        Args:
            tokens: List of word tokens.

        Returns:
            List of ints; missing words become self.unk_id.
        """
        return [self.vocabulary.get(tok, self.unk_id) for tok in tokens]

    def decode(self, indices: list[int]) -> str:
        """
        Map a sequence of indices back into a string.

        Args:
            indices: List of token IDs.

        Returns:
            A reconstructed string (words joined by spaces), with unknowns as '[UNK]'.
        """
        return " ".join(self.inverse_vocabulary.get(idx, '[UNK]') for idx in indices)


# === Quick sandbox to verify Block 2B ===
if __name__ == "__main__":
    sample_vocab = {
        '[PAD]': 0,
        '[UNK]': 1,
        'hello': 2,
        'world': 3,
    }
    tok = WordTokenizer(sample_vocab)

    text = "Hello, world!"
    encoded = tok(text)

    print("Original:    ", text)
    print("Standardized:", tok.standardize(text))      # "hello world"
    print("Tokens:      ", tok.tokenize(tok.standardize(text)))  # ["hello", "world"]
    print("Encoded:     ", encoded)                   # [2, 3]
    print("Decoded:     ", tok.decode(encoded))       # "hello world"
    print("encode() returns:", type(encoded), "length:", len(encoded))

Original:     Hello, world!
Standardized: hello world
Tokens:       ['hello', 'world']
Encoded:      [2, 3]
Decoded:      hello world
encode() returns: <class 'list'> length: 2


#### Block 3: Retrieve the chosen book from Project Gutenberg.  Find the line numbers of the beginning and ending lines of the actual text of the book.  These line numbers will be used to avoid including extraneous text in the analysis.

In [9]:
# Read the book as a list of strings.  Each string is a line from
# the text file, terminated by a newline character.
# Block 3: Download & trim Project Gutenberg text

from tensorflow.keras.utils import get_file

# 1. Download and read all lines
path = get_file(
    'pg46.txt',
    origin='https://www.gutenberg.org/cache/epub/46/pg46.txt'
)
with open(path, encoding='utf-8') as f:
    lines = f.readlines()

# 2. Find the slice points by substring (works whether the line "starts with" or not)
start_idx = next(i for i, L in enumerate(lines) if 'START OF' in L) + 1
end_idx   = next(i for i, L in enumerate(lines) if 'END OF'   in L)

# 3. Extract just the book text
book = lines[start_idx:end_idx]

# 4. Sanity check & report
assert all('START OF' not in L for L in book)
assert all('END OF'   not in L for L in book)
print(f"Trimmed to lines {start_idx}–{end_idx}, total {len(book)} lines")


Trimmed to lines 25–3550, total 3525 lines


#### Block 4:  Verify the book content.

In [10]:
## Add code in this cell to display short portions from the beginning and end of the book.

# How many lines to show at each end
N = 20

print(f"---- First {N} lines of the book ----\n")
# Join preserves the original newlines
print("".join(book[:N]))

print(f"\n---- Last {N} lines of the book ----\n")
print("".join(book[-N:]))

---- First 20 lines of the book ----





A CHRISTMAS CAROL

IN PROSE
BEING
A Ghost Story of Christmas

by Charles Dickens



PREFACE

I HAVE endeavoured in this Ghostly little book,
to raise the Ghost of an Idea, which shall not put my
readers out of humour with themselves, with each other,
with the season, or with me.  May it haunt their houses


---- Last 20 lines of the book ----

me. Assure me that I yet may change these shadows you
have shown me, by an altered life!"

The kind hand trembled.

"I will honour Christmas in my heart, and try to keep it
all the year. I will live in the Past, the Present, and the
Future. The Spirits of all Three shall strive within me. I
will not shut out the lessons that they teach. Oh, tell me I
may sponge away the writing on this stone!"

In his agony, he caught the spectral hand. It sought to
free itself, but he was strong in his entreaty, and detained it.
The Spirit, stronger yet, repulsed him.

Holding up his hands in a last prayer to have his fa

#### Block 5A:  Build a vocabulary and tokenize it for use with the character-level tokenizer.

In [11]:
# 1. Create the vocabulary (top 100 most frequent characters) and the tokenizer
char_vocabulary = set_charlvl_vocabulary(book, max_size=100)
char_tokenizer  = CharTokenizer(char_vocabulary)

# 2. How big is our vocab?
vocab_size = len(char_vocabulary)
print(f"Character vocabulary size: {vocab_size}")

# 3. Reconstruct the ordered list of tokens from the dict
#    (so we can see which are most vs. least common)
tokens_by_index = [
    token
    for token, idx in sorted(char_vocabulary.items(), key=lambda kv: kv[1])
]

# 4. Inspect the “top” 5 (after [PAD] & [UNK]) and the bottom 5 tokens
print("Top-5 tokens:", tokens_by_index[2:7])
print("Bottom-5 tokens:", tokens_by_index[-5:])

# 5. Try it out on a sample sentence
sample = "It was the best of times, it was the worst of times."
print("\nSample sentence:", sample)

#    Expect roughly len(sample) minus punctuation characters
expected_len = len([ch for ch in sample.lower() if ch not in string.punctuation])
print("Expected encoded length (no punctuation):", expected_len)

#    Actually encode
encoded = char_tokenizer(sample)
print("Actual encoded length:", len(encoded))

#    (Optional) peek at the first 20 IDs
print("First 20 token IDs:", encoded[:20])

Character vocabulary size: 44
Top-5 tokens: [' ', 'e', 't', 'o', 'a']
Bottom-5 tokens: [')', '1', '8', '4', '3']

Sample sentence: It was the best of times, it was the worst of times.
Expected encoded length (no punctuation): 50
Actual encoded length: 50
First 20 token IDs: [8, 4, 2, 15, 6, 10, 2, 4, 7, 3, 2, 23, 3, 10, 4, 2, 5, 20, 2, 4]


#### Block 5B:  Check that encoding and decoding at the character level are (roughly) the inverse of one another.

In [12]:
## Add code in this cell that checks whether encoding and decoding at the
## character level are inverses.

# Sample texts to test
samples = [
    "It was the best of times, it was the worst of times.",
    "Hello, World!",
    "Deep Learning 101?"
]

for text in samples:
    # 1) Standardize removes punctuation and lowercases
    std = char_tokenizer.standardize(text)
    # 2) Encode then immediately decode
    encoded = char_tokenizer(text)
    decoded = char_tokenizer.decode(encoded)

    # 3) Print a side-by-side comparison
    print(f"Original     : {text}")
    print(f"Standardized : {std!r}")
    print(f"Decoded      : {decoded!r}")
    print("Match?       :", std == decoded)
    print("-" * 60)

Original     : It was the best of times, it was the worst of times.
Standardized : 'it was the best of times it was the worst of times'
Decoded      : 'it was the best of times it was the worst of times'
Match?       : True
------------------------------------------------------------
Original     : Hello, World!
Standardized : 'hello world'
Decoded      : 'hello world'
Match?       : True
------------------------------------------------------------
Original     : Deep Learning 101?
Standardized : 'deep learning 101'
Decoded      : 'deep learning 1[UNK]1'
Match?       : False
------------------------------------------------------------


#### Block 6A:  Build a vocabulary and tokenize it for use with the word-level tokenizer.

In [13]:
# 1. Create the vocabulary (top 2000 most frequent words) and the tokenizer
word_vocabulary = set_wordlvl_vocabulary(book, max_size=2000)
word_tokenizer  = WordTokenizer(word_vocabulary)

# 2. How big is our vocab?
vocab_size = len(word_vocabulary)
print(f"Word vocabulary size: {vocab_size}")

# 3. Recover tokens in index order so we can inspect frequency extremes
tokens_by_index = [
    token
    for token, idx in sorted(word_vocabulary.items(), key=lambda kv: kv[1])
]

# 4. Show the 5 most-common (after [PAD],[UNK]) and 5 least-common words
print("Top-5 tokens:", tokens_by_index[2:7])
print("Bottom-5 tokens:", tokens_by_index[-5:])

# 5. Test on a sample sentence
sample = "Natural language processing is a key part of deep learning!"
print("\nSample sentence:", sample)

# 5a. Standardize & tokenize to see expected token count
std_tokens    = word_tokenizer.tokenize(word_tokenizer.standardize(sample))
expected_len  = len(std_tokens)
print("Standardized tokens:", std_tokens)
print("Expected token count:", expected_len)

# 5b. Actually encode via the tokenizer
encoded = word_tokenizer(sample)
print("Actual encoded length:", len(encoded))
print("First 20 token IDs:", encoded[:20])

# 6. (Optional) Decode back to words to check inversion
decoded = word_tokenizer.decode(encoded)
print("Decoded back:", decoded)

Word vocabulary size: 2000
Top-5 tokens: ['the', 'and', 'a', 'to', 'of']
Bottom-5 tokens: ['know;', 'offences', 'comfortable', 'hasn', 'dislike']

Sample sentence: Natural language processing is a key part of deep learning!
Standardized tokens: ['natural', 'language', 'processing', 'is', 'a', 'key', 'part', 'of', 'deep', 'learning']
Expected token count: 10
Actual encoded length: 10
First 20 token IDs: [1, 1, 1, 37, 4, 1390, 374, 6, 605, 1]
Decoded back: [UNK] [UNK] [UNK] is a key part of deep [UNK]


#### Block 6B:  Check that encoding and decoding at the word level are (roughly) the inverse of one another.

In [14]:
## Add code in this cell that checks whether encoding and decoding at the
## word level are inverses.

# Sample sentences to test
samples = [
    "It was the best of times, it was the worst of times.",
    "Hello, world! This is a test of the word tokenizer.",
    "Deep learning transforms natural language processing."
]

for text in samples:
    std_text   = word_tokenizer.standardize(text)
    std_tokens = word_tokenizer.tokenize(std_text)
    encoded    = word_tokenizer(text)
    decoded    = word_tokenizer.decode(encoded)

    # Build the “expected” round-trip string by re-joining the standardized tokens
    expected_decoded = " ".join(std_tokens)

    print(f"Original       : {text}")
    print(f"Standardized   : {std_text!r}")
    print(f"Std tokens     : {std_tokens}")
    print(f"Decoded        : {decoded!r}")
    print(f"Matches std?   : {decoded == expected_decoded}")
    print("-" * 60)

Original       : It was the best of times, it was the worst of times.
Standardized   : 'it was the best of times it was the worst of times'
Std tokens     : ['it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst', 'of', 'times']
Decoded        : 'it was the best of times it was the [UNK] of times'
Matches std?   : False
------------------------------------------------------------
Original       : Hello, world! This is a test of the word tokenizer.
Standardized   : 'hello world this is a test of the word tokenizer'
Std tokens     : ['hello', 'world', 'this', 'is', 'a', 'test', 'of', 'the', 'word', 'tokenizer']
Decoded        : '[UNK] world this is a [UNK] of the word [UNK]'
Matches std?   : False
------------------------------------------------------------
Original       : Deep learning transforms natural language processing.
Standardized   : 'deep learning transforms natural language processing'
Std tokens     : ['deep', 'learning', 'transforms', 'natural', 'language', 

**Block 6B summary**  
- When all words are in the top-2000 vocabulary, encode→decode reproduces the standardized text exactly.  
- Any word outside that set becomes `[UNK]`, causing the round-trip check to be False.  
- This behavior confirms the tokenizer’s intended handling of unknown words.

