In [1]:
from __future__ import annotations

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

from helper import (
    start_time,
    time_since,
    ShakespeareDataset,
    TokenMapping,
    build_model,
    next_token,
    # Character-based helpers
    encode_text,
    # Subword-based helpers
    encode_text_from_tokenizer,
    tokenize_text_from_tokenizer,
)



In [2]:
# Deterministic training
torch.manual_seed(0)

# Attempt GPU; if not, stay on CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


# Load Data

In [3]:
# Reduced data to make it manageable for smaller systems
DATA_FILE: str = '../data/shakespeare_small.txt'

with open(DATA_FILE, 'r') as data_file:
    raw_text = data_file.read()

print(f'Number of characters in text file: {len(raw_text):,}')

Number of characters in text file: 50,085


# Character-Based Text Generation

The first model you'll build for text generation will use character-based
tokens.

Each token will be a single character from the text and the model will learn
to predict the next character (a token).

To generate text, the model will take in a new string,
character-by-character, and then generate a new likely character based on the
past input. Then the model will take into account that new character and
generate the following character and so on and so on until the model has
produced a set number of characters.

## Encode Text into Integer Tokens

### Normalization

In [4]:
raw_text[:20]

'First Citizen:\nBefor'

In [5]:
import re
def normalize_text(text: str) -> str:
    # TODO: Normalize incoming text; can be multiple actions
#     s1 = text.lower()
#     s2 = re.sub(r'[^a-z0-9]',' ', s1) #remove special charaters, punctuation
#     s3 = re.sub(r'\s+',' ', s2)  #turn multiple spaces into a single space
#     normalized_text = s3.strip() #remove whitespaces
    
    normalized_text =  text.lower()
    return normalized_text

In [6]:
# TEST: Is your text normalized the way you expected?
# Only the first 500 characters of the original text
normalized_text = normalize_text(raw_text[:500])
print(normalized_text)

first citizen:
before we proceed any further, hear me speak.

all:
speak, speak.

first citizen:
you are all resolved rather to die than to famish?

all:
resolved. resolved.

first citizen:
first, you know caius marcius is chief enemy to the people.

all:
we know't, we know't.

first citizen:
let us kill him, and we'll have corn at our own price.
is't a verdict?

all:
no more talking on't; let it be done: away, away!

second citizen:
one word, good citizens.

first citizen:
we are accounted poor


### Pretokenization

In [7]:
def pretokenize_text(text: str) -> str | list[str]:
    # TODO: Pretokenize normalized text into character strings
    smaller_pieces = [char for char in text]
    return smaller_pieces

In [8]:
# TEST: Is your (normalized) text pretokenized the way you expected?
# Only the first 500 characters of the original text
pretokenized_text = pretokenize_text(normalized_text)
print(pretokenized_text)

['f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'b', 'e', 'f', 'o', 'r', 'e', ' ', 'w', 'e', ' ', 'p', 'r', 'o', 'c', 'e', 'e', 'd', ' ', 'a', 'n', 'y', ' ', 'f', 'u', 'r', 't', 'h', 'e', 'r', ',', ' ', 'h', 'e', 'a', 'r', ' ', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'a', 'l', 'l', ':', '\n', 's', 'p', 'e', 'a', 'k', ',', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'y', 'o', 'u', ' ', 'a', 'r', 'e', ' ', 'a', 'l', 'l', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', ' ', 'r', 'a', 't', 'h', 'e', 'r', ' ', 't', 'o', ' ', 'd', 'i', 'e', ' ', 't', 'h', 'a', 'n', ' ', 't', 'o', ' ', 'f', 'a', 'm', 'i', 's', 'h', '?', '\n', '\n', 'a', 'l', 'l', ':', '\n', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', '.', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', '.', '\n', '\n', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'f', 'i', 'r', 's', 't', ',', ' ', '

### Tokenize

In [9]:
# Combine normalization and pretokenization steps
def tokenize_text(text: str) -> str | list[str]:
    normalized_text: str = normalize_text(text)
    pretokenized_text: str | list[str] = pretokenize_text(normalized_text)
    # Characters are already tokens so pretokenized text is already tokenized
    tokenized_text = pretokenized_text
    return tokenized_text

In [10]:
# TEST: Is your tokenized text the way you expected?
tokenized_text = tokenize_text(raw_text[:500])
print(tokenized_text)

['f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'b', 'e', 'f', 'o', 'r', 'e', ' ', 'w', 'e', ' ', 'p', 'r', 'o', 'c', 'e', 'e', 'd', ' ', 'a', 'n', 'y', ' ', 'f', 'u', 'r', 't', 'h', 'e', 'r', ',', ' ', 'h', 'e', 'a', 'r', ' ', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'a', 'l', 'l', ':', '\n', 's', 'p', 'e', 'a', 'k', ',', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'y', 'o', 'u', ' ', 'a', 'r', 'e', ' ', 'a', 'l', 'l', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', ' ', 'r', 'a', 't', 'h', 'e', 'r', ' ', 't', 'o', ' ', 'd', 'i', 'e', ' ', 't', 'h', 'a', 'n', ' ', 't', 'o', ' ', 'f', 'a', 'm', 'i', 's', 'h', '?', '\n', '\n', 'a', 'l', 'l', ':', '\n', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', '.', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', '.', '\n', '\n', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'f', 'i', 'r', 's', 't', ',', ' ', '

### Postprocessing

We'll skip postprocessing since we don't have any special tokens we want to
consider for our task here.

### Encode (Tokens → Integer IDs)

We have `encode_text()` from our helper module that can encode our text based on
our tokenization process from our created `tokenize_text()` function.

This will also provide us with `character_mapping`, an object that we can use to
map our tokens back and forth from strings to integer IDs.

In [11]:
encoded_text, character_mapping = encode_text(raw_text, tokenize_text)

## Prepare Dataset

In [12]:
n_tokens = character_mapping.n_tokens
dataset_size = len(encoded_text)
print(f'Size of dataset: {dataset_size:,} characters')

Size of dataset: 50,086 characters


In [13]:
# Defining sequence length that will be taken in at a time by our model
sequence_length = 32 # Number of characters
batch_size = 32

train_dataset = ShakespeareDataset(encoded_text, sequence_length)
train_loader = DataLoader(
    train_dataset,
    shuffle=False, # Ensure deterministic training
    batch_size=batch_size,
)

## Define Model

We'll provide a defined model today, but this could be a step that you would
modify and experiment in other NLP projects you'll do.

In [14]:
# Defining the model to be trained and generate text with
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Generation

The `generate_text_by_char()` function will use your tokenizer and NLP model to
generate new text token-by-token (character-by-character in this case) by taking
in the input text and token sampling parameters.

We can use temperature and top-k sampling to adjust the "creativeness" of the
generated text.

We also pass in the `num_chars` parameter to tell the function how many tokens
(characters in this case) to generate.

In [15]:
def generate_text_by_char(
    input_str: str,
    model,
    token_mapping: TokenMapping = character_mapping,
    num_chars: int = 100,
    temperature: float = 1.0,
    topk: int | None = None,
) -> str:
    # Uses your character-based tokenizer
    tokenized_text: list[str] = tokenize_text(input_str)
    # Generates token-by-token and creates a list of those tokens
    generated_tokens = []
    for _ in range(num_chars):
        # Uses the input text and generated text (so far) to get next token
        new_char = next_token(
            tokenized_text=(tokenized_text + generated_tokens),
            model=model,
            token_mapping=token_mapping,
            # Temperature & top-k sampling used in determining the next token
            temperature=temperature,
            topk=topk,
            device=device,
        )
        generated_tokens.append(new_char)
    # Returns input string plus the full generated string (of generated tokens)
    full_text = ''.join(tokenized_text + generated_tokens)
    return full_text

## Train Model

At this point, the model has not been trained so the code below will train the
NLP model that will be used to generate new text.

The model will take in the text data (broken by tokens by our character-based
tokenizer) and attempt to predict the next token. Over time, the model should
hopefully get better in predicting the next token (given the previous text).

To help us visualize how the model is training, at the end of every epoch, we
generate text using the `TEST_PHRASE` with the improving model.

In [16]:
TEST_PHRASE = 'To be or not to be'
# Use more epochs if not CPU device
epochs = 5 if device == 'cpu' else 25

start = start_time()
for epoch in range(epochs):
    # Set model into "training mode"
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print(f'[{time_since(start)} ({epoch} {epoch / epochs * 100}) {loss:.4f}]')
    print('-'*72)
    gen_output = generate_text_by_char(
        input_str=TEST_PHRASE,
        model=model,
        num_chars=100,
    )
    print(gen_output)

Epoch 1/25, Loss: 2.530167247464482
[00m 4.7s (0 0.0) 2.1864]
------------------------------------------------------------------------
to be or not to bee and vevenptury havir
if:
comums mey:
oucidy
 irastr mavif,
met to gow ow, it!citisere-TOKEN_NOT_FOUNDctim thongi
Epoch 2/25, Loss: 2.181258435523548
[00m 9.4s (1 4.0) 1.9870]
------------------------------------------------------------------------
to be or not to bem fiese have an, ato shind thai daraster harfe uss peiind aftorfer;
hins, inius:
ine deart haet of o
Epoch 3/25, Loss: 2.0785378108771084
[00m 14.0s (2 8.0) 1.8850]
------------------------------------------------------------------------
to be or not to bevis sinsnores
in coak foar.
there cont rye sel cot cim andp, gry llott hal and aliendry no; pirse mo
Epoch 4/25, Loss: 2.019213537819469
[00m 18.6s (3 12.0) 1.8197]
------------------------------------------------------------------------
to be or not to be thies and the bericenyly he 'ut nopher iaret; do ltim uncius:iatrr

## Generate Text

Now that the model has been trained, go ahead and observe how it performs!

Try adjusting the different sampling methods using the `temperature` and `topk`
parameters on the same input string to see the differences.

You might also try different phrases as well as how many tokens (`num_chars`) to
generate and observe how it does.

In [17]:
output = generate_text_by_char(
    input_str='To be or not to be',
    model=model,
    num_chars=100,
    temperature=1.0,
    topk=None,
)
print(output)

to be or not to being to the go put all scosius!
it you will end bery are here of their as your canstegred
for huld ba


In [18]:
output = generate_text_by_char(
    input_str='To be or not to be',
    model=model,
    num_chars=100,
    temperature=15,
    topk=10,
)
print(output)

to be or not to be sus mlo spagars. too spenger.' husgorys:-walicu-y-'ffercicesiso,!ns',; wwubradinhug,nc:nia wie's se


In [20]:
output = generate_text_by_char(
    input_str='To be or not to be',
    model=model,
    num_chars=200,
    temperature=0.00005,
    topk=1,
)
print(output)

to be or not to be the prould the prould the prould the prould the prould the prould the prould the prould the prould the prould the prould the prould the prould the prould the prould the prould the prould the prould t


# Subword Text Generation

The next model you'll build will use subword-tokenization instead of 
characters-based token to train a model and ultimately generate new text
token-by-token.

Although this could be done by creating your own tokenizer, you'll use
Hugging Face to use a pretrained tokenizer to tokenize the data.

After training the model with subword tokens, 
the model will take in a new string, token-by-token, and then generate a new
token (subword).
The model will continue producing new subword tokens based on the input text
and already produced tokens until a set number of tokens have been generated.

## Encode Text into Integer Tokens

### Choosing a Tokenizer

> NOTE:
> 
> You can load another model outside of these choices but the model
> will have to be downloaded and may or may not be effective.
>
> If you'd like to explore more, here's a link to you might want to start with
> of different available pretrained models on Hugging Face:
> https://huggingface.co/models?pipeline_tag=text-generation

In [21]:
# TODO: Choose a pretrained tokenizer to use:

# Docs: https://huggingface.co/xlm-roberta-base
# model_name = 'xlm-roberta-base'
# DOCS: https://huggingface.co/bert-base-cased
# model_name = 'bert-base-cased'
# DOCS: https://huggingface.co/bert-base-uncased 
model_name = 'bert-base-uncased'

my_tokenizer = AutoTokenizer.from_pretrained(
    model_name,
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

### Encode (Tokens → Integer IDs)

We have `encode_text_from_tokenizer()` from our helper module that can encode
our text based on our tokenization process from our tokenizer `my_tokenizer`.

This will also provide us with `token_mapping`, an object that we can use to
map our tokens back and forth from strings to integer IDs.

In [22]:
encoded_text, token_mapping = encode_text_from_tokenizer(
    text=raw_text,
    tokenizer=my_tokenizer,
)

## Prepare Dataset

In [23]:
n_tokens = token_mapping.n_tokens
dataset_size = len(encoded_text)
print(f'Size of dataset: {dataset_size:,} tokens')

Size of dataset: 13,139 tokens


In [24]:
# Defining sequence length that will be taken in at a time by our model
sequence_length = 32 # Number of tokens
batch_size = 32

train_dataset = ShakespeareDataset(encoded_text, sequence_length)
train_loader = DataLoader(
    train_dataset,
    shuffle=False, # Ensure deterministic training
    batch_size=batch_size,
)

## Define Model

We'll provide a defined model today, but this could be a step that you would
modify and experiment in other NLP projects you'll do.

In [25]:
# Defining the model to be trained and generate text with
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Generation

The `generate_text_by_subword()` function will use your chosen tokenizer and the
NLP model to generate new text token-by-token (subwords in this case) by taking
in the input text and token sampling parameters.

We can use temperature and top-k sampling to adjust the "creativeness" of the
generated text.

We also pass in the `num_tokens` parameter to tell the function how many
(subword)tokens to generate.

In [26]:
def generate_text_by_subword(
    input_str: str,
    model,
    token_mapping: TokenMapping = token_mapping,
    tokenizer = my_tokenizer,
    num_tokens: int = 100,
    temperature: float = 1.0,
    topk: int | None = None,
) -> str:
    # Use your chosen subword-tokenizer
    tokenized_text = tokenize_text_from_tokenizer(
        tokenizer=tokenizer,
        text=input_str,
    )
    # Generates token-by-token and creates a list of those tokens
    generated_tokens = []
    for _ in range(num_tokens):
        # Uses the input text and generated text (so far) to get next token
        new_token = next_token(
            tokenized_text=(tokenized_text + generated_tokens),
            model=model,
            token_mapping=token_mapping,
            # Temperature & top-k sampling used in determining the next token
            temperature=temperature,
            topk=topk,
            device=device,
        )
        generated_tokens.append(new_token)
    # List of all token IDs (input text and generated text)
    output_ids = tokenizer.convert_tokens_to_ids(
        tokenized_text + generated_tokens
    )
    # Returns input string plus the full generated string from list of token IDs
    full_text = tokenizer.decode(output_ids)
    return full_text

## Train Model

At this point, the model has not been trained so the code below will train the
NLP model that will be used to generate new text.

The model will take in the text data (broken by tokens by our subword tokenizer)
and attempt to predict the next token. Over time, the model should hopefully
get better in predicting the next token (given the previous text).

To help us visualize how the model is training, at the end of every epoch, we
generate text using the `TEST_PHRASE` with the improving model.

In [27]:
TEST_PHRASE = 'To be or not to be'
# Use more epochs if not CPU device
epochs = 5 if device == 'cpu' else 25

start = start_time()
for epoch in range(epochs):
    # Set model into "training mode"
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print(f'[{time_since(start)} ({epoch} {epoch / epochs * 100}) {loss:.4f}]')
    print('-'*72)
    output = generate_text_by_subword(
        input_str=TEST_PHRASE,
        model=model,
        token_mapping=token_mapping,
        tokenizer=my_tokenizer,
        num_tokens=30,
        temperature=1.0,
    )
    print(output)

Epoch 1/25, Loss: 6.535834827655699
[00m 1.5s (0 0.0) 5.5004]
------------------------------------------------------------------------
to be or not to be sons and silkbuius, fi : walk pri not lie me remember safe quick i against that, la have for so will ladder for ;eni swear
Epoch 2/25, Loss: 5.887454633014958
[00m 2.8s (1 4.0) 5.0491]
------------------------------------------------------------------------
to be or not to be and after word : look out we hearing expedition there 'us trumpets. infant co which useni to enemy lamb i sic : left thought hei,
Epoch 3/25, Loss: 5.605786710832177
[00m 4.1s (2 8.0) 4.6285]
------------------------------------------------------------------------
to be or not to be proof malrio base to i not if they : the sway : doing, hear attend tis cai. marc hear :s and rogue five auf are retire
Epoch 4/25, Loss: 5.346873565417964
[00m 5.4s (3 12.0) 4.3611]
------------------------------------------------------------------------
to be or not to be day wore's 

## Generate Text

Now that the model has been trained, go ahead and observe how it performs!

Try adjusting the different sampling methods using the `temperature` and `topk`
parameters on the same input string to see the differences.

You might also try different phrases as well as how many tokens (`num_tokens`)
to generate and observe how it does.

------------

Consider how this model differs from the results from the text generation using
the character-based tokenization.

In [28]:
output = generate_text_by_subword(
        input_str='To be or not to be',
        model=model,
        token_mapping=token_mapping,
        tokenizer=my_tokenizer,
        num_tokens=30,
        temperature=1.5,
        topk=100,
    )
print(output)

to be or not to be content have enoughth you on, which that this wars and bale the gold fast them to corio a mindst say know he returned, no


In [29]:
output = generate_text_by_subword(
        input_str='To be or not to be',
        model=model,
        token_mapping=token_mapping,
        tokenizer=my_tokenizer,
        num_tokens=200,
        temperature=1.5,
        topk=100,
    )
print(output)

to be or not to be sounds, if they may tell too. an :'whom i shall dark report shall you shout of the matter, - best to cai those labour then to sendus aufidius : if coriolanus : this assembly inh great of only want us for their tongues. what aufidius. come with smokingus rer with praises vo are for thence me! aufidius in strokes nor? mess corioet disdain the war, when hear upon s won than an interior off too a treaty of my brain me kindly way requi speedy thusbus ; but is should? they are so,'tis well stored of my with cares but conly transport they smart to mo neither me use off blood their hearts is hisatingnsope sort in compound, he to a sick of the market to thank have ever another with this preva for become no his choice! no should is able at disadvantage a concealtus'en some, who this? volumnia : why, well


In [30]:
output = generate_text_by_subword(
        input_str='To be or not to be',
        model=model,
        token_mapping=token_mapping,
        tokenizer=my_tokenizer,
        num_tokens=200,
        temperature=0.00005,
        topk=10,
    )
print(output)

to be or not to be silent, and the gods doom him in arms. menenius : i'll not prepared for the gods of your conversation and the matter of the people, and the gods doom him in arms. menenius : i'll not prepared for the gods of your conversation and the matter of the people, and the gods doom him in arms. menenius : i'll not prepared for the gods of your conversation and the matter of the people, and the gods doom him in arms. menenius : i'll not prepared for the gods of your conversation and the matter of the people, and the gods doom him in arms. menenius : i'll not prepared for the gods of your conversation and the matter of the people, and the gods doom him in arms. menenius : i'll not prepared for the gods of your conversation and the matter of the people, and the gods doom him in arms. menenius : i'll not prepared for
