In [1]:
from __future__ import annotations

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

from helper import *

In [2]:
# Deterministic training
torch.use_deterministic_algorithms(True)
torch.manual_seed(0)

# Attempt GPU; if not, stay on CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Intro

# Load Data

In [3]:
# Reduced data to make it manageable for smaller systems
DATA_FILE: str = '../data/shakespeare_small.txt'

In [4]:
with open(DATA_FILE, 'r') as data_file:
    raw_text = data_file.read()

In [5]:
print(f'Number of characters: {len(raw_text):,}')

Number of characters: 50,085


# Character-Based Text Generation

The first model we'll try for text generation will be by training the model and
generating by character.

This will mean each token will be a single character from the text and the model
will learn to predict the next character (a token).

To generate text the text, the model will take in a new string,
character-by-character, and then generate a new likely character based on the
past input. Then the model will take into account that new character and
genearate the following character and so on and so on until the model has
produced a set number of characters.

## Encode Text into Integer Tokens

### Normalization

> - Skip?
> - lowercase?

In [6]:
def normalize_text(text: str) -> str:
    # Can be multiple actions to normalize text
    normalized_text = text.lower()
    return normalized_text

In [7]:
# TEST: Is your text normalized the way you expected?
# normalized_text = normalize_text(raw_text)
# print(normalized_text)

### Pretokenization

In [8]:
def pretokenize_text(text: str) -> str | list[str]:
    # Effectively the same as a string
    smaller_pieces = [char for char in text]
    return smaller_pieces

In [9]:
# TEST: Is your (normalized) text pretokenized the way you expected?
# pretokenized_text = pretokenize_text(normalized_text)
# print(pretokenized_text)

### Tokenize

In [10]:
# Combine normalization and pretokenization steps
def tokenize_text(text: str) -> str | list[str]:
    normalized_text: str = normalize_text(text)
    pretokenized_text: str | list[str] = pretokenize_text(normalized_text)
    # Characters are already tokens so pretokenized text is already tokenized
    tokenized_text = pretokenized_text
    return tokenized_text

In [11]:
tokenized_text = tokenize_text(raw_text)

### Postprocessing

> Skip

### Encode (Tokens → Integer IDs)

In [12]:
encoded, character_mapping = encode_text(raw_text, tokenize_text)

## Prepare Dataset

In [13]:
n_tokens = character_mapping.n_tokens
dataset_size = len(encoded)

In [14]:
length = 32 # Number of characters
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=False, # Ensure determinsitic training
    batch_size=batch_size,
)

## Define Model

In [15]:
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Genaration

In [16]:
def generate_text_by_char(
    input_str: str,
    model,
    token_mapping: TokenMapping = character_mapping,
    num_chars: int = 100,
    temperature: float = 1.0,
    topk: int | None = None,
) -> str:
    tokenized_text: list[str] = tokenize_text(input_str)
    generated_tokens = []
    for _ in range(num_chars):
        new_char = next_token(
            tokenized_text=(tokenized_text + generated_tokens),
            model=model,
            token_mapping=token_mapping,
            temperature=temperature,
            topk=topk,
        )
        generated_tokens.append(new_char)
    # Input string and generated string
    full_text = ''.join(tokenized_text + generated_tokens)
    return full_text

## Train Model

In [17]:
PHRASE = 'To be or not to be'
epochs = 5

start = start_time()
for epoch in range(epochs):
    # Set model into "training mode"
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    gen_output = generate_text_by_char(
        input_str=PHRASE,
        model=model,
        num_chars=100,
    )
    print(gen_output)

Epoch 1/5, Loss: 2.5301672265171624
[00m 6.1s (0 0%) 2.1864]
------------------------------------------------------------------------
to be or not to beelld bem
ton thacd an,

ickedr faratwuy ofne sutwot theceres'd to tholly id hore pefoma:
arrim? sobo
Epoch 2/5, Loss: 2.1812584218125757
[00m 12.5s (1 20%) 1.9870]
------------------------------------------------------------------------
to be or not to bent fre'n weirtinin:
and the whathy and the wowper, not to fortry ond rotse hon: the harey, the hice 
Epoch 3/5, Loss: 2.078537799755986
[00m 18.5s (2 40%) 1.8850]
------------------------------------------------------------------------
to be or not to be tas onen to prainn ther doww
sirs move cocan upfle. her and not of
coim noflenos thy thate hamsty, 
Epoch 4/5, Loss: 2.019213533934694
[00m 24.8s (3 60%) 1.8197]
------------------------------------------------------------------------
to be or not to beed ine him will pando soups thelanus:
the bat hons ylomy on and keas ame mant;
howtil

## Generate Text

In [18]:
output = generate_text_by_char(
    input_str='To be or not to be',
    model=model,
    num_chars=100,
    temperature=1.0,
)
print(output)

to be or not to be ot highers
wither witr shumt.

comat hame
thears, theicans, will and adfeve toutus

and fors sprove


# Token-Based Text Generation

The next model we'll try is use subword as tokens instead of characters to train
a model and ultimately generate text token by token.

Although this could be done by creating/training our own tokenizer, we'll use
Hugging Face to use a pretrained tokenizer to tokenizer our data.

After training the model with the subword tokens, we can generate text by
again providing a text input but this time using the tokenizer to create subword
tokens. The model will then take this sequence of subwords to generate a new
token (subword), add this token as part of the sequence, produce a new token,
and so on until a set number of tokens have been generated. We can then take
this list of subword tokens and decode back to a string of text!

## Encode Text into Integer Tokens

### Tokenize (Choose)

In [19]:
# Choose a pretrained tokenizer to use
xlmr_model_name = 'xlm-roberta-base'
bert_model_name = 'bert-base-cased'
bert_model_name_uncased = 'bert-base-uncased'

my_tokenizer = AutoTokenizer.from_pretrained(
    bert_model_name_uncased,
)

### Encode (Tokens → Integer IDs)

In [20]:
encoded, token_mapping = encode_text_from_tokenizer(
    text=raw_text,
    tokenizer=my_tokenizer,
)

## Prepare Dataset

In [21]:
n_tokens = token_mapping.n_tokens
dataset_size = len(encoded)

In [22]:
length = 16  # Tokens 
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=False, # Ensure determinsitic training
    batch_size=batch_size,
)

## Define Model

In [23]:
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Genaration

In [24]:
def generate_text_by_token(
    input_str: str,
    model,
    token_mapping: TokenMapping = token_mapping,
    tokenizer = my_tokenizer,
    num_tokens: int = 100,
    temperature: float = 1.0,
    topk: int | None = None,
) -> str:
    tokenized_text = tokenize_text_from_tokenizer(
        tokenizer=tokenizer,
        text=input_str,
    )
    generated_tokens = []
    for _ in range(num_tokens):
        new_token = next_token(
            tokenized_text=(tokenized_text + generated_tokens),
            model=model,
            token_mapping=token_mapping,
            temperature=temperature,
            topk=topk,
        )
        generated_tokens.append(new_token)
    # Input string and generated string
    output_ids = tokenizer.convert_tokens_to_ids(tokenized_text + generated_tokens)
    full_text = tokenizer.decode(output_ids)
    return full_text

In [25]:
def generate_text(
    tokenizer,
    model,
    input_str: str,
    num_tokens: int = 100,
    temperature: float = 1.0,
) -> str:
    # Set model into "evaluation mode" (deactivates things like Dropout layers)
    model.eval()
    tokenized_text = tokenize_text_from_tokenizer(
        tokenizer=tokenizer,
        text=input_str,
    )
    input_tensor = tokens_to_id_tensor(
        tokens=tokenized_text,
        token_id_mapping=token_mapping.token2id,
    )

    generated_text = []
    with torch.no_grad():
        for _ in range(num_tokens):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(
                output[0, -1] / temperature,
                dim=0,
            )
            next_token_idx = torch.multinomial(probabilities, 1).item()
            generated_text.append(token_mapping.id2token(next_token_idx))
            input_tensor = torch.cat(
                [
                    input_tensor,
                    torch.tensor([[next_token_idx]], dtype=torch.long),
                ],
                1,
            )
    # Convert to text again
    output_ids = tokenizer.convert_tokens_to_ids(generated_text)
    output_str = input_str + ' ' + tokenizer.decode(output_ids)
    return output_str

## Train Model

In [26]:
epochs = 5

start = start_time()
for epoch in range(epochs):
    # Set model into "training mode"
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    output = generate_text_by_token(
        input_str='To be or not to be',
        model=model,
        token_mapping=token_mapping,
        tokenizer=my_tokenizer,
        num_tokens=30,
        temperature=1.0,
    )
    print(output)

Epoch 1/5, Loss: 6.5734895998544065
[00m 2.7s (0 0%) 6.1417]
------------------------------------------------------------------------
to be or not to be himfect small made till have lose to wars f forforw :, ; risen, breath co our for what : now honoursher sights got mother
Epoch 2/5, Loss: 5.882414554447443
[00m 5.6s (1 20%) 5.7952]
------------------------------------------------------------------------
to be or not to be in of on trophy ; : gods :, mine press lay how -s to,ty theutus and him looks ; to seen word des this er
Epoch 3/5, Loss: 5.608338239999293
[00m 8.3s (2 40%) 5.5824]
------------------------------------------------------------------------
to be or not to be disadvantage issue often, as him swear br has that they :. bonnet abundance sound arms v blast in though weguide as andure the state s
Epoch 4/5, Loss: 5.384834153228723
[00m 11.1s (3 60%) 5.3928]
------------------------------------------------------------------------
to be or not to be the capitol baby for cry 

## Generate Text

In [27]:
output = generate_text_by_token(
        input_str='To be or not to be',
        model=model,
        token_mapping=token_mapping,
        tokenizer=my_tokenizer,
        num_tokens=30,
        temperature=1.0,
        topk=10,
    )
print(output)

to be or not to be.'s yournia in he, you have, and a people, that and the capitol of the people is the people'd : he the


# Comparison Between Generation