In [1]:
from __future__ import annotations

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

from helper import *

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Intro

# Load Data

In [3]:
# Reduced data to make it manageable for smaller systems
DATA_FILE: str = '../data/shakespeare_small.txt'

In [4]:
with open(DATA_FILE, 'r') as data_file:
    raw_text = data_file.read()

In [5]:
print(f'Number of characters: {len(raw_text):,}')

Number of characters: 50,085


# Character-Based Text Generation

The first model we'll try for text generation will be by training the model and
generating by character.

This will mean each token will be a single character from the text and the model
will learn to predict the next character (a token).

To generate text the text, the model will take in a new string,
character-by-character, and then generate a new likely character based on the
past input. Then the model will take into account that new character and
genearate the following character and so on and so on until the model has
produced a set number of characters.

## Encode Text into Integer Tokens

### Normalization

> - Skip?
> - lowercase?

In [6]:
def normalize_text(text: str) -> str:
    # Can be multiple actions to normalize text
    normalized_text = text.lower()
    return normalized_text

In [7]:
# TEST: Is your text normalized the way you expected?
# normalized_text = normalize_text(raw_text)
# print(normalized_text)

### Pretokenization

In [8]:
def pretokenize_text(text: str) -> str | list[str]:
    # Effectively the same as a string
    smaller_pieces = [char for char in text]
    return smaller_pieces

In [9]:
# TEST: Is your (normalized) text pretokenized the way you expected?
# pretokenized_text = pretokenize_text(normalized_text)
# print(pretokenized_text)

### Tokenize

In [10]:
# Combine normalization and pretokenization steps
def tokenize_text(text: str) -> str | list[str]:
    normalized_text: str = normalize_text(text)
    pretokenized_text: str | list[str] = pretokenize_text(normalized_text)
    # Characters are already tokens so pretokenized text is already tokenized
    tokenized_text = pretokenized_text
    return tokenized_text

In [11]:
tokenized_text = tokenize_text(raw_text)

### Postprocessing

> Skip

### Encode (Tokens → Integer IDs)

In [12]:
encoded, character_mapping = encode_text(raw_text, tokenize_text)

## Prepare Dataset

In [13]:
n_tokens = character_mapping.n_tokens
dataset_size = len(encoded)

In [14]:
length = 32 # Number of characters
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [15]:
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Genaration

In [16]:
def generate_text_by_char(
    input_str: str,
    model,
    token_mapping: TokenMapping = character_mapping,
    num_chars: int = 100,
    temperature: float = 1.0,
    topk: int | None = None,
) -> str:
    tokenized_text: list[str] = tokenize_text(input_str)
    generated_tokens = []
    for _ in range(num_chars):
        new_char = next_token(
            tokenized_text=(tokenized_text + generated_tokens),
            model=model,
            token_mapping=token_mapping,
            temperature=temperature,
            topk=topk,
        )
        generated_tokens.append(new_char)
    # Input string and generated string
    full_text = ''.join(tokenized_text + generated_tokens)
    return full_text

## Train Model

In [17]:
PHRASE = 'To be or not to be'
epochs = 5

start = start_time()
for epoch in range(epochs):
    # Set model into "training mode"
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    gen_output = generate_text_by_char(
        input_str=PHRASE,
        model=model,
        num_chars=100,
    )
    print(gen_output)

Epoch 1/5, Loss: 2.2764747702656463
[00m 6.3s (0 0%) 1.9193]
------------------------------------------------------------------------
to be or not to be thee inthe to all haim cato recfoneb, eve ie to thee miands, and hit mren,
yey brated nites lermes;
Epoch 2/5, Loss: 1.9104908132705445
[00m 12.8s (1 20%) 1.9763]
------------------------------------------------------------------------
to be or not to belavg winc.
iplefess weser.

ayled, ofe sort, am plyder and gardis't, i stecmong; arope sert, i are
a
Epoch 3/5, Loss: 1.8275297895407143
[00m 19.1s (2 40%) 1.6987]
------------------------------------------------------------------------
to be or not to bein the prup to mar us the sever, go sto have heir why his are'd,
and i's: be agiolids than spere sai
Epoch 4/5, Loss: 1.7852085169892722
[00m 25.6s (3 60%) 1.9129]
------------------------------------------------------------------------
to be or not to beard be were
i shalld?

blius canto love: vence anur!
as for any inges o, he fle, wh

## Generate Text

In [18]:
output = generate_text_by_char(
    input_str='To be or not to be',
    model=model,
    num_chars=100,
    temperature=1.0,
)
print(output)

to be or not to beether he eldielw.

cominius:
vird,
unou welf to gol the selved,
and not that denia in all fuch the b


# Token-Based Generation

## Encode Text into Integer Tokens

### Tokenize (Choose)

In [19]:
# Choose a pretrained tokenizer to use
xlmr_model_name = 'xlm-roberta-base'
bert_model_name = 'bert-base-cased'
bert_model_name_uncased = 'bert-base-uncased'

my_tokenizer = AutoTokenizer.from_pretrained(
    bert_model_name_uncased,
)

### Encode (Tokens → Integer IDs)

In [20]:
encoded, token_mapping = encode_text_from_tokenizer(
    text=raw_text,
    tokenizer=my_tokenizer,
)

## Prepare Dataset

In [21]:
n_tokens = token_mapping.n_tokens
dataset_size = len(encoded)

In [22]:
length = 16  # Tokens 
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [23]:
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Genaration

In [24]:
def generate_text_by_token(
    input_str: str,
    model,
    token_mapping: TokenMapping = token_mapping,
    tokenizer = my_tokenizer,
    num_tokens: int = 100,
    temperature: float = 1.0,
    topk: int | None = None,
) -> str:
    tokenized_text = tokenize_text_from_tokenizer(
        tokenizer=tokenizer,
        text=input_str,
    )
    generated_tokens = []
    for _ in range(num_tokens):
        new_token = next_token(
            tokenized_text=(tokenized_text + generated_tokens),
            model=model,
            token_mapping=token_mapping,
            temperature=temperature,
            topk=topk,
        )
        generated_tokens.append(new_token)
    # Input string and generated string
    output_ids = tokenizer.convert_tokens_to_ids(tokenized_text + generated_tokens)
    full_text = tokenizer.decode(output_ids)
    return full_text

In [25]:
def generate_text(
    tokenizer,
    model,
    input_str: str,
    num_tokens: int = 100,
    temperature: float = 1.0,
) -> str:
    # Set model into "evaluation mode" (deactivates things like Dropout layers)
    model.eval()
    tokenized_text = tokenize_text_from_tokenizer(
        tokenizer=tokenizer,
        text=input_str,
    )
    input_tensor = tokens_to_id_tensor(
        tokens=tokenized_text,
        token_id_mapping=token_mapping.token2id,
    )

    generated_text = []
    with torch.no_grad():
        for _ in range(num_tokens):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(
                output[0, -1] / temperature,
                dim=0,
            )
            next_token_idx = torch.multinomial(probabilities, 1).item()
            generated_text.append(token_mapping.id2token(next_token_idx))
            input_tensor = torch.cat(
                [
                    input_tensor,
                    torch.tensor([[next_token_idx]], dtype=torch.long),
                ],
                1,
            )
    # Convert to text again
    output_ids = tokenizer.convert_tokens_to_ids(generated_text)
    output_str = input_str + ' ' + tokenizer.decode(output_ids)
    return output_str

## Train Model

In [26]:
epochs = 5

start = start_time()
for epoch in range(epochs):
    # Set model into "training mode"
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    output = generate_text_by_token(
        input_str='To be or not to be',
        model=model,
        token_mapping=token_mapping,
        tokenizer=my_tokenizer,
        num_tokens=30,
        temperature=1.0,
    )
    print(output)

Epoch 1/5, Loss: 6.178692927905823
[00m 2.6s (0 0%) 6.0003]
------------------------------------------------------------------------
to be or not to be mat :? done they him he com fl butcher, with : your your, the,nia : but it cannotrti he an will s - forth
Epoch 2/5, Loss: 5.408855026365777
[00m 5.3s (1 20%) 5.3355]
------------------------------------------------------------------------
to be or not to be am all upon leaves sword only our op may aside world is with you. all he lip ofs ve as, prayer. that us, is,
Epoch 3/5, Loss: 4.877313198544393
[00m 8.0s (2 40%) 4.4577]
------------------------------------------------------------------------
to be or not to be intend his yarn shericertius : i off, vol your most hostages way the behold rather em blood, vol interior : would what ta pity co
Epoch 4/5, Loss: 4.477121763855871
[00m 10.7s (3 60%) 4.6280]
------------------------------------------------------------------------
to be or not to be let to cominius, i shall back home we shall

## Generate Text

In [27]:
output = generate_text_by_token(
        input_str='To be or not to be',
        model=model,
        token_mapping=token_mapping,
        tokenizer=my_tokenizer,
        num_tokens=30,
        temperature=1.0,
        topk=10,
    )
print(output)

to be or not to be the volsces are ; and your choice, the vols, and i am, and'd, and's ay? marcius : if


# Comparison Between Generation