In [1]:
from __future__ import annotations

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

from helper import *

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Intro

# Load Data

In [3]:
# Reduced data to make it manageable for smaller systems
DATA_FILE: str = '../data/shakespeare_small.txt'

In [4]:
with open(DATA_FILE, 'r') as data_file:
    raw_text = data_file.read()

In [5]:
print(f'Number of characters: {len(raw_text):,}')

Number of characters: 50,085


# Character-Based Generation

## Encode Text into Integer Tokens

### Normalization

> - Skip?
> - lowercase?

In [6]:
def normalize_text(text: str) -> str:
    # Can be multiple actions to normalize text
    normalized_text = text.lower()
    return normalized_text

In [7]:
# TEST: Is your text normalized the way you expected?
# normalized_text = normalize_text(raw_text)
# print(normalized_text)

### Pretokenization

In [8]:
def pretokenize_text(text: str) -> str | list[str]:
    # Effectively the same as a string
    smaller_pieces = [char for char in text]
    return smaller_pieces

In [9]:
# TEST: Is your (normalized) text pretokenized the way you expected?
# pretokenized_text = pretokenize_text(normalized_text)
# print(pretokenized_text)

### Tokenize

In [10]:
# Combine normalization and pretokenization steps
def tokenize_text(text: str) -> str | list[str]:
    normalized_text: str = normalize_text(text)
    pretokenized_text: str | list[str] = pretokenize_text(normalized_text)
    # Characters are already tokens so pretokenized text is already tokenized
    tokenized_text = pretokenized_text
    return tokenized_text

In [11]:
tokenized_text = tokenize_text(raw_text)

### Postprocessing

> Skip

### Encode (Tokens → Integer IDs)

In [12]:
encoded, character_mapping = encode_text(raw_text, tokenize_text)

## Prepare Dataset

In [13]:
n_tokens = character_mapping.n_tokens
dataset_size = len(encoded)

In [14]:
length = 32 # Number of characters
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [15]:
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Genaration

In [16]:
def next_character(
    input_str: str,
    model,
    token_mapping: TokenMapping,
    temperature: float = 1.0,
    k: int | None = None,
) -> str:
    # Set model into "evaluation mode" (deactivates things like Dropout layers)
    model.eval()
    tokenized_text: list[str] = tokenize_text(input_str)
    input_tensor = tokens_to_id_tensor(
        tokens=tokenized_text,
        token_id_mapping=token_mapping.token2id,
    )

    with torch.no_grad():
        output = model(input_tensor.to(device))
        # Use temperature to change probabilities
        probabilities = nn.functional.softmax(
            output[0, -1] / temperature,
            dim=0,
        )
        # Sampling from probabilities
        sorted_ids = torch.argsort(probabilities, descending=True)
        # Top-k: Defaults to using all given characters
        if k is None:
            sorted_ids_subset = sorted_ids
        else:
            sorted_ids_subset = sorted_ids[:k]
        index_of_sorted = torch.multinomial(
            probabilities[sorted_ids_subset],
            1,
        ).item()
        next_char_idx = sorted_ids_subset[index_of_sorted].item()
        
        return token_mapping.id2token(next_char_idx)

In [17]:
def generate_text_by_char(
    input_str: str,
    model,
    token_mapping: TokenMapping = character_mapping,
    num_chars: int = 100,
    temperature: float = 1.0,
    k: int | None = None,
) -> str:
    generated_tokens = []
    for _ in range(num_chars):
        next_char = next_character(
            input_str=(input_str + ''.join(generated_tokens)),
            model=model,
            token_mapping=token_mapping,
            temperature=temperature,
            k=k,
        )
        generated_tokens.append(next_char)
    # Input string and generated string
    full_text = input_str+ ''.join(generated_tokens)
    return full_text

## Train Model

In [18]:
PHRASE = 'To be or not to be'
epochs = 5

start = start_time()
for epoch in range(epochs):
    # Set model into "training mode"
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    gen_output = generate_text_by_char(
        input_str=PHRASE,
        model=model,
        num_chars=100,
    )
    print(gen_output)

Epoch 1/5, Loss: 2.2609775224051916
[00m 6.3s (0 0%) 1.8597]
------------------------------------------------------------------------
To be or not to be magk,
feard supond tebutther cucpether,
to nowd tood.

voe; tham shart, nust of bendaring that wo. 
Epoch 2/5, Loss: 1.9069556816698263
[00m 12.7s (1 20%) 1.9038]
------------------------------------------------------------------------
To be or not to beir gunt as you the do is the trun; all, is do make, endon.

menenius:
wath qummy ao'r, te't me
morci
Epoch 3/5, Loss: 1.8219082584777198
[00m 19.0s (2 40%) 1.7544]
------------------------------------------------------------------------
To be or not to be all med guare with trustoleceria:
with to the cumm.

firt begoous: moded home, i kneed corior:
spiv
Epoch 4/5, Loss: 1.779409009427689
[00m 25.4s (3 60%) 1.8045]
------------------------------------------------------------------------
To be or not to be comage he giet do dimmackees thy geart in ladiones, sun
be caius; ins ad renib the 

## Generate Text

In [19]:
output = generate_text_by_char(
    input_str='To be or not to be',
    model=model,
    num_chars=100,
    temperature=1.0,
)
print(output)

To be or not to be the generia:
now thun i but if loor,
enepont pould whose it my aless
piton this to as you whis in r


# Token-Based Generation

## Encode Text into Integer Tokens

### Tokenize (Choose)

In [20]:
# Choose a pretrained tokenizer to use
xlmr_model_name = 'xlm-roberta-base'
bert_model_name = 'bert-base-cased'
bert_model_name_uncased = 'bert-base-uncased'

my_tokenizer = AutoTokenizer.from_pretrained(
    bert_model_name_uncased,
)

### Encode (Tokens → Integer IDs)

In [21]:
encoded, token_mapping = encode_text_from_tokenizer(
    text=raw_text,
    tokenizer=my_tokenizer,
)

## Prepare Dataset

In [22]:
n_tokens = token_mapping.n_tokens
dataset_size = len(encoded)

In [23]:
length = 16  # Tokens 
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [24]:
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Genaration

In [25]:
def generate_text(
    tokenizer,
    model,
    input_str: str,
    num_tokens: int = 100,
    temperature: float = 1.0,
) -> str:
    # Set model into "evaluation mode" (deactivates things like Dropout layers)
    model.eval()
    tokenized_text = tokenize_text_from_tokenizer(
        tokenizer=tokenizer,
        text=input_str,
    )
    input_tensor = tokens_to_id_tensor(
        tokens=tokenized_text,
        token_id_mapping=token_mapping.token2id,
    )

    generated_text = []
    with torch.no_grad():
        for _ in range(num_tokens):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(
                output[0, -1] / temperature,
                dim=0,
            )
            next_token_idx = torch.multinomial(probabilities, 1).item()
            generated_text.append(token_mapping.id2token(next_token_idx))
            input_tensor = torch.cat(
                [
                    input_tensor,
                    torch.tensor([[next_token_idx]], dtype=torch.long),
                ],
                1,
            )
    # Convert to text again
    output_ids = tokenizer.convert_tokens_to_ids(generated_text)
    output_str = input_str + ' ' + tokenizer.decode(output_ids)
    return output_str

## Train Model

In [26]:
epochs = 5

start = start_time()
for epoch in range(epochs):
    # Set model into "training mode"
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    output = generate_text(my_tokenizer, model, 'To be or not to be', 30)
    print(output)


Epoch 1/5, Loss: 6.222023793380626
[00m 2.6s (0 0%) 5.8012]
------------------------------------------------------------------------
To be or not to be : of s bring holding. the prisoner with of shields tongue. shallath, they citizen coffin find masters won.lam but : auf issue are,
Epoch 2/5, Loss: 5.59244353927835
[00m 5.3s (1 20%) 4.5287]
------------------------------------------------------------------------
To be or not to be ' s the horses the battle of the i could : and fairness strokes my to see all themselves mercyus you. their than, made me er slaves
Epoch 3/5, Loss: 5.035860540803042
[00m 7.9s (2 40%) 4.0187]
------------------------------------------------------------------------
To be or not to be day ; nay, devotion are - people, sir? is coriolan not o shall and in pride : la that token then he lady, do
Epoch 4/5, Loss: 4.611682188771937
[00m 10.6s (3 60%) 5.0935]
------------------------------------------------------------------------
To be or not to be believe know, so 

## Generate Text

In [27]:
output = generate_text(my_tokenizer, model, 'To be or not to be', 30)
print(output)

To be or not to be fine. men's jun stands be of the small physicalth, hard ; men against blood to, : in is ll arguing is induced put we


# Comparison Between Generation