In [1]:
from __future__ import annotations
from collections.abc import Sequence

import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torch.utils.data import DataLoader

In [2]:
from helper import (
    ShakespeareDataset,
    start_time,
    time_since,
    build_model,
    tokens_to_index_tensor,
    tokenize_text,
)

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Intro

# Load Data

In [4]:
# Reduced data to make it manageable for smaller systems
DATA_FILE: str = '../data/shakespeare_small.txt'

In [5]:
with open(DATA_FILE, 'r') as data_file:
    raw_text = data_file.read()

In [6]:
print(f'Number of characters: {len(raw_text):,}')

Number of characters: 50,085


# Character-Based Generation

## Encode Text into Integer Tokens

### Normalization

> - Skip?
> - lowercase?

In [7]:
shakespeare_text = raw_text.lower()

### Pretokenization

In [8]:
char2idx = {
    char: idx
    for idx, (char, _) in enumerate(Counter(shakespeare_text).items())
}

idx2char = {
    idx: char
    for char, idx in char2idx.items()
}

### Tokenize

In [9]:
encoded = [
    char2idx[char] for char in shakespeare_text
]

### Postprocessing

> Skip

## Prepare Dataset

In [10]:
n_tokens = len(char2idx)
dataset_size = len(encoded)

In [11]:
length = 32 # Number of characters
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [12]:
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Genaration

In [13]:
def generate_text(
    model,
    input_str: str,
    num_chars: int = 100,
    temperature: float = 1.0,
) -> str:
    model.eval()
    generated_text = input_str.lower()
    input_tensor = tokens_to_index_tensor(
        tokens=input_str.lower(),
        token_index_mapping=char2idx,
    )
    
    with torch.no_grad():
        for _ in range(num_chars):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(
                output[0, -1] / temperature,
                dim=0,
            )
            next_char_idx = torch.multinomial(probabilities, 1).item()
            generated_text += idx2char[next_char_idx]
            input_tensor = torch.cat(
                [
                    input_tensor,
                    torch.tensor([[next_char_idx]], dtype=torch.long),
                ],
                1,
            )
            
    return generated_text

## Train Model

In [14]:
PHRASE = 'To be or not to be'
epochs = 5

start = start_time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    gen_output = generate_text(model, PHRASE, 100)
    print(gen_output)

Epoch 1/5, Loss: 2.259148428843806
[00m 6.2s (0 0%) 1.7822]
------------------------------------------------------------------------
to be or not to be hat, the lid this hace thay lot allde gofen to dethor hae our spingseled o' thaa: mose pome,
to vab
Epoch 2/5, Loss: 1.9099794088461148
[00m 12.5s (1 20%) 1.7914]
------------------------------------------------------------------------
to be or not to be it acounstis of vepory the the brutus:
you tho bliens on i gomst comefs ith and goon it the first y
Epoch 3/5, Loss: 1.8243300361755177
[00m 18.8s (2 40%) 1.6895]
------------------------------------------------------------------------
to be or not to betor's dell.


firs sodes mare fill you of nfen cantond.

firsterioue i' tha onanius.

auphfished on 
Epoch 4/5, Loss: 1.7824568124624869
[00m 25.1s (3 60%) 1.6199]
------------------------------------------------------------------------
to be or not to beat varcidicition are belear found waus:
one make
yours: us, your neart dhaingring ro

## Generate Text

In [15]:
output = generate_text(model, 'To be or not to be', 100)
print(output)

to be or not to bellable the warpes
thoug. had say, haak, deedst more's to uspts not the him res thrigur a
woldonor ou


# Token-Based Generation

In [16]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

## Encode Text into Integer Tokens

### Tokenize (Choose)
- Normalization
- Pretokenization
- Tokenize
- Postprocessing

In [17]:
# Choose a pretrained tokenizer to use
xlmr_model_name = 'xlm-roberta-base'
bert_model_name = 'bert-base-cased'
bert_model_name_uncased = 'bert-base-uncased'

bert_uncased_tokenizer = AutoTokenizer.from_pretrained(
    bert_model_name_uncased,
)

In [18]:
shakespeare_text = (
    tokenize_text(
        tokenizer=bert_uncased_tokenizer,
        text=raw_text,
    )
)

In [19]:
token2idx = {token: idx for idx, (token, _) in enumerate(Counter(shakespeare_text).items())}
idx2token = {idx: token for token, idx in token2idx.items()}
encoded = [token2idx[token] for token in shakespeare_text]
n_tokens = len(token2idx)
dataset_size = len(encoded)

## Prepare Dataset

In [20]:
n_tokens = len(token2idx)
dataset_size = len(encoded)

In [21]:
length = 16  # Tokens 
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [22]:
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Genaration

In [23]:
def generate_text(
    tokenizer,
    model,
    input_str: str,
    num_tokens: int = 100,
    temperature: float = 1.0,
) -> str:
    model.eval()
    tokenized_text = tokenize_text(tokenizer=tokenizer, text=input_str)
    input_tensor = tokens_to_index_tensor(
        tokens=tokenized_text,
        token_index_mapping=token2idx,
    )

    generated_text = []
    with torch.no_grad():
        for _ in range(num_tokens):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(
                output[0, -1] / temperature,
                dim=0,
            )
            next_token_idx = torch.multinomial(probabilities, 1).item()
            generated_text.append(idx2token[next_token_idx])
            input_tensor = torch.cat(
                [
                    input_tensor,
                    torch.tensor([[next_token_idx]], dtype=torch.long),
                ],
                1,
            )
    # Convert to text again
    output_ids = tokenizer.convert_tokens_to_ids(generated_text)
    output_str = input_str + ' ' + tokenizer.decode(output_ids)
    return output_str

## Train Model

In [24]:
epochs = 5

start = start_time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    output = generate_text(bert_uncased_tokenizer, model, 'To be or not to be', 30)
    print(output)


Epoch 1/5, Loss: 6.209641674290532
[00m 2.7s (0 0%) 5.7561]
------------------------------------------------------------------------
To be or not to be , pin first with it one - thato though mean itmen city : this't, dissent : than citizen [SEP] where pre but task and acc
Epoch 2/5, Loss: 5.377411510633386
[00m 5.6s (1 20%) 5.3217]
------------------------------------------------------------------------
To be or not to be man : garland'gains : a enemy. la menuck, what praise the and trumpetsriolan door we at little lovedjo in who thirst do
Epoch 3/5, Loss: 4.812482122061909
[00m 8.4s (2 40%) 5.0367]
------------------------------------------------------------------------
To be or not to be t attend know he is! come,'d marcius : there, hoenius : large, and doubt for my plaster he [SEP] l friends
Epoch 4/5, Loss: 4.416869077705531
[00m 11.2s (3 60%) 4.4550]
------------------------------------------------------------------------
To be or not to be very as - - coriolanus : therefore : whe

## Generate Text

In [25]:
output = generate_text(bert_uncased_tokenizer, model, 'To be or not to be', 30)
print(output)

To be or not to be brave fathers abundant ; that we shall you are since flower have on, we! bold stink at the world that pair hate, that'lamb double the


# Comparison Between Generation