In [1]:
from __future__ import annotations
from collections.abc import Sequence

import time
import math
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Intro

# Load Data

In [3]:
# Reduced data to make it manageable for smaller systems
DATA_FILE: str = '../data/shakespeare_small.txt'

In [4]:
with open(DATA_FILE, 'r') as data_file:
    raw_text = data_file.read()

In [5]:
print(f'Number of characters: {len(raw_text):,}')

Number of characters: 50,085


# Character-Based Generation

## Encode Text into Integer Tokens

### Normalization

> - Skip?
> - lowercase?

In [6]:
shakespeare_text = raw_text.lower()

### Pretokenization

In [7]:
char2idx = {
    char: idx
    for idx, (char, _) in enumerate(Counter(shakespeare_text).items())
}

idx2char = {
    idx: char
    for char, idx in char2idx.items()
}

### Tokenize

In [8]:
encoded = [
    char2idx[char] for char in shakespeare_text
]

### Postprocessing

> Skip

## Prepare Dataset

In [9]:
n_tokens = len(char2idx)
dataset_size = len(encoded)

In [10]:
class ShakespeareDataset(Dataset):
    def __init__(self, encoded_text: Sequence, sequence_length: int):
        self.encoded_text = encoded_text
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.encoded_text) - self.sequence_length

    def __getitem__(self, index):
        x = torch.tensor(
            self.encoded_text[index: (index+self.sequence_length)],
            dtype=torch.long,
        )
        # Target is shifted by one character
        y = torch.tensor(
            self.encoded_text[(index+1): (index+self.sequence_length+1)],
            dtype=torch.long,
        )
        return x, y

In [11]:
length = 32 # Number of characters
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [12]:
class ShakespeareModel(nn.Module):
    def __init__(self, n_tokens: int, embedding_dim: int, hidden_dim: int):
        super(ShakespeareModel, self).__init__()
        self.embedding = nn.Embedding(n_tokens, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, n_tokens)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return x

In [13]:
embed_dim = 16
hidden_dim = 32

model = ShakespeareModel(n_tokens, embed_dim, hidden_dim)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Genaration

In [14]:
def generate_text(
    model,
    input_str: str,
    num_chars: int = 100,
    temperature: float = 1.0,
) -> str:
    model.eval()
    generated_text = input_str.lower()
    input_tensor = (
        torch.tensor(
            [char2idx[char] for char in input_str.lower()],
            dtype=torch.long,
        )
        .unsqueeze(0)
    )
    
    with torch.no_grad():
        for _ in range(num_chars):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(
                output[0, -1] / temperature,
                dim=0,
            )
            next_char_idx = torch.multinomial(probabilities, 1).item()
            generated_text += idx2char[next_char_idx]
            input_tensor = torch.cat(
                [
                    input_tensor,
                    torch.tensor([[next_char_idx]], dtype=torch.long),
                ],
                1,
            )
            
    return generated_text

## Train Model

In [15]:
def time_since(since: float) -> str:
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [16]:
PHRASE = 'To be or not to be'
epochs = 5

start = time.time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    gen_output = generate_text(model, PHRASE, 100)
    print(gen_output)

Epoch 1/5, Loss: 2.271035790138732
[0m 6s (0 0%) 2.1613]
------------------------------------------------------------------------
to be or not to beses
hathgr mysher splin aruso ant, waver generie-:
male, anf you you indie, rour wayall your,
carene
Epoch 2/5, Loss: 1.9209989490600439
[0m 12s (1 20%) 1.8412]
------------------------------------------------------------------------
to be or not to bety thatled as of it prowen hisous frught comenb the in the blous:
and thest raga sabluft mised seggt
Epoch 3/5, Loss: 1.8381634871418864
[0m 18s (2 40%) 1.7816]
------------------------------------------------------------------------
to be or not to be from al propy trying
ws
youous, quarges,
my madnty's and on't a grings sire in bear mesest enomant?
Epoch 4/5, Loss: 1.7927972703696058
[0m 25s (3 60%) 1.8368]
------------------------------------------------------------------------
to be or not to bense, virster obence home.

menenius:
whint they showe fach, and of thesey prpeyeat comen i shalt

## Generate Text

In [17]:
output = generate_text(model, 'To be or not to be', 100)
print(output)

to be or not to be ive hawino,
thasd stele a tike theme us meences.

volomply to it frots throbes my torcher
and chath


# Token-Based Generation

In [18]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig

## Encode Text into Integer Tokens

### Tokenize (Choose)
- Normalization
- Pretokenization
- Tokenize
- Postprocessing

In [19]:
# Choose a pretrained tokenizer to use
xlmr_model_name = 'xlm-roberta-base'
bert_model_name = 'bert-base-cased'
bert_model_name_uncased = 'bert-base-uncased'

bert_uncased_tokenizer = AutoTokenizer.from_pretrained(
    bert_model_name_uncased,
)

In [20]:
def tokenize_text(
    tokenizer,
    text: str,
) -> list[str]:
    max_seq_length = tokenizer.model_max_length
    # Chunk the string so tokenizer can take in full input
    chunks_generator = (
        text[i:i+max_seq_length]
        for i in range(0, len(text), max_seq_length)
    )
    # Special tokens to ignore
    ignore_tokens = (
        tokenizer.cls_token,
    )
    # Get list of tokens (one chunk at a time)
    tokenized_text = [
        token
        for chunk in chunks_generator
        for token in tokenizer(chunk).tokens()
        if (
            token not in ignore_tokens
        )
    ]

    return tokenized_text

In [21]:
shakespeare_text = (
    tokenize_text(
        tokenizer=bert_uncased_tokenizer,
        text=raw_text,
    )
)

In [22]:
token2idx = {token: idx for idx, (token, _) in enumerate(Counter(shakespeare_text).items())}
idx2token = {idx: token for token, idx in token2idx.items()}
encoded = [token2idx[token] for token in shakespeare_text]
n_tokens = len(token2idx)
dataset_size = len(encoded)

## Prepare Dataset

In [23]:
n_tokens = len(token2idx)
dataset_size = len(encoded)

In [24]:
class ShakespeareDataset(Dataset):
    def __init__(self, encoded_text: Sequence, sequence_length: int):
        self.encoded_text = encoded_text
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.encoded_text) - self.sequence_length

    def __getitem__(self, index):
        x = torch.tensor(
            self.encoded_text[index: (index+self.sequence_length)],
            dtype=torch.long,
        )
        # Target is shifted by one token
        y = torch.tensor(
            self.encoded_text[(index+1): (index+self.sequence_length+1)],
            dtype=torch.long,
        )
        return x, y

In [25]:
length = 16  # Tokens 
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [26]:
class ShakespeareModel(nn.Module):
    def __init__(self, n_tokens: int, embedding_dim: int, hidden_dim: int):
        super(ShakespeareModel, self).__init__()
        self.embedding = nn.Embedding(n_tokens, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, n_tokens)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return x

In [27]:
embed_dim = 16
hidden_dim = 32

model = ShakespeareModel(n_tokens, embed_dim, hidden_dim)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Genaration

In [28]:
def generate_text(
    tokenizer,
    model,
    input_str: str,
    num_tokens: int = 100,
    temperature: float = 1.0,
) -> str:
    model.eval()
    tokenized_text = tokenize_text(tokenizer=tokenizer, text=input_str)
    input_tensor = (
        torch.tensor(
            [token2idx[token] for token in tokenized_text],
            dtype=torch.long,
        )
        .unsqueeze(0)
    )
    generated_text = []
    with torch.no_grad():
        for _ in range(num_tokens):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(
                output[0, -1] / temperature,
                dim=0,
            )
            next_token_idx = torch.multinomial(probabilities, 1).item()
            generated_text.append(idx2token[next_token_idx])
            input_tensor = torch.cat(
                [
                    input_tensor,
                    torch.tensor([[next_token_idx]], dtype=torch.long),
                ],
                1,
            )
    # Convert to text again
    output_ids = tokenizer.convert_tokens_to_ids(generated_text)
    output_str = input_str + ' ' + tokenizer.decode(output_ids)
    return output_str

## Train Model

In [29]:
def time_since(since: float) -> str:
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [30]:
epochs = 5

start = time.time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    output = generate_text(bert_uncased_tokenizer, model, 'To be or not to be', 30)
    print(output)


Epoch 1/5, Loss: 6.166176602460336
[0m 2s (0 0%) 5.4258]
------------------------------------------------------------------------
To be or not to be risen this ofecteni between, was do who brook foremost'let may ll the le common with i o he to what have letter., off
Epoch 2/5, Loss: 5.406025084896364
[0m 5s (1 20%) 5.2818]
------------------------------------------------------------------------
To be or not to be burn and tired shall these farewell ; ilina great, than, : no, i the be surf mal good art predecessors a sds by first!
Epoch 3/5, Loss: 4.871879844849812
[0m 8s (2 40%) 4.1865]
------------------------------------------------------------------------
To be or not to be has the ice [SEP] they? all : away even to lip, we are? upon him corn clap look to there'we ne be officers.
Epoch 4/5, Loss: 4.4878922153786185
[0m 10s (3 60%) 4.3801]
------------------------------------------------------------------------
To be or not to be ours to this teous that or strength. com. valeria : co

## Generate Text

In [31]:
output = generate_text(bert_uncased_tokenizer, model, 'To be or not to be', 30)
print(output)

To be or not to be , and di at and poor! and ; sir, during the matter : t virgilia : i heard her would letter'instruments [SEP] times ; i


# Comparison Between Generation