In [1]:
from __future__ import annotations
from collections.abc import Sequence

import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torch.utils.data import DataLoader

In [2]:
from helper import (
    ShakespeareDataset,
    start_time,
    time_since,
)

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Intro

# Load Data

In [4]:
# Reduced data to make it manageable for smaller systems
DATA_FILE: str = '../data/shakespeare_small.txt'

In [5]:
with open(DATA_FILE, 'r') as data_file:
    raw_text = data_file.read()

In [6]:
print(f'Number of characters: {len(raw_text):,}')

Number of characters: 50,085


# Character-Based Generation

## Encode Text into Integer Tokens

### Normalization

> - Skip?
> - lowercase?

In [7]:
shakespeare_text = raw_text.lower()

### Pretokenization

In [8]:
char2idx = {
    char: idx
    for idx, (char, _) in enumerate(Counter(shakespeare_text).items())
}

idx2char = {
    idx: char
    for char, idx in char2idx.items()
}

### Tokenize

In [9]:
encoded = [
    char2idx[char] for char in shakespeare_text
]

### Postprocessing

> Skip

## Prepare Dataset

In [10]:
n_tokens = len(char2idx)
dataset_size = len(encoded)

In [11]:
length = 32 # Number of characters
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [12]:
class ShakespeareModel(nn.Module):
    def __init__(self, n_tokens: int, embedding_dim: int, hidden_dim: int):
        super(ShakespeareModel, self).__init__()
        self.embedding = nn.Embedding(n_tokens, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, n_tokens)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return x

In [13]:
embed_dim = 16
hidden_dim = 32

model = ShakespeareModel(n_tokens, embed_dim, hidden_dim)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Genaration

In [14]:
def generate_text(
    model,
    input_str: str,
    num_chars: int = 100,
    temperature: float = 1.0,
) -> str:
    model.eval()
    generated_text = input_str.lower()
    input_tensor = (
        torch.tensor(
            [char2idx[char] for char in input_str.lower()],
            dtype=torch.long,
        )
        .unsqueeze(0)
    )
    
    with torch.no_grad():
        for _ in range(num_chars):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(
                output[0, -1] / temperature,
                dim=0,
            )
            next_char_idx = torch.multinomial(probabilities, 1).item()
            generated_text += idx2char[next_char_idx]
            input_tensor = torch.cat(
                [
                    input_tensor,
                    torch.tensor([[next_char_idx]], dtype=torch.long),
                ],
                1,
            )
            
    return generated_text

## Train Model

In [15]:
PHRASE = 'To be or not to be'
epochs = 5

start = start_time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    gen_output = generate_text(model, PHRASE, 100)
    print(gen_output)

Epoch 1/5, Loss: 2.2826013823286795
[00m 6.3s (0 0%) 1.9859]
------------------------------------------------------------------------
to be or not to be:
and then lthaseren, thice.
.

fitga beand put.

valinizibive he tor worts call word
homay romate e
Epoch 2/5, Loss: 1.908258936961238
[00m 12.6s (1 20%) 1.8876]
------------------------------------------------------------------------
to be or not to bears of his gon him compinot mius.

shim
in wento whis sean afalen wout?

brutod,
my to hergs rnevera
Epoch 3/5, Loss: 1.8248796020452969
[00m 19.0s (2 40%) 1.9155]
------------------------------------------------------------------------
to be or not to be all the?

aufirst not pecm; must bear gots!

cimcinius:
sonke nor isue you,
we our valin wiels,
tha
Epoch 4/5, Loss: 1.7833617346736188
[00m 25.4s (3 60%) 2.0215]
------------------------------------------------------------------------
to be or not to bell.:
nod prove i a
me not yould vim.

martius:
tell i,
the culp! tho store, indere s

## Generate Text

In [16]:
output = generate_text(model, 'To be or not to be', 100)
print(output)

to be or not to bebers theg shall they fiblat had you areshere in that,
een peot cither fender:
then one the pratues! 


# Token-Based Generation

In [17]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig

## Encode Text into Integer Tokens

### Tokenize (Choose)
- Normalization
- Pretokenization
- Tokenize
- Postprocessing

In [18]:
# Choose a pretrained tokenizer to use
xlmr_model_name = 'xlm-roberta-base'
bert_model_name = 'bert-base-cased'
bert_model_name_uncased = 'bert-base-uncased'

bert_uncased_tokenizer = AutoTokenizer.from_pretrained(
    bert_model_name_uncased,
)

In [19]:
def tokenize_text(
    tokenizer,
    text: str,
) -> list[str]:
    max_seq_length = tokenizer.model_max_length
    # Chunk the string so tokenizer can take in full input
    chunks_generator = (
        text[i:i+max_seq_length]
        for i in range(0, len(text), max_seq_length)
    )
    # Special tokens to ignore
    ignore_tokens = (
        tokenizer.cls_token,
    )
    # Get list of tokens (one chunk at a time)
    tokenized_text = [
        token
        for chunk in chunks_generator
        for token in tokenizer(chunk).tokens()
        if (
            token not in ignore_tokens
        )
    ]

    return tokenized_text

In [20]:
shakespeare_text = (
    tokenize_text(
        tokenizer=bert_uncased_tokenizer,
        text=raw_text,
    )
)

In [21]:
token2idx = {token: idx for idx, (token, _) in enumerate(Counter(shakespeare_text).items())}
idx2token = {idx: token for token, idx in token2idx.items()}
encoded = [token2idx[token] for token in shakespeare_text]
n_tokens = len(token2idx)
dataset_size = len(encoded)

## Prepare Dataset

In [22]:
n_tokens = len(token2idx)
dataset_size = len(encoded)

In [23]:
class ShakespeareDataset(Dataset):
    def __init__(self, encoded_text: Sequence, sequence_length: int):
        self.encoded_text = encoded_text
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.encoded_text) - self.sequence_length

    def __getitem__(self, index):
        x = torch.tensor(
            self.encoded_text[index: (index+self.sequence_length)],
            dtype=torch.long,
        )
        # Target is shifted by one token
        y = torch.tensor(
            self.encoded_text[(index+1): (index+self.sequence_length+1)],
            dtype=torch.long,
        )
        return x, y

In [24]:
length = 16  # Tokens 
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [25]:
class ShakespeareModel(nn.Module):
    def __init__(self, n_tokens: int, embedding_dim: int, hidden_dim: int):
        super(ShakespeareModel, self).__init__()
        self.embedding = nn.Embedding(n_tokens, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, n_tokens)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return x

In [26]:
embed_dim = 16
hidden_dim = 32

model = ShakespeareModel(n_tokens, embed_dim, hidden_dim)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Define Text Genaration

In [27]:
def generate_text(
    tokenizer,
    model,
    input_str: str,
    num_tokens: int = 100,
    temperature: float = 1.0,
) -> str:
    model.eval()
    tokenized_text = tokenize_text(tokenizer=tokenizer, text=input_str)
    input_tensor = (
        torch.tensor(
            [token2idx[token] for token in tokenized_text],
            dtype=torch.long,
        )
        .unsqueeze(0)
    )
    generated_text = []
    with torch.no_grad():
        for _ in range(num_tokens):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(
                output[0, -1] / temperature,
                dim=0,
            )
            next_token_idx = torch.multinomial(probabilities, 1).item()
            generated_text.append(idx2token[next_token_idx])
            input_tensor = torch.cat(
                [
                    input_tensor,
                    torch.tensor([[next_token_idx]], dtype=torch.long),
                ],
                1,
            )
    # Convert to text again
    output_ids = tokenizer.convert_tokens_to_ids(generated_text)
    output_str = input_str + ' ' + tokenizer.decode(output_ids)
    return output_str

In [28]:
tokenized_text = tokenize_text(tokenizer=bert_uncased_tokenizer, text='To be or not to')
input_tensor = (
    torch.tensor(
        [token2idx[token] for token in tokenized_text],
        dtype=torch.long,
    )
    .unsqueeze(0)
    )

## Train Model

In [29]:
epochs = 5

start = start_time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    output = generate_text(bert_uncased_tokenizer, model, 'To be or not to be', 30)
    print(output)


Epoch 1/5, Loss: 6.169780093114733
[00m 2.7s (0 0%) 4.9686]
------------------------------------------------------------------------
To be or not to be ##us silenced co'word pray the good most,! plaguebed : meat for citizenutus greatest is : touching ir. think of, madam common to
Epoch 2/5, Loss: 5.345452580475001
[00m 5.4s (1 20%) 4.9929]
------------------------------------------------------------------------
To be or not to be stride a years : i advance super characterus, come crueldon andural malice for utmost, elders sent, that toter. when wheret shall
Epoch 3/5, Loss: 4.8034395468983675
[00m 8.1s (2 40%) 4.7401]
------------------------------------------------------------------------
To be or not to be ##ia : briefly and allrice at time have thy infant : every shall that the people jupiter, heaven had hs, that note. virgilia to
Epoch 4/5, Loss: 4.420099650028247
[00m 10.9s (3 60%) 4.3923]
------------------------------------------------------------------------
To be or not to be 

## Generate Text

In [30]:
output = generate_text(bert_uncased_tokenizer, model, 'To be or not to be', 30)
print(output)

To be or not to be sworn with hisnts and their noble,, mutually know for ma good report thankful caius, we shall will in thirst. sicinius : therefore


# Comparison Between Generation