In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import requests
from collections import Counter
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Intro

# Load Data

In [3]:
shakespeare_url = 'https://homl.info/shakespeare'
response = requests.get(shakespeare_url)
raw_text = response.text

# Character-Based Generation

## Encode Text into Integer Tokens

### Normalization

> - Skip?
> - lowercase?

In [4]:
shakespeare_text = raw_text.lower()

### Pretokenization

In [5]:
char2idx = {
    char: idx
    for idx, (char, _) in enumerate(Counter(shakespeare_text).items())
}

idx2char = {
    idx: char
    for char, idx in char2idx.items()
}

### Tokenize

In [6]:
encoded = [
    char2idx[char] for char in shakespeare_text
]

### Postprocessing

> Skip

## Prepare Dataset

In [7]:
n_tokens = len(char2idx)
dataset_size = len(encoded)

In [8]:
class ShakespeareDataset(Dataset):
    def __init__(self, encoded_text, sequence_length):
        self.encoded_text = encoded_text
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.encoded_text) - self.sequence_length

    def __getitem__(self, index):
        x = torch.tensor(
            self.encoded_text[index: (index+self.sequence_length)],
            dtype=torch.long,
        )
        # Target is shifted by one character
        y = torch.tensor(
            self.encoded_text[(index+1): (index+self.sequence_length+1)],
            dtype=torch.long,
        )
        return x, y

In [9]:
length = 32 # Number of characters
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [10]:
class ShakespeareModel(nn.Module):
    def __init__(self, n_tokens, embedding_dim, hidden_dim):
        super(ShakespeareModel, self).__init__()
        self.embedding = nn.Embedding(n_tokens, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, n_tokens)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return x

In [11]:
embed_dim = 16
hidden_dim = 32

model = ShakespeareModel(n_tokens, embed_dim, hidden_dim)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Train Model

In [12]:
import time, math

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [13]:
def generate_text(model, input_str, num_chars=100):
    model.eval()
    generated_text = input_str.lower()
    input_tensor = torch.tensor([char2idx[char] for char in input_str.lower()], dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        for _ in range(num_chars):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(output[0, -1], dim=0)
            next_char_idx = torch.multinomial(probabilities, 1).item()
            generated_text += idx2char[next_char_idx]
            input_tensor = torch.cat([input_tensor, torch.tensor([[next_char_idx]], dtype=torch.long)], 1)
            
    return generated_text

In [15]:
PHRASE = 'To be or not to be'
epochs = 5

start = time.time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    gen_output = generate_text(model, PHRASE, 100)
    print(gen_output)

Epoch 1/5, Loss: 1.897376342147541
[2m 21s (0 0%) 1.7804]
------------------------------------------------------------------------
to be or not to be
meese;
and requather; well her they heard af enooungnty.
the thmer the thou
spetied so, deards read
Epoch 2/5, Loss: 1.8148567532970459
[4m 48s (1 20%) 1.6553]
------------------------------------------------------------------------
to be or not to be doth what dear to levereve noble sake othergy mark; not read, yet many my utitee the litus:
in
if s
Epoch 3/5, Loss: 1.8023768587378644
[7m 5s (2 40%) 1.7877]
------------------------------------------------------------------------
to be or not to ben to do of our turs of lencctions shall, men mean of i so not stain, that not nurdinghat, our blone.
Epoch 4/5, Loss: 1.795731641097383
[9m 15s (3 60%) 2.0107]
------------------------------------------------------------------------
to be or not to be is oust better,
awerle, wear all tirg busuress yourshon enough now muft
hue plait onletience, ma

## Generate Text

In [17]:
def generate_text(model, input_str, num_chars=100):
    model.eval()
    generated_text = input_str.lower()
    input_tensor = (
        torch.tensor(
            [char2idx[char] for char in input_str.lower()],
            dtype=torch.long,
        )
        .unsqueeze(0)
    )
    
    with torch.no_grad():
        for _ in range(num_chars):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(output[0, -1], dim=0)
            next_char_idx = torch.multinomial(probabilities, 1).item()
            generated_text += idx2char[next_char_idx]
            input_tensor = torch.cat([input_tensor, torch.tensor([[next_char_idx]], dtype=torch.long)], 1)
            
    return generated_text

In [16]:
output = generate_text(model, 'To be or not to be', 100)
print(output)

to be or not to be
piece.

lugidererty:
of maknio:
toun the the ah thee shonven.

hore mente, nor, much anvend the sug


# Token-Based Generation

## Encode Text into Integer Tokens

### Tokenize (Choose)
- Normalization
- Pretokenization
- Tokenize
- Postprocessing

## Define Model

## Train Model

## Generate Text

# Comparison Between Generation