In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Intro

# Load Data

In [3]:
# Reduced data to make it manageable for smaller systems
DATA_FILE = '../data/shakespeare_small.txt'

In [4]:
with open(DATA_FILE, 'r') as data_file:
    raw_text = data_file.read()

In [5]:
print(f'Number of characters: {len(raw_text):,}')

Number of characters: 50,085


# Character-Based Generation

## Encode Text into Integer Tokens

### Normalization

> - Skip?
> - lowercase?

In [6]:
shakespeare_text = raw_text.lower()

### Pretokenization

In [7]:
char2idx = {
    char: idx
    for idx, (char, _) in enumerate(Counter(shakespeare_text).items())
}

idx2char = {
    idx: char
    for char, idx in char2idx.items()
}

### Tokenize

In [8]:
encoded = [
    char2idx[char] for char in shakespeare_text
]

### Postprocessing

> Skip

## Prepare Dataset

In [9]:
n_tokens = len(char2idx)
dataset_size = len(encoded)

In [10]:
class ShakespeareDataset(Dataset):
    def __init__(self, encoded_text, sequence_length):
        self.encoded_text = encoded_text
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.encoded_text) - self.sequence_length

    def __getitem__(self, index):
        x = torch.tensor(
            self.encoded_text[index: (index+self.sequence_length)],
            dtype=torch.long,
        )
        # Target is shifted by one character
        y = torch.tensor(
            self.encoded_text[(index+1): (index+self.sequence_length+1)],
            dtype=torch.long,
        )
        return x, y

In [11]:
length = 32 # Number of characters
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [12]:
class ShakespeareModel(nn.Module):
    def __init__(self, n_tokens, embedding_dim, hidden_dim):
        super(ShakespeareModel, self).__init__()
        self.embedding = nn.Embedding(n_tokens, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, n_tokens)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return x

In [13]:
embed_dim = 16
hidden_dim = 32

model = ShakespeareModel(n_tokens, embed_dim, hidden_dim)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Train Model

In [14]:
import time, math

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [15]:
def generate_text(
    model,
    input_str,
    num_chars=100,
    temperature=1.0,
):
    model.eval()
    generated_text = input_str.lower()
    input_tensor = (
        torch.tensor(
            [char2idx[char] for char in input_str.lower()],
            dtype=torch.long,
        )
        .unsqueeze(0)
    )
    
    with torch.no_grad():
        for _ in range(num_chars):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(
                output[0, -1] / temperature,
                dim=0,
            )
            next_char_idx = torch.multinomial(probabilities, 1).item()
            generated_text += idx2char[next_char_idx]
            input_tensor = torch.cat(
                [
                    input_tensor,
                    torch.tensor([[next_char_idx]], dtype=torch.long),
                ],
                1,
            )
            
    return generated_text

In [16]:
PHRASE = 'To be or not to be'
epochs = 5

start = time.time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    gen_output = generate_text(model, PHRASE, 100)
    print(gen_output)

Epoch 1/5, Loss: 2.268483464938764
[0m 6s (0 0%) 1.8684]
------------------------------------------------------------------------
to be or not to beslo that tarres:
not! spaly and headerius:
hall nord to when and seneminenoutiesswingone the the to 
Epoch 2/5, Loss: 1.9106965876996707
[0m 12s (1 20%) 1.9195]
------------------------------------------------------------------------
to be or not to be's shalnteser, marf, that forther as in done ritutugup calack
gace ir wanger or tatgy wart of chpoce
Epoch 3/5, Loss: 1.8223891508845855
[0m 19s (2 40%) 1.6410]
------------------------------------------------------------------------
to be or not to bereadak him a thelded to be a do nese-tor a on the swar hack?

marcius:
the daon, othed,
yy
parcius f
Epoch 4/5, Loss: 1.778055978811587
[0m 25s (3 60%) 1.6997]
------------------------------------------------------------------------
to be or not to bead and they yous inss abund the notlicer are the gows
fuefn the commomverine
for'd, go belown; th

## Generate Text

In [17]:
output = generate_text(model, 'To be or not to be', 100)
print(output)

to be or not to be volp fiant,--'
when humsel, to bene. wor, as
of
yett begrich the ind be shall reecome ler that in y


# Token-Based Generation

In [18]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig

## Encode Text into Integer Tokens

### Tokenize (Choose)
- Normalization
- Pretokenization
- Tokenize
- Postprocessing

In [19]:
# Choose a pretrained tokenizer to use
xlmr_model_name = 'xlm-roberta-base'
bert_model_name = 'bert-base-cased'
bert_model_name_uncased = 'bert-base-uncased'

bert_uncased_tokenizer = AutoTokenizer.from_pretrained(
    bert_model_name_uncased,
)

In [20]:
shakespeare_text = bert_uncased_tokenizer(raw_text).tokens()

Token indices sequence length is longer than the specified maximum sequence length for this model (13047 > 512). Running this sequence through the model will result in indexing errors


In [21]:
token2idx = {token: idx for idx, (token, _) in enumerate(Counter(shakespeare_text).items())}
idx2token = {idx: token for token, idx in token2idx.items()}
encoded = [token2idx[token] for token in shakespeare_text]
n_tokens = len(token2idx)
dataset_size = len(encoded)

## Prepare Dataset

In [22]:
n_tokens = len(token2idx)
dataset_size = len(encoded)

In [23]:
class ShakespeareDataset(Dataset):
    def __init__(self, encoded_text, sequence_length):
        self.encoded_text = encoded_text
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.encoded_text) - self.sequence_length

    def __getitem__(self, index):
        x = torch.tensor(
            self.encoded_text[index: (index+self.sequence_length)],
            dtype=torch.long,
        )
        # Target is shifted by one character
        y = torch.tensor(
            self.encoded_text[(index+1): (index+self.sequence_length+1)],
            dtype=torch.long,
        )
        return x, y

In [24]:
length = 16  # Tokens 
batch_size = 32

train_dataset = ShakespeareDataset(encoded, length)
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
)

## Define Model

In [25]:
class ShakespeareModel(nn.Module):
    def __init__(self, n_tokens, embedding_dim, hidden_dim):
        super(ShakespeareModel, self).__init__()
        self.embedding = nn.Embedding(n_tokens, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, n_tokens)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return x

In [26]:
embed_dim = 16
hidden_dim = 32

model = ShakespeareModel(n_tokens, embed_dim, hidden_dim)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Train Model

In [27]:
import time, math

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [28]:
def generate_text(
    tokenizer,
    model,
    input_str,
    num_tokens=100,
    temperature=1.0,
):
    model.eval()
    tokenized_text = tokenizer(input_str).tokens()
    input_tensor = (
        torch.tensor(
            [token2idx[token] for token in tokenized_text],
            dtype=torch.long,
        )
        .unsqueeze(0)
    )
    generated_text = []
    with torch.no_grad():
        for _ in range(num_tokens):
            output = model(input_tensor.to(device))
            probabilities = nn.functional.softmax(
                output[0, -1] / temperature,
                dim=0,
            )
            next_token_idx = torch.multinomial(probabilities, 1).item()
            generated_text.append(idx2token[next_token_idx])
            input_tensor = torch.cat(
                [
                    input_tensor,
                    torch.tensor([[next_token_idx]], dtype=torch.long),
                ],
                1,
            )
    # Convert to text again
    output_ids = tokenizer.convert_tokens_to_ids(generated_text)
    output_str = input_str + ' ' + tokenizer.decode(output_ids)
    return output_str

In [29]:
epochs = 5

start = time.time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / epochs * 100, loss))
    print('-'*72)
    output = generate_text(bert_uncased_tokenizer, model, 'To be or not to be', 30)
    print(output)


Epoch 1/5, Loss: 6.197644301489288
[0m 2s (0 0%) 5.7094]
------------------------------------------------------------------------
To be or not to be he, how or whatdro la w where are but have home, no should mule you city spiritum never not is tis thy bands,ini marc
Epoch 2/5, Loss: 5.333558495138206
[0m 5s (1 20%) 4.8525]
------------------------------------------------------------------------
To be or not to be maycu : and,rti wears give wounded they hot he, corio this to,er report? rats justice got so members him :ith statutes
Epoch 3/5, Loss: 4.787915608462165
[0m 8s (2 40%) 4.2919]
------------------------------------------------------------------------
To be or not to be us to be content - - when mothers dogs subjects there arecrest me, holding - letter and the as cannot he means mine. men's woo
Epoch 4/5, Loss: 4.409752887838027
[0m 10s (3 60%) 4.6681]
------------------------------------------------------------------------
To be or not to be ##chers here't you is the noblestang

## Generate Text

In [30]:
output = generate_text(bert_uncased_tokenizer, model, 'To be or not to be', 30)
print(output)

To be or not to be cupboard sous : and you find you havee to steel company, chests the dispatch is the blood is the gentle flower. else. volumnia


# Comparison Between Generation