In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import torch
from torch import nn
import lightning as L
from torch.utils.data import DataLoader, Dataset
from lightning import LightningModule, Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from torchmetrics.functional import accuracy
import random
import re


%load_ext autoreload
%autoreload 2
%load_ext rich

In [7]:
class ShakespeareDataset(Dataset):
    def __init__(self, text, seq_length=100):
        self.chars = sorted(list(set(text)))
        self.char_to_int = {ch: i for i, ch in enumerate(self.chars)}
        self.int_to_char = {i: ch for i, ch in enumerate(self.chars)}
        self.data_size, self.vocab_size = len(text), len(self.chars)
        self.seq_length = seq_length
        # self.embedding_dim = L.Embedding(num_embeddings=self.vocab_size, embedding_dim=256)

        # Create training data
        self.x = []
        self.y = []
        for i in range(0, self.data_size - seq_length, 1):
            seq_in = text[i:i + seq_length]
            seq_out = text[i + seq_length]
            self.x.append([self.char_to_int[char] for char in seq_in])
            self.y.append(self.char_to_int[seq_out])
        self.x = np.array(self.x)
        self.y = np.array(self.y)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return torch.tensor(self.x[index], dtype=torch.long), torch.tensor(self.y[index], dtype=torch.long)


class TextGenerator(L.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, layers):
        super().__init__()
        self.save_hyperparameters()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers=layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

    def training_step(self, batch):
        x, y = batch
        y_hat = self.forward(x)
        loss = nn.functional.cross_entropy(y_hat, y)
        self.log('train_loss', loss, logger=True,
                 on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch):
        x, y = batch
        y_hat = self.forward(x)
        loss = nn.functional.cross_entropy(y_hat, y)
        self.log('val_loss', loss, logger=True,
                 on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

In [8]:
def load_data(file_path, seq_length=100, train_split=0.7, valid_split=0.15):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    dataset = ShakespeareDataset(text, seq_length)
    train_size = int(len(dataset) * train_split)
    valid_size = int(len(dataset) * valid_split)
    test_size = len(dataset) - train_size - valid_size
    train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
        dataset, [train_size, valid_size, test_size])
    return dataset, train_dataset, valid_dataset, test_dataset


# Load your datasets
dataset, train_dataset, valid_dataset, test_dataset = load_data(
    'input.txt', seq_length=100)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [14]:
model = TextGenerator.load_from_checkpoint(
    './lightning_logs/version_10/checkpoints/epoch=8-step=54900.ckpt')

In [16]:
def generate_text(model, start_str, gen_length=100, temperature=1.0):
    model.eval()
    input_seq = [dataset.char_to_int[c] for c in start_str[-dataset.seq_length:]]
    input_seq = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0).to(model.device)

    text = start_str
    hidden = None

    for _ in range(gen_length):
        output, hidden = model.lstm(model.embedding(input_seq), hidden)
        output_logits = model.fc(output[:, -1, :])
        p = torch.nn.functional.softmax(output_logits / temperature, dim=-1).detach().cpu().numpy().squeeze()
        char_ind = np.random.choice(len(dataset.chars), p=p)
        next_char = dataset.int_to_char[char_ind]
        text += next_char

        input_seq = torch.cat((input_seq[:, 1:], torch.tensor([[char_ind]], dtype=torch.long).to(model.device)), dim=1)

    return text

    
print(generate_text(model, 'To be or not to be', gen_length=1000, temperature=1.0))


To be or not to behavour.

HENRY BOLINGBROKE:
Uncleave Serempties
To beder'd me well stay!

SICINIUS:
Pray you
And yet false, then
Edward within the east,
As one riled words.
Patience and crown
To crase my moyself.
My Lord Angelo, I
Do with that one for
offence, and theer heart
Ay, there six fray goodly gentle?
Ours: hed! O, for heaven
wash the bild himself.
Grean a facl down of no!
Scape.
You worthy of each,
On each heart of toil,
And banish'd me father
Among the beneration
Lord Edward prence!
For where you have burn.

CAPULET:
what, faults hence,
I have heard her; and,
young prespiter'd liege,
Unless zenelul case:
Edward! for the noble rafe
Suck but in extremit,
That Prince over as her with
were but dower them
past my advential achie,
Right pind to my boy.

FRIAR LAURENCE:
Then, to she is my woes
And makes me born to't;
Which officer's mean
His signior proport,
Are for my answer the yieldon.

LUCIO:
If we fled country,
Or early royal fined to
me thequager wash'd days.

HERMIONE:
My c