In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import torch
from torch import nn
import lightning as L
from torch.utils.data import DataLoader, Dataset
from lightning import LightningModule, Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from torchmetrics.functional import accuracy
import random


%load_ext autoreload
%autoreload 2
%load_ext rich

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rich extension is already loaded. To reload it, use:
  %reload_ext rich


# Prepare Data

## Preprocess data

In [6]:
with open('input.txt', 'r') as f:
    data = f.read()
    vocab = sorted(set(data))
    print ('The number of unique characters in the corpus is', len(vocab))


The number of unique characters in the corpus is 65
A slice of the unique characters set:
 ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [38]:
class ShakespeareDataset(Dataset):
    def __init__(self, text, seq_length=100):
        self.chars = sorted(list(set(text)))
        self.char_to_int = {ch: i for i, ch in enumerate(self.chars)}
        self.int_to_char = {i: ch for i, ch in enumerate(self.chars)}
        self.data_size, self.vocab_size = len(text), len(self.chars)
        self.seq_length = seq_length
        # self.embedding_dim = L.Embedding(num_embeddings=self.vocab_size, embedding_dim=256)

        # Create training data
        self.x = []
        self.y = []
        for i in range(0, self.data_size - seq_length, 1):
            seq_in = text[i:i + seq_length]
            seq_out = text[i + seq_length]
            self.x.append([self.char_to_int[char] for char in seq_in])
            self.y.append(self.char_to_int[seq_out])
        self.x = np.array(self.x)
        self.y = np.array(self.y)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return torch.tensor(self.x[index], dtype=torch.long), torch.tensor(self.y[index], dtype=torch.long)

## Load dataset

In [39]:
def load_data(file_path, seq_length=100, train_split=0.7, valid_split=0.15):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    dataset = ShakespeareDataset(text, seq_length)
    train_size = int(len(dataset) * train_split)
    valid_size = int(len(dataset) * valid_split)
    test_size = len(dataset) - train_size - valid_size
    train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, valid_size, test_size])
    return train_dataset, valid_dataset, test_dataset

# Model Definition

In [40]:
class TextGenerator(L.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, layers):
        super().__init__()
        self.save_hyperparameters()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = nn.functional.cross_entropy(y_hat, y)
        self.log('train_loss', loss, logger=True, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = nn.functional.cross_entropy(y_hat, y)
        self.log('val_loss', loss, logger=True, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)


In [41]:
# Load your datasets
train_dataset, valid_dataset, test_dataset = load_data('input.txt', seq_length=100)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# Create the model
model = TextGenerator(len(train_dataset.dataset.chars), embedding_dim=128, hidden_dim=256, layers=2)

# Setup trainer and fit the model using Lightning's Trainer
trainer = L.Trainer(max_epochs=20, callbacks=[ModelCheckpoint(monitor="val_loss")])
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=valid_loader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 8.3 K 
1 | lstm      | LSTM      | 921 K 
2 | fc        | Linear    | 16.7 K
----------------------------------------
946 K     Trainable params
0         Non-trainable params
946 K     Total params
3.786     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [34]:
!tensorboard --logdir ./lightning_logs/

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.15.1 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [32]:
def generate_text(model, start_str, gen_length=100, temperature=1.0):
    model.eval()
    

    model.eval()
    input_seq = [test_dataset.dataset.char_to_int[c] for c in start_str[-test_dataset.dataset.seq_length:]]
    input_seq = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0).to(model.device)

    text = start_str
    hidden = None

    for _ in range(gen_length):
        output, hidden = model.lstm(model.embedding(input_seq), hidden)
        output_logits = model.fc(output[:, -1, :])
        p = torch.nn.functional.softmax(output_logits / temperature, dim=-1).detach().cpu().numpy().squeeze()
        char_ind = np.random.choice(len(test_dataset.dataset.chars), p=p)
        next_char = test_dataset.dataset.int_to_char[char_ind]
        text += next_char

        input_seq = torch.cat((input_seq[:, 1:], torch.tensor([[char_ind]], dtype=torch.long).to(model.device)), dim=1)

    return text

print(generate_text(model, 'ROMEO:', gen_length=1000, temperature=0.5))

ROMEO:
Foul not to can do
The sent to the dust
To say the truth
A said to the sent
At our grant thou hate
What he had would
never not to the truth
And stand so head
The head it.
Come, and legs,
As it was not
Betweed some hath
a custom me to thee sure
The faith, as we have heart
With my son,
I will not our return't
Than thou still they we have before
Another-in tribunes
To say an every
Was it is
To the truth,
For that I talk there is a
fatting great man
The consulded
That stand catchious have more
All thou speak
That saying, or how it
That before things
In all the live
With of the truth
That stand cheeks
That way their
Than the first trum;
And such as
With the even thanks
Which with all dreams
That stand so it
With his breaths
at it was a
still apt at
Our strive of patiens
There we will before
That was a grant
With the sun of your voices
Stand and tell your with his
but the gods.

AUFIDIU stand they
And still in this sure
The conselly,
If they are not
By the people.

SICINIUS: then, ins