### Imports

In [None]:
# https://www.geeksforgeeks.org/how-to-create-an-empty-tuple-in-python/

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out)
        return out
    
    

# input_size = 1
# hidden_size = 20
# output_size = 1
# model = SimpleRNN(input_size, hidden_size, output_size)

In [30]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
from tqdm import tqdm

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f'Device = {device}')
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())

FILE_PATH = './shakes.txt'
SEQ_LENGTH = 25
BATCH_SIZE = 128

HIDDEN_SIZE = 256
HIDDEN_LAYERS = 1

NUM_EPOCHS = 1
LEARNING_RATE = 0.001

Device = mps
True
True


### Data pipeline

In [31]:
class DataPreprocessing:
    def __init__(self, file_path, seq_length, batch_size):
        self.file_path = file_path
        self.seq_length = seq_length
        self.batch_size = batch_size
        self.data = None
        self.K = None
        self.char_to_ind = None
        self.ind_to_char = None
    
    def load_data(self):
        """Prepares all the data necessary to train an RNN
        """
        fid = open(self.file_path, "r")
        book_data = fid.read()
        fid.close()
        self.data = book_data
        unique_chars = list(set(book_data))
        K = len(unique_chars)
        self.K = K
        mapping_value = np.arange(K)
        char_to_ind = dict(zip(unique_chars, mapping_value))
        ind_to_char = dict(zip(mapping_value, unique_chars))
        self.char_to_ind = char_to_ind
        self.ind_to_char = ind_to_char

    def get_one_hot_encoding(self, X_chars):
        """Encodes text as a one hot array

        Args:
            char_to_ind (dict): the mapping
            X_chars (string): characters to encode

        Returns:
            np.ndarray: one-hot encoding
        """
        seq_length = len(X_chars)
        one_hot = np.zeros((self.K, seq_length))
        for i, char in enumerate(X_chars):
            ind = self.char_to_ind[char]
            one_hot[ind, i] = 1
        return one_hot

    def get_decoded_one_hot(self, Y):
        """Decodes one-hot array back to text

        Args:
            ind_to_char (dict): the mapping
            Y (np.ndarray): one-hot encoding

        Returns:
            string: the decoded text
        """
        text = ''
        for t in range(Y.shape[1]):
            char_max = np.argmax(Y[:, t])
            text += self.ind_to_char[char_max]
        return text

    def preprocess(self):
        """Prepares the data as a tuple of inputs and outputs 
        """
        encoded_data = self.get_one_hot_encoding(self.data)
        num_sequences = len(self.data) // self.seq_length # discarding the tail
        sequences = []
        t = 0 # pointer in text
        for seq in range(num_sequences):
            inputs = encoded_data[:, t: t+self.seq_length]
            outputs = encoded_data[:, t+1: t+self.seq_length+1]
            sequences.append((inputs, outputs))
            t += self.seq_length
        return sequences
 
DP = DataPreprocessing(FILE_PATH, SEQ_LENGTH, BATCH_SIZE)
DP.load_data()
sequences = DP.preprocess()
print(len(DP.data))
print(DP.K)
print(sequences[0][0].shape)
print(sequences[0][0])

1115394
65
(65, 25)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Network architecture

### Training loop

### Run pipeline

### Text synthesis and evaluation metrics

### Save and load model