In [4]:
import numpy as np
import torch
torch.manual_seed(1) # set random seed

<torch._C.Generator at 0x29c38980af0>

In [7]:
# import text data
with open('datasets/frankenstein.txt', 'r', encoding='utf-8') as file:
    frankenstein = file.read()

In [9]:
# getting a subset of text so the system wont fail
first_letter_text = frankenstein[1380:8230]

print(first_letter_text)

Letter 1

_To Mrs. Saville, England._


St. Petersburgh, Dec. 11th, 17—.


You will rejoice to hear that no disaster has accompanied the
commencement of an enterprise which you have regarded with such evil
forebodings. I arrived here yesterday, and my first task is to assure
my dear sister of my welfare and increasing confidence in the success
of my undertaking.

I am already far north of London, and as I walk in the streets of
Petersburgh, I feel a cold northern breeze play upon my cheeks, which
braces my nerves and fills me with delight. Do you understand this
feeling? This breeze, which has travelled from the regions towards
which I am advancing, gives me a foretaste of those icy climes.
Inspirited by this wind of promise, my daydreams become more fervent
and vivid. I try in vain to be persuaded that the pole is the seat of
frost and desolation; it ever presents itself to my imagination as the
region of beauty and delight. There, Margaret, the sun is for ever
visible, its broad disk

In [11]:
# tokenize text
tokenized_text = list(first_letter_text)
print(len(tokenized_text))

6850


In [13]:
# mapping unique tokens
unique_char_tokens = sorted(list(set(tokenized_text)))

In [21]:
# create a dictionary mapping tokens to unique characters
c2ix = {char : idx for idx, char in enumerate(unique_char_tokens)}
print(c2ix)

{'\n': 0, ' ': 1, '!': 2, ',': 3, '-': 4, '.': 5, '1': 6, '7': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'B': 12, 'D': 13, 'E': 14, 'F': 15, 'G': 16, 'H': 17, 'I': 18, 'J': 19, 'L': 20, 'M': 21, 'N': 22, 'O': 23, 'P': 24, 'R': 25, 'S': 26, 'T': 27, 'U': 28, 'W': 29, 'Y': 30, '_': 31, 'a': 32, 'b': 33, 'c': 34, 'd': 35, 'e': 36, 'f': 37, 'g': 38, 'h': 39, 'i': 40, 'j': 41, 'k': 42, 'l': 43, 'm': 44, 'n': 45, 'o': 46, 'p': 47, 'q': 48, 'r': 49, 's': 50, 't': 51, 'u': 52, 'v': 53, 'w': 54, 'x': 55, 'y': 56, 'z': 57, '—': 58, '’': 59}


In [27]:
# reverse c2ix to be able to match back to characters
ix2c = {idx: char for char, idx in c2ix.items()}

In [31]:
# turn our text sample into numerical map
tokenized_id_text = [c2ix[char] for char in tokenized_text]

In [33]:
from torch.utils.data import Dataset, DataLoader

In [57]:
class TextDataset(Dataset):
    def __init__(self, tokenized_text, seq_length):
        self.tokenized_text = tokenized_text
        self.seq_length = seq_length

    def __len__(self):
        return len(self.tokenized_text) - self.seq_length

    def __getitem__(self, idx):
        features = torch.tensor(self.tokenized_text[idx:idx+self.seq_length])
        labels = torch.tensor(self.tokenized_text[idx+1:idx+self.seq_length+1])
        return features, labels

In [60]:
# call textdataset class from previously made to make a dataset of specific size
dataset = TextDataset(tokenized_id_text, 48)

In [63]:
batch_size = 36

dataloader = DataLoader(dataset, batch_size, shuffle=True)

In [67]:
# begin training the neural network
import torch.nn as nn

In [70]:
class CharacterLSTM(nn.Module):
    def __init__(self):
        super(CharacterLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim=48)
        self.lstm = nn.LSTM(input_size=48, hidden_size=96, batch_first=True)
        self.linear = nn.Linear(96, vocab_size)

    def forward(self, x, states):
        x = self.embedding(x)
        output, states = self.lstm(x, states)
        output = self.linear(output)
        output = output.reshape(-1, output.size(2))
        return output, states

    def init_state(self, batch_size):
        hidden = torch.zeros(1, batch_size, 96)
        cell = torch.zeros(1, batch_size, 96)
        return hidden, cell

In [73]:
#create instance of CharacterLSTM() class
lstm_model = CharacterLSTM()

In [76]:
# create loss function
loss = nn.CrossEntropyLoss()

In [79]:
#setup optimizer
import torch.optim as optim

optimizer = optim.Adam(lstm_model.parameters(), lr=0.015)

In [82]:
# train model through loop and print loss
num_epochs = 5

for epoch in range(num_epochs):
    for features, labels in dataloader:
        optimizer.zero_grad()
        states = lstm_model.init_state(features.size(0))
        output, states = lstm_model(features, states)
        CEloss = loss(output, labels.view(-1,))
        CEloss.backward()
        optimizer.step()

    if (epoch + 1) % 1 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], CELoss: {CEloss.item():.4f}')

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch [1/5], CELoss: 1.0969
Epoch [2/5], CELoss: 0.7321
Epoch [3/5], CELoss: 0.5033
Epoch [4/5], CELoss: 0.4037
Epoch [5/5], CELoss: 0.4265


In [84]:
# begin generating text starting with fixed prompt
starting_prompt = "You will rejoice to hear"

In [86]:
# tokenize starting prompt
tokenized_id_prompt = torch.tensor([[c2ix[char] for char in starting_prompt]])

print(tokenized_id_prompt)

tensor([[30, 46, 52,  1, 54, 40, 43, 43,  1, 49, 36, 41, 46, 40, 34, 36,  1, 51,
         46,  1, 39, 36, 32, 49]])


In [88]:
lstm_model.eval()

CharacterLSTM(
  (embedding): Embedding(60, 48)
  (lstm): LSTM(48, 96, batch_first=True)
  (linear): Linear(in_features=96, out_features=60, bias=True)
)

In [92]:
# let the trained model generate text off of prompt.
num_generated_chars = 500

with torch.no_grad():
    states = lstm_model.init_state(1)
    for _ in range(num_generated_chars):
        output, states = lstm_model(tokenized_id_prompt, states)
        predicted_id = torch.argmax(output[-1, :], dim=-1).item()
        predicted_char = ix2c[predicted_id]
        starting_prompt += predicted_char
        tokenized_id_prompt = torch.tensor([[predicted_id]])

print(starting_prompt)

You will rejoice to hear that a
history of medicine, and whaler, and when theirs are failing.

This is the most favourable period for travelling in Russia. They fly
quickly over the secret of the magnet, which, if at
all possible, can only be effected by an undertaking such as mine.

These reflections and features may be withe months are
require only this
voyage to render the secret, by ease monts on several expedition has been the favourite dream of my earlier bent.

Six years have passed since I resolved on my presen my countrier of my eldicine, and my
intention is
firm; but my hopes fluctuate, and my first task is to assure
my dear sister, I will put
some trust in preceding navigators—there snow in their sledges; the motion
with which I shall confer on all
mankind, to the last generation, by an undertaking such as mine.

These reflections and features may be withe months are
require only this
voyage to render the secret, by ease monts on several expedition has been the favourite dre