In [1]:
from tqdm import tqdm
import torch.nn as nn
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
FILL_IN = "FILL_IN"

### Get the data and process
- This is the Mysterious island found in Project Gutenberg.

In [3]:
with open('1268-0.txt', 'r', encoding="utf8") as fp:
    text=fp.read()

text.index('THE MYSTERIOUS ISLAND')


766

In [4]:
# ## Reading and processing text
# with open('1268-0.txt', 'r', encoding="utf8") as fp:
#     text=fp.read()

# # Get the index of 'THE MYSTERIOUS ISLAND'
# start_indx = text.index('THE MYSTERIOUS ISLAND')
# # Get the index of 'End of the Project Gutenberg'
# end_indx = text.index('END OF THE PROJECT GUTENBERG')

# # Set text to the text between start and end idx.
# text = text[start_indx:end_indx]
# # Get the unique set of characters.
# char_set = set(text)
# print('Total Length:', len(text))
# print('Unique Characters:', len(char_set))
# assert(len(text) == 1130711)
# assert(len(char_set) == 85)

In [5]:
## Reading and processing text
with open('1268-0.txt', 'r', encoding="utf8") as fp:
    text=fp.read()

# Get the index of 'THE MYSTERIOUS ISLAND'
start_indx = text.index('THE MYSTERIOUS ISLAND')
# Get the index of 'End of the Project Gutenberg'
end_indx = text.index('END OF THE PROJECT GUTENBERG')

# Set text to the text between start and end idx.
text = text[start_indx:end_indx]
# Get the unique set of characters.
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))
assert(len(text) == 1112300)
assert(len(char_set) == 80)

Total Length: 1112300
Unique Characters: 80


### Tokenze and get other helpers
- We do this manually since everything is character based.

In [6]:
# The universe of words.
chars_sorted = sorted(char_set)

# Effectively, these maps are the tokenizer.
# Map each char to a unique int. This is a dict.
char2int = {c:i for i, c in enumerate(chars_sorted)}
# Do the revverse of the above, this should be a np array.
int2char = np.array(chars_sorted)

# Tokenize the entire corpus. This should be an np array of np.int32 type.
text_encoded = np.array([char2int[char] for char in text], dtype=np.int32)

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1112300,)
THE MYSTERIOUS       == Encoding ==>  [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28]  == Reverse  ==>  ISLAND


#### Examples

In [7]:
print('Text encoded shape: ', text_encoded.shape)
print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1112300,)
THE MYSTERIOUS       == Encoding ==>  [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28]  == Reverse  ==>  ISLAND


In [8]:
# assert(
#     np.array_equal(
#     text_encoded[:15],
#         [48, 36, 33, 1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1]
#     )
# )

In [9]:
assert(
    np.array_equal(
    text_encoded[:15],
        [44, 32, 29,  1, 37, 48, 43, 44, 29, 42, 33, 39, 45, 43,  1]
    )
)

### Process the data and get the data loader

In [10]:
seq_length = 40
chunk_size = seq_length + 1

# Break up the data into chunks of size 41. This should be a list of lists.
# Use text_encoded. This will be used to get (x, y) pairs.
text_chunks = [text_encoded[i:i+chunk_size]for i in range(0, len(text_encoded)-chunk_size)]

In [11]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        # Get the text chunk at index idx.
        text_chunk = text_chunks[idx]
        # Return (x, y) where x has length 40 and y has length 40.
        # y should be x shifted by 1 time.
        return torch.tensor(text_chunk[:-1]), torch.tensor(text_chunk[1:])
    
seq_dataset = TextDataset(torch.tensor(text_chunks))

  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [12]:
for i, (seq, target) in enumerate(seq_dataset):
    # 40 characters for source and target ...
    print(seq.shape, target.shape)
    print('Input (x):', repr(''.join(int2char[seq])))
    print('Target (y):', repr(''.join(int2char[target])))
    print()
    if i == 1:
        break 

torch.Size([40]) torch.Size([40])
Input (x): 'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER'
Target (y): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'

torch.Size([40]) torch.Size([40])
Input (x): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'
Target (y): 'E MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERIO'



In [13]:
device = torch.device("cuda:0")

In [14]:
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

### Write the models

In [19]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        # Set to an embedding layer of vocab_size by embed_dim.
        self.embedding = torch.nn.Embedding(vocab_size, embed_dim) 
        self.rnn_hidden_size = rnn_hidden_size
        # Set to an LSTM with x having embed_dim and h dimension rnn_hidden_size.
        # batch_first shoould be true.
        self.rnn = torch.nn.LSTM(input_size=embed_dim, 
                                 hidden_size=self.rnn_hidden_size, 
                                 batch_first=True)
        
        # Make a linear layer from rnn_hidden_size to vocab_size.
        # This will be used to get the yt for each xt.
        self.fc = torch.nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, text, hidden=None, cell=None):
        # Get the embeddings for text.
        out = self.embedding(text)
        
        # Pass out, hidden and cell through the rnn.
        # If hidden is None, don't specify it and just use out.
        if hidden is not None:
            out, (hidden, cell) = self.rnn(out,(hidden, cell))
        else:
            out, (hidden, cell) = self.rnn(out)
        
        # Pass out through fc.
        out = self.fc(out)
        
        return out, (hidden, cell)

    def init_hidden(self, batch_size):
        # Initialize to zeros of 1 by ??? appropriate dimensions.
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)

### Do this right way - across all data all at once!

In [20]:
vocab_size = len(int2char)
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)

In [23]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Set to 10000.
num_epochs = 10000

torch.manual_seed(1)

# epochs here will mean batches.
# If the above takes too long, use 1000.

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    
    # Get the next batch from seq_dl
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch, target_batch = seq_batch.type(torch.LongTensor), target_batch.type(torch.LongTensor)
        
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    
    optimizer.zero_grad()
    
    loss = 0

    # Pass through the model.
    logits, _ = model(seq_batch, hidden, cell)
    
    # Get the loss.
    # You'll need to reshape / view things to make this work.
    loss += criterion(logits.view(-1,vocab_size), target_batch.view(-1))
        
    # Do back prop.
    loss.backward()
    optimizer.step()
    
    # Get the value in the tensor loss.
    loss = loss.item()
    
    if epoch % 100 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 3.5844
Epoch 100 loss: 1.6981
Epoch 200 loss: 1.5276
Epoch 300 loss: 1.4950
Epoch 400 loss: 1.3867
Epoch 500 loss: 1.4001
Epoch 600 loss: 1.3485
Epoch 700 loss: 1.2865
Epoch 800 loss: 1.3097
Epoch 900 loss: 1.3092
Epoch 1000 loss: 1.3148
Epoch 1100 loss: 1.2799
Epoch 1200 loss: 1.3213
Epoch 1300 loss: 1.2625
Epoch 1400 loss: 1.2436
Epoch 1500 loss: 1.2614
Epoch 1600 loss: 1.2811
Epoch 1700 loss: 1.2191
Epoch 1800 loss: 1.1974
Epoch 1900 loss: 1.2199
Epoch 2000 loss: 1.2419
Epoch 2100 loss: 1.2105
Epoch 2200 loss: 1.1819
Epoch 2300 loss: 1.1465
Epoch 2400 loss: 1.1910
Epoch 2500 loss: 1.1993
Epoch 2600 loss: 1.1632
Epoch 2700 loss: 1.1891
Epoch 2800 loss: 1.1852
Epoch 2900 loss: 1.1843
Epoch 3000 loss: 1.2206
Epoch 3100 loss: 1.2032
Epoch 3200 loss: 1.1711
Epoch 3300 loss: 1.1830
Epoch 3400 loss: 1.2539
Epoch 3500 loss: 1.1283
Epoch 3600 loss: 1.1700
Epoch 3700 loss: 1.1328
Epoch 3800 loss: 1.2670
Epoch 3900 loss: 1.1779
Epoch 4000 loss: 1.1741
Epoch 4100 loss: 1.1548
Epoc

In [53]:
from torch.distributions.categorical import Categorical

torch.manual_seed(1)

logits = torch.tensor([[-1.0, 1.0, 3.0]])

# Get the probabilities for these logits.
print('Probabilities:', torch.nn.Softmax(dim=1)(logits))

# Get a Categorical random variable with the above probabilities for each of the classes.
m = Categorical(probs=torch.nn.Softmax(dim=1)(logits))
# Generate 10 things.
samples = m.sample(torch.Size([10]))

print(samples.numpy())

Probabilities: tensor([[0.0159, 0.1173, 0.8668]])
[[1]
 [2]
 [2]
 [2]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]]


In [52]:
logits = torch.tensor([[-1.0, 1.0, 3.0]])
m = Categorical(logits.numpy())


AttributeError: 'numpy.ndarray' object has no attribute 'dim'

### Random decoding.
- This compounds problems: once you make a mistake, you can't undo it.

In [34]:
encoded_input = torch.tensor([char2int[s] for s in 'The island'])
encoded_input.view(1,-1).shape
encoded_input

tensor([44, 57, 54,  1, 58, 68, 61, 50, 63, 53])

In [74]:
def random_sample(
    model,
    starting_str, 
    len_generated_text=500, 
):

    # Encode starting string into a tensor using char2str.
    encoded_input = torch.tensor([char2int[s] for s in starting_str]).to(device)
    
    # Reshape to be 1 by ??? - let PyTorch figure this out.
    encoded_input = encoded_input.view(1,-1)

    # This will be what you generate, but it starts off with something.
    generated_str = starting_str

    # Put model in eval mode. This matters if we had dropout o batch / layer norms.
    model.eval()
    
    hidden, cell = model.init_hidden(1)
    print(hidden.shape, cell.shape)
    
    hidden = hidden.to(device)
    
    cell = cell.to(device)
        
    # Build up the starting hidden and cell states.
    # You can do this all in one go?
    for c in range(len(starting_str)-1):
        # Feed each letter 1 by 1 and then get the final hidden state.
        out = encoded_input[:,c].unsqueeze(0)
        # Pass out through, note we update hidden and cell and use them again
        _, (hidden, cell) = model(out, hidden, cell)
    
    # Gte the last char; note we did not do go to the last char above.
    last_char = encoded_input[:,-1]
    # Generate chars one at a time, add them to generated_str.
    # Do this over and over until you get the desired length.

    for i in range(len_generated_text):
        last_char = last_char.unsqueeze(0)
        # Use hidden and cell from the above.
        # Use last_char, which will be updated over and over.
        logits, (hidden, cell) = model(last_char, hidden, cell)
        # Get the logits.
        logits = logits.squeeze()
        
        # m is a random variable with probabilities based on the softmax of the logits.
        m = Categorical(probs=torch.nn.Softmax(dim=0)(logits))
        
        # Generate from m 1 char.
        last_char = m.sample().unsqueeze(0)
        
        # Add the geenrated char to generated_str, but pass it through int2str so that
        generated_str += int2char[last_char.item()]
        
    return generated_str

torch.manual_seed(1)
model.to(device)
print(random_sample(model, starting_str='The island'))

torch.Size([1, 1, 512]) torch.Size([1, 1, 512])
The island?”

“Forward!” exclaimed Harding.

Neb heard, saying to them. The road of their approaches, and in a stormy half an idea, but he had obscuceful
with and did not made to a hundred feet into a
human been quickly by pyrite which the convicts hixed more than the long, questions, he tried, and there wake their situations when all this master!”

“Or the dockyard known about their own banks, there was not moving it quitted,” added the reporter, “she at the lower mass of asked.

At a smoke, so devoted t
