In [24]:
from tqdm import tqdm
import torch.nn as nn
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [1]:
FILL_IN = "FILL_IN"

### Get the data and process
- This is the Mysterious island found in Project Gutenberg.

In [27]:
## Reading and processing text
with open('data/1268-0.txt', 'r', encoding="utf8") as fp:
    text=fp.read()

# Get the index of 'THE MYSTERIOUS ISLAND'
start_indx = FILL_IN
# Get the index of 'End of the Project Gutenberg'
end_indx = FILL_IN

# Set text to the text between start and end idx.
text = FILL_IN
# Get the unique set of characters.
char_set = FILL_IN
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))
assert(len(text) == 1130711)
assert(len(char_set) == 85)

Total Length: 1130711
Unique Characters: 85


### Tokenze and get other helpers
- We do this manually since everything is character based.

In [26]:
# The universe of words.
chars_sorted = sorted(char_set)

# Effectively, these maps are the tokenizer.
# Map each char to a unique int. This is a dict.
char2int = FILL_IN
# Do the revverse of the above, this should be a np array.
int2char = FILL_IN

# Tokenize the entire corpus. This should be an np array of np.int32 type.
text_encoded = FILL_IN

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1130711,)
THE MYSTERIOUS       == Encoding ==>  [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1]
[37 47 40 29 42 32]  == Reverse  ==>  ISLAND


#### Examples

In [6]:
print('Text encoded shape: ', text_encoded.shape)
print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1130711,)
THE MYSTERIOUS       == Encoding ==>  [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1]
[37 47 40 29 42 32]  == Reverse  ==>  ISLAND


In [33]:
assert(
    np.array_equal(
    text_encoded[:15],
        [48, 36, 33, 1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1]
    )
)

### Process the data and get the data loader

In [34]:
seq_length = 40
chunk_size = seq_length + 1

# Break up the data into chunks of size 41. This should be a list of lists.
# Use text_encoded. This will be used to get (x, y) pairs.
text_chunks = FILL_IN 

In [35]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        # Get the text chunk at index idx.
        text_chunk = FILL_IN
        # Return (x, y) where x has length 40 and y has length 40.
        # y should be x shifted by 1 time.
        return FILL_IN
    
seq_dataset = TextDataset(torch.tensor(text_chunks))

In [36]:
for i, (seq, target) in enumerate(seq_dataset):
    # 40 characters for source and target ...
    print(seq.shape, target.shape)
    print('Input (x):', repr(''.join(int2char[seq])))
    print('Target (y):', repr(''.join(int2char[target])))
    print()
    if i == 1:
        break 

torch.Size([40]) torch.Size([40])
Input (x): 'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER'
Target (y): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'

torch.Size([40]) torch.Size([40])
Input (x): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'
Target (y): 'E MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERIO'



In [11]:
device = torch.device("cpu")

In [37]:
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

### Write the models

In [38]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        # Set to an embedding layer of vocab_size by embed_dim.
        self.embedding = FILL_IN 
        self.rnn_hidden_size = rnn_hidden_size
        # Set to an LSTM with x having embed_dim and h dimension rnn_hidden_size.
        # batch_first shoould be true.
        self.rnn = FILL_IN
        
        # Make a linear layer from rnn_hidden_size to vocab_size.
        # This will be used to get the yt for each xt.
        self.fc = FILL_IN

    def forward(self, text, hidden=None, cell=None):
        # Get the embeddings for text.
        out = FILL_IN
        
        # Pass out, hidden and cell through the rnn.
        # If hidden is None, don't specify it and just use out.
        if hidden is not None:
            FILL_IN
        else:
            out, (hidden, cell) = FILL_IN
        
        # Pass out through fc.
        out = FILL_IN
        
        return out, (hidden, cell)

    def init_hidden(self, batch_size):
        # Initialize to zeros of 1 by ??? appropriate dimensions.
        hidden = FILL_IN
        cell = FILL_IN
        return hidden.to(device), cell.to(device)

### Do this right way - across all data all at once!

In [39]:
vocab_size = len(int2char)
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(85, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)

In [2]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Set to 10000.
num_epochs = 1

torch.manual_seed(1)

# epochs here will mean batches.
# If the above takes too long, use 1000.

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    
    # Get the next batch from seq_dl
    seq_batch, target_batch = FILL_IN
        
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    
    optimizer.zero_grad()
    
    loss = 0

    # Pass through the model.
    logits, _ = FILL_IN 
    
    # Get the loss.
    # You'll need to reshape / view things to make this work.
    loss += FILL_IN
        
    # Do back prop.
    FILL_IN
    
    # Get the value in the tensor loss.
    loss = FILL_IN
    
    if epoch % 100 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

NameError: name 'nn' is not defined

In [43]:
from torch.distributions.categorical import Categorical

torch.manual_seed(1)

logits = torch.tensor([[-1.0, 1.0, 3.0]])

# Get the probabilities for these logits.
print('Probabilities:', FILL_IN)

# Get a Categorical random variable with the above probabilities for each of the classes.
m = FILL_IN
# Generate 10 things.
samples = FILL_IN
 
print(samples.numpy())

Probabilities: [[0.01587624 0.11731043 0.86681336]]
[[1]
 [2]
 [2]
 [2]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]]


### Random decoding.
- This compounds problems: once you make a mistake, you can't undo it.

In [44]:
def random_sample(
    model,
    starting_str, 
    len_generated_text=500, 
):

    # Encode starting string into a tensor using char2str.
    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    
    # Reshape to be 1 by ??? - let PyTorch figure this out.
    encoded_input = FILL_IN

    # This will be what you generate, but it starts off with something.
    generated_str = starting_str

    # Put model in eval mode. This matters if we had dropout o batch / layer norms.
    FILL_IN
    
    hidden, cell = model.init_hidden(1)
    
    hidden = hidden.to(device)
    
    cell = cell.to(device)
        
    # Build up the starting hidden and cell states.
    # You can do this all in one go?
    for c in range(len(starting_str)-1):
        # Feed each letter 1 by 1 and then get the final hidden state.
        out = FILL_IN
        # Pass out through, note we update hidden and cell and use them again
        _, (hidden, cell) = FILL_IN
    
    # Gte the last char; note we did not do go to the last char above.
    last_char = FILL_IN
    # Generate chars one at a time, add them to generated_str.
    # Do this over and over until you get the desired length.
    for i in range(len_generated_text):
        
        # Use hidden and cell from the above.
        # Use last_char, which will be updated over and over.
        logits, (hidden, cell) = FILL_IN 
        
        # Get the logits.
        logits = FILL_IN
        
        # m is a random variable with probabilities based on the softmax of the logits.
        m = FILL_IN
        
        # Generate from m 1 char.
        last_char = FILL_IN
        
        # Add the geenrated char to generated_str, but pass it through int2str so that 
        generated_str += FILL_IN
        
    return generated_str

torch.manual_seed(1)
model.to(device)
print(greedy_sample(model, starting_str='The island'))

The island endeavored that ever’thel words, was no feithful perpossing was not care no hould. They cyling at the possing, was under up see they passed his complete the neighbeer
task of their pale of
eight, was a trees, then, and there we doubt which these prevent obliquely to poid all the air. The woods was quighter, pried any over his
fairs, but the dost then was agree his companions had
just. Neb, energy-devening, “part foon might, someat strepain
eir answer.
The colony was watercome to the settlers of
