In [15]:
from tqdm import tqdm
import torch.nn as nn
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [16]:
# Fill in the code below to make Beam Search work.
FILL_IN = "FILL IN"

### Get the data and process
- This is the Mysterious island found in Project Gutenberg.

In [17]:
## Reading and processing text
with open('1268-0.txt', 'r', encoding="utf8") as fp:
    text=fp.read()
    
start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')

text = text[start_indx:end_indx]
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))
assert(len(text) == 1130711)
assert(len(char_set) == 85)

Total Length: 1130711
Unique Characters: 85


### Tokenze and get other helpers
- We do this manually since everything is character based.

In [18]:
# The universe of words.
chars_sorted = sorted(char_set)

# Effectively, these maps are the tokenizer.
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
int2char = np.array(chars_sorted)

# Tokenize the entire corpus.
text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32)

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1130711,)
THE MYSTERIOUS       == Encoding ==>  [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1]
[37 47 40 29 42 32]  == Reverse  ==>  ISLAND


### Load the model

In [19]:
device = torch.device("cuda:0")

In [20]:
!du -h hw7_model.pt

'du' is not recognized as an internal or external command,
operable program or batch file.


In [21]:
# Lost a traced version of this model.
# Note this is the same as the HW 7 model but a little different
# The HW7 model had an if-else in its forward method, and this is not allowed.
# The forward method of this model takes hidden and cell, which could be all zeros but the user has to specify.
model = torch.jit.load('hw7_model.pt')

In [22]:
model

RecursiveScriptModule(
  original_name=RNN
  (embedding): RecursiveScriptModule(original_name=Embedding)
  (rnn): RecursiveScriptModule(original_name=LSTM)
  (fc): RecursiveScriptModule(original_name=Linear)
)

In [23]:
# 'jit' does not save other methods on a model, we need to define this helper and use it below.
def init_hidden(model, batch_size):
    return (
        torch.zeros(1, batch_size, model.rnn_hidden_size),
        torch.zeros(1, batch_size, model.rnn_hidden_size)
    )

### Beam search algorithm.
- Good article: https://towardsdatascience.com/foundations-of-nlp-explained-visually-beam-search-how-it-works-1586b9849a24

In [24]:
encoded_input = torch.tensor([char2int[char] for char in 'the island'])
encoded_input = encoded_input.view(1,-1)
encoded_input[:,1].item()

62

In [25]:
test_node = torch.rand((1,2,1)).squeeze()
print(test_node)
test_node = torch.nn.Softmax(dim=0)(test_node)
print(test_node)
log_test_node = torch.log(test_node)
print(log_test_node)

tensor([0.7576, 0.2793])
tensor([0.6174, 0.3826])
tensor([-0.4823, -0.9606])


In [26]:
def beam_search_decoding(
    model,
    starting_str, 
    len_generated_text=500, 
    beams=5,
    print_paths=True
):
    assert(len(starting_str) != 0)

    # Get the encoding of the starting_str as a tensor of ints.
    encoded_input = torch.tensor([char2int[char] for char in starting_str]).to(device)
    
    # Reshape the above to be of appropiate dimension.
    encoded_input = encoded_input.view(1,-1)

    # Put the model in eval mode.
    model.eval()
    
    # Unfortunately, jit save does not save methods other than forward.
    # Use init_hidden to get the first hidden and cell states.
    hidden, cell = init_hidden(model, 1)
    
    hidden = hidden.to(device)
    
    cell = cell.to(device)
    
    generated_log_prob = 0
    generated_str = starting_str[0]
        
    # Build up the starting hidden and cell states.
    # You can do this all in one go?
    for i in range(len(starting_str)-1):
        # Feed each letter 1 by 1 and then get the final hidden state.
        # Get the character at index i and push it through the model.
        # Get the logits, the hidden and cell states, which would will need later.
        out = encoded_input[:,i].unsqueeze(0)
        logits, (hidden, cell) = model(out, hidden, cell)
        logits = logits.squeeze()
        # print('logits dimension: ', logits.shape)

        # Get the probability of the generated character.
        # For input of index i, we want the probability that the model generated i+1.
        # We push the startiing_str[i] into the model, and append starting_str[i+1] to generated_str.
        generated_str += starting_str[i+1]
        
        # Get the probabilities of the different characters that the model this. 
        # You need to apply Softmax to the logits.
        probs = torch.nn.Softmax(dim=0)(logits)
        
        # Add the log probability of the appened char (int) to the running generated log probability.
        # print(i)
        # print('encoded_input dimension: ', encoded_input.shape)
        # print('probs dimension', probs.shape)
        generated_log_prob += torch.log(probs[encoded_input[:,i+1].item()])
    
    # Get the last character in the encoded input.
    last_char_int = encoded_input[:,-1].unsqueeze(0)
    
    # Push this through the model.
    logits, (hidden, cell) = model(last_char_int, hidden, cell)
    logits = logits.squeeze()

    # As before, get the probaility per character.
    probs = torch.nn.Softmax(dim=0)(logits)
    
    new_beams = []
    
    for j, prob in enumerate(probs):
        # For each probability, append the tuple (hidden, cell, the generatd str with the jth index char, generated str's the log probablity)
        # Note this is the running generated str and generated log probability.
        new_beams.append(
            (hidden, 
             cell, 
             generated_str + int2char[j], 
             generated_log_prob + torch.log(prob))
        )

    # Sort the beams from most proable to least. Use -log(generated_prob).
    print(new_beams[0][-1])
    new_beams = sorted(new_beams, key=lambda x: x[-1], reverse=True)
    
    beam_to_beam_data = {}
    
    # Add the top "beams" = 5 beams to the hash map.
    # We should have a map going {beam_id -> (hidden, cell, generated_str, geenrated_log_prob)}
    for beam in range(beams):
        beam_to_beam_data[beam] = new_beams[beam]
    
    print('The number of beams is', len(beam_to_beam_data))
    
    # For each index of generated text.
    for i in range(len_generated_text):
        # Define new beams.
        new_beams = []
        
        # For each beam.
        for beam in range(beams):
            
            # Grab the 4 elements associated with this beam from beam_to_beam_data.
            hidden, cell, generated_str, generated_log_prob = beam_to_beam_data[beam]

            # Get the last char in the str that's in the beam.
            last_char_int = torch.tensor([char2int[generated_str[-1]]]).unsqueeze(0).to(device)
            # Push hidden, cell and the last_char_int through the model.
            logits, (hidden, cell) = model(last_char_int, hidden, cell)
            logits = logits.squeeze()
            # Get the probabilities.
            probs = torch.nn.Softmax(dim=0)(logits)
            
            # As before, append the 4 elements associated with this new beam to new beams.
            for j, prob in enumerate(probs):
                new_beams.append(
                    (hidden, 
                     cell, 
                     generated_str + int2char[j], 
                     generated_log_prob + torch.log(prob))
                )
        
        # Sort the beams from most probable to least. Use -log(p).
        new_beams = sorted(new_beams, key=lambda x: x[-1], reverse=True)
                
        # The number of beams considered should always satisfy this.
        # Except for the first iteration.
        assert(len(new_beams) == beams * len(char2int))
        
        # Leave this to true.
        if print_paths:
            print("The first 5 paths beam paths and the associated data for them: ")
            for beam in range(5):
                generated_str, generated_log_prob = new_beams[beam][2:]
                print("Text: \"{}\" Prob {:0.30f}".format(
                        generated_str, torch.exp(generated_log_prob)
                ))
            # _ = input("Insert anything to continue ...")
            print("\n")
                
        # Update the beams to be equal to the top beams.
        for beam in range(beams):
            beam_to_beam_data[beam] = new_beams[beam]
            
    generated_strs = []
    generated_log_probs = []
    
    # Grab the top beams, and return them.
    for beam in range(beams):
        (_, _, generated_str, generated_log_prob) = beam_to_beam_data[beam]
        generated_strs.append(generated_str)
        generated_log_probs.append(generated_log_prob)        
                
    return generated_strs, [torch.exp(_) for _ in generated_log_probs]

In [27]:
torch.manual_seed(1)
model.to(device)
beams=5
len_generated_text=500

generated_strs, generated_probs = beam_search_decoding(
    model,
    starting_str="The island",
    len_generated_text=len_generated_text,
    beams=beams
)

for beam in range(beams):
    print(f"Beam {beam} information: ")
    print(generated_strs[beam])
    print(generated_probs[beam])

tensor(-11.5041, device='cuda:0', grad_fn=<AddBackward0>)
The number of beams is 5
The first 5 paths beam paths and the associated data for them: 
Text: "The island, " Prob 0.000520036730449646711349487305
Text: "The island w" Prob 0.000227799449930898845195770264
Text: "The island?”" Prob 0.000097528783953748643398284912
Text: "The island o" Prob 0.000093910748546477407217025757
Text: "The island i" Prob 0.000063718689489178359508514404


The first 5 paths beam paths and the associated data for them: 
Text: "The island wa" Prob 0.000172444546478800475597381592
Text: "The island, w" Prob 0.000092942616902291774749755859
Text: "The island, a" Prob 0.000090473171439953148365020752
Text: "The island, t" Prob 0.000070329326263163238763809204
Text: "The island?”
" Prob 0.000062701932620257139205932617


The first 5 paths beam paths and the associated data for them: 
Text: "The island was" Prob 0.000168390935868956148624420166
Text: "The island, an" Prob 0.000071394621045328676700592041
Text

RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
  File "code/__torch__.py", line 17, in forward
    out = (embedding).forward(text, )
    rnn = self.rnn
    _0 = (rnn).forward__0(out, (hidden, cell), )
          ~~~~~~~~~~~~~~~ <--- HERE
    out0, _1, = _0
    hidden0, cell0, = _1
  File "code/__torch__/torch/nn/modules/rnn.py", line 76, in forward__0
    training = self.training
    _13, _14, = hx0
    _15, _16, _17 = torch.lstm(input0, [_13, _14], _flat_weights, True, 1, 0., training, False, True)
                    ~~~~~~~~~~ <--- HERE
    hidden = (_16, _17)
    if torch.__not__(is_batched):

Traceback of TorchScript, original code (most recent call last):
  File "/var/folders/x8/2_vxppc52znb82mg86nv4y000000gp/T/ipykernel_9790/3976977723.py", line 17, in forward
        out = self.embedding(text)
    
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
                              ~~~~~~~~ <--- HERE
        
        out = self.fc(out)
  File "/opt/anaconda3/envs/mlenv/lib/python3.8/site-packages/torch/nn/modules/rnn.py", line 774, in forward__0
        self.check_forward_args(input, hx, batch_sizes)
        if batch_sizes is None:
            result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
                     ~~~~~~~~ <--- HERE
                              self.dropout, self.training, self.bidirectional, self.batch_first)
        else:
RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 8.00 GiB total capacity; 5.48 GiB already allocated; 0 bytes free; 7.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
