In [0]:
'''
Create ground truth with all states pre-calculated
'''

In [0]:
char_seq_len=10
delta_step=1

In [0]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive/') 

In [0]:
# Configs
data_folder = "/content/drive/My Drive/TensorSMC/check_char_lstm/data/"
train_dataset = data_folder + "training_dataset/nietzsche.txt"
folder_models = data_folder + "trained_models/"
output_ground_truth = data_folder + "ground_truth/"

In [0]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=612, n_layers=4,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        ## TODO: define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## TODO: define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## TODO: define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        ## TODO: Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)
        
        ## TODO: pass through a dropout layer
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)
        
        ## TODO: put x through the fully-connected layer
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [0]:
def predict(net, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        # apply softmax to get p probabilities for the likely next character giving x
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        
        # keep probability to use later
        prob = p.numpy().squeeze()
        char_index = np.argmax(prob)
        char_pred = net.int2char[char_index]
        prob_pred = prob[char_index]
        
#         # get top characters
#         # considering the k most probable characters with topk method
#         if top_k is None:
#             top_ch = np.arange(len(net.chars))
#         else:
#             p, top_ch = p.topk(top_k)
#             top_ch = top_ch.numpy().squeeze()
        
        
#         # select the likely next character with some element of randomness
#         p = p.numpy().squeeze()
#         char = np.random.choice(top_ch, p=p/p.sum())
        
#         # return the encoded value of the predicted char and the hidden state
#        return net.int2char[char], h, prob
        return char_pred, h, prob_pred

In [0]:
# Read file to get all the words.
f = open(train_dataset, 'r')
content = f.read()
import re
s = "string. With. Punctuation?"
s = re.sub(r'[^\w\s]','',content)
s = re.sub('\n', ' ', s)
s = str.lower(s)
content = s

content_size = len(content)
tokens_max_cnt = int(content_size/delta_step)
print('content_size = {} tokens_max_cnt = {}'.format(content_size, tokens_max_cnt))

tokens_all = set()
for i in range(0, tokens_max_cnt):  
  start = int(i*delta_step)
  end = int(i*delta_step + char_seq_len)
  # print('start = {}, end = {}'.format(start, end))  
  seq = content[start:end]
  # print('seq = {}'.format(seq))
  tokens_all.add(seq)
  
tokens = list(tokens_all)
print('Example of tokens {}'.format(tokens[0:3]))

tokens_incr = set() # incremental tokens

for token in tokens:
  for i in range(0, len(token)+1):
    token_i = token[0: i]
    tokens_incr.add(token_i)
tokens_incr = list(tokens_incr)
len(tokens_incr)

In [0]:
# Load models

Creating Ground Truth Data Structure

In [0]:
!pip install marisa-trie


Collecting marisa-trie
[?25l  Downloading https://files.pythonhosted.org/packages/20/95/d23071d0992dabcb61c948fb118a90683193befc88c23e745b050a29e7db/marisa-trie-0.7.5.tar.gz (270kB)
[K     |█▏                              | 10kB 13.3MB/s eta 0:00:01[K     |██▍                             | 20kB 3.2MB/s eta 0:00:01[K     |███▋                            | 30kB 4.6MB/s eta 0:00:01[K     |████▉                           | 40kB 3.0MB/s eta 0:00:01[K     |██████                          | 51kB 3.7MB/s eta 0:00:01[K     |███████▎                        | 61kB 4.3MB/s eta 0:00:01[K     |████████▌                       | 71kB 5.0MB/s eta 0:00:01[K     |█████████▊                      | 81kB 5.6MB/s eta 0:00:01[K     |███████████                     | 92kB 6.2MB/s eta 0:00:01[K     |████████████                    | 102kB 4.9MB/s eta 0:00:01[K     |█████████████▎                  | 112kB 4.9MB/s eta 0:00:01[K     |██████████████▌                 | 122kB 4.9MB/s eta 0:00

In [0]:
import marisa_trie
# https://stackoverflow.com/questions/11015320/how-to-create-a-trie-in-python

In [0]:
trie = marisa_trie.Trie(tokens_incr)

In [0]:
trie.prefixes('see mo')

['', 's', 'se', 'see', 'see ', 'see m', 'see mo']

In [0]:
object_methods = [method_name for method_name in dir(trie)
                  if callable(getattr(trie, method_name))]
object_methods

['__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_build',
 '_config_flags',
 'frombytes',
 'get',
 'has_keys_with_prefix',
 'items',
 'iter_prefixes',
 'iteritems',
 'iterkeys',
 'key_id',
 'keys',
 'load',
 'mmap',
 'prefixes',
 'read',
 'restore_key',
 'save',
 'tobytes',
 'write']

In [0]:
# 1. get list of models from a directory

model_list = 

In [0]:
for model in model_list:
  with open(model, 'rb') as f:
      checkpoint = torch.load(f)

  loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
  loaded.load_state_dict(checkpoint['state_dict'])  
  
  # 2. predict Y and H for all elements of marisa-trie
  # 3. save 
  
  