In [74]:
'''
Create ground truth with all states pre-calculated
'''

import pandas as pd
from torch import nn
import torch.nn.functional as F
import marisa_trie
# https://stackoverflow.com/questions/11015320/how-to-create-a-trie-in-python
import glob
import torch
import time
import numpy as np
import pickle
import os 

import h5py

# ground truth parameters
char_seq_len=10
delta_step=1
# Configs
data_folder = "data/"
train_dataset = data_folder + "training_dataset/nietzsche.txt"
folder_models = data_folder + "trained_models/"
output_ground_truth = data_folder + "ground_truth/"


class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=612, n_layers=4,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        ## TODO: define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## TODO: define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## TODO: define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        ## TODO: Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)
        
        ## TODO: pass through a dropout layer
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)
        
        ## TODO: put x through the fully-connected layer
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden
      
      
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot
  

def predict(net, char, h=None, top_k=None, return_vectors=True):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        # print("h={}".format(h))
        # print("len h {}".format(len(h)))
        h = tuple([each.data for each in h])
        # get the output of the model
        
        # print("Is CUDA: {}".format(inputs.is_cuda))
        # inputs.is_cuda

        
        out, h = net(inputs, h)

        # get the character probabilities
        # apply softmax to get p probabilities for the likely next character giving x
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        
        # keep probability to use later
        prob = p.numpy().squeeze()
        
        if not return_vectors:        
          char_index = np.argmax(prob)
          char_pred = net.int2char[char_index]
          prob_pred = prob[char_index]
          return char_pred, h, prob_pred 
        else: 
          return h, prob       

In [75]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
device

device(type='cuda', index=0)

In [76]:
print("********** Training data processing - Started. ***************")
f = open(train_dataset, 'r')
content = f.read()
import re
s = "string. With. Punctuation?"
s = re.sub(r'[^\w\s]','',content)
s = re.sub('\n', ' ', s)
s = str.lower(s)
content = s

content_size = len(content)
tokens_max_cnt = int(content_size/delta_step)
print('content_size = {} tokens_max_cnt = {}'.format(content_size, tokens_max_cnt))

tokens_all = set()
for i in range(0, tokens_max_cnt):  
  start = int(i*delta_step)
  end = int(i*delta_step + char_seq_len)
  # print('start = {}, end = {}'.format(start, end))  
  seq = content[start:end]
  # print('seq = {}'.format(seq))
  tokens_all.add(seq)
  
tokens = list(tokens_all)
print('Example of tokens {}'.format(tokens[0:3]))

tokens_incr = set() # incremental tokens

for token in tokens:
  for i in range(0, len(token)+1):
    token_i = token[0: i]
    #print(token_i)
    tokens_incr.add(token_i)
tokens_incr = list(tokens_incr)
tokens_incr.remove('')
print("tokens_incr = {}".format(len(tokens_incr)))
print('Example of tokens incr {}'.format(tokens_incr[0:30]))
tokens_len = [ len(x) for x in tokens_incr]
print("Min token len: {}".format(np.min(tokens_len)))
print("Max token len: {}".format(np.max(tokens_len)))
print("Avg token len: {}".format(np.mean(tokens_len)))
print("********** Training data processing - Completed. ***************")


print("********** Construct Ground Truth Data Structure=Prefix tree - Started. ***************")
trie = marisa_trie.Trie(tokens_incr)
print(trie.prefixes('see mo'))
# object_methods = [method_name for method_name in dir(trie)
#                   if callable(getattr(trie, method_name))]
# object_methods
print("********** Construct Ground Truth Data Structure=Prefix tree - Completed. ***************")

# 1. get list of models from a directory

model_fname_list = glob.glob(folder_models+"/*")
print("model_fname_list: {}".format(model_fname_list))



********** Training data processing - Started. ***************
content_size = 580741 tokens_max_cnt = 580741
Example of tokens ['use the la', 'od grounds', ' sinful in']
tokens_incr = 1418404
Example of tokens incr [' one sta', 'inal enoug', 'oohuman f', ' asses', 'esses his ', 'and goes', 'the tedi', 'him  114 t', 'ianism by ', 'efic', 'as life', ' victim', 'f denial', 'nes acts ', 'bted first', 'ng disposi', 'lish eq', 'ly losing', 'to so emph', 'emocrats', 'eoise ju', 'he spee', ' its def', 'ison and a', 'ctions pr', 'er v the n', 'ape obedie', 'w of hu', 'angled to', 'ropical l']
Min token len: 1
Max token len: 10
Avg token len: 8.26963051429635
********** Training data processing - Completed. ***************
********** Construct Ground Truth Data Structure=Prefix tree - Started. ***************
['s', 'se', 'see', 'see ', 'see m', 'see mo']
********** Construct Ground Truth Data Structure=Prefix tree - Completed. ***************
model_fname_list: ['data/trained_models\\model_1024_1

In [78]:
model_fname = 'data/trained_models/model_124_1_80_50'
model_base_name = os.path.basename(model_fname)


print("********** model_fname = {} *********".format(model_fname))
with open(model_fname, 'rb') as f:
      checkpoint = torch.load(f)
model = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
model.load_state_dict(checkpoint['state_dict'])
print("Model info: {}".format(model))

  # Add for-loop for elements of tokens_incr
  
  # 2. predict Y and H for all elements of marisa-trie
  # 3. save as trie? or dict?!!!!


********** model_fname = data/trained_models/model_124_1_80_50 *********
Model info: CharRNN(
  (lstm): LSTM(44, 124, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=124, out_features=44, bias=True)
)


In [79]:
# Ground Truth
class GroundTruth :
  
  def __init__(self, _trie, _ymap, _hmap):
    self.trie = _trie
    self.ymap = _ymap # mapping of trie node ID to y variable
    self.hmap = _hmap # mapping of trie node ID to hidden state variable

In [80]:
next(model.parameters()).is_cuda

False

In [81]:
model = model.to(device)
next(model.parameters()).is_cuda

True

In [82]:
ymap = {}
hmap = {}
train_on_gpu = True

def predict_seq(_chars, _model):
  h = model.init_hidden(1)
  for ch in _chars:
    h, prob = predict(model, ch, h)
    # print("ch = {}, char = {}, prob = {}".format(ch, char, prob))    
  return h, prob      

In [83]:
trie.save(output_ground_truth+'/gt_'+model_base_name+'.marisa')

In [84]:
part = 1

In [85]:
f = h5py.File(output_ground_truth+'/gt_'+model_base_name+'_'+str(part)+'_states.h5py', 'w')

In [86]:
trie_keys = trie.keys()

In [87]:
delta = len(trie_keys)/4.0
start = round((part-1)*delta)
end = round(part*delta)

In [88]:
print("part:{}, start={}, end={}".format(part, start, end))

part:1, start=0, end=354601


In [89]:
end

354601

In [90]:
element_ids = [x[1] for x in trie.items()]

In [91]:
element_ids[0:10]

[0, 40, 788, 6745, 30956, 93672, 215776, 409317, 678428, 1018535]

In [92]:
len(trie.items()[start : end])

354601

In [93]:
trie.items()[0:10]

[(' ', 0),
 (' a', 40),
 (' an', 788),
 (' and', 6745),
 (' and ', 30956),
 (' and s', 93672),
 (' and se', 215776),
 (' and sel', 409317),
 (' and self', 678428),
 (' and self ', 1018535)]

In [94]:
# using hdf5
k = 0
for elem, elem_id in trie.items()[start : end]:
  h, prob = predict_seq(elem, model)
  f['y_'+str(elem_id)] = prob
  f['h_'+str(elem_id)] = h[0].data.cpu().numpy() # only the values, do not save gradient

  k=k+1
  if (k%50000==0):
    print("elem = {}. k = {}".format(elem, k))
  # if k>10:
    # break
  
# model_gt = GroundTruth(trie, ymap, hmap)

f.close()

elem =  if one i. k = 50000
elem =  mankinda. k = 100000
elem =  remain g. k = 150000
elem = e left af. k = 200000
elem = eserts. k = 250000
elem = ey are alr. k = 300000
elem = alsest op. k = 350000


In [59]:
# read  and check that order of elems is the same
g = h5py.File(output_ground_truth+'/gt_'+model_base_name+'_'+str(part)+'_states.h5py', 'r')

In [57]:
trie_keys[0:10]

[' ',
 ' a',
 ' an',
 ' and',
 ' and ',
 ' and s',
 ' and se',
 ' and sel',
 ' and self',
 ' and self ']

In [60]:
g_trie_keys = list(g.keys())

In [71]:
g['y_1018535'][...]

array([4.15970162e-02, 1.02252125e-04, 3.18567851e-04, 1.17493124e-04,
       4.23897393e-02, 3.73146759e-04, 2.39318255e-02, 8.48555639e-02,
       9.20189632e-05, 9.47463363e-02, 2.53079622e-03, 4.12827125e-03,
       3.13363667e-03, 1.02065147e-04, 2.29419004e-02, 9.67583582e-02,
       7.77364075e-02, 7.15183513e-03, 5.76696023e-02, 1.02002057e-04,
       4.92777872e-05, 4.79606800e-02, 1.94603726e-02, 9.63516650e-05,
       1.07069046e-03, 2.20848620e-03, 1.93535147e-04, 6.43221214e-02,
       7.88175687e-02, 8.63932510e-05, 1.41437287e-02, 7.15470524e-05,
       9.15253913e-05, 4.38000150e-02, 3.91744189e-02, 3.57919931e-02,
       7.22919474e-04, 3.01809353e-03, 5.95195706e-05, 5.39685041e-02,
       1.87969841e-02, 1.45215364e-02, 7.59283954e-04, 3.56424353e-05],
      dtype=float32)

In [72]:
g.close()

In [45]:
h[0]

tensor([[[ 6.4322e-03, -2.8410e-02,  3.1460e-01,  7.0690e-02, -9.0526e-02,
           8.5714e-02,  8.1651e-02,  3.4326e-02, -1.8042e-01, -1.6631e-01,
          -6.0355e-03, -3.2680e-01, -2.2718e-02, -9.7644e-02,  4.8761e-02,
           1.6566e-02, -4.8820e-02, -4.6266e-02, -9.4683e-02,  1.4593e-01,
           8.7851e-04,  2.4238e-02,  5.9539e-02,  5.5022e-03, -5.8738e-02,
          -4.3814e-02,  2.1783e-03,  2.1565e-02,  1.0630e-02,  8.2591e-03,
           7.4929e-03, -1.8718e-02,  2.4823e-02,  8.5742e-03, -2.8612e-02,
          -1.0967e-01, -1.6261e-02,  4.1212e-02, -2.2671e-02, -1.4108e-01,
          -8.6997e-02, -6.7066e-02, -2.5027e-03,  1.4641e-02,  1.0700e-01,
           1.4883e-01, -3.8611e-02, -1.8133e-01,  8.4864e-03,  8.8618e-02,
           7.8809e-03,  4.2759e-02,  8.3621e-02,  1.2503e-01,  1.3461e-02,
          -2.0543e-02, -1.1269e-02,  1.3726e-01,  6.8421e-02, -9.7921e-02,
           6.9353e-02, -2.8590e-01,  1.6102e-02, -6.4238e-02, -1.6368e-01,
           2.4099e-04,  4

In [41]:
f['h_0'] = prob

In [None]:
# using pickle

k = 0
for elem in trie:
  #print("elem = {}. k = {}".format(elem, k))
  # print("elem len = {}".format(len(elem)))
  h, prob = predict_seq(elem, model)
  elem_id = trie[elem]
  ymap[elem_id] = prob
  hmap[elem_id] = h
  k=k+1
  if (k%50000==0):
    print("elem = {}. k = {}".format(elem, k))
  # if k>10:
    # break
  
# model_gt = GroundTruth(trie, ymap, hmap)

with open(output_ground_truth+'/gt_'+model_base_name+'.pickle', 'wb') as handle:
    pickle.dump(model_gt, handle, protocol=pickle.HIGHEST_PROTOCOL)