In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import os
import re
import codecs
from IPython.display import display
from six.moves import cPickle as pickle
import string
from PIL import Image
import numpy as np
import h5py

In [3]:
width = None
pd.options.display.max_rows = 600
pd.options.display.max_columns = width
pd.options.display.max_colwidth = 600
pd.options.display.width = width
pd.options.display.max_seq_items = None
pd.options.display.expand_frame_repr = False
pd.options.display.colheader_justify = 'left'

In [4]:
import data_commons as dtc
import dl_commons as dlc

In [14]:
class VisualizeDir(object):
    def __init__(self, storedir, gen_datadir='../data/generated2'):
        self._storedir = storedir
        self._word2id = pd.read_pickle(os.path.join(gen_datadir, 'dict_vocab.pkl'))
        i2w = pd.read_pickle(os.path.join(gen_datadir, 'dict_id2word.pkl'))
        for i in range(-1,-11,-1):
            i2w[i] = '%d'%i
        self._id2word = {}
        ## Append space after all commands beginning with a backslash (except backslash alone)
        for i, w in i2w.items():
            if w[0] == '\\':
              self._id2word[i] = w + " "  
            else:
                self._id2word[i] = w 
        self._id2word[self._word2id['id']['\\']] = '\\'
    
    @property
    def w2i(self):
        return self._word2id['id']

    @property
    def i2w(self):
        return self._id2word
    
    @property
    def max_step(self):
        steps = [int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in os.listdir(self._storedir)]
        return sorted(steps)[-1]
        
    @property
    def args(self):
        return dtc.load(self._storedir, "..", 'args.pkl')
    
    @property
    def hyper(self):
        return dtc.load(self._storedir, "..", 'hyper.pkl')
    
    def keys(self, graph, step):
        with h5py.File(os.path.join(self._storedir, '%s_%d.h5'%(graph,step))) as h5:
            return h5.keys()

    def np(self, graph, step, key=None):
        """
        Args:
            graph: 'training' or 'validation'
            step:  step who's output is to be fetched
            key:   key of object to fetch - e.g. 'predicted_ids'
        """
        with h5py.File(os.path.join(self._storedir, '%s_%d.h5'%(graph,step))) as h5:
            return h5[key][...]
    
    def df(self, graph, step, key):
        return pd.DataFrame(self.np(graph, step, key))
    
    def words(self, graph, step, key):
        df = self.df(graph, step, key)
        return df.applymap(lambda x: self._id2word[x])

    def strs(self, graph, step, key):
        df_str = self.words(graph, step, key)
        
        ## each token's string version - excepting backslash - has a space appended to it,
        ## therefore the string output should be compile if the prediction was syntactically correct
        return pd.DataFrame(["".join(row) for row in df_str.itertuples(index=False)])

class VisualizeStep():
    def __init__(self, visualizer, step):
        self._step = step
        self._visualizer = visualizer
        
    def keys(self, graph):
        return self._visualizer.keys(graph, self._step)
    
    def np(self, graph, key=None):
        return self._visualizer.np(graph, self._step, key)
    
    def df(self, graph, step, key):
        return pd.DataFrame.df(self.np(graph, step, key))
    
    def words(self, graph, key):
        return self._visualizer.words(graph, self._step, key)

    def strs(self, graph, key):
        return self._visualizer.strs(graph, self._step, key)

class DiffParams(object):
    def __init__(self, dir1, dir2):
        self._dir1 = dir1
        self._dir2 = dir2
        
    def get(self, filename, to_str):
        one = dtc.load(self._dir1, filename)
        two = dtc.load(self._dir2, filename)
        if (to_str):
            one = dlc.to_dict(one)
            two = dlc.to_dict(two)
        return one, two

    def print_dict(self, filename, to_str):
        one, two = self.get(filename, to_str)
        dtc.pprint(dlc.diff_dict(one, two))
    
    def _table(self, filename):
        one, two = self.get(filename, False)
        head, tail = dlc.diff_table(one, two)
        display(pd.DataFrame(head))
        display(pd.DataFrame(tail))
        
    def args(self, to_str=True):
        self._table('args.pkl')        
        
    def hyper(self, to_str=True):
        self._table('hyper.pkl')
    
    def get_args(self):
        return self.get('args.pkl', to_str=True)
    def get_hyper(self):
        return self.get('hyper.pkl', to_str=True)

In [22]:
# diff = DiffParams('./tb_metrics/2017-10-07 14-33-35 PDT', './tb_metrics_dev/2017-10-09 12-45-15 PDT')
# diff = DiffParams('./tb_metrics/2017-09-26 22-40-18 PDT', './tb_metrics/2017-10-07 14-33-35 PDT')
# diff = DiffParams('./tb_metrics/2017-10-07 14-33-35 PDT', './tb_metrics/2017-10-08 12-26-45 PDT')
diff = DiffParams('./tb_metrics/2017-09-26 22-40-18 PDT', './tb_metrics/2017-10-08 12-26-45 PDT')
# diff = DiffParams('./tb_metrics/2017-09-26 22-40-18 PDT/w=1', './tb_metrics/2017-10-08 12-26-45 PDT')
# diff = DiffParams('./tb_metrics/2017-09-26 22-40-18 PDT', './tb_metrics_dev/2017-10-09 12-45-15 PDT')

In [26]:
diff.hyper()

Unnamed: 0,0,1
0,CALSTM_STACK_1.CTCBlankTokenID ===> 556,CALSTM_STACK_1.CTCBlankTokenID ===> None
1,dtype_np ===> <type numpy.float32>,dtype_np ===> <type 'numpy.float32'>
2,CALSTM_STACK_1.dtype_np ===> <type numpy.float32>,CALSTM_STACK_1.dtype_np ===> <type 'numpy.float32'>
3,int_type_np ===> <type numpy.int32>,int_type_np ===> <type 'numpy.int32'>
4,CALSTM_STACK_1.use_ctc_loss ===> undefined,CALSTM_STACK_1.use_ctc_loss ===> False
5,CTCBlankTokenID ===> 556,CTCBlankTokenID ===> None
6,CALSTM_STACK_1.int_type_np ===> <type numpy.int32>,CALSTM_STACK_1.int_type_np ===> <type 'numpy.int32'>


Unnamed: 0,0,1
0,CALSTM_STACK_1.logger ===> <logging.Logger object at 0x7f4006201410>,CALSTM_STACK_1.logger ===> None
1,init_model.weights_initializer ===> <function _initializer at 0x7f40062d3b18>,init_model.weights_initializer ===> <function _initializer at 0x7f5cdf338668>
2,CALSTM_STACK_1.biases_initializer ===> <tensorflow.python.ops.init_ops.Zeros object at 0x7f40061ea810>,CALSTM_STACK_1.biases_initializer ===> <tensorflow.python.ops.init_ops.Zeros object at 0x7f850f035890>
3,init_model_final_layers.weights_regularizer ===> <function l2 at 0x7f406bd2b8c0>,init_model_final_layers.weights_regularizer ===> <function l2 at 0x7f5d66613230>
4,init_model_final_layers.activation_fn ===> <function tanh at 0x7f4079de36e0>,init_model_final_layers.activation_fn ===> <function tanh at 0x7f5ceea85668>
5,embeddings_regularizer ===> <function l2 at 0x7f406bd2b8c0>,embeddings_regularizer ===> <function l2 at 0x7f5d66613230>
6,output_layers.activation_fn ===> <function relu at 0x7f4079eeaa28>,output_layers.activation_fn ===> <function relu at 0x7f5ceebeb9b0>
7,biases_initializer ===> <tensorflow.python.ops.init_ops.Zeros object at 0x7f40061ea810>,biases_initializer ===> <tensorflow.python.ops.init_ops.Zeros object at 0x7f850f035890>
8,embeddings_initializer ===> <function _initializer at 0x7f40062d3b18>,embeddings_initializer ===> <function _initializer at 0x7f5cdf338668>
9,CALSTM_STACK_1.att_layers.weights_initializer ===> <function _initializer at 0x7f40062d3b18>,CALSTM_STACK_1.att_layers.weights_initializer ===> <function _initializer at 0x7f5cdf338668>


In [None]:
diff.get_hyper()[0]['CALSTM_STACK']

In [None]:
diff.args()

In [None]:
diff.get_hyper()[1]['squash_input_seq']

In [27]:
# v = Visualize('./tb_metrics_dev/2017-10-06 17-56-47 PDT/store', '../data/generated2')
# v = VisualizeDir('./tb_metrics/2017-10-08 12-26-45 PDT/store')
v = VisualizeDir('./tb_metrics_dev/2017-10-09 12-45-15 PDT/store')


In [29]:
v.args

{'B': '40',
 'MeanSumAlphaEquals1': 'False',
 'NOTE': 'CHECK # of LSTM LAYERS',
 'StartTokenID': '<dl_commons.equalto object at 0x7ffb5ff27590>',
 'assert_whole_batch': 'False',
 'beamsearch_length_penalty': '1.0',
 'build_image_context': '0',
 'ctc_beam_width': '10',
 'data_dir': '../data',
 'doTrain': 'True',
 'doValidate': 'False',
 'dropout': 'None',
 'generated_data_dir': '../data/generated2',
 'image_dir': '../data/formula_images_2',
 'logdir': 'tb_metrics_dev/2017-10-09 12-45-15 PDT',
 'make_training_accuracy_graph': 'False',
 'num_epochs': '10',
 'num_gpus': '2',
 'num_steps': '0',
 'pLambda': '0.005',
 'print_batch': 'True',
 'print_steps': '50',
 'rLambda': '5e-05',
 'restore_from_checkpoint': 'False',
 'seq2seq_beam_width': '10',
 'squash_input_seq': 'True',
 'storedir': 'tb_metrics_dev/2017-10-09 12-45-15 PDT/store',
 'sum_logloss': 'False',
 'swap_memory': 'False',
 'tb': {'tb_activations': 'Activations',
  'tb_biases': 'Biases',
  'tb_logdir': 'tb_metrics_dev',
  'tb_weig

In [None]:
vs = VisualizeStep(v, 46620)
vs.keys('validation')

In [None]:
vs.strs('validation', 'predicted_ids')

In [None]:
vs.strs('validation', 'y')

In [None]:
v.w2i['\\']