In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [1]:
import pandas as pd
import os
import re
import codecs
from IPython.display import display
from six.moves import cPickle as pickle
import string
from PIL import Image
import numpy as np
import h5py

In [2]:
width = None
pd.options.display.max_rows = 600
pd.options.display.max_columns = width
pd.options.display.max_colwidth = 600
pd.options.display.width = width
pd.options.display.max_seq_items = None
pd.options.display.expand_frame_repr = False
pd.options.display.colheader_justify = 'left'

In [3]:
import data_commons as dtc
import dl_commons as dlc

In [23]:
class VisualizeDir(object):
    def __init__(self, storedir, gen_datadir='../data/generated2'):
        self._storedir = storedir
        self._logdir = os.path.join(storedir, '..')
        try:
            self._hyper = dtc.load(self._logdir, 'hyper.pkl')
            self._args = dtc.load(self._logdir, 'args.pkl')
        except:
            self._hyper = dtc.load(self._storedir, 'hyper.pkl')
            self._args = dtc.load(self._storedir, 'args.pkl')

        self._word2id = pd.read_pickle(os.path.join(gen_datadir, 'dict_vocab.pkl'))
        i2w = pd.read_pickle(os.path.join(gen_datadir, 'dict_id2word.pkl'))
        for i in range(-1,-11,-1):
            i2w[i] = '%d'%i
        self._id2word = {}
        ## Append space after all commands beginning with a backslash (except backslash alone)
        for i, w in i2w.items():
            if w[0] == '\\':
              self._id2word[i] = w + " "  
            else:
                self._id2word[i] = w 
        self._id2word[self._word2id['id']['\\']] = '\\'
    
    @property
    def storedir(self):
        return self._storedir
    
    @property
    def w2i(self):
        return self._word2id['id']

    @property
    def i2w(self):
        return self._id2word
    
    @property
    def max_steps(self):
        steps = [int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in os.listdir(self._storedir)]
        epoch_steps = [int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in os.listdir(self._storedir) if f.startswith('validation')]
        return sorted(steps)[-1], sorted(epoch_steps)[-1]
        
    @property
    def args(self):
        return self._args
    
    @property
    def hyper(self):
        return self._hyper
    
    def keys(self, graph, step):
        with h5py.File(os.path.join(self._storedir, '%s_%d.h5'%(graph,step))) as h5:
            return h5.keys()

    def np(self, graph, step, key):
        """
        Args:
            graph: 'training' or 'validation'
            step:  step who's output is to be fetched
            key:   key of object to fetch - e.g. 'predicted_ids'
        """
        with h5py.File(os.path.join(self._storedir, '%s_%d.h5'%(graph,step))) as h5:
            return h5[key][...]
    
    def df(self, graph, step, key):
        return pd.DataFrame(self.np(graph, step, key))
    
    def words(self, graph, step, key, key2=None):
        df = self.df(graph, step, key)
        df2 = self.df(graph, step, key2) if (key2 is not None) else None
        
        if key2 is None:
            return df.applymap(lambda x: self._id2word[x])
        else:
            return pd.DataFrame({'%s'%key: df.applymap(lambda x: self._id2word[x]), '%s'%key2: df2.applymap(lambda x: self._id2word[x])})

    def strs(self, graph, step, key, key2=None, mingle=True):
        df_str = self.words(graph, step, key)
        df_str2 = self.words(graph, step, key2) if (key2 is not None) else None
        
        ## each token's string version - excepting backslash - has a space appended to it,
        ## therefore the string output should be compile if the prediction was syntactically correct
        if key2 == None:
            return pd.DataFrame(["".join(row) for row in df_str.itertuples(index=False)])
        else:
            if mingle:
                ar1 = ["".join(row) for row in df_str.itertuples(index=False)]
                ar2 = ["".join(row) for row in df_str2.itertuples(index=False)]
                data = {'%s_%d %s / %s\t\t(%s)'%(graph, step, key, key2, self._storedir): [e for t in zip(ar1, ar2) for e in t]}
            else:
                data = {'%s_%d.%s\t\t(%s)'%(graph, step, key, self._storedir): ["".join(row) for row in df_str.itertuples(index=False)], '%s_%d.%s\t\t(%s)'%(graph, step, key2, self._storedir): ["".join(row) for row in df_str2.itertuples(index=False)]}

            df = pd.DataFrame(data)
#             df.style.set_caption('%s/%s_%s'%(self._storedir, graph, step))
            return df
        
    def prune_logs(self, save_epochs=1, dry_run=True):
        """Save the latest save_epochs logs and remove the rest."""
        def get_step(f):
            return int(os.path.basename(f).split('_')[-1].split('.')[0])
        
        epoch_steps = [get_step(f) for f in os.listdir(self._storedir) if f.startswith('validation')]
        epoch_steps = list(set(epoch_steps))
        print 'epoch_steps: %s'%epoch_steps
        if len(epoch_steps) <= save_epochs:
            print('Only %d full epochs were found. Deleting nothing.'%epoch_steps)
            return False
        else:
            epoch_steps.sort(reverse=True)
            max_step = epoch_steps[save_epochs]
            training_files = [f for f in os.listdir(self._storedir) if f.startswith('training')]
            training_steps = set([get_step(f) for f in training_files])
            steps_to_remove = set(filter(lambda s: (s<max_step) and (s not in epoch_steps), training_steps))
            files_to_remove = set([f for f in training_files if (get_step(f) in steps_to_remove)])
            files_to_keep = set([f for f in os.listdir(self._storedir)]) - files_to_remove
            if dry_run:
                print '%d files will be kept\n'%len(files_to_keep), pd.Series(sorted(list(files_to_keep), key=get_step))
                print '%d files will be removed\n'%len(files_to_remove), pd.Series(sorted(list(files_to_remove), key=get_step))
            else:
                for f in files_to_remove:
                    os.remove(os.path.join(self._storedir, f))
                print 'Removed %d files\n'%len(files_to_remove), pd.Series(sorted(list(files_to_remove), key=get_step))

    def prune_snapshots(self, keep=10, dry_run=True):
        """ Keep the latest 'save' snapshots. Delete the rest. """
        def get_step(f):
            return int(os.path.basename(f).split('.')[0].split('snapshot-')[1])
        
        files = [f for f in os.listdir(self._logdir) if f.startswith('snapshot-')]
        steps = list(set([get_step(f) for f in files]))
        if len(steps) <= keep:
            print 'Nothing to delete'
            return
        else:
            steps.sort(reverse=True)
            steps_to_keep = set(steps[:keep])
            steps_to_remove = set(steps) - steps_to_keep
            print 'steps to keep: ', sorted(list(steps_to_keep))
            print 'steps to remove: ', sorted(list(steps_to_remove))
            files_to_remove = [f for f in files if (get_step(f) not in steps_to_keep) ]
            files_to_remove = sorted(files_to_remove, key=get_step)
            
            if dry_run:
                print '%d files will be removed\n'%len(files_to_remove), pd.Series(files_to_remove)
            else:
                for f in files_to_remove:
                    os.remove(os.path.join(self._logdir, f))
                print '%d files removed\n'%len(files_to_remove), pd.Series(files_to_remove)
        
class VisualizeStep():
    def __init__(self, visualizer, step):
        self._step = step
        self._visualizer = visualizer
        
    def keys(self, graph):
        return self._visualizer.keys(graph, self._step)
    
    def np(self, graph, key):
        return self._visualizer.np(graph, self._step, key)
    
    def df(self, graph, step, key):
        return pd.DataFrame.df(self.np(graph, step, key))
    
    def words(self, graph, key, key2=None):
        return self._visualizer.words(graph, self._step, key, key2)

    def strs(self, graph, key, key2=None, mingle=True):
        return self._visualizer.strs(graph, self._step, key, key2, mingle)

class DiffParams(object):
    def __init__(self, dir1, dir2):
        self._dir1 = dir1
        self._dir2 = dir2
        
    def get(self, filename, to_str):
        one = dtc.load(self._dir1, filename)
        two = dtc.load(self._dir2, filename)
        if (to_str):
            one = dlc.to_dict(one)
            two = dlc.to_dict(two)
        return one, two

    def print_dict(self, filename, to_str):
        one, two = self.get(filename, to_str)
        dtc.pprint(dlc.diff_dict(one, two))
    
    def _table(self, filename):
        one, two = self.get(filename, False)
        head, tail = dlc.diff_table(one, two)
        display(pd.DataFrame(head))
        display(pd.DataFrame(tail))
        
    def args(self, to_str=True):
        self._table('args.pkl')        
        
    def hyper(self, to_str=True):
        self._table('hyper.pkl')
    
    def get_args(self):
        return self.get('args.pkl', to_str=True)
    def get_hyper(self):
        return self.get('hyper.pkl', to_str=True)

In [28]:
# v = Visualize('./tb_metrics_dev/2017-10-06 17-56-47 PDT/store', '../data/generated2')
# v = VisualizeDir('./tb_metrics/2017-10-08 12-26-45 PDT/store')
# v = VisualizeDir('./tb_metrics_dev/2017-10-09 12-45-15 PDT/store')
# vd = VisualizeDir('./tb_metrics/2017-10-09 17-43-49 PDT/store')
# vd = VisualizeDir('tb_metrics/2017-10-09 16-01-07 PDT/store')
# vd2 = VisualizeDir('tb_metrics/2017-09-26 22-40-18 PDT/new_code 2017-10-10 15-10-17 PDT/store')
# vd = VisualizeDir('./tb_metrics/2017-10-07 14-33-35 PDT_my_decoder/store')
# vd = VisualizeDir('./tb_metrics_view/2017-10-10 19-14-54 PDT good 3_decoder_LSTMs_my_decoder/store')
# vd = VisualizeDir('./tb_metrics_view/2017-10-11 17-46-12 PDT 3lstm_3att/store')
# vd = VisualizeDir('/zpool_3TB/i2l/tb_metrics/2017-10-12 00-15-53 PDT good 3lstm_attMLP/store')
# vd = VisualizeDir('./tb_metrics_view/2017-10-12 19-06-32 PDT 3.1LSTM_noShare_3att/store')
vd = VisualizeDir('./tb_metrics_view/2017-10-17 15-41-15 PDT 3.1LSTM_3att_noGather/store_2')

In [29]:
# vd.prune_logs(save_epochs=0, dry_run=False)
# vd.prune_snapshots(keep=5, dry_run=False)
vd.hyper

{'B': 40,
 'CALSTM_STACK': [{'B': 40,
   'CTCBlankTokenID': None,
   'D': 512,
   'H': 3,
   'K': 557,
   'L': 99,
   'MaxSeqLen': 151,
   'NullTokenID': 0,
   'SpaceTokenID': 556,
   'StartTokenID': 556,
   'W': 33,
   'att_layers': {'activation_fn': <function tensorflow.python.ops.math_ops.tanh>,
    'biases_initializer': <tensorflow.python.ops.init_ops.Zeros at 0x7f64f831e110>,
    'biases_regularizer': None,
    'dropout': None,
    'layers_units': [512, 512, 512, 1],
    'normalizer_fn': None,
    'op_name': 'MLP',
    'tb': {'logdir_tag': '3.1LSTM_3att_noGather',
     'tb_activations': 'Activations',
     'tb_biases': 'Biases',
     'tb_logdir': 'tb_metrics',
     'tb_weights': 'Weights'},
    'weights_initializer': <function tensorflow.contrib.layers.python.layers.initializers._initializer>,
    'weights_regularizer': <function tensorflow.contrib.layers.python.layers.regularizers.l2>},
   'att_share_weights': True,
   'att_weighted_gather': False,
   'biases_initializer': <tenso

In [16]:
display(vd.max_steps)
# display(vd2.max_steps)

(134680, 24864)

In [9]:
vs = VisualizeStep(vd, 134680)
vs.strs('training', 'predicted_ids', 'y', mingle=False)

KeyError: "Unable to open object (object 'predicted_ids' doesn't exist)"

In [17]:
# diff = DiffParams('./tb_metrics/2017-09-26 22-40-18 PDT', './tb_metrics/2017-10-07 14-33-35 PDT_2CALSTMs')
# diff = DiffParams('./tb_metrics/2017-10-07 14-33-35 PDT', './tb_metrics/2017-10-08 12-26-45 PDT')
# diff = DiffParams('./tb_metrics/2017-09-26 22-40-18 PDT', './tb_metrics/2017-10-08 12-26-45 PDT')
# diff = DiffParams('./tb_metrics/2017-09-26 22-40-18 PDT/w=1', './tb_metrics/2017-10-08 12-26-45 PDT')
# diff = DiffParams('./tb_metrics/2017-10-09 16-01-07 PDT_good', './tb_metrics/2017-10-09 17-43-49 PDT')
# diff = DiffParams('./tb_metrics/2017-09-26 22-40-18 PDT', './tb_metrics/2017-10-09 16-01-07 PDT_good')
diff = DiffParams('./tb_metrics_view/2017-10-11 17-46-12 PDT 3lstm_2att', './tb_metrics_view/2017-10-12 19-06-32 PDT 3.1LSTM_noShare_3att')
diff.hyper()

Unnamed: 0,0,1
0,"CALSTM_STACK_1.att_layers.layers_units ===> [512, 512]","CALSTM_STACK_1.att_layers.layers_units ===> [99, 99, 99]"
1,output_layers.tb.logdir_tag ===> 3lstm_2att,output_layers.tb.logdir_tag ===> 3.1LSTM_noShare_3att
2,init_model.tb.logdir_tag ===> 3lstm_2att,init_model.tb.logdir_tag ===> 3.1LSTM_noShare_3att
3,CALSTM_STACK_1.att_share_weights ===> True,CALSTM_STACK_1.att_share_weights ===> False
4,tb.logdir_tag ===> 3lstm_2att,tb.logdir_tag ===> 3.1LSTM_noShare_3att
5,CALSTM_STACK_1.decoder_lstm.tb.logdir_tag ===> 3lstm_2att,CALSTM_STACK_1.decoder_lstm.tb.logdir_tag ===> 3.1LSTM_noShare_3att
6,init_model_final_layers.tb.logdir_tag ===> 3lstm_2att,init_model_final_layers.tb.logdir_tag ===> 3.1LSTM_noShare_3att
7,"CALSTM_STACK_1.decoder_lstm.layers_units ===> [1000, 1000, 557]","CALSTM_STACK_1.decoder_lstm.layers_units ===> [1000, 1000, 1000]"
8,CALSTM_STACK_1.att_layers.tb.logdir_tag ===> 3lstm_2att,CALSTM_STACK_1.att_layers.tb.logdir_tag ===> 3.1LSTM_noShare_3att
9,CALSTM_STACK_1.tb.logdir_tag ===> 3lstm_2att,CALSTM_STACK_1.tb.logdir_tag ===> 3.1LSTM_noShare_3att


Unnamed: 0,0,1
0,CALSTM_STACK_1.logger ===> undefined,CALSTM_STACK_1.logger ===> <logging.Logger object at 0x7ff0abbb5b90>
1,init_model.weights_initializer ===> <function _initializer at 0x7ff0abce1cf8>,init_model.weights_initializer ===> <function _initializer at 0x7ff0abb6f320>
2,CALSTM_STACK_1.weights_regularizer ===> <function l2 at 0x7ff0abca0668>,CALSTM_STACK_1.weights_regularizer ===> <function l2 at 0x7ff0abb6f230>
3,CALSTM_STACK_1.biases_initializer ===> <tensorflow.python.ops.init_ops.Zeros object at 0x7ff0abbb56d0>,CALSTM_STACK_1.biases_initializer ===> <tensorflow.python.ops.init_ops.Zeros object at 0x7ff0abbb59d0>
4,init_model_final_layers.weights_regularizer ===> <function l2 at 0x7ff0abca0668>,init_model_final_layers.weights_regularizer ===> <function l2 at 0x7ff0abb6f230>
5,embeddings_regularizer ===> <function l2 at 0x7ff0abca0668>,embeddings_regularizer ===> <function l2 at 0x7ff0abb6f230>
6,biases_initializer ===> <tensorflow.python.ops.init_ops.Zeros object at 0x7ff0abbb56d0>,biases_initializer ===> <tensorflow.python.ops.init_ops.Zeros object at 0x7ff0abbb59d0>
7,embeddings_initializer ===> <function _initializer at 0x7ff0abce1cf8>,embeddings_initializer ===> <function _initializer at 0x7ff0abb6f320>
8,CALSTM_STACK_1.att_layers.weights_initializer ===> <function _initializer at 0x7ff0abce1cf8>,CALSTM_STACK_1.att_layers.weights_initializer ===> <function _initializer at 0x7ff0abb6f320>
9,CALSTM_STACK_1.att_layers.weights_regularizer ===> <function l2 at 0x7ff0abca0668>,CALSTM_STACK_1.att_layers.weights_regularizer ===> <function l2 at 0x7ff0abb6f230>


In [None]:
diff.get_hyper()[1]['CALSTM_STACK'][1]