In [12]:
import os
import sys
import logging
import argparse
import tensorflow as tf
import numpy as np
from src.model import get_model_instance
from src.dataset import get_data_iterator
from src import util

In [13]:
def get_paths_in_dirs(dirs, shuffle=False):
    paths = []
    for dir in dirs:
        paths.extend(list(map(lambda filename: os.path.join(dir, filename), os.listdir(dir))))
    if shuffle is True:
        random.shuffle(paths)
    return paths


def get_dataset_from_dirs(data_dirs, batch_size, is_train=True):

    def parse_exmp(serial_exmp):
        feats = tf.parse_single_example(serial_exmp, features={'inp': tf.VarLenFeature(tf.int64),
                                                               'resp': tf.VarLenFeature(tf.int64),
                                                               'label': tf.FixedLenFeature([1], tf.int64)})
        inp = tf.sparse_tensor_to_dense(feats['inp'])[:100]
        resp = tf.sparse_tensor_to_dense(feats['resp'])[:100]
        label = feats['label']
        return inp, resp, label

    paths = get_paths_in_dirs(data_dirs)
    dataset = tf.data.TFRecordDataset(paths, 'GZIP').map(parse_exmp)
    padded_shapes = tuple(dataset.output_shapes)
    padded_values = tuple([tf.convert_to_tensor(0, tf.int64)] * len(padded_shapes))
    if is_train is True:
        dataset = dataset.shuffle(buffer_size=20000).repeat(None)
    dataset = dataset.padded_batch(batch_size, padded_shapes, padded_values).prefetch(10)
    return dataset

In [18]:
hparams = util.load_hparams("config/maga.yaml")
print hparams

[('D_hidden_sizes', [512]), ('activation', 'tanh'), ('debug', False), ('decay_rate', 0.99), ('decay_steps', 100000), ('dot_inp_dnn_sizes', [-1]), ('dot_resp_dnn_sizes', [-1]), ('embed_dim', 256), ('encoder_type', 'dan'), ('experiment_dir', 'experiment/maga'), ('learning_rate', 0.001), ('max_epoch', 10000000), ('max_seqlen', 100), ('model_type', 'dot_product'), ('optimizer', 'adam'), ('steps_per_checkpoint', 20), ('steps_per_eval', 20), ('steps_per_info', 10), ('train_batch_size', 256), ('train_dirs', ['data/tfrecord/train']), ('val_batch_size', 256), ('val_dirs', ['data/tfrecord/val']), ('vocab_size', 5744)]


In [23]:
# define data
train_dataset = get_dataset_from_dirs(hparams.train_dirs, hparams.train_batch_size, is_train=True)
val_dataset = get_dataset_from_dirs(hparams.val_dirs, hparams.val_batch_size, is_train=False)
iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
inp, resp, label = iterator.get_next()
train_iter_init_op = iterator.make_initializer(train_dataset)
val_iter_init_op = iterator.make_initializer(val_dataset)

In [36]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    sess.run(train_iter_init_op)
    sess.run(val_iter_init_op)
    i = sess.run(inp)
    r = sess.run(resp)
    l = sess.run(label)
    print i[1]
    print r[1]
    print l[1]
    print i.shape
    print r.shape
    print l.shape

[4423 1271   32 2117 2532 3967 5477 3092 4873 2211 2583 3652 5069    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]
[5702 4882   24 4331 3092 2669 4000 4971    0    0    0    0    0    0
    0    0    0    0    0]
[0]
(256, 28)
(256, 19)
(256, 1)
