In [1]:
import tensorflow as tf
import numpy as np

In [2]:
## BATCHING & PADDING DATA (high-level approach)
tf.reset_default_graph()

# [0, 1, 2, 3, 4 ,...]
x = tf.range(1, 10, name="x")
 
# A queue that outputs 0,1,2,3,...
range_q = tf.train.range_input_producer(limit=10, shuffle=False)
slice_end = range_q.dequeue()
 
# Slice x to variable length, i.e. [0], [0, 1], [0, 1, 2], ....
y = tf.slice(x, [0], [slice_end], name="y")

    
# Batch the variable length tensor with dynamic padding
batched_data = tf.train.batch(
    tensors=[y],
    batch_size=6,
    dynamic_pad=True,
    name="y_batch"
)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    try: 
        result = sess.run(batched_data)
    except tf.errors.OutOfRangeError:
        print('Done')
    finally:
        coord.request_stop()
        
    print(result)
    coord.join(threads)

# Run the graph
# tf.contrib.learn takes care of starting the queues for us
#res = tf.contrib.learn.run_n({"y": batched_data}, n=1, feed_dict=None)
 
# Print the result
#print("Batch shape: {}".format(res[0]["y"].shape))
#print(res[0]["y"])

    

[[0 0 0 0 0]
 [1 0 0 0 0]
 [1 2 0 0 0]
 [1 2 3 0 0]
 [1 2 3 4 0]
 [1 2 3 4 5]]


In [3]:
## BATCHING & PADDING DATA (low-level approach)
tf.reset_default_graph()

# [0, 1, 2, 3, 4 ,...]
x = tf.range(1, 10, name="x")
 
# A queue that outputs 0,1,2,3,...
range_q = tf.train.range_input_producer(limit=10, shuffle=False)
slice_end = range_q.dequeue()
 
# Slice x to variable length, i.e. [0], [0, 1], [0, 1, 2], ....
y = tf.slice(x, [0], [slice_end], name="y")

    
# Batch the variable length tensor with PaddingFIFOQueue
padding_q = tf.PaddingFIFOQueue(
    capacity=10,
    dtypes=tf.int32,
    shapes=[[None]])
enqueue_op = padding_q.enqueue([y])
qr = tf.train.QueueRunner(padding_q, [enqueue_op])
tf.train.add_queue_runner(qr)
batched_data = padding_q.dequeue_many(6)


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    try: 
        result = sess.run(batched_data)
    except tf.errors.OutOfRangeError:
        print('Done')
    finally:
        coord.request_stop()
        
    print(result)
    coord.join(threads)

[[0 0 0 0 0]
 [1 0 0 0 0]
 [1 2 0 0 0]
 [1 2 3 0 0]
 [1 2 3 4 0]
 [1 2 3 4 5]]


In [4]:
## SequenceExample (1: serialize/write part)
import os

tf.reset_default_graph()

sequences = [[1, 2, 3], [4, 5, 1], [1, 2], [0, 2, 4, 7], [9, 8], [5, 4, 3, 2, 1], [3, 6, 9]]
label_sequences = [[0, 1, 0], [1, 0, 0], [1, 1], [0, 1, 1, 0], [1, 0], [0, 1, 1, 0, 0], [1, 0, 1]]


def make_sequence_example(inputs, labels):
    # The object we return
    ex = tf.train.SequenceExample()
    # A non-sequential feature of our example
    sequence_length = len(inputs)
    ex.context.feature["length"].int64_list.value.append(sequence_length)
    # Feature lists for the two sequential features of our example
    fl_inputs = ex.feature_lists.feature_list["inputs"]
    fl_labels = ex.feature_lists.feature_list["labels"]
    for _input, _label in zip(inputs, labels):
        fl_inputs.feature.add().int64_list.value.append(_input)
        fl_labels.feature.add().int64_list.value.append(_label)
    return ex  
 
# Write all examples into a TFRecords file
output_file = os.path.join(os.getcwd(), 'Sequence_test1.tfr')
print output_file
writer = tf.python_io.TFRecordWriter(output_file)
for sequence, label_sequence in zip(sequences, label_sequences):
    ex = make_sequence_example(sequence, label_sequence)
    writer.write(ex.SerializeToString())
writer.close()

/Users/Winston/workspace/LSTM/Sequence_test1.tfr


In [5]:
## SequenceExample (2: deserialize/read part-1, leverage context feature in TFR)
import os

tf.reset_default_graph()

# A single serialized example
# (You can read this from a file using TFRecordReader)
#ex = make_example([1, 2, 3], [0, 1, 0]).SerializeToString()

BATCH_SIZE = 4
file_list = [os.path.join(os.getcwd(), 'Sequence_test1.tfr')]
print file_list
file_queue = tf.train.string_input_producer(file_list, num_epochs=1)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(file_queue)

# Define how to parse the example
context_features = {
    "length": tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
    "inputs": tf.FixedLenSequenceFeature([], dtype=tf.int64),
    "labels": tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
 
# Parse the example
context, sequence = tf.parse_single_sequence_example(
    serialized=serialized_example,
    context_features=context_features,
    sequence_features=sequence_features)




# Batch the variable length tensor with dynamic padding
batch_lengths, batch_sequences, batch_labels = tf.train.batch(
    [context["length"], sequence["inputs"], sequence["labels"]],
    batch_size=BATCH_SIZE,
    dynamic_pad=True,
    allow_smaller_final_batch=True,
    name="input_batching")


with tf.Session() as sess:
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    try: 
        for i in range(2):
            lens, seqs, lbls = sess.run([batch_lengths, batch_sequences, batch_labels])
            print('actual lengths =', lens)
            print('sequences = ', seqs)
            print('labels = ', lbls)
    except tf.errors.OutOfRangeError:
        print('Done')
    finally:
        coord.request_stop()



['/Users/Winston/workspace/LSTM/Sequence_test1.tfr']
('actual lengths =', array([3, 3, 2, 4]))
('sequences = ', array([[1, 2, 3, 0],
       [4, 5, 1, 0],
       [1, 2, 0, 0],
       [0, 2, 4, 7]]))
('labels = ', array([[0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 1, 0, 0],
       [0, 1, 1, 0]]))
('actual lengths =', array([2, 5, 3]))
('sequences = ', array([[9, 8, 0, 0, 0],
       [5, 4, 3, 2, 1],
       [3, 6, 9, 0, 0]]))
('labels = ', array([[1, 0, 0, 0, 0],
       [0, 1, 1, 0, 0],
       [1, 0, 1, 0, 0]]))


In [6]:
## SequenceExample (2: deserialize/read part-2, no use of context feature in TFR)
import os

tf.reset_default_graph()

BATCH_SIZE = 4
file_list = [os.path.join(os.getcwd(), 'Sequence_test1.tfr')]
print file_list
file_queue = tf.train.string_input_producer(file_list, num_epochs=1)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(file_queue)

# Define how to parse the example
sequence_features = {
    "inputs": tf.FixedLenSequenceFeature([], dtype=tf.int64),
    "labels": tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
 
# Parse the example
_, sequence = tf.parse_single_sequence_example(
    serialized=serialized_example,
    sequence_features=sequence_features)
actual_length = tf.shape(sequence["inputs"])[0]

# # Batch the variable length tensor with dynamic padding
batch_lengths, batch_sequences, batch_labels = tf.train.batch(
    [actual_length, sequence["inputs"], sequence["labels"]],
    batch_size=BATCH_SIZE,
    dynamic_pad=True,
    allow_smaller_final_batch=True,
    name="input_batching")


with tf.Session() as sess:
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    try: 
        for i in range(2):
            lens, seqs, lbls = sess.run([batch_lengths, batch_sequences, batch_labels])
            print('actual_lengths =', lens) 
            print('sequences = ', seqs)
            print('labels = ', lbls)      
    except tf.errors.OutOfRangeError:
        print('Done')
    finally:
        coord.request_stop()



['/Users/Winston/workspace/LSTM/Sequence_test1.tfr']
('actual_lengths =', array([3, 3, 2, 4], dtype=int32))
('sequences = ', array([[1, 2, 3, 0],
       [4, 5, 1, 0],
       [1, 2, 0, 0],
       [0, 2, 4, 7]]))
('labels = ', array([[0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 1, 0, 0],
       [0, 1, 1, 0]]))
('actual_lengths =', array([2, 5, 3], dtype=int32))
('sequences = ', array([[9, 8, 0, 0, 0],
       [5, 4, 3, 2, 1],
       [3, 6, 9, 0, 0]]))
('labels = ', array([[1, 0, 0, 0, 0],
       [0, 1, 1, 0, 0],
       [1, 0, 1, 0, 0]]))


In [None]:
## Another reference, code snippet of Tensorflow/Magenta/sequence_example_lib.py

def make_sequence_example(inputs, labels):
  """Returns a SequenceExample for the given inputs and labels.
  Args:
    inputs: A list of input vectors. Each input vector is a list of floats.
    labels: A list of ints.
  Returns:
    A tf.train.SequenceExample containing inputs and labels.
  """
  input_features = [
      tf.train.Feature(float_list=tf.train.FloatList(value=input_))
      for input_ in inputs]
  label_features = [
      tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
      for label in labels]
  feature_list = {
      'inputs': tf.train.FeatureList(feature=input_features),
      'labels': tf.train.FeatureList(feature=label_features)
  }
  feature_lists = tf.train.FeatureLists(feature_list=feature_list)
  return tf.train.SequenceExample(feature_lists=feature_lists)


def get_padded_batch(file_list, batch_size, input_size,
                     num_enqueuing_threads=4):
  """Reads batches of SequenceExamples from TFRecords and pads them.
  Can deal with variable length SequenceExamples by padding each batch to the
  length of the longest sequence with zeros.
  Args:
    file_list: A list of paths to TFRecord files containing SequenceExamples.
    batch_size: The number of SequenceExamples to include in each batch.
    input_size: The size of each input vector. The returned batch of inputs
        will have a shape [batch_size, num_steps, input_size].
    num_enqueuing_threads: The number of threads to use for enqueuing
        SequenceExamples.
  Returns:
    inputs: A tensor of shape [batch_size, num_steps, input_size] of floats32s.
    labels: A tensor of shape [batch_size, num_steps] of int64s.
    lengths: A tensor of shape [batch_size] of int32s. The lengths of each
        SequenceExample before padding.
  """
  file_queue = tf.train.string_input_producer(file_list)
  reader = tf.TFRecordReader()
  _, serialized_example = reader.read(file_queue)

  sequence_features = {
      'inputs': tf.FixedLenSequenceFeature(shape=[input_size],
                                           dtype=tf.float32),
      'labels': tf.FixedLenSequenceFeature(shape=[],
                                           dtype=tf.int64)}

  _, sequence = tf.parse_single_sequence_example(
      serialized_example, sequence_features=sequence_features)

  length = tf.shape(sequence['inputs'])[0]

  queue = tf.PaddingFIFOQueue(
      capacity=1000,
      dtypes=[tf.float32, tf.int64, tf.int32],
      shapes=[(None, input_size), (None,), ()])

  enqueue_ops = [queue.enqueue([sequence['inputs'],
                                sequence['labels'],
                                length])] * num_enqueuing_threads
  tf.train.add_queue_runner(tf.train.QueueRunner(queue, enqueue_ops))
  return queue.dequeue_many(batch_size)
