In [1]:
import tensorflow as tf
import numpy as np

In [2]:
################################################
## BATCHING & PADDING DATA (high-level approach)
################################################
tf.reset_default_graph()

# [0, 1, 2, 3, 4 ,...]
x = tf.range(1, 10, name="x")
 
# A queue that outputs 0,1,2,3,...
range_q = tf.train.range_input_producer(limit=10, shuffle=False)
slice_end = range_q.dequeue()
 
# Slice x to variable length, i.e. [0], [0, 1], [0, 1, 2], ....
y = tf.slice(x, [0], [slice_end], name="y")

    
# Batch the variable length tensor with dynamic padding
batched_data = tf.train.batch(
    tensors=[y],
    batch_size=6,
    dynamic_pad=True,
    name="y_batch"
)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    try: 
        result = sess.run(batched_data)
    except tf.errors.OutOfRangeError:
        print('Done')
    finally:
        coord.request_stop()
        
    print(result)
    coord.join(threads)

# Run the graph
# tf.contrib.learn takes care of starting the queues for us
#res = tf.contrib.learn.run_n({"y": batched_data}, n=1, feed_dict=None)
 
# Print the result
#print("Batch shape: {}".format(res[0]["y"].shape))
#print(res[0]["y"])

    

[[0 0 0 0 0]
 [1 0 0 0 0]
 [1 2 0 0 0]
 [1 2 3 0 0]
 [1 2 3 4 0]
 [1 2 3 4 5]]


In [3]:
################################################
## BATCHING & PADDING DATA (low-level approach)
################################################
tf.reset_default_graph()

# [0, 1, 2, 3, 4 ,...]
x = tf.range(1, 10, name="x")
 
# A queue that outputs 0,1,2,3,...
range_q = tf.train.range_input_producer(limit=10, shuffle=False)
slice_end = range_q.dequeue()
 
# Slice x to variable length, i.e. [0], [0, 1], [0, 1, 2], ....
y = tf.slice(x, [0], [slice_end], name="y")

    
# Batch the variable length tensor with PaddingFIFOQueue
padding_q = tf.PaddingFIFOQueue(
    capacity=10,
    dtypes=tf.int32,
    shapes=[[None]])
enqueue_op = padding_q.enqueue([y])
qr = tf.train.QueueRunner(padding_q, [enqueue_op])
tf.train.add_queue_runner(qr)
batched_data = padding_q.dequeue_many(6)


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    try: 
        result = sess.run(batched_data)
    except tf.errors.OutOfRangeError:
        print('Done')
    finally:
        coord.request_stop()
        
    print(result)
    coord.join(threads)

[[0 0 0 0 0]
 [1 0 0 0 0]
 [1 2 0 0 0]
 [1 2 3 0 0]
 [1 2 3 4 0]
 [1 2 3 4 5]]


In [4]:
#############################################################
## SequenceExample 1 [leverage context feature in TFR]
## Note:
##      input feature is simply a float32 per time-step, 
##      each sequence has different time-steps
#############################################################

### 1: serialize/write part

import os

tf.reset_default_graph()

sequences = [[1., 2., 3.], [4., 5., 1.], [1., 2.], [0., 2., 4., 7.], [9., 8.], [5., 4., 3., 2., 1.], [3., 6., 9.]]
label_sequences = [[0, 1, 0], [1, 0, 0], [1, 1], [0, 1, 1, 0], [1, 0], [0, 1, 1, 0, 0], [1, 0, 1]]

# inputs: A list of float32
# labels: A list of int64
def make_sequence_example(inputs, labels):
    # A non-sequential feature of our example
    sequence_length = len(inputs)
    context_features = {
        'length': tf.train.Feature(int64_list=tf.train.Int64List(value=[sequence_length]))
    }
    context = tf.train.Features(feature=context_features)
    # Feature lists for the two sequential features of our example
    input_features = [tf.train.Feature(float_list=tf.train.FloatList(value=[input_])) for input_ in inputs]
    label_features = [tf.train.Feature(int64_list=tf.train.Int64List(value=[label])) for label in labels]
    feature_list = {
        'inputs': tf.train.FeatureList(feature=input_features),
        'labels': tf.train.FeatureList(feature=label_features)
    }
    feature_lists = tf.train.FeatureLists(feature_list=feature_list)
    return tf.train.SequenceExample(context=context, feature_lists=feature_lists)
 
# Write all examples into a TFRecords file
output_file = os.path.join(os.getcwd(), 'Sequence_test_1.tfr')
writer = tf.python_io.TFRecordWriter(output_file)
for sequence, label_sequence in zip(sequences, label_sequences):
    ex = make_sequence_example(sequence, label_sequence)
    writer.write(ex.SerializeToString())
writer.close()

# Alternative:
# def make_sequence_example(inputs, labels):
#     # The object we return
#     ex = tf.train.SequenceExample()
#     # A non-sequential feature of our example
#     sequence_length = len(inputs)
#     ex.context.feature["length"].int64_list.value.append(sequence_length)
#     # Feature lists for the two sequential features of our example
#     fl_inputs = ex.feature_lists.feature_list["inputs"]
#     fl_labels = ex.feature_lists.feature_list["labels"]
#     for _input, _label in zip(inputs, labels):
#         fl_inputs.feature.add().float_list.value.append(_input)
#         fl_labels.feature.add().int64_list.value.append(_label)
#     return ex  



### 2: deserialize/read part
tf.reset_default_graph()

BATCH_SIZE = 4
file_list = [os.path.join(os.getcwd(), 'Sequence_test_1.tfr')]
print file_list
file_queue = tf.train.string_input_producer(file_list, num_epochs=1)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(file_queue)

# Define how to parse the example
context_features = {
    "length": tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
    "inputs": tf.FixedLenSequenceFeature([], dtype=tf.float32),
    "labels": tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
 
# Parse the example
context, sequence = tf.parse_single_sequence_example(
    serialized=serialized_example,
    context_features=context_features,
    sequence_features=sequence_features)


# Batch the variable length tensor with dynamic padding
batch_lengths, batch_sequences, batch_labels = tf.train.batch(
    [context["length"], sequence["inputs"], sequence["labels"]],
    batch_size=BATCH_SIZE,
    dynamic_pad=True,
    allow_smaller_final_batch=True,
    name="input_batching")


with tf.Session() as sess:
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    try: 
        for i in range(2):
            lens, seqs, lbls = sess.run([batch_lengths, batch_sequences, batch_labels])
            print('actual_lengths =', lens)
            print('batch_size=%d, time_steps=%d' % (lbls.shape[0],lbls.shape[1]))
            print('sequences = ', seqs)
            print('labels = ', lbls)   
    except tf.errors.OutOfRangeError:
        print('Done')
    finally:
        coord.request_stop()


['/Users/Winston/workspace/LSTM/Sequence_test_1.tfr']
('actual_lengths =', array([3, 3, 2, 4]))
batch_size=4, time_steps=4
('sequences = ', array([[ 1.,  2.,  3.,  0.],
       [ 4.,  5.,  1.,  0.],
       [ 1.,  2.,  0.,  0.],
       [ 0.,  2.,  4.,  7.]], dtype=float32))
('labels = ', array([[0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 1, 0, 0],
       [0, 1, 1, 0]]))
('actual_lengths =', array([2, 5, 3]))
batch_size=3, time_steps=5
('sequences = ', array([[ 9.,  8.,  0.,  0.,  0.],
       [ 5.,  4.,  3.,  2.,  1.],
       [ 3.,  6.,  9.,  0.,  0.]], dtype=float32))
('labels = ', array([[1, 0, 0, 0, 0],
       [0, 1, 1, 0, 0],
       [1, 0, 1, 0, 0]]))


In [5]:
#############################################################
## SequenceExample 2 [no use of context feature in TFR]
## Note:
##      input feature is simply a float32 per time-step, 
##      each sequence has different time-steps
#############################################################

### 1: serialize/write part 

import os

tf.reset_default_graph()

sequences = [[1., 2., 3.], [4., 5., 1.], [1., 2.], [0., 2., 4., 7.], [9., 8.], [5., 4., 3., 2., 1.], [3., 6., 9.]]
label_sequences = [[0, 1, 0], [1, 0, 0], [1, 1], [0, 1, 1, 0], [1, 0], [0, 1, 1, 0, 0], [1, 0, 1]]

# inputs: A list of float32
# labels: A list of int64
def make_sequence_example(inputs, labels):
    # Feature lists for the two sequential features of our example
    input_features = [tf.train.Feature(float_list=tf.train.FloatList(value=[input_])) for input_ in inputs]
    label_features = [tf.train.Feature(int64_list=tf.train.Int64List(value=[label])) for label in labels]
    feature_list = {
        'inputs': tf.train.FeatureList(feature=input_features),
        'labels': tf.train.FeatureList(feature=label_features)
    }
    feature_lists = tf.train.FeatureLists(feature_list=feature_list)
    return tf.train.SequenceExample(feature_lists=feature_lists)
 
# Write all examples into a TFRecords file
output_file = os.path.join(os.getcwd(), 'Sequence_test_2.tfr')
writer = tf.python_io.TFRecordWriter(output_file)
for sequence, label_sequence in zip(sequences, label_sequences):
    ex = make_sequence_example(sequence, label_sequence)
    writer.write(ex.SerializeToString())
writer.close()



### 2: deserialize/read part
tf.reset_default_graph()

BATCH_SIZE = 4
file_list = [os.path.join(os.getcwd(), 'Sequence_test_2.tfr')]
print file_list
file_queue = tf.train.string_input_producer(file_list, num_epochs=1)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(file_queue)

# Define how to parse the example
sequence_features = {
    "inputs": tf.FixedLenSequenceFeature([], dtype=tf.float32),
    "labels": tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
 
# Parse the example
_, sequence = tf.parse_single_sequence_example(
    serialized=serialized_example,
    sequence_features=sequence_features)
actual_length = tf.shape(sequence["inputs"])[0]

# Batch the variable length tensor with dynamic padding
batch_lengths, batch_sequences, batch_labels = tf.train.batch(
    [actual_length, sequence["inputs"], sequence["labels"]],
    batch_size=BATCH_SIZE,
    dynamic_pad=True,
    allow_smaller_final_batch=True,
    name="input_batching")


with tf.Session() as sess:
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    try: 
        for i in range(2):
            lens, seqs, lbls = sess.run([batch_lengths, batch_sequences, batch_labels])
            print('actual_lengths =', lens)
            print('batch_size=%d, time_steps=%d' % (lbls.shape[0],lbls.shape[1]))
            print('sequences = ', seqs)
            print('labels = ', lbls) 
    except tf.errors.OutOfRangeError:
        print('Done')
    finally:
        coord.request_stop()


['/Users/Winston/workspace/LSTM/Sequence_test_2.tfr']
('actual_lengths =', array([3, 3, 2, 4], dtype=int32))
batch_size=4, time_steps=4
('sequences = ', array([[ 1.,  2.,  3.,  0.],
       [ 4.,  5.,  1.,  0.],
       [ 1.,  2.,  0.,  0.],
       [ 0.,  2.,  4.,  7.]], dtype=float32))
('labels = ', array([[0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 1, 0, 0],
       [0, 1, 1, 0]]))
('actual_lengths =', array([2, 5, 3], dtype=int32))
batch_size=3, time_steps=5
('sequences = ', array([[ 9.,  8.,  0.,  0.,  0.],
       [ 5.,  4.,  3.,  2.,  1.],
       [ 3.,  6.,  9.,  0.,  0.]], dtype=float32))
('labels = ', array([[1, 0, 0, 0, 0],
       [0, 1, 1, 0, 0],
       [1, 0, 1, 0, 0]]))


In [6]:
#############################################################
## SequenceExample 3 [no use of context feature in TFR]
## Note:
##      input feature is a float32 vector per time-step,
##      input feature dimension is the same for all sequences,
##      acutal time-steps may differ for each sequence
#############################################################

### 1: serialize/write part 

import os

tf.reset_default_graph()

FEATURE_SIZE_PER_TIMESTEP = 5
sequences = [[[1.,1.,1.,1.,1.], [2.,3.,4.,5.,6.], [3.,2.,1.,0.,-1.]], 
             [[4.,3.,1.,2.,5.], [5.,5.,5.,5.,5.], [1.,2.,3.,4.,5.]], 
             [[1.,0.,0.,0.,1.], [2.,2.,2.,2.,2.]], 
             [[0.,0.,0.,0.,0.], [2.,1.,0.,-1.,-2.], [4.,8.,12.,16.,20.], [7.,7.,7.,0.,1.]], 
             [[9.,9.,9.,9.,9.], [8.,8.,1.,1.,1.]], 
             [[5.,4.,3.,2.,1.], [4.,4.,8.,8.,8.], [3.,3.,3.,6.,6.], [2.,2.,2.,2.,1.], [1.,1.,1.,1.,1.]], 
             [[3.,0.,3.,0.,3.], [6.,8.,3.,1.,1.], [9.,9.,9.,9.,8.]]]
label_sequences = [[0, 1, 0], [1, 0, 0], [1, 1], [0, 1, 1, 0], [1, 0], [0, 1, 1, 0, 0], [1, 0, 1]]

# inputs: A list of input vectors, each input vector is a list of float32 (entries #: FEATURE_SIZE_PER_TIMESTEP)
# labels: A list of int64
def make_sequence_example(inputs, labels):
    input_features = [tf.train.Feature(float_list=tf.train.FloatList(value=input_)) for input_ in inputs]
    label_features = [tf.train.Feature(int64_list=tf.train.Int64List(value=[label])) for label in labels]
    feature_list = {
        'inputs': tf.train.FeatureList(feature=input_features),
        'labels': tf.train.FeatureList(feature=label_features)
    }
    feature_lists = tf.train.FeatureLists(feature_list=feature_list)
    return tf.train.SequenceExample(feature_lists=feature_lists)

# Write all examples into a TFRecords file
output_file = os.path.join(os.getcwd(), 'Sequence_test_3.tfr')
writer = tf.python_io.TFRecordWriter(output_file)
for sequence, label_sequence in zip(sequences, label_sequences):
    ex = make_sequence_example(sequence, label_sequence)
    writer.write(ex.SerializeToString())
writer.close()


## 2: deserialize/read part
tf.reset_default_graph()

BATCH_SIZE = 4
FEATURE_SIZE_PER_TIMESTEP = 5

file_list = [os.path.join(os.getcwd(), 'Sequence_test_3.tfr')]
print file_list
file_queue = tf.train.string_input_producer(file_list, num_epochs=1)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(file_queue)

# Define how to parse the example
sequence_features = {
    "inputs": tf.FixedLenSequenceFeature([FEATURE_SIZE_PER_TIMESTEP], dtype=tf.float32),
    "labels": tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
 
# Parse the example
_, sequence = tf.parse_single_sequence_example(
    serialized=serialized_example,
    sequence_features=sequence_features)
actual_length = tf.shape(sequence["inputs"])[0]

# Batch the variable length tensor with dynamic padding
batch_lengths, batch_sequences, batch_labels = tf.train.batch(
    [actual_length, sequence["inputs"], sequence["labels"]],
    batch_size=BATCH_SIZE,
    dynamic_pad=True,
    allow_smaller_final_batch=True,
    name="input_batching")


with tf.Session() as sess:
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    try: 
        for i in range(2):
            lens, seqs, lbls = sess.run([batch_lengths, batch_sequences, batch_labels])
            print('actual_lengths =', lens)
            print('batch_size=%d, time_steps=%d' % (lbls.shape[0],lbls.shape[1]))
            print('sequences = ', seqs)
            print('labels = ', lbls)      
    except tf.errors.OutOfRangeError as e:
        print('Done')
        print(e.error_code, e.message)
    finally:
        coord.request_stop()



['/Users/Winston/workspace/LSTM/Sequence_test_3.tfr']
('actual_lengths =', array([3, 3, 2, 4], dtype=int32))
batch_size=4, time_steps=4
('sequences = ', array([[[  1.,   1.,   1.,   1.,   1.],
        [  2.,   3.,   4.,   5.,   6.],
        [  3.,   2.,   1.,   0.,  -1.],
        [  0.,   0.,   0.,   0.,   0.]],

       [[  4.,   3.,   1.,   2.,   5.],
        [  5.,   5.,   5.,   5.,   5.],
        [  1.,   2.,   3.,   4.,   5.],
        [  0.,   0.,   0.,   0.,   0.]],

       [[  1.,   0.,   0.,   0.,   1.],
        [  2.,   2.,   2.,   2.,   2.],
        [  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.]],

       [[  0.,   0.,   0.,   0.,   0.],
        [  2.,   1.,   0.,  -1.,  -2.],
        [  4.,   8.,  12.,  16.,  20.],
        [  7.,   7.,   7.,   0.,   1.]]], dtype=float32))
('labels = ', array([[0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 1, 0, 0],
       [0, 1, 1, 0]]))
('actual_lengths =', array([2, 5, 3], dtype=int32))
batch_size=3, time_steps=5
('se