# Notebook to Practice Bucketing
* [TF docs](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/training/bucket_by_sequence_length)
```
bucket_by_sequence_length(
    input_length,
    tensors,
    batch_size,
    bucket_boundaries,
    num_threads=1,
    capacity=32,
    bucket_capacities=None,
    shapes=None,
    dynamic_pad=False,
    allow_smaller_final_batch=False,
    keep_input=True,
    shared_name=None,
    name=None
)
```

**Relevant**
* [GitHub Issue 8182: how do you dequeue buckets](https://github.com/tensorflow/tensorflow/issues/8182)
* [GitHub Issue 5069: how do you use bucketing functions](https://github.com/tensorflow/tensorflow/issues/5609)

**Code I used to get this working**
* \*\* **[Working minimal example code](http://pythonexample.com/search/dtype/3)** \*\*
  * [Updated Version](https://gist.github.com/raviqqe/2b7112f8fe4f97ee8c723d83c51e3197)
* [Function using bucket_by_sequence_length in Stanford Tutorials](https://github.com/chiphuyen/tf-stanford-tutorials/blob/master/examples/cgru/data_reader.py)

**More Examples**
* [programtalk: Example 60 shows a function using tf.learn_by_sequence_length](http://programtalk.com/python-examples/tensorflow.reset_default_graph/?ipage=2)

Notes: I treat word-embeddings and one-hot vectors as the same thing. Generally, word-embeddings are the output of some function on one-hot vectors

(This is practice for bucketting sequences of words so my examples will be oriented towards this)

# What have I learned?

* You need to create queue runners in order to use the bucketting function


In [16]:
import tensorflow as tf
import numpy as np

In [17]:
# function to generate a sequence of one_hot vectors (i.e. a sequence of "words")
def gen_sequence(words, embedding_size):
    sentences = np.zeros((len(words), embedding_size))
    for i in range(len(words)):
        sentences[i,words[i]] = 1
    return sentences

# sentence with 4 words
print (gen_sequence([1, 5, 6, 2], 10)) 

[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]]


In [28]:
# generate N=6 sentences with variable number of words
# Assume a vocabulary of size V=30
# maximum words per sentence will be m=10
V=20
sentences = []
N=4
m=10
for i in range(N):
    nwords = np.random.randint(1, m+1)
    words = np.random.randint(V, size=nwords)
    sentence = gen_sequence(words, V)
#     print(sentence)
#     print()
    sentences.append(sentence)

# print sentence lengths
print([len(s) for s in sentences])

[4, 10, 3, 8]


In [19]:
# assume a batch_size of 1 (unrealistic and will change iteratively)
batch_size=1
bucket_boundaries = [2, 4, 6, 8]

# lengths = tf.placeholder(tf.int32, ())
x = tf.placeholder(tf.float32, shape=(None, ))
q_capacity=100

# assign operations to enqueue and dequeue"word embeddings" or "one-hot vectors"
q = tf.FIFOQueue(q_capacity, tf.float32, shapes=[x.get_shape()])
enq_op = q.enqueue(x)
deq_op = q.dequeue()

# unsure about what length to use for input_length
batches = tf.contrib.training.bucket_by_sequence_length(
                input_length=q_capacity, 
                tensors=[deq_op], 
                batch_size=batch_size, 
                bucket_boundaries = bucket_boundaries,
                allow_smaller_final_batch=True)

In [27]:
print (sentences[0])
sess = tf.Session()

sess.run(tf.global_variables_initializer())
# sess.run(tf.local_variables_initializer())

threads = tf.train.start_queue_runners(sess)
for sentence in sentences:
#     encode a single sentence
#     sess.run(enq_op, feed_dict={x: np.reshape(sentence, [1,0])})
#     for word in sentence:
#         # need to expand each one-hot from (20,) to (1,20) since that's the shape of our placeholder
#         sess.run(enq_op, feed_dict={x: word})
#     print("batches", sess.run(batches))
    break
sess.close()

[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
   0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]]
batches (array([100], dtype=int32), [array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  1.,  0.,  0.,  0.]], dtype=float32)])


In [11]:
x = tf.constant([123, 456, 789])
q = tf.FIFOQueue(100, tf.int32, shapes=[x.get_shape()])
enq_op = q.enqueue(x)
x = q.dequeue()
 
deq = tf.contrib.training.bucket_by_sequence_length(tf.shape(x)[0], [x], 1, [1],
    allow_smaller_final_batch=True)
print("deq", deq)
 
s = tf.InteractiveSession()
 
for _ in range(64):
  s.run(enq_op)
 
tf.train.start_queue_runners(s)
 
print("run_deq", s.run(deq))
 
# # Outputs
# (<tf.Tensor 'bucket_by_sequence_length/bucket/dequeue_top:1' shape=(?,) dtype=int32>, [<tf.Tensor 'bucket_by_sequence_length/bucket/dequeue_top:2' shape=(?, 3) dtype=int32>])
# (array([3], dtype=int32), [array([[123, 456, 789]], dtype=int32)])

deq (<tf.Tensor 'bucket_by_sequence_length_3/bucket/dequeue_top:1' shape=(?,) dtype=int32>, [<tf.Tensor 'bucket_by_sequence_length_3/bucket/dequeue_top:2' shape=(?, 3) dtype=int32>])
run_deq (array([3], dtype=int32), [array([[123, 456, 789]], dtype=int32)])


In [4]:
# # Process sentences for bucketting
# input_length=np.array([len(s) for s in sentences])
# # not sure why but input_length seems to need to be the largest length of all
# max_length=0
# for s in sentences: max_length = max(max_length, len(s))

# sentence_tensors = [tf.convert_to_tensor(s) for s in sentences]

In [None]:
q = tf.FIFOQueue(100, tf.int32, shapes=[x.get_shape() for x in sentence_tensors])

In [5]:
def batch_examples(examples, batch_size, bucket_boundaries=None):
  """Given a queue of examples, create batches of examples with similar lengths.
  We assume that examples is a dictionary with string keys and tensor values,
  possibly coming from a queue, e.g., constructed by examples_queue above.
  Each tensor in examples is assumed to be 1D. We will put tensors of similar
  length into batches togeter. We return a dictionary with the same keys as
  examples, and with values being batches of size batch_size. If elements have
  different lengths, they are padded with 0s. This function is based on
  tf.contrib.training.bucket_by_sequence_length so see there for details.
  For example, if examples is a queue containing [1, 2, 3] and [4], then
  this function with batch_size=2 will return a batch [[1, 2, 3], [4, 0, 0]].
  Args:
    examples: a dictionary with string keys and 1D tensor values.
    batch_size: a python integer or a scalar int32 tensor.
    bucket_boundaries: a list of integers for the boundaries that will be
      used for bucketing; see tf.contrib.training.bucket_by_sequence_length
      for more details; if None, we create a default set of buckets.
  Returns:
    A dictionary with the same keys as examples and with values being batches
    of examples padded with 0s, i.e., [batch_size x length] tensors.
  """
  # Create default buckets if none were provided.
  if bucket_boundaries is None:
    # Small buckets -- go in steps of 8 until 64.
    small_buckets = [8 * (i + 1) for i in range(8)]
    # Medium buckets -- go in steps of 32 until 256.
    medium_buckets = [32 * (i + 3) for i in range(6)]
    # Large buckets -- go in steps of 128 until maximum of 1024.
    large_buckets = [128 * (i + 3) for i in range(6)]
    # By default use the above 20 bucket boundaries (21 queues in total).
    bucket_boundaries = small_buckets + medium_buckets + large_buckets
  with tf.name_scope("batch_examples"):
    # The queue to bucket on will be chosen based on maximum length.
    max_length = 0
    for v in examples:  # We assume 0-th dimension is the length.
      max_length = tf.maximum(max_length, tf.shape(v)[0])
    (_, outputs) = tf.contrib.training.bucket_by_sequence_length(
        max_length, examples, batch_size, bucket_boundaries,
        capacity=2 * batch_size, dynamic_pad=True)
    return outputs

In [None]:
outputs = batch_examples(sentence_tensors, 2)
print(outputs)
sess = tf.Session()

sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)

# print(sess.run(outputs))
# sess.close()

[<tf.Tensor 'batch_examples_3/bucket_by_sequence_length/bucket/dequeue_top:2' shape=(2, 1, 20) dtype=float64>, <tf.Tensor 'batch_examples_3/bucket_by_sequence_length/bucket/dequeue_top:3' shape=(2, 2, 20) dtype=float64>, <tf.Tensor 'batch_examples_3/bucket_by_sequence_length/bucket/dequeue_top:4' shape=(2, 8, 20) dtype=float64>, <tf.Tensor 'batch_examples_3/bucket_by_sequence_length/bucket/dequeue_top:5' shape=(2, 1, 20) dtype=float64>]


In [None]:

sentence_tensors = [tf.convert_to_tensor(s) for s in sentences]
capacity=32 # arbitrary
q = tf.FIFOQueue(capacity, tf.float32, shapes=[x.get_shape()])

sequence_lengths, outputs = tf.contrib.training.bucket_by_sequence_length(
                                input_length=max_length, 
                                tensors = sentence_tensors, 
                                batch_size = 2, # arbitrary
                                capacity=capacity,
                                bucket_boundaries = [8], 
                                dynamic_pad=True, 
                                allow_smaller_final_batch=True,
                                name="bucketting_example")

sess = tf.Session()

sess.run(tf.global_variables_initializer())
print(sess.run(outputs))
sess.close()

In [7]:
# http://pythonexample.com/search/dtype/3
x = tf.constant([123, 456, 789])
q = tf.FIFOQueue(100, tf.int32, shapes=[x.get_shape()])
enq_op = q.enqueue(x)
x = q.dequeue()
 
deq = tf.contrib.training.bucket_by_sequence_length(tf.shape(x)[0], [x], 1, [1],
    allow_smaller_final_batch=True)
print("deq", deq)
 
s = tf.InteractiveSession()
 
for _ in range(64):
  s.run(enq_op)
 
tf.train.start_queue_runners(s)
 
print("run_deq", s.run(deq))
 
# # Outputs
# (<tf.Tensor 'bucket_by_sequence_length/bucket/dequeue_top:1' shape=(?,) dtype=int32>, [<tf.Tensor 'bucket_by_sequence_length/bucket/dequeue_top:2' shape=(?, 3) dtype=int32>])
# (array([3], dtype=int32), [array([[123, 456, 789]], dtype=int32)])

deq (<tf.Tensor 'bucket_by_sequence_length_1/bucket/dequeue_top:1' shape=(?,) dtype=int32>, [<tf.Tensor 'bucket_by_sequence_length_1/bucket/dequeue_top:2' shape=(?, 3) dtype=int32>])
run_deq (array([3], dtype=int32), [array([[123, 456, 789]], dtype=int32)])


## ProgramTalk Example

In [6]:
tf.reset_default_graph()

# All inputs must be identical lengths across tuple index.
# The input reader will get input_length from the first tuple
# entry.
allow_small_batch=True
data_len = 4
labels_len = 3
input_pairs = [
  (length,
   ([np.int64(length)] * data_len,
    [str(length).encode("ascii")] * labels_len))
  for length in (1, 3, 4, 5, 6, 10)]

lengths = tf.placeholder(tf.int32, ())
data = tf.placeholder(tf.int64, (data_len,))
labels = tf.placeholder(tf.string, (labels_len,))

batch_size = 8
bucket_boundaries = [3, 4, 5, 10]

# Make capacity very large so we can feed all the inputs in the
# main thread without blocking
input_queue = tf.FIFOQueue(
  5000, (tf.int32, tf.int64, tf.string),
  ((), (data_len,), (labels_len,)))
input_enqueue_op = input_queue.enqueue((lengths, data, labels))
lengths_t, data_t, labels_t = input_queue.dequeue()
close_input_op = input_queue.close()

(out_lengths_t, data_and_labels_t) = (
  tf.contrib.training.bucket_by_sequence_length(
      input_length=lengths_t,
      tensors=[data_t, labels_t],
      batch_size=batch_size,
      bucket_boundaries=bucket_boundaries,
      allow_smaller_final_batch=allow_small_batch,
      num_threads=10))

expected_batch_size = None if allow_small_batch else batch_size

sess = tf.Session()
sess.run(tf.global_variables_initializer())

# Feed the inputs, then close the input thread.
for _ in range(50 * batch_size + 100):
    which = np.random.randint(0, len(input_pairs) - 1)
    length, pair = input_pairs[which]
    print ("length:", length)
    print ("pair:", pair)
    print()
    sess.run(input_enqueue_op, feed_dict={
    lengths: length, data: pair[0], labels: pair[1]})
sess.run(close_input_op)

# # Start the queue runners
threads = tf.train.start_queue_runners(sess)
# Read off the top of the bucket and ensure correctness of output
def _read_test(sess):
    for _ in range(50):
      (out_lengths, (data, labels)) = sess.run(
          (out_lengths_t, data_and_labels_t))
    print("out_lengths", out_lengths, "(data, labels)", (data, labels))
_read_test(sess)

length: 6
pair: ([6, 6, 6, 6], [b'6', b'6', b'6'])

length: 6
pair: ([6, 6, 6, 6], [b'6', b'6', b'6'])

length: 5
pair: ([5, 5, 5, 5], [b'5', b'5', b'5'])

length: 4
pair: ([4, 4, 4, 4], [b'4', b'4', b'4'])

length: 3
pair: ([3, 3, 3, 3], [b'3', b'3', b'3'])

length: 6
pair: ([6, 6, 6, 6], [b'6', b'6', b'6'])

length: 6
pair: ([6, 6, 6, 6], [b'6', b'6', b'6'])

length: 5
pair: ([5, 5, 5, 5], [b'5', b'5', b'5'])

length: 5
pair: ([5, 5, 5, 5], [b'5', b'5', b'5'])

length: 4
pair: ([4, 4, 4, 4], [b'4', b'4', b'4'])

length: 6
pair: ([6, 6, 6, 6], [b'6', b'6', b'6'])

length: 4
pair: ([4, 4, 4, 4], [b'4', b'4', b'4'])

length: 4
pair: ([4, 4, 4, 4], [b'4', b'4', b'4'])

length: 3
pair: ([3, 3, 3, 3], [b'3', b'3', b'3'])

length: 5
pair: ([5, 5, 5, 5], [b'5', b'5', b'5'])

length: 1
pair: ([1, 1, 1, 1], [b'1', b'1', b'1'])

length: 1
pair: ([1, 1, 1, 1], [b'1', b'1', b'1'])

length: 3
pair: ([3, 3, 3, 3], [b'3', b'3', b'3'])

length: 6
pair: ([6, 6, 6, 6], [b'6', b'6', b'6'])

length: 6
pa


length: 4
pair: ([4, 4, 4, 4], [b'4', b'4', b'4'])

length: 3
pair: ([3, 3, 3, 3], [b'3', b'3', b'3'])

length: 3
pair: ([3, 3, 3, 3], [b'3', b'3', b'3'])

length: 3
pair: ([3, 3, 3, 3], [b'3', b'3', b'3'])

length: 1
pair: ([1, 1, 1, 1], [b'1', b'1', b'1'])

length: 5
pair: ([5, 5, 5, 5], [b'5', b'5', b'5'])

length: 4
pair: ([4, 4, 4, 4], [b'4', b'4', b'4'])

length: 6
pair: ([6, 6, 6, 6], [b'6', b'6', b'6'])

length: 5
pair: ([5, 5, 5, 5], [b'5', b'5', b'5'])

length: 6
pair: ([6, 6, 6, 6], [b'6', b'6', b'6'])

length: 3
pair: ([3, 3, 3, 3], [b'3', b'3', b'3'])

length: 6
pair: ([6, 6, 6, 6], [b'6', b'6', b'6'])

length: 4
pair: ([4, 4, 4, 4], [b'4', b'4', b'4'])

length: 3
pair: ([3, 3, 3, 3], [b'3', b'3', b'3'])

length: 1
pair: ([1, 1, 1, 1], [b'1', b'1', b'1'])

length: 3
pair: ([3, 3, 3, 3], [b'3', b'3', b'3'])

length: 1
pair: ([1, 1, 1, 1], [b'1', b'1', b'1'])

length: 4
pair: ([4, 4, 4, 4], [b'4', b'4', b'4'])

length: 1
pair: ([1, 1, 1, 1], [b'1', b'1', b'1'])

length: 1
p