### Datasets

https://www.tensorflow.org/guide/datasets

The `tf.data` API enables you to build complex input pipelines from simple, reusable pieces. Theres' two main aspects of the `data` library:

1. `Dataset` which will have both the **source** and the **transformations**
2. `Iterator` this takes the `dataset` and acts like a generator.

Note, datasets are intended to replace the earlier `Queue` and `Dequeue` APIs. https://www.tensorflow.org/api_guides/python/reading_data


In [32]:
import tensorflow as tf
import numpy as np

In [8]:
[m for m in dir(tf.data) if '__' not in m]

['Dataset',
 'FixedLengthRecordDataset',
 'Iterator',
 'TFRecordDataset',
 'TextLineDataset']

### Dataset methods

In [9]:
[m for m in dir(tf.data.Dataset) if '__' not in m]

['_GeneratorState',
 '_as_serialized_graph',
 '_as_variant_tensor',
 '_enumerate',
 '_tf_api_names',
 '_tf_api_names_v1',
 'apply',
 'batch',
 'cache',
 'concatenate',
 'filter',
 'flat_map',
 'from_generator',
 'from_sparse_tensor_slices',
 'from_tensor_slices',
 'from_tensors',
 'interleave',
 'list_files',
 'make_initializable_iterator',
 'make_one_shot_iterator',
 'map',
 'output_classes',
 'output_shapes',
 'output_types',
 'padded_batch',
 'prefetch',
 'range',
 'repeat',
 'shard',
 'shuffle',
 'skip',
 'take',
 'zip']

In [15]:
# simple tensor slicing
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
print(dataset1.output_types)
print(dataset1.output_shapes)

# making a feat / target generator
feat = tf.random_uniform([10, 3])
targ = tf.random_uniform([10])
dataset2 = tf.data.Dataset.from_tensor_slices((feat, targ))
print(dataset2.output_types)
print(dataset2.output_shapes)

# can combine other datasets
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))
print(dataset3.output_types)
print(dataset3.output_shapes)

<dtype: 'float32'>
(10,)
(tf.float32, tf.float32)
(TensorShape([Dimension(3)]), TensorShape([]))
(tf.float32, (tf.float32, tf.float32))
(TensorShape([Dimension(10)]), (TensorShape([Dimension(3)]), TensorShape([])))


### Iterator

>  Currently, one-shot iterators are the only type that is easily usable with an Estimator.

In [22]:
dataset5 = tf.data.Dataset.range(100)
iterator = dataset5.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    for i in range(100):
        value = sess.run(next_element)
        print(value, end=', ')

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 

### Dataset Transformations

```python
    dataset.map(lambda x : ..)
    dataset.flat_map(lambda x : ..)
    dataset.filter(lambda x : ..)
```

In [23]:
dataset6 = tf.data.Dataset.range(100)
dataset6 = dataset6.map(lambda x : x * 2)
iterator = dataset6.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    for i in range(100):
        value = sess.run(next_element)
        print(value, end=', ')

0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 

### Numpy -> Datasets
https://www.tensorflow.org/guide/datasets#consuming_numpy_arrays

In [41]:
array = np.array(np.split(np.arange(9),3))
print(array)
feat = array[:,:-1]
labl = array[:,-1]

dataset_npy = tf.data.Dataset.from_tensor_slices((feat,labl))
print(dataset_npy.output_types)
print(dataset_npy.output_shapes)

[[0 1 2]
 [3 4 5]
 [6 7 8]]
(tf.int64, tf.int64)
(TensorShape([Dimension(2)]), TensorShape([]))


### CSV -> Datasets

In [47]:
import pandas as pd
array = np.array(np.split(np.arange(36),6))
pd.DataFrame(array).to_csv('tmp.csv')
pd.DataFrame(array).to_csv('tmp2.csv')

# assume we know htere are 6 columns
ncols = 6
record_defaults = [tf.float32] * ncols
dataset_csv = tf.contrib.data.CsvDataset(['tmp.csv','tmp2.csv'], record_defaults)
print(dataset_csv.output_types)
print(dataset_csv.output_shapes)

(tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32)
(TensorShape([]), TensorShape([]), TensorShape([]), TensorShape([]), TensorShape([]), TensorShape([]))


### Batching

In [50]:
array = np.array(np.split(np.arange(36),6))
print(array)
feat = array[:,:-1]
labl = array[:,-1]

dataset_batch = tf.data.Dataset.from_tensor_slices((feat,labl))
batched_dataset = dataset_batch.batch(2)

iterator = batched_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

sess = tf.Session()

print(sess.run(next_element))
print(sess.run(next_element))
print(sess.run(next_element))
sess.close()

[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 31 32 33 34 35]]
(array([[ 0,  1,  2,  3,  4],
       [ 6,  7,  8,  9, 10]]), array([ 5, 11]))
(array([[12, 13, 14, 15, 16],
       [18, 19, 20, 21, 22]]), array([17, 23]))
(array([[24, 25, 26, 27, 28],
       [30, 31, 32, 33, 34]]), array([29, 35]))


### Shuffling

In [62]:
array = np.array(np.split(np.arange(72),12))
print(array)
feat = array[:,:-1]
labl = array[:,-1]

dataset_batch = tf.data.Dataset.from_tensor_slices((feat,labl))
shuffle_dataset = dataset_batch.batch(2)

## will shuffle now
shuffle_dataset = shuffle_dataset.shuffle(buffer_size=6)

iterator = shuffle_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

sess = tf.Session()

print(sess.run(next_element))
print(sess.run(next_element))
print(sess.run(next_element))
sess.close()

[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 31 32 33 34 35]
 [36 37 38 39 40 41]
 [42 43 44 45 46 47]
 [48 49 50 51 52 53]
 [54 55 56 57 58 59]
 [60 61 62 63 64 65]
 [66 67 68 69 70 71]]
(array([[24, 25, 26, 27, 28],
       [30, 31, 32, 33, 34]]), array([29, 35]))
(array([[36, 37, 38, 39, 40],
       [42, 43, 44, 45, 46]]), array([41, 47]))
(array([[12, 13, 14, 15, 16],
       [18, 19, 20, 21, 22]]), array([17, 23]))
