<a href="https://colab.research.google.com/github/venkat-krish/basics_tensorflow/blob/master/Transformation_of_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating input data pipeline in Tensorflow

Data pipeline can be created using tf.data API in tensorflow

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  !pip install tf-nightly-2.0-preview
except Exception:
  pass

import tensorflow as tf

Collecting tf-nightly-2.0-preview
[?25l  Downloading https://files.pythonhosted.org/packages/b8/be/e4e2cc0b4896648fe6d5e45dda6d8c3b784823301708cfe4ff96de9e01cf/tf_nightly_2.0_preview-2.0.0.dev20191002-cp36-cp36m-manylinux2010_x86_64.whl (95.2MB)
[K     |████████████████████████████████| 95.2MB 93kB/s 
Collecting tb-nightly<2.2.0a0,>=2.1.0a0
[?25l  Downloading https://files.pythonhosted.org/packages/48/6b/b9e735120c77721570aed36cec55390827db0d580b14a5ffd93a4cce5997/tb_nightly-2.1.0a20191206-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 35.5MB/s 
Collecting tensorflow-estimator-2.0-preview
[?25l  Downloading https://files.pythonhosted.org/packages/db/f5/790508e193121ab301cb40cada7f451c531404051ac9249f21b1f5484450/tensorflow_estimator_2.0_preview-2.0.0-py2.py3-none-any.whl (449kB)
[K     |████████████████████████████████| 450kB 50.0MB/s 
Installing collected packages: tb-nightly, tensorflow-estimator-2.0-preview, tf-nightly-2.0-preview
Successfully install

In [0]:
import pandas as pd
import numpy as np

np.set_printoptions(precision=4)

In [7]:
dataset = tf.data.Dataset.from_tensor_slices([8,3,0,8,2,1])
dataset # Resulting dataset is a scalar which does not have any size

<TensorSliceDataset shapes: (), types: tf.int32>

In [6]:
for elem in dataset:
  print(elem.numpy())

8
3
0
8
2
1


In [8]:
# we can use python iterator to iterate over the dataset
it = iter(dataset)
print(next(it).numpy())

8


Transformation of the dataset can be performed by transform functions.

In [10]:
# here the transform function is reduce function which does the sum of given dataset values
print(dataset.reduce(0, lambda state, value: state + value).numpy())

22


### Dataset structure



In [12]:
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random.uniform([4,10]))
dataset1.element_spec

TensorSpec(shape=(10,), dtype=tf.float32, name=None)

In [15]:
dataset2 = tf.data.Dataset.from_tensor_slices(
    (tf.random.uniform([4]),
    tf.random.uniform([4,100], maxval=100, dtype=tf.int32)))
dataset2.element_spec

(TensorSpec(shape=(), dtype=tf.float32, name=None),
 TensorSpec(shape=(100,), dtype=tf.int32, name=None))

In [18]:
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))
dataset3.element_spec

(TensorSpec(shape=(10,), dtype=tf.float32, name=None),
 (TensorSpec(shape=(), dtype=tf.float32, name=None),
  TensorSpec(shape=(100,), dtype=tf.int32, name=None)))

In [20]:
# Dataset containing sparse tensor
dataset4 = tf.data.Dataset.from_tensors(tf.SparseTensor(indices=[[0,0],[1,2]], values=[1,2], dense_shape=[3,4]))

dataset4.element_spec.value_type

tensorflow.python.framework.sparse_tensor.SparseTensor

### Batching
The simplest form of batching stacks `n` consecutive elements into a single element.

In [34]:
inc_dataset = tf.data.Dataset.range(100)
dec_dataset = tf.data.Dataset.range(0, 100, -1)
dataset = tf.data.Dataset.zip((inc_dataset, dec_dataset))
batched_dataset = dataset.batch(4)

# print(inc_dataset, dec_dataset)
itr = iter(batched_dataset)
for batch in batched_dataset.take(4):
  print(batch)
  print([a.numpy() for a in batch])

print(batched_dataset)

batched_dataset = dataset.batch(7, drop_remainder=True)
batched_dataset

<BatchDataset shapes: ((None,), (None,)), types: (tf.int64, tf.int64)>


<BatchDataset shapes: ((7,), (7,)), types: (tf.int64, tf.int64)>

In [37]:
# Padding on different size of the tensors
dataset = tf.data.Dataset.range(100)
dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))
dataset = dataset.padded_batch(4, padded_shapes=(None,))

for batch in dataset.take(4):
  print(batch.numpy())

[[0 0 0]
 [1 0 0]
 [2 2 0]
 [3 3 3]]
[[4 4 4 4 0 0 0]
 [5 5 5 5 5 0 0]
 [6 6 6 6 6 6 0]
 [7 7 7 7 7 7 7]]
[[ 8  8  8  8  8  8  8  8  0  0  0]
 [ 9  9  9  9  9  9  9  9  9  0  0]
 [10 10 10 10 10 10 10 10 10 10  0]
 [11 11 11 11 11 11 11 11 11 11 11]]
[[12 12 12 12 12 12 12 12 12 12 12 12  0  0  0]
 [13 13 13 13 13 13 13 13 13 13 13 13 13  0  0]
 [14 14 14 14 14 14 14 14 14 14 14 14 14 14  0]
 [15 15 15 15 15 15 15 15 15 15 15 15 15 15 15]]
