<a href="https://colab.research.google.com/github/vaibhavbaswal95/ComputerVision/blob/master/DataInput_Pipelines%20with%20Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## tf.data API

In [0]:
# tf.data API enables us to build complex input pipelines from simple, reusable pieces.

# For example, the pipeline for an image model might aggregate data from files in a distributed file system,
# apply random perturbations to each image, and merge randomly selected images into a batch for training. 

# The pipeline for a text model might involve extracting symbols from raw text data, converting them to embedding 
# identifiers with a lookup table, and batching together sequences of different lengths.

## tf.data.Dataset

In [0]:
# The tf.data API introduces a tf.data.Dataset abstraction that represents a sequence of elements, in which each element consists of one or more components.

# For example, in an image pipeline, an element might be a single training example, with a pair of tensor components representing the image and its label.

In [0]:
# There are two distinct ways to create a dataset:

# A data source constructs a Dataset from data stored in memory or in one or more files.
# A data transformation constructs a dataset from one or more tf.data.Dataset objects.

In [0]:
# few import for python future functionality
from __future__ import absolute_import, division, print_function, unicode_literals

In [0]:
import tensorflow as tf
tf.enable_eager_execution() #important for getting results instantaneously

import pathlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

np.set_printoptions(precision=4)

In [0]:
# Creating input pipeline from data sources
# from_tensors()
# from_tensor_slices()
# TFRecordDataset()

# Once you have a Dataset object, you can transform it into a new Dataset by chaining method calls on the tf.data.Dataset object.
# For example, you can apply per-element transformations such as Dataset.map(), and multi-element transformations such as Dataset.batch()

In [135]:
dataset = tf.data.Dataset.from_tensor_slices([8, 3, 0, 8, 2, 1])
dataset

<DatasetV1Adapter shapes: (), types: tf.int32>

In [136]:
# The Dataset object is a Python iterable. This makes it possible to consume its elements using a for loop:

for elem in dataset:
  print(elem.numpy())

8
3
0
8
2
1


In [137]:
# creating a Python iterator using iter and consuming its elements using next
it = iter(dataset)
print(next(it).numpy())

8


In [138]:
# Alternatively, dataset elements can be consumed using the reduce transformation, which reduces all elements to produce a single result.

for i in dataset:
  print(i.numpy())


8
3
0
8
2
1


In [139]:
print(dataset.reduce(0, lambda state, value : state + value).numpy())

# 0 is the initial state 
# lambda function takes 2 argument old_state and new element to return new state 

22


# Dataset **Structure**

In [0]:
# # A dataset contains elements that each have the same (nested) structure and the individual components of the structure can be of any type representable by
# # tf.TypeSpec, including Tensor, SparseTensor, RaggedTensor, TensorArray, or Dataset.

# The Dataset.element_spec property allows you to inspect the type of each element component. The property returns a nested structure of tf.TypeSpec objects,
#  matching the structure of the element, 
#  which may be a single component, 
#  a tuple of components, 
#  or a nested tuple of components. 
 
#  For example:

In [141]:
tf.random.uniform([4]) # tensor with 4 elements picked randomly from uniform distribution

<tf.Tensor: id=54399, shape=(4,), dtype=float32, numpy=array([0.5189, 0.1423, 0.0793, 0.0326], dtype=float32)>

In [142]:
tf.random.uniform([4,100]) # tensor with 4 elements with 100 elements in each, picked randomly from uniform distribution

<tf.Tensor: id=54406, shape=(4, 100), dtype=float32, numpy=
array([[2.6000e-01, 7.0136e-01, 9.3297e-01, 6.1512e-01, 9.6711e-01,
        9.1897e-01, 5.0030e-03, 5.2029e-01, 4.9330e-01, 6.7616e-01,
        3.2295e-01, 1.9213e-01, 1.4459e-01, 8.5800e-01, 4.3934e-01,
        7.8393e-01, 8.9524e-01, 6.9418e-01, 9.8573e-01, 8.2910e-01,
        1.7645e-01, 8.2130e-01, 1.6727e-01, 1.8965e-02, 2.2085e-03,
        1.2577e-01, 3.3763e-01, 3.2875e-01, 2.4778e-01, 3.7950e-01,
        5.3768e-01, 2.0966e-02, 5.5340e-01, 2.1140e-01, 9.2948e-01,
        2.9901e-01, 9.3281e-01, 7.1184e-01, 2.5234e-01, 5.3230e-01,
        1.9655e-01, 6.4796e-01, 7.9475e-01, 2.1970e-01, 7.2693e-01,
        6.3158e-01, 9.0722e-01, 6.2177e-01, 1.0747e-02, 8.5942e-01,
        3.2445e-01, 7.3720e-01, 5.5915e-02, 2.1608e-01, 7.2705e-01,
        8.9620e-01, 4.1577e-01, 5.9589e-01, 8.6361e-01, 3.2141e-01,
        6.9932e-01, 1.2920e-01, 2.6142e-01, 6.8537e-01, 1.6269e-01,
        3.8085e-01, 1.8124e-01, 1.0200e-01, 6.0669e-01, 

In [143]:
# dataset from tensor
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random.uniform([4, 10]))

dataset1.element_spec

TensorSpec(shape=(10,), dtype=tf.float32, name=None)

In [144]:
# dataset from many tensors
dataset2 = tf.data.Dataset.from_tensor_slices(
   (tf.random.uniform([4]),
    tf.random.uniform([4, 100], maxval=100, dtype=tf.int32)))

dataset2.element_spec

(TensorSpec(shape=(), dtype=tf.float32, name=None),
 TensorSpec(shape=(100,), dtype=tf.int32, name=None))

In [145]:
# combining together
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))

dataset3.element_spec

(TensorSpec(shape=(10,), dtype=tf.float32, name=None),
 (TensorSpec(shape=(), dtype=tf.float32, name=None),
  TensorSpec(shape=(100,), dtype=tf.int32, name=None)))

In [146]:
tf.SparseTensor(indices=[[0,0],[1,2]], values=[1,2], dense_shape=[3,4])
# SparseTensor

<tensorflow.python.framework.sparse_tensor.SparseTensor at 0x7fa74d3e3978>

In [147]:
# Dataset containing a sparse tensor.
dataset4 = tf.data.Dataset.from_tensors(tf.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]))

dataset4.element_spec

SparseTensorSpec(TensorShape([Dimension(3), Dimension(4)]), tf.int32)

## Dataset **Transformations**

In [148]:
# The Dataset transformations support datasets of any structure. 
# When using the Dataset.map(), and Dataset.filter() transformations, which apply a function to each element, the element structure determines the arguments of the function:

dataset1 = tf.data.Dataset.from_tensor_slices(
    tf.random.uniform([4, 10], minval=1, maxval=10, dtype=tf.int32))

dataset1

<DatasetV1Adapter shapes: (10,), types: tf.int32>

In [149]:
for z in dataset1:
  print(z.numpy())

[2 4 9 4 1 5 3 3 8 6]
[9 1 8 8 5 5 2 6 3 7]
[5 6 3 8 5 8 8 4 6 5]
[4 5 9 8 3 5 7 4 9 4]


In [150]:
dataset2 = tf.data.Dataset.from_tensor_slices(
   (tf.random.uniform([4]),
    tf.random.uniform([4, 100], maxval=100, dtype=tf.int32)))

dataset2

<DatasetV1Adapter shapes: ((), (100,)), types: (tf.float32, tf.int32)>

In [151]:
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))

dataset3

<DatasetV1Adapter shapes: ((10,), ((), (100,))), types: (tf.int32, (tf.float32, tf.int32))>

In [152]:
for a, (b,c) in dataset3:
  print('shapes: {a.shape}, {b.shape}, {c.shape}'.format(a=a, b=b, c=c))

shapes: (10,), (), (100,)
shapes: (10,), (), (100,)
shapes: (10,), (), (100,)
shapes: (10,), (), (100,)


# Reading **input** data

In [0]:
# If all of your input data fits in memory, the simplest way to create a Dataset from them is to convert them to tf.Tensor objects 
# use Dataset.from_tensor_slices()

train, test = tf.keras.datasets.fashion_mnist.load_data()

In [154]:
images, labels = train
images = images/255

dataset = tf.data.Dataset.from_tensor_slices((images, labels))
dataset

<DatasetV1Adapter shapes: ((28, 28), ()), types: (tf.float64, tf.uint8)>

# Consuming Python **Generators**

In [0]:
# Consuming Python generators
# Another common data source that can easily be ingested as a tf.data.Dataset is the python generator.

def count(stop):
  i = 0
  while i<stop:
    yield i
    i += 1

In [156]:
for n in count(5):
  print(n)

0
1
2
3
4


In [0]:
# The Dataset.from_generator constructor converts the python generator to a fully functional tf.data.Dataset.
ds_counter = tf.data.Dataset.from_generator(count, args = [25], output_types=tf.int32, output_shapes=(),)

# The constructor takes a callable as input, not an iterator. This allows it to restart the generator when it reaches the end. 
# It takes an optional args argument, which is passed as the callable's arguments.

# The output_types argument is required because tf.data builds a tf.Graph internally, and graph edges require a tf.dtype.

In [158]:
for count_batch in ds_counter.repeat().batch(10).take(5):
  print(count_batch.numpy())

[0 1 2 3 4 5 6 7 8 9]
[10 11 12 13 14 15 16 17 18 19]
[20 21 22 23 24  0  1  2  3  4]
[ 5  6  7  8  9 10 11 12 13 14]
[15 16 17 18 19 20 21 22 23 24]


In [0]:
# The output_shapes argument is not required but is highly recomended as many tensorflow operations do not support tensors with unknown rank. 
# If the length of a particular axis is unknown or variable, set it as None in the output_shapes

In [0]:
# Here is an example generator that demonstrates both aspects, it returns tuples of arrays, where the second array is a vector with unknown length
def gen_series():
  i = 0
  while True:
    size = np.random.randint(0, 10)
    yield i, np.random.normal(size=(size,))
    i += 1

In [191]:
for i, series in gen_series():
  print(i, ":", str(series))
  if i > 5:
    break

0 : [ 0.3295  1.4132  0.0534  0.7677 -0.0115 -0.2631]
1 : []
2 : [1.8892 0.2117 0.5582]
3 : [ 0.3612  0.5921 -0.1273 -1.2374  0.1988 -1.445   1.6463 -0.3292  0.6723]
4 : []
5 : [-1.1839 -1.1007 -1.296  -0.4785  0.3676 -1.0314  0.3547 -1.4683]
6 : [-0.5354  0.3232 -0.1999 -1.5655]


In [162]:
for i,s in gen_series():
  print(type(i),":",type(s))
  if i>5:
    break

<class 'int'> : <class 'numpy.ndarray'>
<class 'int'> : <class 'numpy.ndarray'>
<class 'int'> : <class 'numpy.ndarray'>
<class 'int'> : <class 'numpy.ndarray'>
<class 'int'> : <class 'numpy.ndarray'>
<class 'int'> : <class 'numpy.ndarray'>
<class 'int'> : <class 'numpy.ndarray'>


In [192]:
ds_series = tf.data.Dataset.from_generator(
    gen_series, 
    output_types=(tf.int32, tf.float32), 
    output_shapes=((), (None,)))

ds_series

<DatasetV1Adapter shapes: ((), (?,)), types: (tf.int32, tf.float32)>

In [193]:
for (i,j) in ds_series:
  print(i,j)
  i += 1
  if i.numpy() > 5:
    break


tf.Tensor(0, shape=(), dtype=int32) tf.Tensor([ 0.9216 -1.0667 -0.513   0.4258 -0.3533  1.59  ], shape=(6,), dtype=float32)
tf.Tensor(1, shape=(), dtype=int32) tf.Tensor([ 1.9053 -0.3863 -0.2576 -0.5493], shape=(4,), dtype=float32)
tf.Tensor(2, shape=(), dtype=int32) tf.Tensor([ 1.6094  1.0931  0.3305 -0.083   1.217   0.5942], shape=(6,), dtype=float32)
tf.Tensor(3, shape=(), dtype=int32) tf.Tensor([ 0.3737 -1.7276  0.1943  1.4043 -0.3213  0.09   -0.4149 -0.0825], shape=(8,), dtype=float32)
tf.Tensor(4, shape=(), dtype=int32) tf.Tensor([ 1.9798  0.7006 -0.1994  0.1067 -0.89    0.3898  0.8002  0.2242], shape=(8,), dtype=float32)
tf.Tensor(5, shape=(), dtype=int32) tf.Tensor([-1.0003  0.1941 -1.4914], shape=(3,), dtype=float32)


In [194]:
ds_series.element_spec

(TensorSpec(shape=(), dtype=tf.int32, name=None),
 TensorSpec(shape=(?,), dtype=tf.float32, name=None))

In [227]:
ds_series_batch = ds_series.shuffle(20).padded_batch(batch_size=10,padded_shapes=([],[10,]))

ids, sequence_batch = next(iter(ds_series_batch))
print(ids.numpy())
print()
print(sequence_batch.numpy())

[17  5 12 11 21  9  4 22 19 26]

[[ 1.7809e+00  2.4111e+00 -8.9688e-01  5.1726e-01  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00]
 [-1.7194e+00  2.7396e-01  2.4720e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00]
 [-2.0499e+00 -1.7956e+00 -4.1560e-01  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00]
 [ 2.1430e+00  9.7250e-01  8.6747e-01 -6.8606e-01 -1.1640e+00  1.4149e-01
  -4.8939e-01 -4.7391e-02  0.0000e+00  0.0000e+00]
 [-1.2061e+00  1.1375e+00  7.1115e-01 -1.7961e+00 -4.4770e-01  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00]
 [ 9.6129e-01  3.4106e-01 -1.3752e-03 -9.0193e-01 -8.4836e-01  1.0217e+00
  -1.1765e+00  7.5524e-01  0.0000e+00  0.0000e+00]
 [ 3.7254e-01  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00]
 [ 6.0329e-01  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0

Some **HandsOn** with some **PreProcessing**

In [0]:
flowers = tf.keras.utils.get_file(
    'flower_photos',
    'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    untar=True)

In [0]:
img_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255, rotation_range=20)

In [202]:
type(img_gen)

tensorflow.python.keras.preprocessing.image.ImageDataGenerator

In [217]:
for i,j in img_gen.flow_from_directory(flowers):
  print(i.shape,j.shape)
  break

Found 3670 images belonging to 5 classes.
(32, 256, 256, 3) (32, 5)


In [240]:
ds = tf.data.Dataset.from_generator(
    img_gen.flow_from_directory, args=[flowers], 
    output_types=(tf.float32, tf.float32), 
    output_shapes=([32,256,256,3], [32,5])
)

ds

<DatasetV1Adapter shapes: ((32, 256, 256, 3), (32, 5)), types: (tf.float32, tf.float32)>

# Consuming **TFRecord** data

In [243]:
# Creates a dataset that reads all of the examples from two files.
fsns_test_file = tf.keras.utils.get_file("fsns.tfrec", "https://storage.googleapis.com/download.tensorflow.org/data/fsns-20160927/testdata/fsns-00000-of-00001")

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/fsns-20160927/testdata/fsns-00000-of-00001


In [244]:
dataset = tf.data.TFRecordDataset(filenames = [fsns_test_file])
dataset

<TFRecordDatasetV1 shapes: (), types: tf.string>

In [0]:
raw_example = next(iter(dataset))

In [250]:
parsed = tf.train.Example.FromString(raw_example.numpy()) ### what is tf.train.Example, didn't get it

parsed.features.feature['image/text']

bytes_list {
  value: "Rue Perreyon"
}

In [258]:
type(raw_example)

tensorflow.python.framework.ops.EagerTensor

# Consuming **text** data

In [259]:
directory_url = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
file_names = ['cowper.txt', 'derby.txt', 'butler.txt']

file_paths = [
    tf.keras.utils.get_file(file_name, directory_url + file_name)
    for file_name in file_names
]

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt


In [261]:
file_paths

['/root/.keras/datasets/cowper.txt',
 '/root/.keras/datasets/derby.txt',
 '/root/.keras/datasets/butler.txt']

In [0]:
dataset = tf.data.TextLineDataset(file_paths)

In [264]:
for line in dataset.take(5):
  print(line)

tf.Tensor(b"\xef\xbb\xbfAchilles sing, O Goddess! Peleus' son;", shape=(), dtype=string)
tf.Tensor(b'His wrath pernicious, who ten thousand woes', shape=(), dtype=string)
tf.Tensor(b"Caused to Achaia's host, sent many a soul", shape=(), dtype=string)
tf.Tensor(b'Illustrious into Ades premature,', shape=(), dtype=string)
tf.Tensor(b'And Heroes gave (so stood the will of Jove)', shape=(), dtype=string)


In [265]:
titanic_file = tf.keras.utils.get_file("train.csv", "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic_lines = tf.data.TextLineDataset(titanic_file)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv


In [266]:
for line in titanic_lines.take(10):
  print(line.numpy())

b'survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone'
b'0,male,22.0,1,0,7.25,Third,unknown,Southampton,n'
b'1,female,38.0,1,0,71.2833,First,C,Cherbourg,n'
b'1,female,26.0,0,0,7.925,Third,unknown,Southampton,y'
b'1,female,35.0,1,0,53.1,First,C,Southampton,n'
b'0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y'
b'0,male,2.0,3,1,21.075,Third,unknown,Southampton,n'
b'1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n'
b'1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n'
b'1,female,4.0,1,1,16.7,Third,G,Southampton,n'


In [0]:
def survived(line):
  return tf.not_equal(tf.strings.substr(line,0,1),'0')

survivors = titanic_lines.skip(1).filter(survived)

In [270]:
for line in survivors.take(10):
  print(line.numpy())

b'1,female,38.0,1,0,71.2833,First,C,Cherbourg,n'
b'1,female,26.0,0,0,7.925,Third,unknown,Southampton,y'
b'1,female,35.0,1,0,53.1,First,C,Southampton,n'
b'1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n'
b'1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n'
b'1,female,4.0,1,1,16.7,Third,G,Southampton,n'
b'1,male,28.0,0,0,13.0,Second,unknown,Southampton,y'
b'1,female,28.0,0,0,7.225,Third,unknown,Cherbourg,y'
b'1,male,28.0,0,0,35.5,First,A,Southampton,y'
b'1,female,38.0,1,5,31.3875,Third,unknown,Southampton,n'


# Consuming **CSV** data