### Dataset Basic API

In [2]:
import tensorflow as tf
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from tensorflow import keras
from tensorflow.python.keras.callbacks import History

In [7]:
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))
dataset

<TensorSliceDataset shapes: (), types: tf.int64>

In [10]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)


### The common need of Dataset function for ML
* Repeated read of data -> Epoch
* Get batch -> define a batchsize for gradient decent

### 1. Generate data from existing dataset

In [13]:
dataset = dataset.repeat(3).batch(7)

for item in dataset:
    print(item)
    
# repeat(3) -> produce 0-9 three times, we will have 30 number 
# at the end

# batch(7) -> means, it will take 7 element each time. After
# three iterations (28 items are iterated), we have still 2 
# element remain...

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int64)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int64)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int64)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int64)
tf.Tensor([8 9], shape=(2,), dtype=int64)



#### Interleave:
<br>
Iterate over each dataset element and can perform certain operations on the element. After iteration it will collect results together
<br>
Usecase:
 <br>
It can iterate over a data directory, read all datafiles in that direcotry and collect all peaces of data at the end to one single dataset

In [18]:
# interleave takes three parameters:
# 1. map - function -> perform operation / transformation on data
# 2. cycle_length -> parallel tasks for the job
# 3. block_length -> how many data it should take each time

data_merged = dataset.interleave(
    lambda x: tf.data.Dataset.from_tensor_slices(x),
    cycle_length = 5,
    block_length = 5
)

for item in data_merged:
    print(item)
    
# because block_length is set to 5, it will take 5 element at
# once and perform lambda function for it.
# for example 
# from [0 1 2 3 4 5 6] take [0...4], [5, 6] remains
# from [7 8 9 0 1 2 3] take [7...1], [2 3] remains
# from [4 5 6 7 8 9 0] take [4...8], [9,0] remains
# from [1 2 3 4 5 6 7] take [1...5], [6,7] remains
# from [8,9] take [8, 9] take [5, 6] from 1. group and 2 from 2.group
# [3, 9, 0, 6, 7] ... the rest remaining data

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype

### 2. Merge two set together as training and label

In [29]:
train_data = np.array([[1, 2], [3, 4], [5, 6]])
label = np.array(['cat', 'dog', 'bird'])

zip_data = tf.data.Dataset.from_tensor_slices((train_data, label))

for x, y in zip_data:
    print(x.numpy(), y.numpy())

[1 2] b'cat'
[3 4] b'dog'
[5 6] b'bird'


In [33]:
# zip as dict
train_data = np.array([[1, 2], [3, 4], [5, 6]])
label = np.array(['cat', 'dog', 'bird'])

zip_data = tf.data.Dataset.from_tensor_slices(
                                {
                                    "feature": train_data,
                                    "label": label
                                }
                            )

for item in zip_data:
    print(item[x], item[y])

tf.Tensor([1 2], shape=(2,), dtype=int64) tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor([3 4], shape=(2,), dtype=int64) tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor([5 6], shape=(2,), dtype=int64) tf.Tensor(b'bird', shape=(), dtype=string)
