There are two distinct ways to create a dataset:

 1. A ***data source*** constructs a Dataset from data stored in memory in one ore more files.
 
 2. A ***data transformation*** constructs a dataset from one or more tf.data.Dataset objects.



 * If data is in memory:
    * `tf.data.Dataset.from_tensors()`
    * `tf.data.Dataset.from_tensor_slices()`
    
 * If data is in a stored file like `TFRecord` format:
    * `tf.data.TFRecordDataset()`

In [2]:
import tensorflow as tf

dataset = tf.data.Dataset.range(16)

for i, elem in enumerate(dataset):
    print(elem)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
tf.Tensor(10, shape=(), dtype=int64)
tf.Tensor(11, shape=(), dtype=int64)
tf.Tensor(12, shape=(), dtype=int64)
tf.Tensor(13, shape=(), dtype=int64)
tf.Tensor(14, shape=(), dtype=int64)
tf.Tensor(15, shape=(), dtype=int64)


In [3]:
batched_dataset = dataset.batch(4)

for i, elem in enumerate(batched_dataset):
    print(elem)

tf.Tensor([0 1 2 3], shape=(4,), dtype=int64)
tf.Tensor([4 5 6 7], shape=(4,), dtype=int64)
tf.Tensor([ 8  9 10 11], shape=(4,), dtype=int64)
tf.Tensor([12 13 14 15], shape=(4,), dtype=int64)


In [24]:
dataset = tf.data.Dataset.range(16)
    
print(dataset.reduce(tf.constant(0, dtype='int64'), lambda state, value: state + value))

tf.Tensor(120, shape=(), dtype=int64)


## Applying transformations

In [28]:
A = tf.random.uniform([4, 5])
print(A)

ds1 = tf.data.Dataset.from_tensor_slices(A)

## transforming
ds2 = ds1.map(lambda x: 2*x - 1.0)

for elem in ds2:
    print(elem)

tf.Tensor(
[[0.95739675 0.42748928 0.24500048 0.93642485 0.12181795]
 [0.66007257 0.30977857 0.9706702  0.07444978 0.9007733 ]
 [0.5745696  0.11842835 0.35037625 0.23001218 0.12898052]
 [0.13482332 0.13462055 0.1631118  0.27879    0.58187914]], shape=(4, 5), dtype=float32)
tf.Tensor([ 0.9147935  -0.14502144 -0.50999904  0.8728497  -0.7563641 ], shape=(5,), dtype=float32)
tf.Tensor([ 0.32014513 -0.38044286  0.94134045 -0.85110044  0.8015466 ], shape=(5,), dtype=float32)
tf.Tensor([ 0.14913917 -0.7631433  -0.2992475  -0.53997564 -0.74203897], shape=(5,), dtype=float32)
tf.Tensor([-0.73035336 -0.7307589  -0.6737764  -0.44242     0.16375828], shape=(5,), dtype=float32)


In [42]:
A = tf.random.uniform([4, 2])
B = tf.random.uniform([4, 3])

ds1 = tf.data.Dataset.from_tensor_slices(A)
ds2 = tf.data.Dataset.from_tensor_slices(B)
    
ds3 = tf.data.Dataset.zip((ds1, ds2))
ds2.filter(lambda x, y: )

tf.Tensor(
[[0.29966164 0.8284199  0.35511267]
 [0.9710816  0.23627019 0.7314824 ]
 [0.98152757 0.15410101 0.2688105 ]
 [0.8460425  0.8091365  0.9296279 ]
 [0.01610696 0.03823996 0.86665654]], shape=(5, 3), dtype=float32)
tf.Tensor([0.67093456 0.75870633 0.3019017  0.9223398 ], shape=(4,), dtype=float32)


In [23]:
a = tf.constant(0, dtype='int64')
print(a)

b = tf.random.uniform([4, 5])
print(b)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(
[[0.58689535 0.18678069 0.8242061  0.98863506 0.08575535]
 [0.52911484 0.35827053 0.1366111  0.42180657 0.85821474]
 [0.46148193 0.3394221  0.10129654 0.14419067 0.744663  ]
 [0.35144687 0.251024   0.55445135 0.6826707  0.28955042]], shape=(4, 5), dtype=float32)


In [43]:
A.get_shape()

TensorShape([4, 5])

## Repeat, Shuffle and Batch

In [91]:
# Let's build a toy dataset

import numpy as np
x_arr = np.arange(12).reshape(-1, 1)

ds_orig = tf.data.Dataset.from_tensor_slices(x_arr)

print(ds_orig)

<TensorSliceDataset shapes: (1,), types: tf.int64>


### Shuffle with `buffer_size`

In [105]:
ds = ds_orig.shuffle(buffer_size=2)
ds = ds.batch(batch_size=4)

for batch in ds.take(3):
    print(batch.numpy().flatten())

[0 2 3 1]
[4 5 6 8]
[ 9  7 10 11]


### Full-shuffling with `buffer_size=len(x_arr)`

In [107]:
ds = ds_orig.shuffle(buffer_size=len(x_arr))
ds = ds.batch(batch_size=4)

for batch in ds.take(3):
    print(batch.numpy().flatten())

[ 5 11  6  3]
[8 0 1 2]
[ 7  4  9 10]


### Order matters:

 * **(1) Batch -> Shuffle -> Repeat**
 
   **$\longrightarrow$ The lements within each batch are not shuffled**

In [109]:
ds = ds_orig.batch(batch_size=5)
ds = ds.shuffle(buffer_size=len(x_arr))
ds = ds.repeat()

for batch in ds.take(4):
    print(batch.numpy().flatten())

[10 11]
[5 6 7 8 9]
[0 1 2 3 4]
[0 1 2 3 4]


 * **(2) Shuffle -> Batch -> Repeat**


In [92]:
ds = ds_orig.shuffle(buffer_size=len(x_arr))
ds = ds.batch(batch_size=5)
ds = ds.repeat()

for batch in ds.take(4):
    print(batch.numpy().flatten())

[ 4 10  6  8  0]
[ 1  3  7  9 11]
[5 2]
[ 5  2 11 10  4]


 * **(3) Shuffle -> Repeat -> Batch**

In [95]:
ds = ds_orig.shuffle(buffer_size=len(x_arr))
ds = ds.repeat()
ds = ds.batch(5)

for batch in ds.take(4):
    print(batch.numpy().flatten())

[9 4 5 8 2]
[ 1 10  7  3  0]
[ 6 11  3  2  8]
[10  1  6  7  4]


## Apply a function

 * Change the range of input values from $[0, 1]$ to $[-1, 1]$

In [113]:
############ 
def change_range(x):
    return 2*x-1

x_arr = np.random.uniform(size=(6))
print(x_arr)

ds = tf.data.Dataset.from_tensor_slices(x_arr)
ds = ds.map(change_range)

ds = ds.batch(len(x_arr))
batch, = ds.take(1)
print(batch.numpy())

[0.9545886  0.87917942 0.62141088 0.09069598 0.92583994 0.43794825]
[ 0.90917719  0.75835883  0.24282176 -0.81860804  0.85167988 -0.12410351]


# Feature-column

 * A configuration class that transforms and prepares features for input to a model
 * Mainly for structured data
 * The instructions given by `feature_column` will be embedded into model graph

### 1. Categorical Features

In [66]:
cat_feats = tf.feature_column.categorical_column_with_identity(
    key='cat-feat-1', num_buckets=4)

#help(tf.feature_column.categorical_column_with_identity)
cat_feats

IdentityCategoricalColumn(key='cat-feat-1', number_buckets=4, default_value=None)

In [68]:
## embed this feature to one-hot encoding

feat_embed = tf.feature_column.embedding_column(
    cat_feats, dimension=10)

feat_embed

EmbeddingColumn(categorical_column=IdentityCategoricalColumn(key='cat-feat-1', number_buckets=4, default_value=None), dimension=10, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x7f937017dac8>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True)

### 2. Numeric Features

In [69]:
numeric_feats = tf.feature_column.numeric_column(
    key='numeric-feats', shape=(10,))

numeric_feats

NumericColumn(key='numeric-feats', shape=(10,), default_value=None, dtype=tf.float32, normalizer_fn=None)

### 3. Combine all features

In [73]:
columns = [feat_embed, numeric_feats]

columns

[EmbeddingColumn(categorical_column=IdentityCategoricalColumn(key='cat-feat-1', number_buckets=4, default_value=None), dimension=10, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x7f937017dac8>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True),
 NumericColumn(key='numeric-feats', shape=(10,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

 * Now, we can define a feature layer based on these features

In [74]:
feature_layer = tf.keras.layers.DenseFeatures(columns)

feature_layer

<tensorflow.python.feature_column.feature_column_v2.DenseFeatures at 0x7f93700da6a0>