In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
X = tf.range(10)
X

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])>

In [11]:
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

### Chaining Transformations

In [12]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [13]:
dataset = dataset.repeat(3).batch(7)
dataset

<BatchDataset shapes: (None,), types: tf.int32>

In [14]:
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [15]:
dataset = dataset.map(lambda x : x * 2, num_parallel_calls=3) # use 3 threads for rapidity

In [16]:
for item in dataset:
    print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)
tf.Tensor([16 18], shape=(2,), dtype=int32)


In [17]:
# apply used with athe entire dataset and not item by item
dataset = dataset.apply(tf.data.experimental.unbatch())
# or 
# dataset = dataset.unbatch()

Instructions for updating:
Use `tf.data.Dataset.unbatch()`.


In [18]:
dataset = dataset.filter(lambda x : x < 10)

In [19]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)


In [20]:
for item in dataset.take(3):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


### Shuffling the DATA

In [21]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)

In [22]:
for item in dataset.take(3):
    print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)


#### Interleaving lines from multiple files 

In [23]:
## getting the caleforina housing data
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

In [24]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths


In [25]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [26]:
train_filepaths

['datasets\\housing\\my_train_00.csv',
 'datasets\\housing\\my_train_01.csv',
 'datasets\\housing\\my_train_02.csv',
 'datasets\\housing\\my_train_03.csv',
 'datasets\\housing\\my_train_04.csv',
 'datasets\\housing\\my_train_05.csv',
 'datasets\\housing\\my_train_06.csv',
 'datasets\\housing\\my_train_07.csv',
 'datasets\\housing\\my_train_08.csv',
 'datasets\\housing\\my_train_09.csv',
 'datasets\\housing\\my_train_10.csv',
 'datasets\\housing\\my_train_11.csv',
 'datasets\\housing\\my_train_12.csv',
 'datasets\\housing\\my_train_13.csv',
 'datasets\\housing\\my_train_14.csv',
 'datasets\\housing\\my_train_15.csv',
 'datasets\\housing\\my_train_16.csv',
 'datasets\\housing\\my_train_17.csv',
 'datasets\\housing\\my_train_18.csv',
 'datasets\\housing\\my_train_19.csv']

In [88]:
# read all training files into the dataset
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [89]:
# interleave 5 files in parallel: this is actually sequencial
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length = n_readers)

# to do it in parallel
#dataset = filepath_dataset.interleave(
#    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
#    cycle_length = n_readers, num_parallel_calls=tf.data.experimental.AUTOTUNE or a number_of_threads )

In [92]:
for line in dataset.take(5):
    #print(line)
    print(line.numpy())

b'2.1856,41.0,3.7189873417721517,1.0658227848101265,803.0,2.0329113924050635,32.76,-117.12,1.205'
b'4.2708,45.0,5.121387283236994,0.953757225433526,492.0,2.8439306358381504,37.48,-122.19,2.67'
b'3.8456,35.0,5.461346633416459,0.9576059850374065,1154.0,2.8778054862842892,37.96,-122.05,1.598'
b'3.0217,22.0,4.983870967741935,1.1008064516129032,615.0,2.4798387096774195,38.76,-120.6,1.069'
b'4.2083,44.0,5.323204419889502,0.9171270718232044,846.0,2.3370165745856353,37.47,-122.2,2.782'


### Preprocessing data: parsing and scalling

In [95]:
n_inputs = 8

@tf.function
def preprocess(line):
    defaults =[0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defaults)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return x - X_mean / X_std, y

In [96]:
preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([  2.1699545,  41.735565 ,   3.1839366,  -1.4453683, 844.7002   ,
          1.0839758,  20.770391 , -62.435696 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

### Putting all together

In [97]:
def csv_reader_dataset(filepaths, n_readers = 5, num_read_threads = None, batch_size = 32,
                       num_parse_threads=5, shuffle_buffer_size=10000, repeat = 1):
    filepath_dataset = tf.data.Dataset.list_files(filepaths, seed=42)
    dataset = filepath_dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length = n_readers, num_parallel_calls = num_read_threads)
    dataset = dataset.map(preprocess, num_parallel_calls= num_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1)


### Using the dataset with tf.keras

In [98]:
train_set = csv_reader_dataset(train_filepaths) 
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [108]:
keras.backend.clear_session()
model = keras.models.Sequential([
    keras.layers.Input(shape=(8,)),
    keras.layers.Dense(30, activation="relu"),
    keras.layers.Dense(1)], name="My_Model")


model.compile(loss="mse", optimizer=keras.optimizers.SGD(lr=1e-3))
model.summary()

Model: "My_Model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 30)                270       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 31        
Total params: 301
Trainable params: 301
Non-trainable params: 0
_________________________________________________________________


In [110]:
model.fit(train_set, epochs=100, validation_data=valid_set)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1f25b6bb760>

In [111]:
model.evaluate(test_set)
new_set = test_set.take(3).map(lambda X, y: X)
model.predict(new_set)



array([[2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],
       [2.0832763],


### TFRecords Format

In [10]:
with tf.io.TFRecordWriter("mydata.tfrecord") as f:
    for i in range(10000):
        f.write(b"this is a record!!!")

In [11]:
filepaths = ["mydata.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)

In [12]:
for item in dataset:
    print(item)

tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'th

#### Compressed TFRecords files

In [14]:
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("mydata_compressed.tfrecord", options) as f:
    for i in range(10000):
        f.write(b"this is a record!!!")

In [18]:
dataset = tf.data.TFRecordDataset(["mydata_compressed.tfrecord"], compression_type="GZIP")

In [19]:
for item in dataset.take(3):
    print(item)

tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)
tf.Tensor(b'this is a record!!!', shape=(), dtype=string)


#### Tensorflow Protocol Buffers

In [2]:
# Tensofrlow protobuff
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

person_example = Example(
    features = Features( 
        feature={
            "name": Feature(bytes_list=BytesList(value=[b"Alice"])),
            "id": Feature(int64_list=Int64List(value=[123])),
            "emails": Feature(bytes_list=BytesList(value=[b"a@b.com", b"c@d.com"]))
        }
    ))

now we put the serialized person into our tfrecord file. NOte that we should do that for multiple persons Examples,
So ideally wa can write a scipt that get lines from a csv file, and for each line it transfrom it to an Example protobuff, serialize it
and put it into the tfrecord file

In [3]:
with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    f.write(person_example.SerializeToString())

### Loading and parsing Examples

In [9]:
feature_description = {
    "name" : tf.io.FixedLenFeature([], tf.string, default_value=""),
    "id" : tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "emails": tf.io.VarLenFeature(tf.string)
}

for serialized_example in tf.data.TFRecordDataset(["my_contacts.tfrecord"]):
    parsed_example = tf.io.parse_single_example(serialized_example, feature_description)
    print(parsed_example)

{'emails': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000001A572F3A280>, 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alice'>}


In [10]:
# parce all examples
dataset = tf.data.TFRecordDataset(["my_contacts.tfrecord"]).batch(10)
for serialized_exmples in dataset:
    parsed_exmaples = tf.io.parse_example(serialized_exmples, feature_description) 

## Processing the input features (using a keras lambda layer)

In [28]:
means = np.mean(X_train, axis=0, keepdims=True)
stds = np.std(X_train, axis=0, keepdims=True)
keras.backend.clear_session()
eps = keras.backend.epsilon()
model = keras.models.Sequential([
    keras.layers.Input(shape=(8,)),
    keras.layers.Lambda(lambda inputs: (inputs - means) / (stds + eps) ),
    keras.layers.Dense(30, activation="relu"),
    keras.layers.Dense(1)], name="My_Model")


model.compile(loss="mse", optimizer=keras.optimizers.SGD(lr=1e-3))
model.summary()

Model: "My_Model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda (Lambda)              (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 30)                270       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 31        
Total params: 301
Trainable params: 301
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1a574f98310>

In [35]:
## other methode: create a layer class
class Standardization(keras.layers.Layer):
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample, axis=0, keepdims=True)
        self.stds_ = np.std(data_sample, axis=0, keepdims=True)
    def call(self, inputs):
        return (inputs - self.means_) / (self.stds_ + keras.backend.epsilon())

# use a data sample to compute means and stds
number_of_rows = X_train.shape[0]
random_indices = np.random.choice(number_of_rows, size=1000, replace=False)
data_sample = X_train[random_indices,:]
std_layer = Standardization()
std_layer.adapt(data_sample)

# After taht we can use this layer in out model def

In [34]:
X_train.shape

(11610, 8)

### Encoding Categorical Features USING One-hot vectors

In [37]:
# create a table to map each category to an index
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
# number of out of vector buckets: we use it in case the number of categories is very lasrge or changing,
# so we create the table just with a sample of data
num_oov_buckets = 2 
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [38]:
# ENCODE A SIMPLE EXAMPLE OF CATEGORIES
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1], dtype=int64)>

In [40]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
cat_one_hot

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

### Encodging Categorical Features USING Embeddings

In [41]:
# Manually just to undestand
embedding_dim = 2
enbed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(enbed_init)
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.03523946, 0.06090033],
       [0.39394176, 0.0516572 ],
       [0.03743649, 0.056301  ],
       [0.7998544 , 0.12611687],
       [0.707814  , 0.1607995 ],
       [0.2377187 , 0.22600007],
       [0.4484222 , 0.8978292 ]], dtype=float32)>

In [42]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1], dtype=int64)>

In [43]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.7998544 , 0.12611687],
       [0.2377187 , 0.22600007],
       [0.39394176, 0.0516572 ],
       [0.39394176, 0.0516572 ]], dtype=float32)>

In [44]:
# we can use keras.layers.TextVectorization (when available i think tensorflow 2.2 ?)
# or
regular_inputs = keras.layers.Input(shape=[8])
categories_inputs = keras.layers.Input(shape=[], dtype=tf.string)
cat_indices = keras.layers.Lambda(lambda cats: table.lookup(cats))(categories_inputs)
cat_embed = keras.layers.Embedding(input_dim=len(vocab)+ num_oov_buckets, output_dim=2)(cat_indices)
encoded_inputs = keras.layers.concatenate([regular_inputs, cat_embed])
outputs = keras.layers.Dense(1)(encoded_inputs)
model = keras.models.Model(inputs=[regular_inputs, categories_inputs], outputs=[outputs])

In [None]:
# Keras preprocessing layers (in new version of tensorflow 2.X)
normalization = keras.layers.Normalisation()
textvectorization = keras.layers.Textvertorization()
discretization = keras.layers.Descitization()
# you can define piplines
pipeline = keras.layers.PreprocessingStage([normalization, discretization])
pipeline.adapt(data_sample)

## TF Transform

In [None]:
!pip install tfx

In [None]:
# TFDS