<a href="https://colab.research.google.com/github/sumanyurosha/tensorflow-specialization/blob/master/Hands-on%20ML/chapter13/Preprocessing_the_Input_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

In [4]:
housing_data = fetch_california_housing()
x, y = housing_data.data, housing_data.target

x_train, x_test, y_train, y_test = train_test_split(x, y)
print(x_train.shape, x_test.shape)

(15480, 8) (5160, 8)


In [7]:
means = np.mean(x_train, axis=0, keepdims=True)
std = np.std(x_train, axis=0, keepdims=True)
eps = tf.keras.backend.epsilon()

model = tf.keras.models.Sequential([
    tf.keras.layers.Lambda(lambda x: (x - mean) / (std + eps) ),
    tf.keras.layers.Dense(30, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [13]:
class Standardization(tf.keras.layers.Layer):
    def adapt(self, x):
        self.mean_ = np.mean(x, axis=0, keepdims=True)
        self.std_ = np.std(x, axis=0, keepdims=True)
    
    def call(self, x):
        return (x - self.mean_) / (self.std_ + tf.keras.backend.epsilon())

In [14]:
data_sample = x_train[:100]

scaler = Standardization()
scaler.adapt(data_sample)

In [15]:
model = tf.keras.models.Sequential()
model.add(scaler)

# Encoding Categorical Values to One-hot Vectors

In [17]:
vocab =  ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, oov_buckets)

In [19]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [20]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + oov_buckets)
cat_one_hot

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>