<a href="https://colab.research.google.com/github/sumanyurosha/tensorflow-specialization/blob/master/Hands-on%20ML/chapter13/Preprocessing_the_Input_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

In [2]:
housing_data = fetch_california_housing()
x, y = housing_data.data, housing_data.target

x_train, x_test, y_train, y_test = train_test_split(x, y)
print(x_train.shape, x_test.shape)

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /root/scikit_learn_data


(15480, 8) (5160, 8)


In [3]:
means = np.mean(x_train, axis=0, keepdims=True)
std = np.std(x_train, axis=0, keepdims=True)
eps = tf.keras.backend.epsilon()

model = tf.keras.models.Sequential([
    tf.keras.layers.Lambda(lambda x: (x - mean) / (std + eps) ),
    tf.keras.layers.Dense(30, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [4]:
class Standardization(tf.keras.layers.Layer):
    def adapt(self, x):
        self.mean_ = np.mean(x, axis=0, keepdims=True)
        self.std_ = np.std(x, axis=0, keepdims=True)
    
    def call(self, x):
        return (x - self.mean_) / (self.std_ + tf.keras.backend.epsilon())

In [5]:
data_sample = x_train[:100]

scaler = Standardization()
scaler.adapt(data_sample)

In [6]:
model = tf.keras.models.Sequential()
model.add(scaler)

# Encoding Categorical Values to One-hot Vectors

In [7]:
vocab =  ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, oov_buckets)

In [8]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [9]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + oov_buckets)
cat_one_hot

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

# **Encoding Categorical Features using Embeddings**

In [12]:
embed_dim = 2
embed_init = tf.random.uniform([len(vocab) + oov_buckets, embed_dim])
embedding_matrix = tf.Variable(embed_init)

In [13]:
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.12861013, 0.4142785 ],
       [0.18494403, 0.830976  ],
       [0.72081065, 0.60406303],
       [0.12199879, 0.47760797],
       [0.8836411 , 0.75761044],
       [0.26383793, 0.7781397 ],
       [0.52805746, 0.7617295 ]], dtype=float32)>

In [14]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [15]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.12199879, 0.47760797],
       [0.26383793, 0.7781397 ],
       [0.18494403, 0.830976  ],
       [0.18494403, 0.830976  ]], dtype=float32)>

In [17]:
embeddings = tf.keras.layers.Embedding(input_dim=len(vocab) + oov_buckets, output_dim=embed_dim)

embeddings(cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.03124557,  0.00882273],
       [ 0.041356  , -0.01476697],
       [-0.01374805, -0.02689283],
       [-0.01374805, -0.02689283]], dtype=float32)>

In [18]:
regular_input = tf.keras.layers.Input(shape=[8])
categorical_input = tf.keras.layers.Input(shape=[], dtype=tf.string)
cat_indices = tf.keras.layers.Lambda(lambda x: table.lookup(x))(categorical_input)
cat_embeddings = tf.keras.layers.Embedding(input_dim=7, output_dim=2)(cat_indices)
concat = tf.keras.layers.Concatenate()([regular_input, cat_embeddings])
output = tf.keras.layers.Dense(1)(concat)

model = tf.keras.Model(inputs=[regular_input, categorical_input],
                       outputs=output)


In [20]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None,)              0           input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 2)            14          lambda_1[0][0]                   
_______________________________________________________________________________________