In [9]:
import pandas as pd
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers

## Load Data

In [10]:
titanic = pd.read_csv("https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic.head()


Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


## Extract Features and Labels from data

In [11]:
titanic_features = titanic.copy()
titanic_labels = titanic_features.pop('survived')


In [12]:
titanic_features

Unnamed: 0,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.2500,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,female,26.0,0,0,7.9250,Third,unknown,Southampton,y
3,female,35.0,1,0,53.1000,First,C,Southampton,n
4,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
...,...,...,...,...,...,...,...,...,...
622,male,28.0,0,0,10.5000,Second,unknown,Southampton,y
623,male,25.0,0,0,7.0500,Third,unknown,Southampton,y
624,female,19.0,0,0,30.0000,First,B,Southampton,y
625,female,28.0,1,2,23.4500,Third,unknown,Southampton,n


## Mark Features as Numerical or String

In [13]:
inputs = {}

for name, column in titanic_features.items():
  dtype = column.dtype
  if dtype == object:
    dtype = tf.string
  else:
    dtype = tf.float32

  inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

inputs


{'sex': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'sex')>,
 'age': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'age')>,
 'n_siblings_spouses': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'n_siblings_spouses')>,
 'parch': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'parch')>,
 'fare': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'fare')>,
 'class': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'class')>,
 'deck': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'deck')>,
 'embark_town': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'embark_town')>,
 'alone': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'alone')>}

## Preprocess Numerical Features : Normalize

In [14]:
numeric_inputs = {name:input for name,input in inputs.items()
                  if input.dtype==tf.float32}

x = layers.Concatenate()(list(numeric_inputs.values()))
norm = layers.Normalization()
norm.adapt(np.array(titanic[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

all_numeric_inputs

<KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'normalization_1')>

In [15]:
preprocessed_inputs = [all_numeric_inputs]

## Preprocess String Data: Numeric Equivalent and Categorical Embedding

In [16]:
for name, input in inputs.items():
  if input.dtype == tf.float32:
    continue

  lookup = layers.StringLookup(vocabulary=np.unique(titanic_features[name]))
  one_hot = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())

  x = lookup(input)
  x = one_hot(x)
  preprocessed_inputs.append(x)


## Concatenate Normalized Numerical and Encoded String features

In [17]:
preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)


In [18]:
tf.keras.utils.plot_model(model = titanic_preprocessing , rankdir="LR", dpi=72, show_shapes=True)


You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


#### The above step shows the model for preprocessing

## Convert pandas dataframe to a dictionary of tensors

But why ??

The `titanic_features` variable is a pandas DataFrame. A DataFrame is a two-dimensional labeled data structure with columns potentially of different types. It's similar to a spreadsheet or SQL table, or a dictionary of Series objects.

Keras models don't automatically convert pandas DataFrames into tensors. A tensor is a generalization of vectors and matrices and is easily understood as a multidimensional array. In the context of TensorFlow, tensors are the primary data structure that TensorFlow uses to operate on the computational graph.

The **reason Keras doesn't automatically convert DataFrames to tensors** is because it's not clear whether the DataFrame should be converted into one tensor or a dictionary of tensors. 

To resolve this, the code creates a dictionary of tensors manually from the DataFrame. This is done using a dictionary comprehension, which is a concise way to create dictionaries.

Here's what the dictionary comprehension does:

```python
titanic_features_dict = {name: np.array(value) 
                         for name, value in titanic_features.items()}
```

This code iterates over the items in the `titanic_features` DataFrame. For each item, it extracts the column name and its values, and assigns them to `name` and `value` respectively. It then uses `np.array(value)` to convert the column values into a numpy array (which is compatible with TensorFlow). The column name and the numpy array are then added as a key-value pair to the `titanic_features_dict` dictionary.

The result is a dictionary where the keys are the column names from the DataFrame, and the values are numpy arrays of the values in those columns. This dictionary of tensors can then be used with Keras models.

In [19]:
## Keras models don't automatically convert pandas DataFrames 
## because it's not clear if it should be converted to one tensor 
## or to a dictionary of tensors. So, it is converted to a dictionary of tensors:

titanic_features_dict = {name: np.array(value) 
                         for name, value in titanic_features.items()}


In [20]:
features_dict = {name:values[:1] for name, values in titanic_features_dict.items()}
features_dict

{'sex': array(['male'], dtype=object),
 'age': array([22.]),
 'n_siblings_spouses': array([1]),
 'parch': array([0]),
 'fare': array([7.25]),
 'class': array(['Third'], dtype=object),
 'deck': array(['unknown'], dtype=object),
 'embark_town': array(['Southampton'], dtype=object),
 'alone': array(['n'], dtype=object)}

In [None]:
titanic_preprocessing(features_dict)

## Build Model

In [None]:
def titanic_model(preprocessing_head, inputs):
  body = tf.keras.Sequential([
    layers.Dense(64),
    layers.Dense(1)
  ])

  preprocessed_inputs = preprocessing_head(inputs)
  result = body(preprocessed_inputs)
  model = tf.keras.Model(inputs, result)

  model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam())
  return model

titanic_model = titanic_model(titanic_preprocessing, inputs)


## Fit Model

In [None]:
titanic_model.fit(x=titanic_features_dict, y=titanic_labels, epochs=10)
