# Tabular Data

## Import Libraries

In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Normalization, StringLookup, CategoryEncoding, IntegerLookup
pd.set_option('display.max_rows', 10000)

## Load Dataset

In [None]:
TRAIN_DATA_URL = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv'
TEST_DATA_URL = 'https://storage.googleapis.com/tf-datasets/titanic/eval.csv'

train_file_path = tf.keras.utils.get_file('train.csv', TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file('eval.csv', TEST_DATA_URL)

## Prepare & Analyze Data


### Train/Val/Test Split

In [None]:
train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

test, val = train_test_split(test, test_size=0.5)

print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

### Exploratory Analysis

In [None]:
train.age.hist(bins=25)

In [None]:
train.sex.value_counts().plot(kind='barh')

In [None]:
train['class'].value_counts().plot(kind='barh')

In [None]:
train.groupby('sex').survived.mean().plot(kind='barh').set_xlabel('% survived')

In [None]:
train[(train.age < 18) & (train['class'] == 'First')].head()

## `tf.data` API
- TensorFlow 2.0 documentation
    - [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)
    - [tf.data.Dataset.from_tensor_slices](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices)
- This helper function will:
    - Remove label feature
    - Convert DataFrame to Tensor dataset
    - Shuffle and batch records
- Using `prefetch` lets us prepare the next batch while the model is processing the current batch
    - Note that it requires GPU

In [None]:
# dataframe = train.copy()

# data = tf.data.Dataset.from_tensor_slices((dict(dataframe), dataframe.pop('survived')))
# for item in data:
#   print(item)

In [None]:
def df_to_dataset(df, shuffle=True, batch_size=3):
    df = df.copy()
    labels = df.pop('survived')
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

Now that you have created the input pipeline, let's call it to see the format of the data it returns. You have used a small batch size to keep the output readable.

In [None]:
train_ds = df_to_dataset(train)
train_ds

In [None]:
[(train_features, label_batch)] = train_ds.take(1)
label_batch

- You can see that we have a batch of 3 passengers.

In [None]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of ages:', train_features['age'])
print('A batch of labels:', label_batch )

## Feature engineering

### Numeric columns


In [None]:
train.describe()


Let's create a function `get_normalization_layer` that returns a layer which applies featurewise normalization to numerical features.

In [None]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for our feature.
  normalizer = Normalization(axis=None)

  # Prepare a Dataset that only yields our feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # A pre-processing / non-traininable layer. 
  normalizer.adapt(feature_ds)

  return normalizer

The TensorFlow 2.0 Documentation has more information on the [adapt method](https://www.tensorflow.org/guide/keras/preprocessing_layers#the_adapt_method) if you want to find out more.

In [None]:
age_column = train_features['age']
age_column

- You can see that when we pass a batch of 3 numerical ages to the get_normalization_layer, they are returned as normalized features.

In [None]:
numeric_layer = get_normalization_layer('age', train_ds)
numeric_layer(age_column)

### Categorical columns


In this dataset, the town that passengers embarked is represented as a string (e.g. 'Southampton', 'Cherbourg', 'Queenstown' or 'unknown'). You cannot feed strings directly to a model. The preprocessing layer takes care of representing strings as a one-hot encoding vector.

In [None]:
train.embark_town.unique()

`get_category_encoding_layer` function returns a layer which maps values from a vocabulary to integer indices and one-hot encodes the features.

In [None]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a StringLookup layer which will turn strings into integer indices
  if dtype == 'string':
    index = StringLookup(max_tokens=max_tokens)
  else:
    index = IntegerLookup(max_tokens=max_tokens)

  # Prepare a Dataset that only yields our feature
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Create a Discretization for our integer indices.
  encoder = CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply one-hot encoding to our indices and return this feature
  return lambda feature: encoder(index(feature))

The TensorFlow 2.0 Documentation has more details on [StringLookup](https://www.tensorflow.org/api_docs/python/tf/keras/layers/StringLookup), [IntegerLookup](https://www.tensorflow.org/api_docs/python/tf/keras/layers/IntegerLookup) and [CategoryEncoding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/CategoryEncoding)

In [None]:
index = StringLookup(max_tokens=None)
feature_ds = train_ds.map(lambda x, y: x['embark_town'])
index.adapt(feature_ds)

print(f'vocabulary: {index.get_vocabulary()}')
print(f'vocabulary size: {index.vocabulary_size()}')

In [None]:
index.oov_token

Let's take a look at a batch of 3 entries. 

In [None]:
embark_town_column = train_features['embark_town']
embark_town_column

In [None]:
categorical_layer = get_category_encoding_layer('embark_town', train_ds, 'string')
categorical_layer(embark_town_column)

### Creating a pipeline

In [None]:
batch_size = 64
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

- Next let's take a look at our numeric and categorical features

In [None]:
all_inputs = []
encoded_features = []

# Numeric features.
numeric_columns = ['age', 'n_siblings_spouses', 'parch', 'fare']
for column_name in numeric_columns:
    numeric_column = tf.keras.Input(shape=(1,), name=column_name)
    normalization_layer = get_normalization_layer(column_name, train_ds)
    encoded_numeric_column = normalization_layer(numeric_column)
    all_inputs.append(numeric_column)
    encoded_features.append(encoded_numeric_column)

# Categorical features encoded as string.
categorical_columns = ['sex', 'class', 'embark_town', 'deck', 'alone']
for column_name in categorical_columns:
    categorical_column = tf.keras.Input(shape=(1,), name=column_name, dtype='string')
    encoding_layer = get_category_encoding_layer(column_name, train_ds, dtype='string', max_tokens=5)
    encoded_categorical_column = encoding_layer(categorical_column)
    all_inputs.append(categorical_column)
    encoded_features.append(encoded_categorical_column)

Let's take a look at the list all_inputs to see what we have. You can see these are the different columns

In [None]:
all_inputs

The encoded_features is all of the layers that have been normalized for numeric values and have category encoding for categorical values

In [None]:
encoded_features

## Train and evaluate the model

### Design Model
- Working with tabular data requires more preprocessing than imagery
- However the model design is pretty similar to Fashion-MNIST
- Some differences
    - Don't need to flatten cause we're not using grid values
    - Uses a dropout to prevent overfitting
    - Final output is only a single node
    - Switched to `BinaryCrossentropy` for loss function


In [None]:
#compile
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(128, activation='relu')(all_features)
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dropout(0.1)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
# rankdir='LR' is used to make the graph horizontal.
tf.keras.utils.plot_model(model, show_shapes=True, rankdir='LR')

### Train and evaluate model

In [None]:
model.fit(train_ds, validation_data=val_ds, epochs=10)

In [None]:
loss, accuracy = model.evaluate(test_ds)
print('Accuracy', accuracy)

In [None]:
model.save('classifier')