In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow import feature_column 

import os
import tempfile

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from keras.models import load_model

#### Load Dataset

In [None]:
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
dataframe.head()

In [None]:
# First step in classification model, check imbalance ratio in dataset. There are many different ways to handle this. 
# Will look into more details later in this section. For now treat this dataset as perfect dataset for classification. 
neg, pos = np.bincount(dataframe['target'])
total = neg + pos
print('Total Number Of Samples: {}\n Positive: {} ({:.2f}% of total)\n Negative:{} ({:.2f}% of total)' 
      .format(total, pos, 100 * pos / total, neg, 100 * neg / total))

#### Prepare Train, Test, Val datasets

In [None]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

#### Create input pipeline using tf.data Module. 
    
In practice, Spark to fetch data from datalake and use tf.data to read it from disk directly.

In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [None]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
for feature_batch, label_batch in train_ds.take(1):
    print('Features:', list(feature_batch.keys()))
    print('Ages:', feature_batch['age'])
    print('Targets:', label_batch )

#### Preprocess data with Tensorflow Transform module. 

TensorFlow Transform is a library for preprocessing input data for TensorFlow, including creating features that require a full pass over the training dataset. 

For example, using TensorFlow Transform you could:

    Normalize an input value by using the mean and standard deviation
    Convert strings to integers by generating a vocabulary over all of the input values
    Convert floats to integers by assigning them to buckets, based on the observed data distribution
    
    
The output of tf.Transform is exported as a TensorFlow graph which you can use for both training and serving. Using the same graph for both training and serving can prevent skew, since the same transformations are applied in both stages.

Tensorflow Transform module works with Apache Beam. [Different companies uses different approaches]

Use Spark for transformations. 

Another approach to preprocess using tf.feature_column 

##### tf.feature_column module demo

In [None]:
def demo(feature_column):
    feature_layer = keras.layers.DenseFeatures(feature_column, dtype="float64")
    print(feature_layer(next(iter(train_ds))[0]).numpy())

In [None]:
# numeric_column => Represents real valued or numerical features. No changes applied to input. 
age = feature_column.numeric_column("age")
demo(age)

In [None]:
# bucketized_column => Represents discretized dense input bucketed by boundaries.
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
demo(age_buckets)

In [None]:
# categorical_column_with_vocabulary_list => A CategoricalColumn with in-memory vocabulary.
# categorical_column_with_hash_bucket => Represents sparse feature where ids are set by hashing.
# categorical_column_with_identity => A CategoricalColumn that returns identity values.
# categorical_column_with_vocabulary_file => A CategoricalColumn with a vocabulary file.
# embedding_column => DenseColumn that converts from sparse, categorical input.   
thal = feature_column.categorical_column_with_vocabulary_list('thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
demo(thal_one_hot)

thal_embedding = feature_column.embedding_column(thal, 8)
demo(thal_embedding)

In [None]:
# crossed_column => Returns a column for performing crosses of categorical features.
# Combining features into a single feature

crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
demo(feature_column.indicator_column(crossed_feature))

###### Apply above feature column transformations to current task.

In [None]:
# Choose columns to train a model
feature_columns = []

# add numeric cols
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
      feature_columns.append(feature_column.numeric_column(header))

# add bucketized cols
feature_columns.append(age_buckets)

# add categorical cols
feature_columns.append(thal_embedding)

# crossed cols
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

for column in feature_columns:
    print(column)

In [None]:
# Create Feature Layer
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

#### Create, compile, and train the model 

A Model groups layers into an object with training and inference features.

Methods:
    
    compile => Configures the model for training.

        optimizer
        loss function
        metrics
        
    fit => Trains the model for a fixed number of epochs (iterations on a dataset).
        
                      x: Input Data
                      y: Target Data
             batch_size: Number of samples per gradient update
                 epochs: An epoch is an iteration over the entire x and y data provided
              callbacks: List of callbacks to apply during training
       validation_split: Float between 0 and 1. Fraction of the training data to be used as validation data.
        validation_data: Data on which to evaluate the loss and any model metrics at the end of each epoch. 
          initial_epoch: Epoch at which to start training (useful for resuming a previous training run).
        steps_per_epoch: Total number of (batches of samples) before declaring one epoch finished and start next.
      
    evaluate => Returns the loss value & metrics values for the model in test mode.Computation is done in batches.
    
    predict  => Generates output predictions for the input samples.
    
    Save     => Saves the model to Tensorflow SavedModel or a single HDF5 file.
   

In [None]:
def create_model():
    model = tf.keras.Sequential([
        feature_layer, # Feature layer will be input to our model
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])
    model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics=['accuracy'])
    return model
    
model = create_model()

##### Fit Model and View Training Loss With Epochs

In [None]:
epochs = 13
history = model.fit(train_ds, validation_data=val_ds, epochs=epochs)

In [None]:
result = model.predict(test_ds)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss=history.history['loss']
val_loss=history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

#### Saving a TensorFlow model

The phrase "Saving a TensorFlow model" typically means one of two things:

    Checkpoints
    SavedModel

Checkpoints capture the exact value of all parameters (tf.Variable objects) used by a model. Checkpoints do not contain any description of the computation defined by the model and thus are typically only useful when source code that will use the saved parameter values is available.

The SavedModel format on the other hand includes a serialized description of the computation defined by the model in addition to the parameter values (checkpoint). Models in this format are independent of the source code that created the model. They are thus suitable for deployment via TensorFlow Serving, TensorFlow Lite, TensorFlow.js, or programs in other programming languages (the C, C++, Java, Go, Rust, C# etc. TensorFlow APIs).