In [None]:
## What-If Tool from scratch - From CSV to trained model to What-If Tool usage

# This notebook shows the process of loading up a dataset from CSV, training a very simple classifier to
# predict one of the columns, then using the What-If Tool (WIT) to analyze the training dataset and the trained
# model.

# It is shown with both the UCI census binary classification task and the UCI iris multiclass classification task.

## Setup (install Jupyter, TF, and TF Serving in a virtualenv).
# NOTE: Use of a virtualenv, pip installation of tensorflow and docker use for TF Serving aren't the only way
# to set all this up. I just find it the simplest to use.

# virtualenv tf
# source tf/bin/activate
# pip install --upgrade pip
# pip install jupyter
# pip install tensorflow (or tensorflow-gpu)
# docker pull tensorflow/serving

In [None]:
## Define helper functions

import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import data

# Writes a pandas dataframe to disk as a tfrecord file of tf.Example protos,
# using only the dataframe columns specified. Non-numeric columns are treated
# as strings.
def write_df_as_tfrecord(df, filename, columns):
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    writer = tf.python_io.TFRecordWriter(filename)
    for index, row in df.iterrows():
        example = tf.train.Example()
        for col in columns:
            if df[col].dtype is np.dtype(np.int64):
                example.features.feature[col].int64_list.value.append(row[col])
            elif df[col].dtype is np.dtype(np.float64):
                example.features.feature[col].float_list.value.append(row[col])
            else:
                example.features.feature[col].bytes_list.value.append(row[col].encode('utf-8'))
        writer.write(example.SerializeToString())
    writer.close()


# Creates a tf feature spec from the dataframe and columns specified.
def create_feature_spec(df, columns):
    feature_spec = {}
    for f in columns:
        if df[f].dtype is np.dtype(np.int64):
            feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.int64)
        elif df[f].dtype is np.dtype(np.float64):
            feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.float32)
        else:
            feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.string)
    return feature_spec

# Parses a serialized tf.Example into input features and target feature from 
# the provided label feature name and feature spec.
def parse_tf_example(example_proto, label, feature_spec):
    parsed_features = tf.parse_example(serialized=example_proto, features=feature_spec)
    target = parsed_features.pop(label)
    return parsed_features, target

# An input function for providing input to a model from tf.Examples from tf record files.
def tfrecords_input_fn(files_name_pattern, feature_spec, label, mode=tf.estimator.ModeKeys.EVAL,
                       num_epochs=None, 
                       batch_size=64):
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    file_names = tf.matching_files(files_name_pattern)
    dataset = data.TFRecordDataset(filenames=file_names)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda tf_example: parse_tf_example(tf_example, label, feature_spec))
    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()
    
    features, target = iterator.get_next()
    return features, target

# Creates simple numeric and categorical feature columns from a feature spec and a
# list of columns from that spec to use.
#
# NOTE: Models might perform better with some feature engineering such as bucketed
# numeric columns and hash-bucket/embedding columns for categorical features.
def create_feature_columns(columns, feature_spec):
    ret = []
    for col in columns:
        if feature_spec[col].dtype is tf.int64 or feature_spec[col].dtype is tf.float32:
            ret.append(tf.feature_column.numeric_column(col))
        else:
            ret.append(tf.feature_column.categorical_column_with_vocabulary_list(col, list(df[col].unique())))
    return ret
    

In [None]:
## UCI Census - Prepare for model training - This is the only cell that is dataset-specific.

tfrecord_path = './data/adult.tfrecord'
label_col = 'Target'
model_path = './uci_model'
n_classes = 2

# Read data from CSV to dataframe
csv_columns = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital-Status",
               "Occupation", "Relationship", "Race", "Sex", "Capital-Gain", "Capital-Loss",
               "Hours-per-week", "Country", "Target"]
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    names=csv_columns,
    skipinitialspace=True)

# Make the label column numeric (0 and 1), for use in our model
df[label_col] = np.where(df[label_col] == '<=50K', 0, 1)

# Get list of all columns from the dataset we will use for model input or output.
# We will ignore the fnlwgt column in the dataset for training this model.
features_and_labels = [f for f in df.columns.values.tolist() if f != 'fnlwgt']


In [None]:
## Iris - Prepare for model training - This is the only cell that is dataset-specific

tfrecord_path = './data/iris.tfrecord'
label_col = 'class'
model_path = './iris_model'
n_classes = 3

# Read data from CSV to dataframe
csv_columns = ["sepal-length", "sepal-width", "pedal-length", "pedal-width", "class-str"]
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
    names=csv_columns)

# Make the label column numeric for use in our model
df[label_col] = np.where(df["class-str"] == 'Iris-setosa', 0, np.where(df["class-str"] == 'Iris-versicolor', 1, 2)).astype(int)

# Get list of all columns from the dataset we will use for model input or output.
features_and_labels = [f for f in df.columns.values.tolist() if f != 'class-str']

In [None]:
## Create and train the classifier

import functools

# Write the records to disk as tf.Example protos in tf record file, for use in model training
# and later for use by WIT.
write_df_as_tfrecord(df, tfrecord_path, features_and_labels)

# Create a feature spec for the classifier
feature_spec = create_feature_spec(df, features_and_labels)

print feature_spec

# Create a list of just the input features the classifier will use (removing the label feature)
features = [f for f in features_and_labels if f != label_col]

# Define and train the classifier
train_inpf = functools.partial(tfrecords_input_fn, tfrecord_path, feature_spec, label_col)
classifier = tf.estimator.LinearClassifier(feature_columns=create_feature_columns(features, feature_spec),
                                           n_classes=n_classes)
classifier.train(train_inpf, steps=10000)

In [None]:
## Save the classifier to disk for serving

# Uses a parsing serving input receiver function so that it can classify from serialized tf.Examples
# using the TensorFlow Serving Classify API.

serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
classifier.export_savedmodel(model_path, serving_input_fn)

In [None]:
## What-If Tool usage instructions (serve model, launch TensorBoard, configure What-If Tool)

# sudo docker run -p 8500:8500 --mount type=bind,source=[model_path],target=/models/my_model/ -e MODEL_NAME=my_model -t tensorflow/serving
# tensorboard --logdir .
# Navigate to http://localhost:6006/#whatif&inferenceAddress=localhost%3A8500&modelName=my_model
# Set examples path to tfrecord_path and click accept button