# YOU MUST RUN Airflow `taxi` Pipeline BEFORE YOU CONTINUE !!!

![Completed Airflow DAG](https://raw.githubusercontent.com/PipelineAI/site/master/assets/img/airflow-dag-complete.png)

# Feature Analysis

Use the code below to run TensorFlow Transform on some example data using the schema from your pipeline. Start by importing and opening the metadata store.

In [2]:
!pip install tfx matplotlib networkx pandas

Collecting tfx
[?25l  Downloading https://files.pythonhosted.org/packages/87/46/16a252013f055ec7c9f8956c081551ac69c5316cc45c56649cace173f8d5/tfx-0.13.0-py2.py3-none-any.whl (212kB)
[K    100% |████████████████████████████████| 215kB 8.5MB/s ta 0:00:01
[?25hCollecting matplotlib
[?25l  Downloading https://files.pythonhosted.org/packages/da/83/d989ee20c78117c737ab40e0318ea221f1aed4e3f5a40b4f93541b369b93/matplotlib-3.1.0-cp36-cp36m-manylinux1_x86_64.whl (13.1MB)
[K    100% |████████████████████████████████| 13.1MB 3.1MB/s eta 0:00:01
[?25hCollecting networkx
[?25l  Downloading https://files.pythonhosted.org/packages/85/08/f20aef11d4c343b557e5de6b9548761811eb16e438cee3d32b1c66c8566b/networkx-2.3.zip (1.7MB)
[K    100% |████████████████████████████████| 1.8MB 12.2MB/s ta 0:00:01
Collecting ml-metadata<0.14,>=0.13.2 (from tfx)
[?25l  Downloading https://files.pythonhosted.org/packages/e7/7f/0e0eb09e0191bd439fb7bbaf06908d6f5b403bd2b2d812949cdb54a985fe/ml_metadata-0.13.2-cp36-cp36m-ma

Building wheels for collected packages: networkx


  Building wheel for networkx (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/de/63/64/3699be2a9d0ccdb37c7f16329acf3863fd76eda58c39c737af
Successfully built networkx
Installing collected packages: ml-metadata, tensorflow-model-analysis, tfx, cycler, kiwisolver, matplotlib, networkx
  Found existing installation: tensorflow-model-analysis 0.13.1
    Uninstalling tensorflow-model-analysis-0.13.1:
      Successfully uninstalled tensorflow-model-analysis-0.13.1
Successfully installed cycler-0.10.0 kiwisolver-1.1.0 matplotlib-3.1.0 ml-metadata-0.13.2 networkx-2.3 tensorflow-model-analysis-0.13.2 tfx-0.13.0
[33mYou are using pip version 19.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
from __future__ import print_function

import os
import tempfile
import pandas as pd

import tensorflow as tf
import tensorflow_transform as tft
from tensorflow_transform import beam as tft_beam
import tfx_utils
from tfx.utils import io_utils
from tensorflow_metadata.proto.v0 import schema_pb2

# For DatasetMetadata boilerplate
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import schema_utils

def _make_default_sqlite_uri(pipeline_name):
    return os.path.join('/mnt/pipelineai/users/airflow-dags/tfx/metadata', pipeline_name, 'metadata.db')

def get_metadata_store(pipeline_name):
    return tfx_utils.TFXReadonlyMetadataStore.from_sqlite_db(_make_default_sqlite_uri(pipeline_name))

pipeline_name = 'taxi'

pipeline_db_path = _make_default_sqlite_uri(pipeline_name)
print('Pipeline DB:\n{}'.format(pipeline_db_path))

store = get_metadata_store(pipeline_name)

Pipeline DB:
/mnt/pipelineai/users/airflow-dags/tfx/metadata/taxi/metadata.db


Get the schema URI from the metadata store

In [5]:
# Get the schema URI from the metadata store
schemas = store.get_artifacts_of_type_df(tfx_utils.TFXArtifactTypes.SCHEMA)
assert len(schemas.URI) == 1
schema_uri = schemas.URI.iloc[0] + 'schema.pbtxt'
print ('Schema URI:\n{}'.format(schema_uri))

Schema URI:
/mnt/pipelineai/users/airflow-dags/tfx/pipelines/taxi/SchemaGen/output/3/schema.pbtxt


Get the schema that was inferred by TensorFlow Data Validation

In [6]:
# TODO:  This will work once we mount /mnt/pipelineai/users

schema_proto = io_utils.parse_pbtxt_file(file_name=schema_uri, message=schema_pb2.Schema())
feature_spec, domains = schema_utils.schema_as_feature_spec(schema_proto)
legacy_metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(feature_spec, domains))

Define features and create functions for TensorFlow Transform

In [7]:
# Categorical features are assumed to each have a maximum value in the dataset.
_MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]

_CATEGORICAL_FEATURE_KEYS = [
    'trip_start_hour', 'trip_start_day', 'trip_start_month',
    'pickup_census_tract', 'dropoff_census_tract', 'pickup_community_area',
    'dropoff_community_area'
]

_DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

# Number of buckets used by tf.transform for encoding each feature.
_FEATURE_BUCKET_COUNT = 10

_BUCKET_FEATURE_KEYS = [
    'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
    'dropoff_longitude'
]

# Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
_VOCAB_SIZE = 1000

# Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
_OOV_SIZE = 10

_VOCAB_FEATURE_KEYS = [
    'payment_type',
    'company',
]

# Keys
_LABEL_KEY = 'tips'
_FARE_KEY = 'fare'


def _transformed_name(key):
  return key + '_xf'

def _transformed_names(keys):
  return [_transformed_name(key) for key in keys]

def _fill_in_missing(x):
  """Replace missing values in a SparseTensor.

  Fills in missing values of `x` with '' or 0, and converts to a dense tensor.

  Args:
    x: A `SparseTensor` of rank 2.  Its dense shape should have size at most 1
      in the second dimension.

  Returns:
    A rank 1 tensor where missing values of `x` have been filled in.
  """
  default_value = '' if x.dtype == tf.string else 0
  return tf.squeeze(
      tf.sparse_to_dense(x.indices, [x.dense_shape[0], 1], x.values,
                         default_value),
      axis=1)

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.where(
      tf.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs


Display the results of transforming some example data

In [8]:
from IPython.display import display
with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    raw_examples = [
        {
            "fare": [100.0],
            "trip_start_hour": [12],
            "pickup_census_tract": ['abcd'],
            "dropoff_census_tract": [12345.0],  # No idea why this is a float
            "company": ['taxi inc.'],
            "trip_start_timestamp": [123456],
            "pickup_longitude": [12.0],
            "trip_start_month": [5],
            "trip_miles": [8.0],
            "dropoff_longitude": [12.05],
            "dropoff_community_area": [123],
            "pickup_community_area": [123],
            "payment_type": ['visa'],
            "trip_seconds": [600.0],
            "trip_start_day": [12],
            "tips": [10.0],
            "pickup_latitude": [80.0],
            "dropoff_latitude": [80.01],
        }
    ]
    (transformed_examples, transformed_metadata), transform_fn = (
        (raw_examples, legacy_metadata)
        | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
            preprocessing_fn))
    display(pd.DataFrame(transformed_examples))

Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


W0615 20:39:20.869966 140498720569152 deprecation.py:323] From <ipython-input-7-d1598abb6e5e>:57: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


Instructions for updating:
Use tf.cast instead.


W0615 20:39:20.971487 140498720569152 deprecation.py:323] From /opt/conda/lib/python3.6/site-packages/tensorflow_transform/mappers.py:1027: to_int64 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.


Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.


W0615 20:39:21.122573 140498720569152 deprecation.py:323] From /opt/conda/lib/python3.6/site-packages/tensorflow/python/saved_model/signature_def_utils_impl.py:205: build_tensor_info (from tensorflow.python.saved_model.utils_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.


INFO:tensorflow:Assets added to graph.


I0615 20:39:21.130954 140498720569152 builder_impl.py:654] Assets added to graph.


INFO:tensorflow:No assets to write.


I0615 20:39:21.133599 140498720569152 builder_impl.py:449] No assets to write.


INFO:tensorflow:SavedModel written to: /var/tmp/tmp6pa8lflc/tftransform_tmp/7ff7e38154c443c39bef030d005abc27/saved_model.pb


I0615 20:39:21.168106 140498720569152 builder_impl.py:414] SavedModel written to: /var/tmp/tmp6pa8lflc/tftransform_tmp/7ff7e38154c443c39bef030d005abc27/saved_model.pb


INFO:tensorflow:Assets added to graph.


I0615 20:39:23.401020 140498720569152 builder_impl.py:654] Assets added to graph.


INFO:tensorflow:No assets to write.


I0615 20:39:23.403881 140498720569152 builder_impl.py:449] No assets to write.


INFO:tensorflow:SavedModel written to: /var/tmp/tmp6pa8lflc/tftransform_tmp/00a1653653b54f92944ca0fa43dcac39/saved_model.pb


I0615 20:39:23.437595 140498720569152 builder_impl.py:414] SavedModel written to: /var/tmp/tmp6pa8lflc/tftransform_tmp/00a1653653b54f92944ca0fa43dcac39/saved_model.pb


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0615 20:39:28.398238 140498720569152 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0615 20:39:31.761635 140498720569152 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Assets added to graph.


I0615 20:39:31.791769 140498720569152 builder_impl.py:654] Assets added to graph.


INFO:tensorflow:Assets written to: /var/tmp/tmp6pa8lflc/tftransform_tmp/1ce171579e2c4d29abf4aad938997394/assets


I0615 20:39:31.794975 140498720569152 builder_impl.py:763] Assets written to: /var/tmp/tmp6pa8lflc/tftransform_tmp/1ce171579e2c4d29abf4aad938997394/assets


INFO:tensorflow:SavedModel written to: /var/tmp/tmp6pa8lflc/tftransform_tmp/1ce171579e2c4d29abf4aad938997394/saved_model.pb


I0615 20:39:31.839801 140498720569152 builder_impl.py:414] SavedModel written to: /var/tmp/tmp6pa8lflc/tftransform_tmp/1ce171579e2c4d29abf4aad938997394/saved_model.pb


value: "\n\013\n\tConst_9:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



W0615 20:39:31.929422 140498720569152 ops.py:6153] Expected binary or unicode string, got type_url: "type.googleapis.com/tensorflow.AssetFileDef"
value: "\n\013\n\tConst_9:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\014\n\nConst_10:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



W0615 20:39:31.932003 140498720569152 ops.py:6153] Expected binary or unicode string, got type_url: "type.googleapis.com/tensorflow.AssetFileDef"
value: "\n\014\n\nConst_10:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0615 20:39:31.933852 140498720569152 saver.py:1483] Saver not created because there are no variables in the graph to restore


value: "\n\013\n\tConst_9:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



W0615 20:39:32.517777 140498720569152 ops.py:6153] Expected binary or unicode string, got type_url: "type.googleapis.com/tensorflow.AssetFileDef"
value: "\n\013\n\tConst_9:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\014\n\nConst_10:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



W0615 20:39:32.519882 140498720569152 ops.py:6153] Expected binary or unicode string, got type_url: "type.googleapis.com/tensorflow.AssetFileDef"
value: "\n\014\n\nConst_10:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0615 20:39:32.522027 140498720569152 saver.py:1483] Saver not created because there are no variables in the graph to restore


Unnamed: 0,company_xf,dropoff_census_tract_xf,dropoff_community_area_xf,dropoff_latitude_xf,dropoff_longitude_xf,fare_xf,payment_type_xf,pickup_census_tract_xf,pickup_community_area_xf,pickup_latitude_xf,pickup_longitude_xf,tips_xf,trip_miles_xf,trip_seconds_xf,trip_start_day_xf,trip_start_hour_xf,trip_start_month_xf
0,0,12345.0,123.0,1,1,0.0,0,b'abcd',123,1,1,0,0.0,0.0,12,12,5
