In [1]:
import tensorflow as tf
import pandas as pd
import shutil

print(tf.__version__)

1.15.0


In [2]:
!gsutil cp gs://cloud-training-demos/taxifare/small/*.csv .
!ls -l *.csv

Copying gs://cloud-training-demos/taxifare/small/taxi-test.csv...
Copying gs://cloud-training-demos/taxifare/small/taxi-train.csv...              
Copying gs://cloud-training-demos/taxifare/small/taxi-valid.csv...              
- [3 files][ 10.9 MiB/ 10.9 MiB]                                                
Operation completed over 3 objects/10.9 MiB.                                     
-rw-r--r-- 1 jupyter jupyter 1799474 Dec  5 11:50 taxi-test.csv
-rw-r--r-- 1 jupyter jupyter 7986353 Dec  5 11:50 taxi-train.csv
-rw-r--r-- 1 jupyter jupyter 1673742 Dec  5 11:50 taxi-valid.csv


In [3]:
df_train = pd.read_csv(filepath_or_buffer = "./taxi-train.csv")
df_valid = pd.read_csv(filepath_or_buffer = "./taxi-valid.csv")
df_test = pd.read_csv(filepath_or_buffer = "./taxi-test.csv")

CSV_COLUMN_NAMES = list(df_train)
print(CSV_COLUMN_NAMES)

FEATURE_NAMES = CSV_COLUMN_NAMES[1:] # all but first column
LABEL_NAME = CSV_COLUMN_NAMES[0] # first column

['fare_amount', 'dayofweek', 'hourofday', 'pickuplon', 'pickuplat', 'dropofflon', 'dropofflat']


# Feature Columns

In [4]:
feature_columns = [tf.feature_column.numeric_column(key = k) for k in FEATURE_NAMES]
feature_columns

[NumericColumn(key='dayofweek', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='hourofday', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='pickuplon', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='pickuplat', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='dropofflon', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='dropofflat', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [5]:
def train_input_fn(df, batch_size = 128):
    #1. Convert dataframe into correct (features,label) format for Estimator API
    dataset = tf.data.Dataset.from_tensor_slices(tensors = (dict(df[FEATURE_NAMES]), df[LABEL_NAME]))
    #2. Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(buffer_size = 1000).repeat(count = None).batch(batch_size = batch_size)
   
    return dataset

def eval_input_fn(df, batch_size = 128):
    #1. Convert dataframe into correct (features,label) format for Estimator API
    dataset = tf.data.Dataset.from_tensor_slices(tensors = (dict(df[FEATURE_NAMES]), df[LABEL_NAME]))
    #2.Batch the examples.
    dataset = dataset.batch(batch_size = batch_size)
   
    return dataset

def predict_input_fn(df, batch_size = 128):
    #1. Convert dataframe into correct (features) format for Estimator API
    dataset = tf.data.Dataset.from_tensor_slices(tensors = dict(df[FEATURE_NAMES])) # no label
    #2.Batch the examples.
    dataset = dataset.batch(batch_size = batch_size)
   
    return dataset

# Estimator Define

In [7]:
OUTDIR = "taxi_trained"

model = tf.estimator.LinearRegressor(
    feature_columns = feature_columns,
    model_dir = OUTDIR,
    config = tf.estimator.RunConfig(tf_random_seed = 1) # for reproducibility
)

INFO:tensorflow:Using config: {'_master': '', '_session_creation_timeout_secs': 7200, '_num_worker_replicas': 1, '_task_id': 0, '_experimental_max_worker_delay_secs': None, '_is_chief': True, '_global_id_in_cluster': 0, '_protocol': None, '_save_checkpoints_steps': None, '_save_summary_steps': 100, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_task_type': 'worker', '_evaluation_master': '', '_tf_random_seed': 1, '_experimental_distribute': None, '_num_ps_replicas': 0, '_train_distribute': None, '_model_dir': 'taxi_trained', '_eval_distribute': None, '_save_checkpoints_secs': 600, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa6909f97f0>, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_device_fn': None}


# Model Train

In [8]:
model.train(
    input_fn = lambda: train_input_fn(df = df_train), 
    steps = 500)

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use `tf.cast` instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into taxi_trained/model.ckpt.
INFO:tensorflow:loss = 22720.79, step = 1
INFO:tensorflow:global_step/sec: 156.921
INFO:tensorflow:loss = 6976.6094, step = 101 (0.640 sec)
INFO:tensorflow:global_step/sec: 237.201
INFO:tensorflow:loss = 14335.657, step = 201 (0.421 sec)
INFO:

<tensorflow_estimator.python.estimator.canned.linear.LinearRegressor at 0x7fa6909f95c0>

# Evaluate

In [10]:
def print_rmse(model, df):
    metrics = model.evaluate(input_fn = lambda: eval_input_fn(df))
    print("RMSE on dataset = {}".format(metrics["average_loss"]**.5))
print_rmse(model = model, df = df_valid)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-12-05T11:56:19Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from taxi_trained/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-12-05-11:56:20
INFO:tensorflow:Saving dict for global step 500: average_loss = 89.48336, global_step = 500, label/mean = 11.229713, loss = 11435.183, prediction/mean = 12.608461
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 500: taxi_trained/model.ckpt-500
RMSE on dataset = 9.459564487360257


# Predict

In [11]:
predictions = model.predict(input_fn = lambda: predict_input_fn(df = df_test[:10]))
for items in predictions:
    print(items)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from taxi_trained/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
{'predictions': array([9.537243], dtype=float32)}
{'predictions': array([9.708608], dtype=float32)}
{'predictions': array([9.596281], dtype=float32)}
{'predictions': array([9.537495], dtype=float32)}
{'predictions': array([9.537642], dtype=float32)}
{'predictions': array([9.879642], dtype=float32)}
{'predictions': array([9.879233], dtype=float32)}
{'predictions': array([9.70439], dtype=float32)}
{'predictions': array([9.535185], dtype=float32)}
{'predictions': array([9.877537], dtype=float32)}


# Model Change

In [14]:
model = tf.estimator.DNNRegressor(
    hidden_units = [10,10], # specify neural architecture
    feature_columns = feature_columns, 
    model_dir = OUTDIR,
    config = tf.estimator.RunConfig(tf_random_seed = 1)
)
model.train(
    input_fn = lambda: train_input_fn(df = df_train), 
    steps = 500)
print_rmse(model = model, df = df_valid)

INFO:tensorflow:Using config: {'_master': '', '_session_creation_timeout_secs': 7200, '_num_worker_replicas': 1, '_task_id': 0, '_experimental_max_worker_delay_secs': None, '_is_chief': True, '_global_id_in_cluster': 0, '_protocol': None, '_save_checkpoints_steps': None, '_save_summary_steps': 100, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_task_type': 'worker', '_evaluation_master': '', '_tf_random_seed': 1, '_experimental_distribute': None, '_num_ps_replicas': 0, '_train_distribute': None, '_model_dir': 'taxi_trained', '_eval_distribute': None, '_save_checkpoints_secs': 600, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa67c154e48>, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_device_fn': None}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create Check