In [2]:
%bash
git clone https://github.com/GoogleCloudPlatform/training-data-analyst

Cloning into 'training-data-analyst'...


In [5]:
%bash

ls training-data-analyst/courses/machine_learning/deepdive/03_tensorflow

a_tfstart.ipynb
b_estimator.ipynb
c_dataset.ipynb
debug_demo.ipynb
diagrams
d_traineval.ipynb
e_cloudmle.ipynb
taxifare
taxi-test.csv
taxi-train.csv
taxi-valid.csv


In [6]:
import tensorflow as tf
import pandas as pd
import numpy as np

  from ._conv import register_converters as _register_converters


In [21]:
path = 'training-data-analyst/courses/machine_learning/deepdive/03_tensorflow/'

COLUMN_NAMES = ['fare_amount', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'number_of_passengers', 'key']
df_train = pd.read_csv(path + 'taxi-train.csv', names=COLUMN_NAMES, header=None)
df_validation = pd.read_csv(path + 'taxi-valid.csv', names=COLUMN_NAMES, header=None)
df_test = pd.read_csv(path + 'taxi-test.csv', names=COLUMN_NAMES, header=None)

In [23]:
df_train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,number_of_passengers,key
0,12.0,-73.987625,40.750617,-73.971163,40.78518,1,0
1,4.5,-73.96362,40.774363,-73.953485,40.772665,1,1
2,4.5,-73.989649,40.756633,-73.985597,40.765662,1,2
3,10.0,-73.99395,40.727524,-74.006584,40.74424,1,3
4,2.5,-73.950223,40.66896,-73.948112,40.668872,6,4


In [24]:
df_train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,number_of_passengers,key
count,7770.0,7770.0,7770.0,7770.0,7770.0,7770.0,7770.0
mean,11.091293,-73.97532,40.751353,-73.974881,40.751501,1.733719,3884.5
std,9.076088,0.039807,0.030734,0.039604,0.034357,1.354334,2243.150129
min,2.5,-74.417107,40.27725,-74.417107,40.303627,1.0,0.0
25%,6.0,-73.992332,40.737163,-73.9919,40.735094,1.0,1942.25
50%,8.1,-73.982062,40.753661,-73.981085,40.754028,1.0,3884.5
75%,12.5,-73.968422,40.767864,-73.966838,40.768631,2.0,5826.75
max,82.25,-73.137393,41.366138,-73.137393,41.366138,6.0,7769.0


In [27]:
FEATURES = COLUMN_NAMES[1:len(FEATURES) - 1]
LABEL = COLUMN_NAMES[0]

In [29]:
def get_feature_columns():
  features = [tf.feature_column.numeric_column(f) for f in FEATURES]
  return features

def train_input_fn(df, num_epochs):
  return tf.estimator.inputs.pandas_input_fn(
    x = df,
    y = df[LABEL],
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )

def predict_input_fn(df, num_epochs):
  return tf.estimator.inputs.pandas_input_fn(
    x = df,
    y = None,
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )

In [42]:
log_dir = './taxi-trained'

tf.logging.set_verbosity(tf.logging.INFO)
model = tf.estimator.LinearRegressor(feature_columns=get_feature_columns(), model_dir=log_dir)

model.train(
  input_fn = train_input_fn(df_train, 10)
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0c0759ba50>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': './taxi-trained', '_global_id_in_cluster': 0, '_save_summary_steps': 100}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./taxi-trained/model.ckpt-1824
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoin

<tensorflow.python.estimator.canned.linear.LinearRegressor at 0x7f0c0759bc10>

In [32]:
metrics = model.evaluate(
  input_fn = train_input_fn(df_validation, 1)
)
print("The average loss on validation dataset is {}".format(metrics['average_loss']))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-06-11-03:20:24
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./taxi-trained/model.ckpt-608
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-06-11-03:20:24
INFO:tensorflow:Saving dict for global step 608: average_loss = 109.34633, global_step = 608, loss = 13004.403
The average loss on validation dataset is 109.346328735


In [38]:
predictions = model.predict(
  input_fn = predict_input_fn(df_test, 1)
)

In [39]:
for _ in range(10):
  print(predictions.next())

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./taxi-trained/model.ckpt-608
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
{'predictions': array([10.938421], dtype=float32)}
{'predictions': array([10.940023], dtype=float32)}
{'predictions': array([10.939257], dtype=float32)}
{'predictions': array([10.938503], dtype=float32)}
{'predictions': array([10.94262], dtype=float32)}
{'predictions': array([10.939071], dtype=float32)}
{'predictions': array([10.938959], dtype=float32)}
{'predictions': array([10.934711], dtype=float32)}
{'predictions': array([10.93951], dtype=float32)}
{'predictions': array([10.939154], dtype=float32)}


In [46]:
import shutil

log_dir = 'taxi-dnn-trained'
shutil.rmtree(log_dir)
model = tf.estimator.DNNRegressor(
  hidden_units = [4, 2],
  feature_columns = get_feature_columns(),
  model_dir = log_dir
)

model.train(input_fn = train_input_fn(df_train, 10))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0c06b94a10>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': 'taxi-dnn-trained', '_global_id_in_cluster': 0, '_save_summary_steps': 100}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into taxi-dnn-trained/model.ckpt.
INFO:tensorflow:loss = 18779

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x7f0c06b94910>

In [47]:
metrics = model.evaluate(
  input_fn = train_input_fn(df_validation, 1)
)
print("The average loss on validation dataset is {}".format(metrics['average_loss']))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-06-11-03:27:35
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from taxi-dnn-trained/model.ckpt-608
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-06-11-03:27:35
INFO:tensorflow:Saving dict for global step 608: average_loss = 109.116875, global_step = 608, loss = 12977.114
The average loss on validation dataset is 109.116874695


In [48]:
predictions = model.predict(
  input_fn = predict_input_fn(df_test, 1)
)

for _ in range(10):
  print(predictions.next())

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from taxi-dnn-trained/model.ckpt-608
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
{'predictions': array([11.132646], dtype=float32)}
{'predictions': array([11.133021], dtype=float32)}
{'predictions': array([11.135671], dtype=float32)}
{'predictions': array([11.120423], dtype=float32)}
{'predictions': array([11.126996], dtype=float32)}
{'predictions': array([11.126618], dtype=float32)}
{'predictions': array([11.12947], dtype=float32)}
{'predictions': array([11.109124], dtype=float32)}
{'predictions': array([11.132747], dtype=float32)}
{'predictions': array([11.133456], dtype=float32)}
