In [1]:
%bash
git clone https://github.com/GoogleCloudPlatform/training-data-analyst

Cloning into 'training-data-analyst'...


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [3]:
df = pd.read_csv("https://storage.googleapis.com/ml_universities/california_housing_train.csv", delimiter=',')

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [5]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [16]:
msk = np.random.rand(len(df)) < 0.8
df_train = df[msk]
df_valid = df[~msk]

In [17]:
print("Training set size {}".format(len(df_train)))
print("Validation set size {}".format(len(df_valid)))

Training set size 13669
Validation set size 3331


In [19]:
def add_new_column(df):
  df['num_rooms'] = df['total_rooms'] / df['households']
  return df

In [26]:
def make_input_fn(df, num_epochs):
  return tf.estimator.inputs.pandas_input_fn(
    x=add_new_column(df),
    y=df['median_house_value'],
    batch_size=128,
    num_epochs=num_epochs,
    shuffle=True,
    queue_capacity=1000
  )

In [35]:
def get_feature_cols():
  return [
    tf.feature_column.numeric_column('housing_median_age'),
    tf.feature_column.numeric_column('num_rooms'),
    tf.feature_column.numeric_column('median_income'),
    tf.feature_column.bucketized_column(tf.feature_column.numeric_column('latitude'), boundaries=range(32, 42, 1))
  ]

In [47]:
def train_and_evaluate(log_dir, max_train_steps):
  model = tf.estimator.LinearRegressor(feature_columns=get_feature_cols(), model_dir=log_dir)
  train_spec = tf.estimator.TrainSpec(
    input_fn=make_input_fn(df_train, 8),
    max_steps=max_train_steps
  )
  eval_spec = tf.estimator.EvalSpec(
    input_fn=make_input_fn(df_valid, 1),
    steps=None,
    start_delay_secs=1,
    throttle_secs=100
  )
  tf.estimator.train_and_evaluate(model, train_spec, eval_spec)

In [48]:
import shutil

OUTDIR = './housing-trained'
shutil.rmtree(OUTDIR, ignore_errors=True)
train_and_evaluate(OUTDIR, 2000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0a0ef0cd50>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': './housing-trained', '_global_id_in_cluster': 0, '_save_summary_steps': 100}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 100 secs (eval_spec.throttle_secs) or training is finished.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into ./housing-trained/model.ckpt.
INFO:tensorflow:loss = 5139248000000.0, step = 1
INFO:tensorflow:global_step/sec: 116.925
INFO:tensorflow:loss = 5074064500000.0, step = 101 (0.861 sec)
INFO:tensorflow:global_step/sec: 175.252
INFO:tensorflow:loss = 6897536600000.0, step = 201 (0.567 sec)
INFO:tensorflow:global_step/sec: 181.134
INFO:tensorflow:loss = 11117085000000.0, step = 301 (0.552 sec)
INFO:tensorflow:global_step/sec: 184.885
INFO:tensorflow:loss = 9005033000000.0, step = 401 (0.541 sec)
INFO:tensorflow:global_step/sec: 194.364
INFO:tensorflow:loss = 6566523000000.0, step = 501 (0.514 sec)
INFO:tensorflow:global_step/sec: 192.584
INFO:tensorflow:loss = 3776005500000.0, step = 601 (0.520 sec)
INFO:tensorfl

In [49]:
from google.datalab.ml import TensorBoard

TensorBoard().start(OUTDIR)

16241

In [59]:
for p in TensorBoard().list()['pid']:
  TensorBoard().stop(p)

In [60]:
TensorBoard().list()