## Set Up
In this first cell, we'll load the necessary libraries.

In [78]:
import tensorflow as tf 
import pandas as pd
import numpy as np
import shutil

print(tf.__version__)
tf.logging.set_verbosity(tf.logging.INFO)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

1.8.0


In [79]:
df = pd.read_csv("train.csv")
df = df.dropna()

In [80]:
# df["latitude"] = df["latitude"].astype(float)
# df["longitude"] = df["longitude"].astype(float)
# df["price"] = df["price"].astype(float)

In [81]:
df.describe()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,price
count,28575.0,28575.0,28575.0,28575.0
mean,30002.8,37.7,-122.2,10205.0
std,17285.9,0.4,1.1,377371.2
min,0.0,17.1,-124.1,0.0
25%,15078.5,37.4,-122.4,2158.0
50%,30043.0,37.8,-122.2,2775.0
75%,45010.5,37.8,-122.0,3600.0
max,59873.0,49.2,-54.2,34754255.0


In [82]:
df.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,district,price
0,0,38.0,-122.1,concord / pleasant hill / martinez,2395.0
1,1,37.4,-122.0,santa clara,2100.0
3,3,37.8,-122.5,sunset / parkside,1900.0
4,4,37.4,-121.9,san jose north,2205.0
7,7,38.0,-122.3,"hercules, pinole, san pablo, el sob",2500.0


In [83]:
df.dtypes

Unnamed: 0      int64
latitude      float64
longitude     float64
district       object
price         float64
dtype: object

Now, split the data into two parts -- training and evaluation.

In [84]:
np.random.seed(seed=1) #makes result reproducible
msk = np.random.rand(len(df)) < 0.8
traindf = df[msk]
evaldf = df[~msk]

## Training and Evaluation

In [85]:
def add_more_features(df):
    # df["avg_rooms_per_house"] = df["total_rooms"] / df["households"] #expect positive correlation
    # df["avg_persons_per_room"] = df["population"] / df["total_rooms"] #expect negative correlation
    return df

In [86]:
# Create pandas input function
def make_input_fn(df, num_epochs):
    return tf.estimator.inputs.pandas_input_fn(
        x = add_more_features(df),
        # y = df['price'] / 100000, # will talk about why later in the course
        y = df['price'],
        batch_size = 32,
        num_epochs = num_epochs,
        shuffle = True,
        queue_capacity = 1000,
        num_threads = 1
    )

In [87]:
# Define your feature columns
def create_feature_cols():
    return [
#         tf.feature_column.numeric_column('housing_median_age'),
        tf.feature_column.bucketized_column(
            tf.feature_column.numeric_column('latitude'), boundaries = np.arange(37.0, 38.5, 0.1).tolist()
        ),
        tf.feature_column.bucketized_column(
            tf.feature_column.numeric_column('longitude'), boundaries = np.arange(-122.78, -121.65, 0.1).tolist()
        ),
  ]

In [88]:
# Create estimator train and evaluate function
def train_and_evaluate(output_dir, num_train_steps):
    estimator = tf.estimator.LinearRegressor(model_dir = output_dir, feature_columns = create_feature_cols())
    train_spec = tf.estimator.TrainSpec(input_fn = make_input_fn(traindf, None), 
                                        max_steps = num_train_steps)
    eval_spec = tf.estimator.EvalSpec(input_fn = make_input_fn(evaldf, 1), 
                                      steps = None, 
                                      start_delay_secs = 1, # start evaluating after N seconds, 
                                      throttle_secs = 5)  # evaluate every N seconds
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

In [89]:
# # Launch tensorboard
# from google.datalab.ml import TensorBoard

OUTDIR = './trained_model'
# TensorBoard().start(OUTDIR)

# will run tensorboard --logdir=path/to/log-directory instead

In [90]:
# Run the model
shutil.rmtree(OUTDIR, ignore_errors = True)
train_and_evaluate(OUTDIR, 2000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7feb9f335050>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': './trained_model', '_global_id_in_cluster': 0, '_save_summary_steps': 100}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 5 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorfl