In [10]:
from __future__ import print_function
import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows=10
pd.options.display.float_format = '{:.1f}'.format

In [11]:
california_housing_dataframe = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")

In [12]:
california_housing_dataframe.index

RangeIndex(start=0, stop=17000, step=1)

In [16]:
california_housing_dataframe=california_housing_dataframe.reindex(np.random.permutation(california_housing_dataframe.index))

In [18]:
california_housing_dataframe["median_house_value"] /= 1000

In [19]:
california_housing_dataframe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
9327,-119.2,34.2,42.0,1411.0,300.0,1295.0,339.0,2.7,164.9
14976,-122.2,37.8,52.0,2513.0,502.0,1048.0,518.0,3.7,269.9
16444,-122.6,38.6,18.0,3753.0,752.0,1454.0,668.0,3.8,185.7
13934,-122.0,37.3,15.0,5132.0,1059.0,2156.0,982.0,5.7,404.8
9448,-119.2,34.3,35.0,2532.0,407.0,1338.0,422.0,4.8,219.0
...,...,...,...,...,...,...,...,...,...
10519,-120.4,37.0,16.0,1027.0,199.0,673.0,193.0,3.0,63.8
12142,-121.4,37.7,33.0,1875.0,363.0,970.0,381.0,3.5,141.7
7353,-118.3,34.2,42.0,1073.0,220.0,804.0,226.0,3.8,172.6
6215,-118.2,34.0,37.0,441.0,125.0,390.0,98.0,1.7,90.2


In [15]:
california_housing_dataframe.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.6,35.6,28.6,2643.7,539.4,1429.6,501.2,3.9,207300.9
std,2.0,2.1,12.6,2179.9,421.5,1147.9,384.5,1.9,115983.8
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,14999.0
25%,-121.8,33.9,18.0,1462.0,297.0,790.0,282.0,2.6,119400.0
50%,-118.5,34.2,29.0,2127.0,434.0,1167.0,409.0,3.5,180400.0
75%,-118.0,37.7,37.0,3151.2,648.2,1721.0,605.2,4.8,265000.0
max,-114.3,42.0,52.0,37937.0,6445.0,35682.0,6082.0,15.0,500001.0


In [22]:
#define feature column
my_feature = california_housing_dataframe[["total_rooms"]]
feature_columns = [tf.feature_column.numeric_column("total_rooms")]

In [24]:
#define target(label)
targets = california_housing_dataframe["median_house_value"]

In [29]:
#configure LinearRegressor model in tf estimator api
#define optimizer a parameter of LineraRegressor
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer,5.0)

#configure LineraRegressor model
linear_regressor = tf.estimator.LinearRegressor(
    feature_columns=feature_columns,
    optimizer = my_optimizer
)

In [31]:
 features = {key:np.array(value) for key,value in dict(my_feature).items()}

In [32]:
features

{'total_rooms': array([1411., 2513., 3753., ..., 1073.,  441., 3273.])}

In [36]:
def my_input_fn(features,targets,batch_size=1,shuffle=True,num_epochs=None):
    #convert pandas data into a dict of np arrays
    features = {key:np.array(value) for key,value in dict(features).items()}
    
    #construct dataset
    ds = Dataset.from_tensor_slices((features,targets))
    ds=ds.batch(batch_size).repeat(num_epochs)
    
    if shuffle:
        ds=ds.shuffle(buffer_size=10000)
        

    #return next batch data
    features, labels = ds.make_one_shot_iterator().get_next()
    return features,labels
    

In [38]:
linear_regressor.train(
    input_fn=lambda:my_input_fn(my_feature,targets),
    steps=100
)

<tensorflow_estimator.python.estimator.canned.linear.LinearRegressor at 0x1b9939b87b8>

In [44]:
predict_input_fn=lambda:my_input_fn(my_feature,targets,num_epochs=1,shuffle=False)
predictions=linear_regressor.predict(input_fn=predict_input_fn)


predictions = np.array([item['predictions'][0] for item in predictions])
print(predictions)

mean_squared_error = metrics.mean_squared_error(predictions,targets)
root_mean_squared_error = mean_squared_error ** (1/2)

print("mean_squared_error = {0} ".format(mean_squared_error))
print("root_mean_squared_error = {0} ".format(root_mean_squared_error))

[0.14109962 0.2512993  0.3752989  ... 0.10729973 0.04409993 0.32729903]
mean_squared_error = 56308.997950473895 
root_mean_squared_error = 237.29517051654022 


In [46]:
min_house_val = california_housing_dataframe["median_house_value"].min()
max_house_val = california_housing_dataframe["median_house_value"].max()
min_max_diff = max_house_val - min_house_val
print("min_house_val = {0} ".format(min_house_val))
print("max_house_val = {0} ".format(max_house_val))
print("min_max_diff = {0} ".format(min_max_diff))
print("root_mean_squared_error = {0} ".format(root_mean_squared_error))

min_house_val = 14.999 
max_house_val = 500.001 
min_max_diff = 485.00199999999995 
root_mean_squared_error = 237.29517051654022 


In [47]:
calibration_data = pd.DataFrame()
calibration_data["predictions"]= pd.Series(predictions)
calibration_data["targets"]=pd.Series(targets)
calibration_data

Unnamed: 0,predictions,targets
0,0.1,66.9
1,0.3,80.1
2,0.4,85.7
3,0.5,73.4
4,0.3,65.5
...,...,...
16995,0.1,111.4
16996,0.2,79.0
16997,0.1,103.6
16998,0.0,85.8


In [48]:
calibration_data.describe()

Unnamed: 0,predictions,targets
count,17000.0,17000.0
mean,0.3,207.3
std,0.2,116.0
min,0.0,15.0
25%,0.1,119.4
50%,0.2,180.4
75%,0.3,265.0
max,3.8,500.0
