In [1]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

In [3]:
#Importing the data
housing_df = pd.read_csv("train_LR.csv", sep=",")
housing_df = shuffle(housing_df)

In [4]:
#Processing the data
processed_features = housing_df[["GrLivArea"]]

output_targets = pd.DataFrame(housing_df["SalePrice"])

In [7]:
#Splitting the data into training, validation, and test sets
training_examples = processed_features[0:1060]
training_targets = output_targets[0:1060]

val_examples = processed_features[1060:1260]
val_targets = output_targets[1060:1260]

test_examples = processed_features[1260:1460]
test_targets = output_targets[1260:1460]

In [8]:
# Configure a numeric feature column for total_rooms.
my_feature_columns = [tf.feature_column.numeric_column("GrLivArea")]

# Define the preferred optimizer: in this case lets use gradient descent
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)

# here is the difference between previous linear regression class
# Configure the linear regression model with our feature columns, hidden layers, and optimizer.

# https://www.kaggle.com/usersumit/tensorflow-dnnregressor
model = tf.estimator.DNNRegressor(feature_columns=my_feature_columns,hidden_units=[12,12],optimizer=my_optimizer)

In [9]:
#Define the input function required for training
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):

# Convert pandas data into a dict of np arrays.
# dictionary comprehension
# https://www.datacamp.com/community/tutorials/python-dictionary-comprehension
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
 
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # Shuffle the data, if specified.
    if shuffle:
      ds = ds.shuffle(buffer_size=10000)
      
    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    
    return features, labels

In [10]:
#Train the model from the existing data
training = model.train(input_fn = lambda:my_input_fn(training_examples,training_targets["SalePrice"],batch_size=32),steps=2000)

In [11]:
# Evaluating the model with RMSE
train_predictions = model.predict(input_fn=lambda: my_input_fn(training_examples, training_targets, num_epochs=1, shuffle=False))
val_predictions = model.predict(input_fn=lambda: my_input_fn(val_examples, val_targets, num_epochs=1, shuffle=False))
test_predictions = model.predict(input_fn=lambda: my_input_fn(test_examples, test_targets, num_epochs=1, shuffle=False))

# Format predictions as a NumPy array, so we can calculate error metrics.
train_predictions = np.array([item['predictions'][0] for item in train_predictions])
val_predictions = np.array([item['predictions'][0] for item in val_predictions])
test_predictions = np.array([item['predictions'][0] for item in test_predictions])

# Print Mean Squared Error and Root Mean Squared Error.
mean_squared_error = metrics.mean_squared_error(train_predictions, training_targets)
root_mean_squared_error = math.sqrt(mean_squared_error)
print("Root Mean Squared Error (on training data): %0.3f" % root_mean_squared_error)
mean_squared_error = metrics.mean_squared_error(val_predictions, val_targets)
root_mean_squared_error = math.sqrt(mean_squared_error)
print("Root Mean Squared Error (on validation data): %0.3f" % root_mean_squared_error)
mean_squared_error = metrics.mean_squared_error(test_predictions, test_targets)
root_mean_squared_error = math.sqrt(mean_squared_error)
print("Root Mean Squared Error (on test data): %0.3f" % root_mean_squared_error)

Root Mean Squared Error (on training data): 56348.057
Root Mean Squared Error (on validation data): 53484.164
Root Mean Squared Error (on test data): 59448.892
