In [1]:
import pandas as pd

In [3]:
housing = pd.read_csv('/home/umairshah/cal_housing_clean.csv')

In [5]:
housing.head() # All the values are continuous

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [6]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 7 columns):
housingMedianAge    20640 non-null float64
totalRooms          20640 non-null float64
totalBedrooms       20640 non-null float64
population          20640 non-null float64
households          20640 non-null float64
medianIncome        20640 non-null float64
medianHouseValue    20640 non-null float64
dtypes: float64(7)
memory usage: 1.1 MB


In [8]:
# Train Test Split

from sklearn.model_selection import train_test_split

X = housing.drop('medianHouseValue', axis = 1)
y = housing['medianHouseValue']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

In [10]:
# MinMax Scalling

from sklearn.preprocessing import MinMaxScaler

In [11]:
scaler = MinMaxScaler()

In [12]:
scaler.fit(X_train)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [14]:
scaled_data = scaler.transform(X_train)

In [15]:
X_train = pd.DataFrame(data = scaled_data, columns = X_train.columns, index = X_train.index)

In [16]:
X_train.shape

(14448, 6)

In [17]:
X_train.head(3)

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome
6761,0.352941,0.069688,0.117163,0.048769,0.115442,0.142508
3010,0.607843,0.011242,0.015673,0.008367,0.014142,0.045027
7812,0.666667,0.02523,0.031347,0.020971,0.030258,0.212866


In [18]:
X_test = pd.DataFrame(data = scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

In [19]:
X_test.head(3)

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome
16086,0.686275,0.046264,0.045158,0.025873,0.048841,0.353133
8816,0.705882,0.027417,0.020795,0.012709,0.023187,0.770182
7175,0.901961,0.032326,0.040813,0.041662,0.042592,0.133626


In [59]:
## Steps: 1) create tensorflow feature columns 2) create input functions 3) create estimator model
## 4) Train the model 5) Create a Prediction function 6) Predict the model 7) Calculate RMSE

import tensorflow as tf

In [22]:
X_train.columns

Index(['housingMedianAge', 'totalRooms', 'totalBedrooms', 'population',
       'households', 'medianIncome'],
      dtype='object')

In [30]:
feature_columns = []

for col in X_train.columns:
    col = tf.feature_column.numeric_column(col)
    feature_columns.append(col)

feature_columns

[NumericColumn(key='housingMedianAge', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='totalRooms', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='totalBedrooms', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='population', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='households', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='medianIncome', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [28]:
## 2) Input Function

input_func = tf.estimator.inputs.pandas_input_fn(x = X_train, y = y_train,
                                                batch_size = 10, num_epochs = 10,
                                                shuffle = True)

In [31]:
## 3) Estimator Model

model = tf.estimator.DNNRegressor(hidden_units = [6,6,6], feature_columns = feature_columns)

W0905 12:53:16.140625 140381774497600 estimator.py:1811] Using temporary folder as model directory: /tmp/tmpb51h8mvr


In [33]:
## 4) Training the Model

model.train(input_fn = input_func, steps = 100)

W0905 12:56:20.078932 140381774497600 deprecation.py:323] From /home/umairshah/anaconda3/lib/python3.7/site-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
W0905 12:56:20.136358 140381774497600 deprecation.py:323] From /home/umairshah/anaconda3/lib/python3.7/site-packages/tensorflow/python/training/saver.py:1066: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.


<tensorflow_estimator.python.estimator.canned.dnn.DNNRegressor at 0x7facec86f048>

In [34]:
predict_input_func = tf.estimator.inputs.pandas_input_fn(x = X_test, batch_size = 10, num_epochs = 1,
                                                        shuffle = False)

In [36]:
pred_gen = model.predict(input_fn = predict_input_func)

In [37]:
predictions = list(pred_gen)

In [40]:
len(predictions)

6192

In [45]:
final_pred = []

for pred in predictions:
    final_pred.append(pred['predictions'])

In [46]:
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test, final_pred) ** 0.5

print(MSE)

234424.38274360696


In [51]:
housing.describe()

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,28.639486,2635.763081,537.898014,1425.476744,499.53968,3.870671,206855.816909
std,12.585558,2181.615252,421.247906,1132.462122,382.329753,1.899822,115395.615874
min,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,18.0,1447.75,295.0,787.0,280.0,2.5634,119600.0
50%,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [58]:
# As you can see above our mean value doesn't deviate much from actual mean value, that shows our model is already
# Performing well on training data and other paramters. But, if you want to see more better results, you can 
# try number of things like changing batch_size, num_epochs, and training data or even can use LinearRegressor rather
# DNNRegressor to see the better results