<a href="https://colab.research.google.com/github/vssood/WU_DL/blob/master/Assignments/WU_DL_AS5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 5
### Problem Statement

The median income by zipcode provides an additional feature, median income, that you should use in your predictions. To complete this assignment perform the following steps:

1. Load the housing prices training data.

2. Join the median income by zipcode to the training data so that you gain the median income.

3. Train a model to predict house price when given the following inputs: 'bedrooms', 'bathrooms', 'garage', 'land', 'sqft', 'median_income'.
Load the housing prices test data. This data does not contain the house price, you must predict this.

4. Join the median income by zipcode to the test/submit data to gain the median income.

5. Predict prices for the test/submit data.

6. Create a submission dataset that contains the house id (from the test/submit data) and the predicted price for that house. Include no other fields.
Submit this dataset and see how close you are to the actual values.

https://github.com/jeffheaton/t81_558_deep_learning/blob/df29ce2413c1ef32acaf99764c54b1b529cd8779/assignments/assignment_yourname_class5.ipynb


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import metrics

from scipy.stats import zscore 

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

## Read, validate, join & Scrub data 

In [None]:
train_path = "https://data.heatonresearch.com/data/t81-558/datasets/houses_train.csv"
test_path = "https://data.heatonresearch.com/data/t81-558/datasets/houses_test.csv"
zip_path = "https://data.heatonresearch.com/data/t81-558/datasets/zips.csv"


In [None]:
df_train = pd.read_csv(train_path, na_values=["NA", "?"])
df_test = pd.read_csv(test_path, na_values= ["NA", "?"])
df_zip = pd.read_csv(zip_path, na_values=["NA", "?"])

In [None]:
print(df_train.shape, df_test.shape, df_zip.shape)

(10000, 8) (2000, 7) (50, 2)


In [None]:
df_train.head()

Unnamed: 0,id,zip,bedrooms,bathrooms,garage,land,sqft,price
0,1,60019,9,2,3,2.198,4860,1005580
1,2,60049,5,2,2,4.517,2870,620278
2,3,60011,2,1,0,4.12,1220,265711
3,4,60027,6,4,2,3.201,3810,819916
4,5,60001,9,3,2,1.347,5061,1039491


In [None]:
df_train.describe()

Unnamed: 0,id,zip,bedrooms,bathrooms,garage,land,sqft,price
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,60024.311,4.9988,2.3032,1.4467,2.586615,2961.4875,622229.4
std,2886.89568,14.505796,2.577918,1.623679,0.953336,1.366607,1550.644456,319141.6
min,1.0,60000.0,1.0,1.0,0.0,0.25,650.0,118364.0
25%,2500.75,60012.0,3.0,1.0,1.0,1.409,1676.0,354009.2
50%,5000.5,60024.0,5.0,2.0,2.0,2.5745,2899.5,613495.5
75%,7500.25,60037.0,7.0,3.0,2.0,3.771,4340.25,905456.5
max,10000.0,60049.0,9.0,7.0,3.0,4.999,5952.0,1270773.0


In [None]:
df_test.describe()

Unnamed: 0,id,zip,bedrooms,bathrooms,garage,land,sqft
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,11000.5,60024.7895,5.0435,2.308,1.4455,2.653507,2983.922
std,577.494589,14.617798,2.634403,1.640269,0.970825,1.355154,1581.491528
min,10001.0,60000.0,1.0,1.0,0.0,0.26,650.0
25%,10500.75,60012.0,3.0,1.0,1.0,1.47725,1668.0
50%,11000.5,60025.0,5.0,2.0,2.0,2.678,2890.0
75%,11500.25,60038.0,7.0,3.0,2.0,3.798,4388.25
max,12000.0,60049.0,9.0,7.0,3.0,4.999,5952.0


In [None]:
# check for misssing values
sum(df_train.isnull().sum())

0

In [None]:
df_zip.head()

Unnamed: 0,zip,median_income
0,60000,75806
1,60001,205564
2,60002,307019
3,60003,145929
4,60004,135496


Combine train and zip data

In [None]:
df_train = df_train.join(df_zip.set_index('zip'), on='zip')

In [None]:
df_test = df_test.join(df_zip.set_index('zip'), on='zip')

### Set-up data for the model 

In [None]:
# replace with  zscore 	
#df_train['land'] = zscore(df_train['land'])     - Not needed as the mean and sd are in single digit 
df_train['sqft'] = zscore(df_train['sqft'])
df_train['median_income'] = zscore(df_train['median_income'])

# df_test['land'] = zscore(df_test['land'])        - Not needed as the mean and sd are in single digit 
df_test['sqft'] = zscore(df_test['sqft'])
df_test['median_income'] = zscore(df_test['median_income'])

In [None]:
df_train.head()

Unnamed: 0,id,zip,bedrooms,bathrooms,garage,land,sqft,price,median_income
0,1,60019,9,2,3,2.198,1.224399,1005580,-0.820375
1,2,60049,5,2,2,4.517,-0.059003,620278,0.095794
2,3,60011,2,1,0,4.12,-1.12313,265711,-0.117203
3,4,60027,6,4,2,3.201,0.547227,819916,1.599117
4,5,60001,9,3,2,1.347,1.354029,1039491,-0.163032


In [None]:
# convert to numpy
x_columns = df_train.columns.drop(['id','zip', 'price'])

In [None]:
x_columns
x = df_train[x_columns].values
y = df_train['price'].values

In [None]:
print(x, y)

[[ 9.          2.          3.          2.198       1.22439894 -0.82037489]
 [ 5.          2.          2.          4.517      -0.05900261  0.09579356]
 [ 2.          1.          0.          4.12       -1.12312953 -0.11720343]
 ...
 [ 7.          2.          2.          2.011       0.63364727 -0.65511305]
 [ 9.          7.          3.          1.768       1.88286778 -0.71704911]
 [ 4.          1.          1.          2.377      -0.49045771 -1.52139269]] [1005580  620278  265711 ...  813447 1208882  446179]


In [None]:
# Create train / validation split 
x_train, x_val, y_train, y_val = train_test_split(x, y, train_size = .8, random_state = 42)

In [None]:
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

(8000, 6) (8000,) (2000, 6) (2000,)


### Create Model 

In [None]:
# Build Model 
model = Sequential()
model.add(Dense(100, input_dim = x_train.shape[1], activation='relu', kernel_initializer='random_normal'))   # Hidden 1
model.add(Dense(50, activation='relu', kernel_initializer = 'random_normal' ))                               # Hidden 2
model.add(Dense(25, activation='relu', kernel_initializer= 'random_normal'))                                # Hidden 3
model.add(Dense(1))

model.compile(loss = 'mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta = 1e-3, 
                        patience=5, verbose = 1, mode = 'auto', restore_best_weights=True)


Fit the model

In [None]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), verbose =1, callbacks = [monitor], epochs= 1000 )

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 00057: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f14fa62af28>

In [None]:
# Predict Root Mean Square Error 
pred = model.predict(x_val)

# Measure MSE error
score = metrics.mean_squared_error(y_val, pred)
print("Final Score (MSE) : {}".format(score))

Final Score (MSE) : 23607183.514568724


In [None]:
#Root Mean Square Error
rmse_score = np.sqrt(metrics.mean_squared_error(y_val, pred))
print("Final Score (RMSE) : {}".format(rmse_score))

Final Score (RMSE) : 4858.722415879376


### File for Submission 

In [None]:
x_test = df_test[x_columns].values

In [None]:
x_test[0:2]

array([[ 8.        ,  6.        ,  2.        ,  2.901     ,  1.42501015,
         1.60791168],
       [ 7.        ,  2.        ,  2.        ,  2.455     ,  0.59773596,
        -0.78391149]])

In [None]:
pred_test = model.predict(x_test)

In [None]:
pred_test

array([[1096533.1 ],
       [ 805605.94],
       [ 288208.1 ],
       ...,
       [1098467.5 ],
       [ 503296.8 ],
       [1120481.  ]], dtype=float32)

In [None]:
# File to submit 
df_test_submit = df_test.copy()

In [None]:
df_test_submit['price'] = pd.DataFrame(pred_test).astype(int)

In [None]:
df_test_submit.drop(['zip', 'bedrooms', 'bathrooms', 'garage', 'land', 'sqft',  'median_income'], axis=1)

Unnamed: 0,id,price
0,10001,1096533
1,10002,805605
2,10003,288208
3,10004,236514
4,10005,360632
...,...,...
1995,11996,265576
1996,11997,1154644
1997,11998,1098467
1998,11999,503296
