## Setup

In [1]:
# Import Dependencies.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import requests
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from joblib import dump, load

In [2]:
data_df = pd.read_csv("../Resources/housingDataUpdatedandCleaned.csv")
data_df.head()

Unnamed: 0,address,price,home_type,bedrooms,bathrooms,square_feet,built,lot_size,neighborhood,county,...,zipcode_rank,zipcodeAVGcost,elementary_school,middle_school,high_school,hs_rank,hsAVGcost,district,district_rank,districtAVGcost
0,"19609 NE Marine DR E-4, Portland OR 97230",129500,Floating,1,1.0,735,1960,0.0,unknown,Multnomah,...,29,412295.897059,Salish Pond,Reynolds,Reynolds,21,393627.258621,Reynolds,10,393627.258621
1,"3389 NE 162ND AVE, Portland OR 97230",160000,Condo,2,2.0,1073,1979,0.0,Fremont Village Park,Multnomah,...,29,412295.897059,Margaret Scott,H.B. Lee,Reynolds,21,393627.258621,Reynolds,10,393627.258621
2,"19609 NE MARINE DR E1, Portland OR 97230",224500,Floating,3,2.0,1150,1945,0.0,Big Eddy Marina,Multnomah,...,29,412295.897059,Salish Pond,Reynolds,Reynolds,21,393627.258621,Reynolds,10,393627.258621
3,"15041 NE SISKIYOU CT, Portland OR 97230",229900,Condo,2,2.0,1638,1973,0.0,unknown,Multnomah,...,29,412295.897059,Scott,H.B. Lee,Reynolds,21,393627.258621,Reynolds,10,393627.258621
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",239000,Condo,2,2.0,1128,1986,0.0,SUMMERPLACE,Multnomah,...,29,412295.897059,Margaret Scott,H.B. Lee,Reynolds,21,393627.258621,Reynolds,10,393627.258621


## Data Preprocessing

In [3]:
# Make a copy of the original data frame to modify.
model_df = data_df

# Include only those columns that will be used in the deep learning model.
model_df = model_df.loc[:, ["bathrooms",
                            "bedrooms",
                            "built",
                            "lot_size",
                            "square_feet",
#                             "neighborhood",
#                             "county",
#                             "home_type",
#                             "hs_rank",
#                             "hsAVGcost",
                            "districtAVGcost",
                            "district_rank",
                            "zipcodeAVGcost",
                            "zipcode_rank",
                            "price"]
                       ]

# Drop rows with NaN entries.
model_df.dropna(inplace=True)

# Check the model data.
print(len(model_df))
model_df.head()

1943


Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,districtAVGcost,district_rank,zipcodeAVGcost,zipcode_rank,price
0,1.0,1,1960,0.0,735,393627.258621,10,412295.897059,29,129500
1,2.0,2,1979,0.0,1073,393627.258621,10,412295.897059,29,160000
2,2.0,3,1945,0.0,1150,393627.258621,10,412295.897059,29,224500
3,2.0,2,1973,0.0,1638,393627.258621,10,412295.897059,29,229900
4,2.0,2,1986,0.0,1128,393627.258621,10,412295.897059,29,239000


In [4]:
# Bin prices into ten equal length ranges.
model_df["price_range"] = pd.qcut(model_df["price"], 5)
# Drop the original price data.
model_df.drop("price", axis=1, inplace=True)
model_df.head()

Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,districtAVGcost,district_rank,zipcodeAVGcost,zipcode_rank,price_range
0,1.0,1,1960,0.0,735,393627.258621,10,412295.897059,29,"(123499.999, 348340.0]"
1,2.0,2,1979,0.0,1073,393627.258621,10,412295.897059,29,"(123499.999, 348340.0]"
2,2.0,3,1945,0.0,1150,393627.258621,10,412295.897059,29,"(123499.999, 348340.0]"
3,2.0,2,1973,0.0,1638,393627.258621,10,412295.897059,29,"(123499.999, 348340.0]"
4,2.0,2,1986,0.0,1128,393627.258621,10,412295.897059,29,"(123499.999, 348340.0]"


In [5]:
# # Get dummies for the values in home_type to use in the model.
# model_df = pd.get_dummies(model_df, columns=["home_type"])
# model_df.head()

In [6]:
# Assign X (input) and y (target).
X = model_df.drop("price_range", axis=1)
y = model_df["price_range"]

In [7]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
# Create a MinMaxScaler model and fit it to the training data
X_scaler = MinMaxScaler().fit(X_train)

# Save the scalar.
dump(X_scaler, 'minmax_scaler.bin', compress=True)

['minmax_scaler.bin']

In [9]:
# Transform the training and testing data using the X_scaler and y_scaler models.

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Label encode the target data.
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Save the label encoder
dump(label_encoder, 'label_encoder.bin', compress=True)

['label_encoder.bin']

In [11]:
# Convert encoded labels to one-hot encoding.
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

## Run Random Forest Classifier

In [12]:
# Create a random forest classifier, fit to the training data, and score on the testing data.
rf = RandomForestClassifier(n_estimators=1000)
rf = rf.fit(X_train_scaled, y_train_categorical)
print(rf.score(X_test_scaled, y_test_categorical))

# Find the importances of each feature.
feature_names = X.columns
importances = rf.feature_importances_
print(sorted(zip(rf.feature_importances_, feature_names), reverse=True))

0.5967078189300411
[(0.35057865890148476, 'square_feet'), (0.1526336942057401, 'built'), (0.10555870296091262, 'lot_size'), (0.09699821929376133, 'zipcodeAVGcost'), (0.0967309718613422, 'zipcode_rank'), (0.0862491463308231, 'bathrooms'), (0.067326500773453, 'bedrooms'), (0.02197780930941564, 'districtAVGcost'), (0.021946296363067237, 'district_rank')]


## Create a Deep Learning Model

In [13]:
# Create a deep learning Sequential model.
deep_model = Sequential()
deep_model.add(Dense(units=500, activation='relu', input_dim=9))
deep_model.add(Dense(units=200, activation='relu'))
deep_model.add(Dense(units=100, activation='relu'))
deep_model.add(Dense(units=5, activation='softmax'))

In [14]:
# Compile and fit the model.
deep_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=50,
    shuffle=True,
    verbose=2
)

Train on 1457 samples
Epoch 1/50
1457/1457 - 2s - loss: 1.3869 - accuracy: 0.3981
Epoch 2/50
1457/1457 - 0s - loss: 1.1097 - accuracy: 0.5154
Epoch 3/50
1457/1457 - 0s - loss: 1.0571 - accuracy: 0.5278
Epoch 4/50
1457/1457 - 0s - loss: 1.0177 - accuracy: 0.5491
Epoch 5/50
1457/1457 - 0s - loss: 0.9951 - accuracy: 0.5566
Epoch 6/50
1457/1457 - 0s - loss: 0.9701 - accuracy: 0.5655
Epoch 7/50
1457/1457 - 0s - loss: 0.9473 - accuracy: 0.5834
Epoch 8/50
1457/1457 - 0s - loss: 0.9703 - accuracy: 0.5697
Epoch 9/50
1457/1457 - 0s - loss: 0.9285 - accuracy: 0.5923
Epoch 10/50
1457/1457 - 0s - loss: 0.9342 - accuracy: 0.5772
Epoch 11/50
1457/1457 - 0s - loss: 0.8945 - accuracy: 0.6074
Epoch 12/50
1457/1457 - 0s - loss: 0.8806 - accuracy: 0.6129
Epoch 13/50
1457/1457 - 0s - loss: 0.8989 - accuracy: 0.6081
Epoch 14/50
1457/1457 - 0s - loss: 0.8752 - accuracy: 0.6266
Epoch 15/50
1457/1457 - 0s - loss: 0.8736 - accuracy: 0.6198
Epoch 16/50
1457/1457 - 0s - loss: 0.8677 - accuracy: 0.6129
Epoch 17/50

<tensorflow.python.keras.callbacks.History at 0x24728707bc8>

## Quantify our Trained Model

In [15]:
model_loss, model_accuracy = deep_model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

486/486 - 0s - loss: 0.8595 - accuracy: 0.6399
Loss: 0.8594671875361062, Accuracy: 0.6399176716804504


## Make Predictions

In [16]:
# Use the first 10 test data values to make a prediction and compare it to the actual labels.
encoded_predictions = deep_model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:10])}")

Predicted classes: [Interval(609000.0, 825000.0, closed='right')
 Interval(609000.0, 825000.0, closed='right')
 Interval(348340.0, 449000.0, closed='right')
 Interval(449000.0, 609000.0, closed='right')
 Interval(449000.0, 609000.0, closed='right')
 Interval(348340.0, 449000.0, closed='right')
 Interval(609000.0, 825000.0, closed='right')
 Interval(123499.999, 348340.0, closed='right')
 Interval(123499.999, 348340.0, closed='right')
 Interval(609000.0, 825000.0, closed='right')]
Actual Labels: [Interval(609000.0, 825000.0, closed='right'), Interval(609000.0, 825000.0, closed='right'), Interval(449000.0, 609000.0, closed='right'), Interval(449000.0, 609000.0, closed='right'), Interval(449000.0, 609000.0, closed='right'), Interval(449000.0, 609000.0, closed='right'), Interval(609000.0, 825000.0, closed='right'), Interval(123499.999, 348340.0, closed='right'), Interval(123499.999, 348340.0, closed='right'), Interval(449000.0, 609000.0, closed='right')]


## Save the trained model

In [17]:
# Save the model
deep_model.save("housing_model_trained.h5")

## Test the saved model, scaler, and label encoder

In [18]:
# Load the model, scaler and label encoder.
model = load_model("housing_model_trained.h5")
scaler = load("minmax_scaler.bin")
label_encoder = load("label_encoder.bin")

In [19]:
# Input data for testing.
input_data = np.array(np.array([X.iloc[0]]))

In [20]:
X.iloc[0]

bathrooms               1.000000
bedrooms                1.000000
built                1960.000000
lot_size                0.000000
square_feet           735.000000
districtAVGcost    393627.258621
district_rank          10.000000
zipcodeAVGcost     412295.897059
zipcode_rank           29.000000
Name: 0, dtype: float64

In [21]:
encoded_predictions = model.predict_classes(scaler.transform(input_data))
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"{prediction_labels[0].left}, {prediction_labels[0].right}")

123499.999, 348340.0
