## Setup

In [1]:
# Import Dependencies.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import requests
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from joblib import dump, load

In [2]:
# Fetch the data from the API.
listings_json = requests.get("http://127.0.0.1:5000/housingDataAPI/v1.0/listings").json()

# Examine the data.
print(json.dumps(listings_json[0], indent=4, sort_keys=True))

{
    "address": "17452 NE GLISAN ST #7, Portland OR 97230",
    "bathrooms": 2.0,
    "bedrooms": 2,
    "built": 1988,
    "city": "Portland",
    "county": "Multnomah",
    "elementary_school": "Hartley",
    "high_school": "Reynolds",
    "home_type": "Manufactured - Double Wide Manufact",
    "lot_size": null,
    "middle_school": "Reynolds",
    "neighborhood": "unknown",
    "price": 72000,
    "square_feet": 1152,
    "zipcode": 97230
}


In [3]:
# Create a dataframe to use for our model.
data_df = pd.DataFrame(listings_json)

print(len(data_df))
data_df.head()

1822


Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode
0,"17452 NE GLISAN ST #7, Portland OR 97230",2.0,2,1988,Portland,Multnomah,Hartley,Reynolds,Manufactured - Double Wide Manufact,,Reynolds,unknown,72000,1152,97230
1,"16000 SE POWELL BLVD 75, Portland OR 97236",2.0,3,1990,Portland,Multnomah,Powell Butte,Centennial,Manufactured - Double Wide Manufact,,Centennial,unknown,79950,1404,97236
2,"12846 SE RAMONA ST 6, Portland OR 97236",2.0,3,1997,Portland,Multnomah,Gilbert Hts,David Douglas,Manufactured - Double Wide Manufact,,Alice Ott,unknown,93900,1297,97236
3,"7720 S Macadam AVE 7, Portland OR 97219",3.0,3,1988,Portland,Multnomah,Other,Other,Floating Home - Contemporary,,Other,unknown,125000,2432,97219
4,"19609 NE Marine DR E-4, Portland OR 97230",1.0,1,1960,Portland,Multnomah,Salish Pond,Reynolds,Floating Home - Cabin,,Reynolds,unknown,129500,735,97230


## Data Preprocessing

In [5]:
# Make a copy of the original data frame to modify.
model_df = data_df

# Insert a lot value of 0 for condos and floating homes.
for index, row in model_df.iterrows():
    if ("Condo" in row["home_type"]) | ("Floating" in row["home_type"]):
        model_df.loc[index, "lot_size"] = 0
    else:
        pass

# Include only those columns that will be used in the deep learning model.
model_df = model_df.loc[:, ["bathrooms", "bedrooms", "built", "lot_size", "square_feet", "home_type", "price"]]
# Chose not to include high_school due to lousy random forest fitting.
# Drop rows with NaN entries.
model_df.dropna(inplace=True)

# Bin prices into ten equal length ranges.
model_df["price_range"] = pd.qcut(model_df["price"], 5)
# Drop the original price data.
model_df.drop("price", axis=1, inplace=True)

# Check the model data.
print(len(model_df))
model_df.head()

1725


Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,home_type,price_range
3,3.0,3,1988,0.0,2432,Floating Home - Contemporary,"(124999.999, 349000.0]"
4,1.0,1,1960,0.0,735,Floating Home - Cabin,"(124999.999, 349000.0]"
5,1.0,1,1974,0.0,720,Condo - Traditional,"(124999.999, 349000.0]"
6,1.0,1,1927,0.0,382,Condo - Common Wall,"(124999.999, 349000.0]"
7,1.0,1,2004,0.0,513,Condo - Other,"(124999.999, 349000.0]"


In [8]:
# Simplify home types in model_df.
for i in model_df.index:
    if "Floating" in model_df.at[i, "home_type"]:
        model_df.at[i, "home_type"] = "Floating"
    if "Condo" in model_df.at[i, "home_type"]:
        model_df.at[i, "home_type"] = "Condo"
    if "Single Family" in model_df.at[i, "home_type"]:
        model_df.at[i, "home_type"] = "Single Family"
    if "Manufactured" in model_df.at[i, "home_type"]:
        model_df.at[i, "home_type"] = "Manufactured"
    
model_df.head()

Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,home_type,price_range
3,3.0,3,1988,0.0,2432,Floating,"(124999.999, 349000.0]"
4,1.0,1,1960,0.0,735,Floating,"(124999.999, 349000.0]"
5,1.0,1,1974,0.0,720,Condo,"(124999.999, 349000.0]"
6,1.0,1,1927,0.0,382,Condo,"(124999.999, 349000.0]"
7,1.0,1,2004,0.0,513,Condo,"(124999.999, 349000.0]"


In [11]:
# Get dummies for the values in home_type to use in the model.
model_df = pd.get_dummies(model_df, columns=["home_type"])
model_df.head()

Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,price_range,home_type_Condo,home_type_Floating,home_type_Manufactured,home_type_Single Family
3,3.0,3,1988,0.0,2432,"(124999.999, 349000.0]",0,1,0,0
4,1.0,1,1960,0.0,735,"(124999.999, 349000.0]",0,1,0,0
5,1.0,1,1974,0.0,720,"(124999.999, 349000.0]",1,0,0,0
6,1.0,1,1927,0.0,382,"(124999.999, 349000.0]",1,0,0,0
7,1.0,1,2004,0.0,513,"(124999.999, 349000.0]",1,0,0,0


In [12]:
# Assign X (input) and y (target).

X = model_df.drop("price_range", axis=1)
y = model_df["price_range"]

In [13]:
# Split the data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [14]:
# Create a MinMaxScaler model and fit it to the training data

X_scaler = MinMaxScaler().fit(X_train)

# Save the scalar.
dump(X_scaler, 'minmax_scaler.bin', compress=True)

['minmax_scaler.bin']

In [15]:
# Transform the training and testing data using the X_scaler and y_scaler models.

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Label encode the target data.
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Save the label encoder
dump(label_encoder, 'label_encoder.bin', compress=True)

['label_encoder.bin']

In [17]:
# Convert encoded labels to one-hot encoding.
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

## Run Random Forest Classifier

In [20]:
# Create a random forest classifier, fit to the training data, and score on the testing data.
rf = RandomForestClassifier(n_estimators=1000)
rf = rf.fit(X_train_scaled, y_train_categorical)
print(rf.score(X_test_scaled, y_test_categorical))

# Find the importances of each feature.
feature_names = X.columns
importances = rf.feature_importances_
print(sorted(zip(rf.feature_importances_, feature_names), reverse=True))

0.5300925925925926
[(0.44928706573335603, 'square_feet'), (0.23200249195892506, 'built'), (0.11439141434438334, 'lot_size'), (0.09574016278543077, 'bathrooms'), (0.08081807817507289, 'bedrooms'), (0.011965381304771861, 'home_type_Single Family'), (0.011559706140583869, 'home_type_Condo'), (0.0039034236117335068, 'home_type_Floating'), (0.0003322759457426507, 'home_type_Manufactured')]


## Create a Deep Learning Model

In [23]:
# Create a deep learning Sequential model.
deep_model = Sequential()
deep_model.add(Dense(units=100, activation='relu', input_dim=9))
deep_model.add(Dense(units=100, activation='relu'))
deep_model.add(Dense(units=5, activation='softmax'))

In [24]:
# Compile and fit the model.
deep_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 1293 samples
Epoch 1/100
1293/1293 - 1s - loss: 1.5447 - accuracy: 0.2978
Epoch 2/100
1293/1293 - 0s - loss: 1.4399 - accuracy: 0.3790
Epoch 3/100
1293/1293 - 0s - loss: 1.3426 - accuracy: 0.4223
Epoch 4/100
1293/1293 - 0s - loss: 1.2589 - accuracy: 0.4532
Epoch 5/100
1293/1293 - 0s - loss: 1.2001 - accuracy: 0.4834
Epoch 6/100
1293/1293 - 0s - loss: 1.1654 - accuracy: 0.5073
Epoch 7/100
1293/1293 - 0s - loss: 1.1483 - accuracy: 0.5205
Epoch 8/100
1293/1293 - 0s - loss: 1.1234 - accuracy: 0.5228
Epoch 9/100
1293/1293 - 0s - loss: 1.1063 - accuracy: 0.5290
Epoch 10/100
1293/1293 - 0s - loss: 1.0943 - accuracy: 0.5305
Epoch 11/100
1293/1293 - 0s - loss: 1.0875 - accuracy: 0.5391
Epoch 12/100
1293/1293 - 0s - loss: 1.0737 - accuracy: 0.5553
Epoch 13/100
1293/1293 - 0s - loss: 1.0601 - accuracy: 0.5592
Epoch 14/100
1293/1293 - 0s - loss: 1.0541 - accuracy: 0.5615
Epoch 15/100
1293/1293 - 0s - loss: 1.0348 - accuracy: 0.5731
Epoch 16/100
1293/1293 - 0s - loss: 1.0326 - accuracy: 0.

<tensorflow.python.keras.callbacks.History at 0x1b1c01ec248>

## Quantify our Trained Model

In [None]:
model_loss, model_accuracy = deep_model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

## Make Predictions

In [None]:
# Use the first 10 test data values to make a prediction and compare it to the actual labels.
encoded_predictions = deep_model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:10])}")

## Save the trained model

In [None]:
# Save the model
deep_model.save("housing_model_trained.h5")

## Test the saved model, scaler, and label encoder

In [None]:
# Load the model, scaler and label encoder.
model = load_model("housing_model_trained.h5")
scaler = load("minmax_scaler.bin")
label_encoder = load("label_encoder.bin")

In [None]:
# Input data as bathrooms, bedrooms, built, lot_size, square_feet
input_data = np.array([[3, 4, 1920, 0.5, 1075]])

In [None]:
encoded_predictions = model.predict_classes(input_data)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"{prediction_labels[0].right}")