## Setup

In [1]:
# Import Dependencies.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import requests
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from joblib import dump, load

In [2]:
# Fetch the data from the API.
listings_json = requests.get("http://127.0.0.1:5000/housingDataAPI/v1.0/listings").json()

# Examine the data.
print(json.dumps(listings_json[0], indent=4, sort_keys=True))

{
    "address": "17452 NE GLISAN ST #7, Portland OR 97230",
    "bathrooms": 2.0,
    "bedrooms": 2,
    "built": 1988,
    "city": "Portland",
    "county": "Multnomah",
    "elementary_school": "Hartley",
    "high_school": "Reynolds",
    "home_type": "Manufactured - Double Wide Manufact",
    "lot_size": null,
    "middle_school": "Reynolds",
    "neighborhood": "unknown",
    "price": 72000,
    "square_feet": 1152,
    "zipcode": 97230
}


In [3]:
# Create a dataframe to use for our model.
data_df = pd.DataFrame(listings_json)

print(len(data_df))
data_df.head()

1822


Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode
0,"17452 NE GLISAN ST #7, Portland OR 97230",2.0,2,1988,Portland,Multnomah,Hartley,Reynolds,Manufactured - Double Wide Manufact,,Reynolds,unknown,72000,1152,97230
1,"16000 SE POWELL BLVD 75, Portland OR 97236",2.0,3,1990,Portland,Multnomah,Powell Butte,Centennial,Manufactured - Double Wide Manufact,,Centennial,unknown,79950,1404,97236
2,"12846 SE RAMONA ST 6, Portland OR 97236",2.0,3,1997,Portland,Multnomah,Gilbert Hts,David Douglas,Manufactured - Double Wide Manufact,,Alice Ott,unknown,93900,1297,97236
3,"7720 S Macadam AVE 7, Portland OR 97219",3.0,3,1988,Portland,Multnomah,Other,Other,Floating Home - Contemporary,,Other,unknown,125000,2432,97219
4,"19609 NE Marine DR E-4, Portland OR 97230",1.0,1,1960,Portland,Multnomah,Salish Pond,Reynolds,Floating Home - Cabin,,Reynolds,unknown,129500,735,97230


## Data Preprocessing

In [35]:
# Make a copy of the original data frame to modify.
model_df = data_df

# Insert a lot value of 0 for condos and floating homes.
for index, row in model_df.iterrows():
    if ("Condo" in row["home_type"]) | ("Floating" in row["home_type"]):
        model_df.loc[index, "lot_size"] = 0
    else:
        pass

# Include only those columns that will be used in the deep learning model.
model_df = model_df.loc[:, ["bathrooms", "bedrooms", "built", "lot_size", "square_feet", "home_type", "zipcode", "price"]]
# Chose not to include high_school due to lousy random forest fitting.
# Drop rows with NaN entries.
model_df.dropna(inplace=True)

# Check the model data.
print(len(model_df))
model_df.head()

1725


Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,home_type,zipcode,price
3,3.0,3,1988,0.0,2432,Floating Home - Contemporary,97219,125000
4,1.0,1,1960,0.0,735,Floating Home - Cabin,97230,129500
5,1.0,1,1974,0.0,720,Condo - Traditional,97236,141900
6,1.0,1,1927,0.0,382,Condo - Common Wall,97209,144900
7,1.0,1,2004,0.0,513,Condo - Other,97220,149900


In [36]:
# Simplify home types in model_df.
for i in model_df.index:
    if "Floating" in model_df.at[i, "home_type"]:
        model_df.at[i, "home_type"] = "Floating"
    if "Condo" in model_df.at[i, "home_type"]:
        model_df.at[i, "home_type"] = "Condo"
    if "Single Family" in model_df.at[i, "home_type"]:
        model_df.at[i, "home_type"] = "Single Family"
    if "Manufactured" in model_df.at[i, "home_type"]:
        model_df.at[i, "home_type"] = "Manufactured"
    
model_df.head()

Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,home_type,zipcode,price
3,3.0,3,1988,0.0,2432,Floating,97219,125000
4,1.0,1,1960,0.0,735,Floating,97230,129500
5,1.0,1,1974,0.0,720,Condo,97236,141900
6,1.0,1,1927,0.0,382,Condo,97209,144900
7,1.0,1,2004,0.0,513,Condo,97220,149900


In [37]:
# Rank the zipcodes in order of mean home price.
zipcode = model_df[["price","zipcode"]]
zipcodeAVG = zipcode.groupby(["zipcode"]).mean().sort_values(by=["price"], ascending=False)
zipcodeRanker = zipcodeAVG.reset_index(drop=False)

# Create a dictionary to rank the zipcode for a particular listing.
zipcode_ranker_dict = {}
for index, row in zipcodeRanker.iterrows():
    zipcode_ranker_dict[int(row["zipcode"])] = index
zipcode_ranker_dict

{97035: 0,
 97231: 1,
 97221: 2,
 97210: 3,
 97219: 4,
 97239: 5,
 97215: 6,
 97212: 7,
 97201: 8,
 97209: 9,
 97225: 10,
 97202: 11,
 97229: 12,
 97205: 13,
 97223: 14,
 97211: 15,
 97224: 16,
 97227: 17,
 97214: 18,
 97213: 19,
 97232: 20,
 97217: 21,
 97204: 22,
 97218: 23,
 97203: 24,
 97222: 25,
 97220: 26,
 97206: 27,
 97230: 28,
 97236: 29,
 97266: 30,
 97216: 31,
 97233: 32}

In [38]:
# Create a zipcode ranking for each listing.
model_df["zipcode_rank"] = [zipcode_ranker_dict[zipcode] for zipcode in model_df["zipcode"]]
# model_df["zipcode_rank"] = zipcode_ranks
# Drop the zipcode for each listing.
model_df.drop("zipcode", axis=1, inplace=True)
model_df.head()

Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,home_type,price,zipcode_rank
3,3.0,3,1988,0.0,2432,Floating,125000,4
4,1.0,1,1960,0.0,735,Floating,129500,28
5,1.0,1,1974,0.0,720,Condo,141900,29
6,1.0,1,1927,0.0,382,Condo,144900,9
7,1.0,1,2004,0.0,513,Condo,149900,26


In [39]:
# Bin prices into ten equal length ranges.
model_df["price_range"] = pd.qcut(model_df["price"], 5)
# Drop the original price data.
model_df.drop("price", axis=1, inplace=True)
model_df.head()

Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,home_type,zipcode_rank,price_range
3,3.0,3,1988,0.0,2432,Floating,4,"(124999.999, 349000.0]"
4,1.0,1,1960,0.0,735,Floating,28,"(124999.999, 349000.0]"
5,1.0,1,1974,0.0,720,Condo,29,"(124999.999, 349000.0]"
6,1.0,1,1927,0.0,382,Condo,9,"(124999.999, 349000.0]"
7,1.0,1,2004,0.0,513,Condo,26,"(124999.999, 349000.0]"


In [40]:
# Get dummies for the values in home_type to use in the model.
model_df = pd.get_dummies(model_df, columns=["home_type"])
model_df.head()

Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,zipcode_rank,price_range,home_type_Condo,home_type_Floating,home_type_Manufactured,home_type_Single Family
3,3.0,3,1988,0.0,2432,4,"(124999.999, 349000.0]",0,1,0,0
4,1.0,1,1960,0.0,735,28,"(124999.999, 349000.0]",0,1,0,0
5,1.0,1,1974,0.0,720,29,"(124999.999, 349000.0]",1,0,0,0
6,1.0,1,1927,0.0,382,9,"(124999.999, 349000.0]",1,0,0,0
7,1.0,1,2004,0.0,513,26,"(124999.999, 349000.0]",1,0,0,0


In [41]:
# Assign X (input) and y (target).

X = model_df.drop("price_range", axis=1)
y = model_df["price_range"]

In [42]:
# Split the data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [43]:
# Create a MinMaxScaler model and fit it to the training data

X_scaler = MinMaxScaler().fit(X_train)

# Save the scalar.
dump(X_scaler, 'minmax_scaler.bin', compress=True)

['minmax_scaler.bin']

In [44]:
# Transform the training and testing data using the X_scaler and y_scaler models.

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [45]:
# Label encode the target data.
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Save the label encoder
dump(label_encoder, 'label_encoder.bin', compress=True)

['label_encoder.bin']

In [46]:
# Convert encoded labels to one-hot encoding.
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

## Run Random Forest Classifier

In [47]:
# Create a random forest classifier, fit to the training data, and score on the testing data.
rf = RandomForestClassifier(n_estimators=1000)
rf = rf.fit(X_train_scaled, y_train_categorical)
print(rf.score(X_test_scaled, y_test_categorical))

# Find the importances of each feature.
feature_names = X.columns
importances = rf.feature_importances_
print(sorted(zip(rf.feature_importances_, feature_names), reverse=True))

0.6412037037037037
[(0.36228640426188263, 'square_feet'), (0.17819539119761849, 'zipcode_rank'), (0.17801253031373246, 'built'), (0.09290328880195768, 'lot_size'), (0.08952444358682148, 'bathrooms'), (0.06973802469857276, 'bedrooms'), (0.012989499123091754, 'home_type_Single Family'), (0.011735796678209925, 'home_type_Condo'), (0.004356576819133594, 'home_type_Floating'), (0.0002580445189792551, 'home_type_Manufactured')]


## Create a Deep Learning Model

In [48]:
# Create a deep learning Sequential model.
deep_model = Sequential()
deep_model.add(Dense(units=100, activation='relu', input_dim=10))
deep_model.add(Dense(units=100, activation='relu'))
deep_model.add(Dense(units=5, activation='softmax'))

In [49]:
# Compile and fit the model.
deep_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 1293 samples
Epoch 1/100
1293/1293 - 1s - loss: 1.5319 - accuracy: 0.3241
Epoch 2/100
1293/1293 - 0s - loss: 1.3445 - accuracy: 0.4416
Epoch 3/100
1293/1293 - 0s - loss: 1.1995 - accuracy: 0.4795
Epoch 4/100
1293/1293 - 0s - loss: 1.1258 - accuracy: 0.5174
Epoch 5/100
1293/1293 - 0s - loss: 1.0942 - accuracy: 0.5159
Epoch 6/100
1293/1293 - 0s - loss: 1.0775 - accuracy: 0.5197
Epoch 7/100
1293/1293 - 0s - loss: 1.0507 - accuracy: 0.5344
Epoch 8/100
1293/1293 - 0s - loss: 1.0346 - accuracy: 0.5499
Epoch 9/100
1293/1293 - 0s - loss: 1.0289 - accuracy: 0.5499
Epoch 10/100
1293/1293 - 0s - loss: 1.0135 - accuracy: 0.5553
Epoch 11/100
1293/1293 - 0s - loss: 0.9999 - accuracy: 0.5623
Epoch 12/100
1293/1293 - 0s - loss: 0.9979 - accuracy: 0.5661
Epoch 13/100
1293/1293 - 0s - loss: 0.9964 - accuracy: 0.5654
Epoch 14/100
1293/1293 - 0s - loss: 0.9742 - accuracy: 0.5785
Epoch 15/100
1293/1293 - 0s - loss: 0.9667 - accuracy: 0.5793
Epoch 16/100
1293/1293 - 0s - loss: 0.9565 - accuracy: 0.

<tensorflow.python.keras.callbacks.History at 0x29a92ff5f88>

## Quantify our Trained Model

In [50]:
model_loss, model_accuracy = deep_model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

432/432 - 0s - loss: 0.8684 - accuracy: 0.6667
Loss: 0.8684401865358706, Accuracy: 0.6666666865348816


## Make Predictions

In [51]:
# Use the first 10 test data values to make a prediction and compare it to the actual labels.
encoded_predictions = deep_model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:10])}")

Predicted classes: [Interval(349000.0, 449500.0, closed='right')
 Interval(349000.0, 449500.0, closed='right')
 Interval(124999.999, 349000.0, closed='right')
 Interval(614994.0, 835200.0, closed='right')
 Interval(349000.0, 449500.0, closed='right')
 Interval(835200.0, 4495000.0, closed='right')
 Interval(124999.999, 349000.0, closed='right')
 Interval(449500.0, 614994.0, closed='right')
 Interval(835200.0, 4495000.0, closed='right')
 Interval(614994.0, 835200.0, closed='right')]
Actual Labels: [Interval(349000.0, 449500.0, closed='right'), Interval(349000.0, 449500.0, closed='right'), Interval(124999.999, 349000.0, closed='right'), Interval(835200.0, 4495000.0, closed='right'), Interval(349000.0, 449500.0, closed='right'), Interval(614994.0, 835200.0, closed='right'), Interval(124999.999, 349000.0, closed='right'), Interval(449500.0, 614994.0, closed='right'), Interval(835200.0, 4495000.0, closed='right'), Interval(614994.0, 835200.0, closed='right')]


## Save the trained model

In [52]:
# Save the model
deep_model.save("housing_model_trained.h5")

## Test the saved model, scaler, and label encoder

In [53]:
# Load the model, scaler and label encoder.
model = load_model("housing_model_trained.h5")
scaler = load("minmax_scaler.bin")
label_encoder = load("label_encoder.bin")

In [54]:
# Input data as bathrooms, bedrooms, built, lot_size, square_feet, home_type_Condo, home_type_Floating, home_type_Manufactured, home_type_Single Family.
input_data = np.array([[3, 4, 1920, 0.5, 1075, 16, 0, 0, 1, 0]])

In [55]:
encoded_predictions = model.predict_classes(scaler.transform(input_data))
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"{prediction_labels[0].left}, {prediction_labels[0].right}")

124999.999, 349000.0
