## Setup

In [30]:
# Import Dependencies.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import requests
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from joblib import dump, load

In [31]:
# Fetch the data from the API.
listings_json = requests.get("http://127.0.0.1:5000/housingDataAPI/v1.0/listings").json()

# Examine the data.
print(json.dumps(listings_json[0], indent=4, sort_keys=True))

{
    "address": "3157 NE MARINE DR, Portland OR 97035",
    "bathrooms": 1.0,
    "bedrooms": 1,
    "built": 1964,
    "city": "Portland",
    "county": "Multnomah",
    "elementary_school": "Faubion",
    "high_school": "Current Price:",
    "home_type": "Floating Home - 1 Story",
    "lot_size": null,
    "middle_school": "Jefferson",
    "neighborhood": "unknown",
    "price": 65000,
    "square_feet": 800,
    "zipcode": 97035
}


In [32]:
# Create a dataframe to use for our model.
housing_data = pd.DataFrame(listings_json)

print(len(housing_data))
housing_data.head()

2246


Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode
0,"3157 NE MARINE DR, Portland OR 97035",1.0,1,1964,Portland,Multnomah,Faubion,Current Price:,Floating Home - 1 Story,,Jefferson,unknown,65000,800,97035
1,"17452 NE GLISAN ST #7, Portland OR 97230",2.0,2,1988,Portland,Multnomah,Hartley,Reynolds,Manufactured - Double Wide Manufact,,Reynolds,unknown,72000,1152,97230
2,"9034 SE 78TH PL, Portland OR 97206",2.0,3,1997,Portland,Clackamas,Whitman,Current Price:,Manufactured - Double Wide Manufact,,Milwaukie,unknown,79950,1344,97206
3,"16000 SE POWELL BLVD 75, Portland OR 97236",2.0,3,1990,Portland,Multnomah,Powell Butte,Centennial,Manufactured - Double Wide Manufact,,Centennial,unknown,79950,1404,97236
4,"12846 SE RAMONA ST 6, Portland OR 97236",2.0,3,1997,Portland,Multnomah,Gilbert Hts,David Douglas,Manufactured - Double Wide Manufact,,Alice Ott,unknown,93900,1297,97236


# Data Cleaning

In [33]:
# Simplify home types 
for i in housing_data.index:
    if "Floating" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Floating"
    if "Condo" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Condo"
    if "Single Family" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Single Family"
    if "Manufactured" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Manufactured"
    
housing_data.home_type.unique() 

array(['Floating', 'Manufactured', 'Condo', 'Single Family'], dtype=object)

In [34]:
# Print data to compare how many data points lost
print(f'Current Amount of Listings: {len(housing_data)}')

# Change lot size to 0 for floating homes and condos
for i in housing_data.index:
    if housing_data.at[i, "home_type"] == "Floating":
        housing_data.at[i, "lot_size"] = 0
    if housing_data.at[i, "home_type"] == "Condo":
        housing_data.at[i, "lot_size"] = 0

# Drop listing with null lot_size
cleaned_housing_data = housing_data.drop(housing_data[housing_data["lot_size"].isnull()].index)
      
# Print length of data
print(f'Updated Amount of Listings: {len(cleaned_housing_data)}')

Current Amount of Listings: 2246
Updated Amount of Listings: 2133


In [35]:
# Drop listings with unclear Highschool data
cleaned_housing_data.drop(cleaned_housing_data[cleaned_housing_data.high_school == "Current Price:"].index, inplace = True)
cleaned_housing_data.drop(cleaned_housing_data[cleaned_housing_data.high_school == "Other"].index, inplace = True)
cleaned_housing_data.shape

(2121, 15)

In [36]:
# Create a cost ranker based on zipcode
zipcode = cleaned_housing_data[["price","zipcode"]]
zipcodeAVG = zipcode.groupby(["zipcode"]).mean().sort_values(by=["price"], ascending=False)
zipcodeRanker = zipcodeAVG.reset_index(drop=False)
zipcodeRanker.reset_index(drop=False, inplace=True)
zipcodeRanker.rename(columns={"index":"zipcode_rank","price":"zipcodeAVGcost"}, inplace=True)
zipcodeRanker["zipcode_rank"]=zipcodeRanker["zipcode_rank"]+1


# Merge into df
cleaned_housing_data2 = pd.merge(cleaned_housing_data, zipcodeRanker, on="zipcode")
cleaned_housing_data2.rename(columns={"price_y":"zipcodeAVGcost"}, inplace = True)
cleaned_housing_data2.head()

Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode,zipcode_rank,zipcodeAVGcost
0,"19609 NE Marine DR E-4, Portland OR 97230",1.0,1,1960,Portland,Multnomah,Salish Pond,Reynolds,Floating,0.0,Reynolds,unknown,129500,735,97230,29,412757.415584
1,"3389 NE 162ND AVE, Portland OR 97230",2.0,2,1979,Portland,Multnomah,Margaret Scott,Reynolds,Condo,0.0,H.B. Lee,Fremont Village Park,160000,1073,97230,29,412757.415584
2,"19609 NE MARINE DR E1, Portland OR 97230",2.0,3,1945,Portland,Multnomah,Salish Pond,Reynolds,Floating,0.0,Reynolds,Big Eddy Marina,224500,1150,97230,29,412757.415584
3,"15041 NE SISKIYOU CT, Portland OR 97230",2.0,2,1973,Portland,Multnomah,Scott,Reynolds,Condo,0.0,H.B. Lee,unknown,229900,1638,97230,29,412757.415584
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",2.0,2,1986,Portland,Multnomah,Margaret Scott,Reynolds,Condo,0.0,H.B. Lee,SUMMERPLACE,239000,1128,97230,29,412757.415584


In [37]:
# Create district df
school_dict = ({"high_school" : ['Reynolds', 'Parkrose', 'David Douglas', 'Centennial', 'Cleveland',
        'Lincoln', 'Madison', 'Jefferson', 'Roosevelt', 'Sunset','Westview', 'Liberty', 'Beaverton', 
        'Grant', 'Southridge', 'Tigard', 'Wilson', 'Riverdale', 'Lake Oswego', 'Franklin',
        'Tualatin', 'Milwaukie', 'Scappoose'], "district" : ['Reynolds', 'Parkrose','David Douglas',
        'Centennial', 'Portland Public', 'Portland Public', 'Portland Public', 'Portland Public',
        'Portland Public', 'Beaverton', 'Beaverton', 'Hillsboro', 'Beaverton', 'Portland Public',
        'Beaverton', 'Tigard-Tualatin', 'Portland Public', 'Riverdale', 'Lake Oswego', 'Portland Public',
        'Tigard-Tualatin', 'North Clackamas', 'Scappose']})
district_df = pd.DataFrame (school_dict)

# Merge into OG df
cleaned_housing_data3 = pd.merge(cleaned_housing_data2, district_df, on="high_school")
cleaned_housing_data3.head()

Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode,zipcode_rank,zipcodeAVGcost,district
0,"19609 NE Marine DR E-4, Portland OR 97230",1.0,1,1960,Portland,Multnomah,Salish Pond,Reynolds,Floating,0.0,Reynolds,unknown,129500,735,97230,29,412757.415584,Reynolds
1,"3389 NE 162ND AVE, Portland OR 97230",2.0,2,1979,Portland,Multnomah,Margaret Scott,Reynolds,Condo,0.0,H.B. Lee,Fremont Village Park,160000,1073,97230,29,412757.415584,Reynolds
2,"19609 NE MARINE DR E1, Portland OR 97230",2.0,3,1945,Portland,Multnomah,Salish Pond,Reynolds,Floating,0.0,Reynolds,Big Eddy Marina,224500,1150,97230,29,412757.415584,Reynolds
3,"15041 NE SISKIYOU CT, Portland OR 97230",2.0,2,1973,Portland,Multnomah,Scott,Reynolds,Condo,0.0,H.B. Lee,unknown,229900,1638,97230,29,412757.415584,Reynolds
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",2.0,2,1986,Portland,Multnomah,Margaret Scott,Reynolds,Condo,0.0,H.B. Lee,SUMMERPLACE,239000,1128,97230,29,412757.415584,Reynolds


In [38]:
# Create a cost ranker based on high schools
hs = cleaned_housing_data3[["price","high_school"]]
hsAVG = hs.groupby(["high_school"]).mean().sort_values(by=["price"], ascending=False)
hsRanker = hsAVG.reset_index(drop=False)
hsRanker.reset_index(drop=False, inplace=True)
hsRanker.rename(columns={"index":"hs_rank","price":"hsAVGcost"}, inplace=True)
hsRanker["hs_rank"]= hsRanker["hs_rank"]+1

# Create a cost ranker based on districts
district = cleaned_housing_data3[["price","district"]]
districtAVG = district.groupby(["district"]).mean().sort_values(by=["price"], ascending=False)
districtRanker = districtAVG.reset_index(drop=False)
districtRanker.reset_index(drop=False, inplace=True)
districtRanker.rename(columns={"index":"district_rank","price":"districtAVGcost"}, inplace=True)
districtRanker["district_rank"]= districtRanker["district_rank"]+1

In [39]:
# Merge high school and district rankers 
cleaned_housing_data4 = pd.merge(cleaned_housing_data3, hsRanker, on="high_school")
cleaned_housing_data_5 = pd.merge(cleaned_housing_data4, districtRanker, on="district")
cleaned_housing_data_final = cleaned_housing_data_5[['address', 'price', 'home_type', 'bedrooms', 
                                'bathrooms', 'square_feet', 'built', 'lot_size', 'neighborhood', 
                                'county', 'city', 'zipcode', 'zipcode_rank', 'zipcodeAVGcost',
                                'elementary_school', 'middle_school', 'high_school','hs_rank', 
                                'hsAVGcost', 'district', 'district_rank', 'districtAVGcost']]

cleaned_housing_data_final.head()

Unnamed: 0,address,price,home_type,bedrooms,bathrooms,square_feet,built,lot_size,neighborhood,county,...,zipcode_rank,zipcodeAVGcost,elementary_school,middle_school,high_school,hs_rank,hsAVGcost,district,district_rank,districtAVGcost
0,"19609 NE Marine DR E-4, Portland OR 97230",129500,Floating,1,1.0,735,1960,0.0,unknown,Multnomah,...,29,412757.415584,Salish Pond,Reynolds,Reynolds,21,396434.078125,Reynolds,10,396434.078125
1,"3389 NE 162ND AVE, Portland OR 97230",160000,Condo,2,2.0,1073,1979,0.0,Fremont Village Park,Multnomah,...,29,412757.415584,Margaret Scott,H.B. Lee,Reynolds,21,396434.078125,Reynolds,10,396434.078125
2,"19609 NE MARINE DR E1, Portland OR 97230",224500,Floating,3,2.0,1150,1945,0.0,Big Eddy Marina,Multnomah,...,29,412757.415584,Salish Pond,Reynolds,Reynolds,21,396434.078125,Reynolds,10,396434.078125
3,"15041 NE SISKIYOU CT, Portland OR 97230",229900,Condo,2,2.0,1638,1973,0.0,unknown,Multnomah,...,29,412757.415584,Scott,H.B. Lee,Reynolds,21,396434.078125,Reynolds,10,396434.078125
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",239000,Condo,2,2.0,1128,1986,0.0,SUMMERPLACE,Multnomah,...,29,412757.415584,Margaret Scott,H.B. Lee,Reynolds,21,396434.078125,Reynolds,10,396434.078125


## Prepare Data for Model

In [40]:
# Make a copy of the original data frame to modify.
model_df = cleaned_housing_data_final

# Include only those columns that will be used in the deep learning model.
model_df = model_df.loc[:, ["bathrooms",
                            "bedrooms",
                            "built",
                            "lot_size",
                            "square_feet",
#                             "neighborhood",
#                             "county",
#                             "home_type",
#                             "hs_rank",
#                             "hsAVGcost",
                            "districtAVGcost",
                            "district_rank",
                            "zipcodeAVGcost",
                            "zipcode_rank",
                            "price"]
                       ]

# Drop rows with NaN entries.
model_df.dropna(inplace=True)

# Check the model data.
print(len(model_df))
model_df.head()

2121


Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,districtAVGcost,district_rank,zipcodeAVGcost,zipcode_rank,price
0,1.0,1,1960,0.0,735,396434.078125,10,412757.415584,29,129500
1,2.0,2,1979,0.0,1073,396434.078125,10,412757.415584,29,160000
2,2.0,3,1945,0.0,1150,396434.078125,10,412757.415584,29,224500
3,2.0,2,1973,0.0,1638,396434.078125,10,412757.415584,29,229900
4,2.0,2,1986,0.0,1128,396434.078125,10,412757.415584,29,239000


In [41]:
# Bin prices into ten equal length ranges.
model_df["price_range"] = pd.qcut(model_df["price"], 5)
# Drop the original price data.
model_df.drop("price", axis=1, inplace=True)
model_df.head()

Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,districtAVGcost,district_rank,zipcodeAVGcost,zipcode_rank,price_range
0,1.0,1,1960,0.0,735,396434.078125,10,412757.415584,29,"(123499.999, 349500.0]"
1,2.0,2,1979,0.0,1073,396434.078125,10,412757.415584,29,"(123499.999, 349500.0]"
2,2.0,3,1945,0.0,1150,396434.078125,10,412757.415584,29,"(123499.999, 349500.0]"
3,2.0,2,1973,0.0,1638,396434.078125,10,412757.415584,29,"(123499.999, 349500.0]"
4,2.0,2,1986,0.0,1128,396434.078125,10,412757.415584,29,"(123499.999, 349500.0]"


In [42]:
# # Get dummies for the values in home_type to use in the model.
# model_df = pd.get_dummies(model_df, columns=["home_type"])
# model_df.head()

In [43]:
# Assign X (input) and y (target).
X = model_df.drop("price_range", axis=1)
y = model_df["price_range"]

In [44]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [45]:
# Create a MinMaxScaler model and fit it to the training data
X_scaler = MinMaxScaler().fit(X_train)

# Save the scalar.
dump(X_scaler, 'minmax_scaler.bin', compress=True)

['minmax_scaler.bin']

In [46]:
# Transform the training and testing data using the X_scaler and y_scaler models.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [47]:
# Label encode the target data.
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Save the label encoder
dump(label_encoder, 'label_encoder.bin', compress=True)

['label_encoder.bin']

In [48]:
# Convert encoded labels to one-hot encoding.
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

## Run Random Forest Classifier

In [49]:
# Create a random forest classifier, fit to the training data, and score on the testing data.
rf = RandomForestClassifier(n_estimators=1000)
rf = rf.fit(X_train_scaled, y_train_categorical)
print(rf.score(X_test_scaled, y_test_categorical))

# Find the importances of each feature.
feature_names = X.columns
importances = rf.feature_importances_
print(sorted(zip(rf.feature_importances_, feature_names), reverse=True))

0.6346516007532956
[(0.3578699685496072, 'square_feet'), (0.1512734458154296, 'built'), (0.10458889988808215, 'lot_size'), (0.09600732459330048, 'zipcode_rank'), (0.09598144891407617, 'zipcodeAVGcost'), (0.08268383292616352, 'bathrooms'), (0.06744864870238483, 'bedrooms'), (0.022178771139326148, 'districtAVGcost'), (0.021967659471629784, 'district_rank')]


## Create a Deep Learning Model

In [50]:
# Create a deep learning Sequential model.
deep_model = Sequential()
deep_model.add(Dense(units=500, activation='relu', input_dim=9))
deep_model.add(Dense(units=200, activation='relu'))
deep_model.add(Dense(units=100, activation='relu'))
deep_model.add(Dense(units=5, activation='softmax'))

In [51]:
# Compile and fit the model.
deep_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=200,
    shuffle=True,
    verbose=2
)

Train on 1590 samples
Epoch 1/200
1590/1590 - 1s - loss: 1.3576 - accuracy: 0.4069
Epoch 2/200
1590/1590 - 0s - loss: 1.1051 - accuracy: 0.4899
Epoch 3/200
1590/1590 - 0s - loss: 1.0376 - accuracy: 0.5377
Epoch 4/200
1590/1590 - 0s - loss: 1.0127 - accuracy: 0.5453
Epoch 5/200
1590/1590 - 0s - loss: 0.9798 - accuracy: 0.5610
Epoch 6/200
1590/1590 - 0s - loss: 0.9538 - accuracy: 0.5767
Epoch 7/200
1590/1590 - 0s - loss: 0.9337 - accuracy: 0.5843
Epoch 8/200
1590/1590 - 0s - loss: 0.9345 - accuracy: 0.5736
Epoch 9/200
1590/1590 - 0s - loss: 0.9394 - accuracy: 0.6113
Epoch 10/200
1590/1590 - 0s - loss: 0.9173 - accuracy: 0.5981
Epoch 11/200
1590/1590 - 0s - loss: 0.8953 - accuracy: 0.6088
Epoch 12/200
1590/1590 - 0s - loss: 0.8888 - accuracy: 0.5981
Epoch 13/200
1590/1590 - 0s - loss: 0.8877 - accuracy: 0.5994
Epoch 14/200
1590/1590 - 0s - loss: 0.8900 - accuracy: 0.6164
Epoch 15/200
1590/1590 - 0s - loss: 0.8710 - accuracy: 0.6176
Epoch 16/200
1590/1590 - 0s - loss: 0.8640 - accuracy: 0.

Epoch 133/200
1590/1590 - 0s - loss: 0.5538 - accuracy: 0.7686
Epoch 134/200
1590/1590 - 0s - loss: 0.5659 - accuracy: 0.7597
Epoch 135/200
1590/1590 - 0s - loss: 0.5527 - accuracy: 0.7660
Epoch 136/200
1590/1590 - 0s - loss: 0.5527 - accuracy: 0.7428
Epoch 137/200
1590/1590 - 0s - loss: 0.5378 - accuracy: 0.7667
Epoch 138/200
1590/1590 - 0s - loss: 0.5733 - accuracy: 0.7522
Epoch 139/200
1590/1590 - 0s - loss: 0.5659 - accuracy: 0.7465
Epoch 140/200
1590/1590 - 0s - loss: 0.5497 - accuracy: 0.7648
Epoch 141/200
1590/1590 - 0s - loss: 0.5388 - accuracy: 0.7704
Epoch 142/200
1590/1590 - 0s - loss: 0.5394 - accuracy: 0.7616
Epoch 143/200
1590/1590 - 0s - loss: 0.5968 - accuracy: 0.7409
Epoch 144/200
1590/1590 - 0s - loss: 0.5611 - accuracy: 0.7736
Epoch 145/200
1590/1590 - 0s - loss: 0.5747 - accuracy: 0.7535
Epoch 146/200
1590/1590 - 0s - loss: 0.5290 - accuracy: 0.7717
Epoch 147/200
1590/1590 - 0s - loss: 0.5694 - accuracy: 0.7503
Epoch 148/200
1590/1590 - 0s - loss: 0.5459 - accuracy:

<tensorflow.python.keras.callbacks.History at 0x1a4aec2cd0>

## Quantify our Trained Model

In [52]:
model_loss, model_accuracy = deep_model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

531/1 - 0s - loss: 1.0152 - accuracy: 0.6968
Loss: 1.0429716554738708, Accuracy: 0.6967985033988953


## Make Predictions

In [53]:
# Use the first 10 test data values to make a prediction and compare it to the actual labels.
encoded_predictions = deep_model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:10])}")

Predicted classes: [Interval(809900.0, 4495000.0, closed='right')
 Interval(449000.0, 599900.0, closed='right')
 Interval(809900.0, 4495000.0, closed='right')
 Interval(449000.0, 599900.0, closed='right')
 Interval(123499.999, 349500.0, closed='right')
 Interval(449000.0, 599900.0, closed='right')
 Interval(809900.0, 4495000.0, closed='right')
 Interval(599900.0, 809900.0, closed='right')
 Interval(599900.0, 809900.0, closed='right')
 Interval(809900.0, 4495000.0, closed='right')]
Actual Labels: [Interval(809900.0, 4495000.0, closed='right'), Interval(449000.0, 599900.0, closed='right'), Interval(809900.0, 4495000.0, closed='right'), Interval(449000.0, 599900.0, closed='right'), Interval(123499.999, 349500.0, closed='right'), Interval(599900.0, 809900.0, closed='right'), Interval(809900.0, 4495000.0, closed='right'), Interval(599900.0, 809900.0, closed='right'), Interval(599900.0, 809900.0, closed='right'), Interval(809900.0, 4495000.0, closed='right')]


## Save the trained model

In [54]:
# Save the model
deep_model.save("housing_model_trained.h5")

## Test the saved model, scaler, and label encoder

In [55]:
# Load the model, scaler and label encoder.
model = load_model("housing_model_trained.h5")
scaler = load("minmax_scaler.bin")
label_encoder = load("label_encoder.bin")

In [56]:
# Input data for testing.
input_data = np.array(np.array([X.iloc[0]]))

In [57]:
X.iloc[0]

bathrooms               1.000000
bedrooms                1.000000
built                1960.000000
lot_size                0.000000
square_feet           735.000000
districtAVGcost    396434.078125
district_rank          10.000000
zipcodeAVGcost     412757.415584
zipcode_rank           29.000000
Name: 0, dtype: float64

In [58]:
encoded_predictions = model.predict_classes(scaler.transform(input_data))
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"{prediction_labels[0].left}, {prediction_labels[0].right}")

123499.999, 349500.0
