## Setup

In [1]:
# Import Dependencies.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import requests
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from joblib import dump, load

In [2]:
# Fetch the data from the API.
listings_json = requests.get("http://127.0.0.1:5000/housingDataAPI/v1.0/listings").json()

# Examine the data.
print(json.dumps(listings_json[0], indent=4, sort_keys=True))

{
    "address": "17452 NE GLISAN ST #7, Portland OR 97230",
    "bathrooms": 2.0,
    "bedrooms": 2,
    "built": 1988,
    "city": "Portland",
    "county": "Multnomah",
    "elementary_school": "Hartley",
    "high_school": "Reynolds",
    "home_type": "Manufactured - Double Wide Manufact",
    "lot_size": null,
    "middle_school": "Reynolds",
    "neighborhood": "unknown",
    "price": 72000,
    "square_feet": 1152,
    "zipcode": 97230
}


In [3]:
# Create a dataframe to use for our model.
housing_data = pd.DataFrame(listings_json)

print(len(housing_data))
housing_data.head()

2056


Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode
0,"17452 NE GLISAN ST #7, Portland OR 97230",2.0,2,1988,Portland,Multnomah,Hartley,Reynolds,Manufactured - Double Wide Manufact,,Reynolds,unknown,72000,1152,97230
1,"16000 SE POWELL BLVD 75, Portland OR 97236",2.0,3,1990,Portland,Multnomah,Powell Butte,Centennial,Manufactured - Double Wide Manufact,,Centennial,unknown,79950,1404,97236
2,"12846 SE RAMONA ST 6, Portland OR 97236",2.0,3,1997,Portland,Multnomah,Gilbert Hts,David Douglas,Manufactured - Double Wide Manufact,,Alice Ott,unknown,93900,1297,97236
3,"7720 S Macadam AVE 7, Portland OR 97219",3.0,3,1988,Portland,Multnomah,Other,Other,Floating Home - Contemporary,,Other,unknown,125000,2432,97219
4,"19609 NE Marine DR E-4, Portland OR 97230",1.0,1,1960,Portland,Multnomah,Salish Pond,Reynolds,Floating Home - Cabin,,Reynolds,unknown,129500,735,97230


# Data Cleaning

In [4]:
# Simplify home types 
for i in housing_data.index:
    if "Floating" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Floating"
    if "Condo" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Condo"
    if "Single Family" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Single Family"
    if "Manufactured" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Manufactured"
    
housing_data.home_type.unique() 

array(['Manufactured', 'Floating', 'Condo', 'Single Family'], dtype=object)

In [5]:
# Print data to compare how many data points lost
print(f'Current Amount of Listings: {len(housing_data)}')

# Change lot size to 0 for floating homes and condos
for i in housing_data.index:
    if housing_data.at[i, "home_type"] == "Floating":
        housing_data.at[i, "lot_size"] = 0
    if housing_data.at[i, "home_type"] == "Condo":
        housing_data.at[i, "lot_size"] = 0

# Drop listing with null lot_size
cleaned_housing_data = housing_data.drop(housing_data[housing_data["lot_size"].isnull()].index)
      
# Print length of data
print(f'Updated Amount of Listings: {len(cleaned_housing_data)}')

Current Amount of Listings: 2056
Updated Amount of Listings: 1947


In [6]:
# Drop listings with unclear Highschool data
cleaned_housing_data.drop(cleaned_housing_data[cleaned_housing_data.high_school == "Current Price:"].index, inplace = True)
cleaned_housing_data.drop(cleaned_housing_data[cleaned_housing_data.high_school == "Other"].index, inplace = True)
cleaned_housing_data.shape

(1943, 15)

In [7]:
# Create a cost ranker based on zipcode
zipcode = cleaned_housing_data[["price","zipcode"]]
zipcodeAVG = zipcode.groupby(["zipcode"]).mean().sort_values(by=["price"], ascending=False)
zipcodeRanker = zipcodeAVG.reset_index(drop=False)
zipcodeRanker.reset_index(drop=False, inplace=True)
zipcodeRanker.rename(columns={"index":"zipcode_rank","price":"zipcodeAVGcost"}, inplace=True)
zipcodeRanker["zipcode_rank"]=zipcodeRanker["zipcode_rank"]+1


# Merge into df
cleaned_housing_data2 = pd.merge(cleaned_housing_data, zipcodeRanker, on="zipcode")
cleaned_housing_data2.rename(columns={"price_y":"zipcodeAVGcost"}, inplace = True)
cleaned_housing_data2.head()

Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode,zipcode_rank,zipcodeAVGcost
0,"19609 NE Marine DR E-4, Portland OR 97230",1.0,1,1960,Portland,Multnomah,Salish Pond,Reynolds,Floating,0.0,Reynolds,unknown,129500,735,97230,29,412295.897059
1,"3389 NE 162ND AVE, Portland OR 97230",2.0,2,1979,Portland,Multnomah,Margaret Scott,Reynolds,Condo,0.0,H.B. Lee,Fremont Village Park,160000,1073,97230,29,412295.897059
2,"19609 NE MARINE DR E1, Portland OR 97230",2.0,3,1945,Portland,Multnomah,Salish Pond,Reynolds,Floating,0.0,Reynolds,Big Eddy Marina,224500,1150,97230,29,412295.897059
3,"15041 NE SISKIYOU CT, Portland OR 97230",2.0,2,1973,Portland,Multnomah,Scott,Reynolds,Condo,0.0,H.B. Lee,unknown,229900,1638,97230,29,412295.897059
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",2.0,2,1986,Portland,Multnomah,Margaret Scott,Reynolds,Condo,0.0,H.B. Lee,SUMMERPLACE,239000,1128,97230,29,412295.897059


In [8]:
# Create district df
school_dict = ({"high_school" : ['Reynolds', 'Parkrose', 'David Douglas', 'Centennial', 'Cleveland',
        'Lincoln', 'Madison', 'Jefferson', 'Roosevelt', 'Sunset','Westview', 'Liberty', 'Beaverton', 
        'Grant', 'Southridge', 'Tigard', 'Wilson', 'Riverdale', 'Lake Oswego', 'Franklin',
        'Tualatin', 'Milwaukie', 'Scappoose'], "district" : ['Reynolds', 'Parkrose','David Douglas',
        'Centennial', 'Portland Public', 'Portland Public', 'Portland Public', 'Portland Public',
        'Portland Public', 'Beaverton', 'Beaverton', 'Hillsboro', 'Beaverton', 'Portland Public',
        'Beaverton', 'Tigard-Tualatin', 'Portland Public', 'Riverdale', 'Lake Oswego', 'Portland Public',
        'Tigard-Tualatin', 'North Clackamas', 'Scappose']})
district_df = pd.DataFrame (school_dict)

# Merge into OG df
cleaned_housing_data3 = pd.merge(cleaned_housing_data2, district_df, on="high_school")
cleaned_housing_data3.head()

Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode,zipcode_rank,zipcodeAVGcost,district
0,"19609 NE Marine DR E-4, Portland OR 97230",1.0,1,1960,Portland,Multnomah,Salish Pond,Reynolds,Floating,0.0,Reynolds,unknown,129500,735,97230,29,412295.897059,Reynolds
1,"3389 NE 162ND AVE, Portland OR 97230",2.0,2,1979,Portland,Multnomah,Margaret Scott,Reynolds,Condo,0.0,H.B. Lee,Fremont Village Park,160000,1073,97230,29,412295.897059,Reynolds
2,"19609 NE MARINE DR E1, Portland OR 97230",2.0,3,1945,Portland,Multnomah,Salish Pond,Reynolds,Floating,0.0,Reynolds,Big Eddy Marina,224500,1150,97230,29,412295.897059,Reynolds
3,"15041 NE SISKIYOU CT, Portland OR 97230",2.0,2,1973,Portland,Multnomah,Scott,Reynolds,Condo,0.0,H.B. Lee,unknown,229900,1638,97230,29,412295.897059,Reynolds
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",2.0,2,1986,Portland,Multnomah,Margaret Scott,Reynolds,Condo,0.0,H.B. Lee,SUMMERPLACE,239000,1128,97230,29,412295.897059,Reynolds


In [9]:
# Create a cost ranker based on high schools
hs = cleaned_housing_data3[["price","high_school"]]
hsAVG = hs.groupby(["high_school"]).mean().sort_values(by=["price"], ascending=False)
hsRanker = hsAVG.reset_index(drop=False)
hsRanker.reset_index(drop=False, inplace=True)
hsRanker.rename(columns={"index":"hs_rank","price":"hsAVGcost"}, inplace=True)
hsRanker["hs_rank"]= hsRanker["hs_rank"]+1

# Create a cost ranker based on districts
district = cleaned_housing_data3[["price","district"]]
districtAVG = district.groupby(["district"]).mean().sort_values(by=["price"], ascending=False)
districtRanker = districtAVG.reset_index(drop=False)
districtRanker.reset_index(drop=False, inplace=True)
districtRanker.rename(columns={"index":"district_rank","price":"districtAVGcost"}, inplace=True)
districtRanker["district_rank"]= districtRanker["district_rank"]+1

In [10]:
# Merge high school and district rankers 
cleaned_housing_data4 = pd.merge(cleaned_housing_data3, hsRanker, on="high_school")
cleaned_housing_data_5 = pd.merge(cleaned_housing_data4, districtRanker, on="district")
cleaned_housing_data_final = cleaned_housing_data_5[['address', 'price', 'home_type', 'bedrooms', 
                                'bathrooms', 'square_feet', 'built', 'lot_size', 'neighborhood', 
                                'county', 'city', 'zipcode', 'zipcode_rank', 'zipcodeAVGcost',
                                'elementary_school', 'middle_school', 'high_school','hs_rank', 
                                'hsAVGcost', 'district', 'district_rank', 'districtAVGcost']]

cleaned_housing_data_final.head()

Unnamed: 0,address,price,home_type,bedrooms,bathrooms,square_feet,built,lot_size,neighborhood,county,...,zipcode_rank,zipcodeAVGcost,elementary_school,middle_school,high_school,hs_rank,hsAVGcost,district,district_rank,districtAVGcost
0,"19609 NE Marine DR E-4, Portland OR 97230",129500,Floating,1,1.0,735,1960,0.0,unknown,Multnomah,...,29,412295.897059,Salish Pond,Reynolds,Reynolds,21,393627.258621,Reynolds,10,393627.258621
1,"3389 NE 162ND AVE, Portland OR 97230",160000,Condo,2,2.0,1073,1979,0.0,Fremont Village Park,Multnomah,...,29,412295.897059,Margaret Scott,H.B. Lee,Reynolds,21,393627.258621,Reynolds,10,393627.258621
2,"19609 NE MARINE DR E1, Portland OR 97230",224500,Floating,3,2.0,1150,1945,0.0,Big Eddy Marina,Multnomah,...,29,412295.897059,Salish Pond,Reynolds,Reynolds,21,393627.258621,Reynolds,10,393627.258621
3,"15041 NE SISKIYOU CT, Portland OR 97230",229900,Condo,2,2.0,1638,1973,0.0,unknown,Multnomah,...,29,412295.897059,Scott,H.B. Lee,Reynolds,21,393627.258621,Reynolds,10,393627.258621
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",239000,Condo,2,2.0,1128,1986,0.0,SUMMERPLACE,Multnomah,...,29,412295.897059,Margaret Scott,H.B. Lee,Reynolds,21,393627.258621,Reynolds,10,393627.258621


## Prepare Data for Model

In [11]:
# Make a copy of the original data frame to modify.
model_df = cleaned_housing_data_final

# Include only those columns that will be used in the deep learning model.
model_df = model_df.loc[:, ["bathrooms",
                            "bedrooms",
                            "built",
                            "lot_size",
                            "square_feet",
#                             "neighborhood",
#                             "county",
#                             "home_type",
#                             "hs_rank",
#                             "hsAVGcost",
                            "districtAVGcost",
                            "district_rank",
                            "zipcodeAVGcost",
                            "zipcode_rank",
                            "price"]
                       ]

# Drop rows with NaN entries.
model_df.dropna(inplace=True)

# Check the model data.
print(len(model_df))
model_df.head()

1943


Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,districtAVGcost,district_rank,zipcodeAVGcost,zipcode_rank,price
0,1.0,1,1960,0.0,735,393627.258621,10,412295.897059,29,129500
1,2.0,2,1979,0.0,1073,393627.258621,10,412295.897059,29,160000
2,2.0,3,1945,0.0,1150,393627.258621,10,412295.897059,29,224500
3,2.0,2,1973,0.0,1638,393627.258621,10,412295.897059,29,229900
4,2.0,2,1986,0.0,1128,393627.258621,10,412295.897059,29,239000


In [12]:
# Bin prices into ten equal length ranges.
model_df["price_range"] = pd.qcut(model_df["price"], 5)
# Drop the original price data.
model_df.drop("price", axis=1, inplace=True)
model_df.head()

Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,districtAVGcost,district_rank,zipcodeAVGcost,zipcode_rank,price_range
0,1.0,1,1960,0.0,735,393627.258621,10,412295.897059,29,"(123499.999, 348340.0]"
1,2.0,2,1979,0.0,1073,393627.258621,10,412295.897059,29,"(123499.999, 348340.0]"
2,2.0,3,1945,0.0,1150,393627.258621,10,412295.897059,29,"(123499.999, 348340.0]"
3,2.0,2,1973,0.0,1638,393627.258621,10,412295.897059,29,"(123499.999, 348340.0]"
4,2.0,2,1986,0.0,1128,393627.258621,10,412295.897059,29,"(123499.999, 348340.0]"


In [13]:
# # Get dummies for the values in home_type to use in the model.
# model_df = pd.get_dummies(model_df, columns=["home_type"])
# model_df.head()

In [14]:
# Assign X (input) and y (target).
X = model_df.drop("price_range", axis=1)
y = model_df["price_range"]

In [15]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [16]:
# Create a MinMaxScaler model and fit it to the training data
X_scaler = MinMaxScaler().fit(X_train)

# Save the scalar.
dump(X_scaler, 'minmax_scaler.bin', compress=True)

['minmax_scaler.bin']

In [17]:
# Transform the training and testing data using the X_scaler and y_scaler models.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Label encode the target data.
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Save the label encoder
dump(label_encoder, 'label_encoder.bin', compress=True)

['label_encoder.bin']

In [19]:
# Convert encoded labels to one-hot encoding.
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

## Run Random Forest Classifier

In [20]:
# Create a random forest classifier, fit to the training data, and score on the testing data.
rf = RandomForestClassifier(n_estimators=1000)
rf = rf.fit(X_train_scaled, y_train_categorical)
print(rf.score(X_test_scaled, y_test_categorical))

# Find the importances of each feature.
feature_names = X.columns
importances = rf.feature_importances_
print(sorted(zip(rf.feature_importances_, feature_names), reverse=True))

0.5967078189300411
[(0.3505526037574974, 'square_feet'), (0.15424433583665914, 'built'), (0.10541034532431656, 'lot_size'), (0.09762572519472738, 'zipcode_rank'), (0.0966156566150146, 'zipcodeAVGcost'), (0.08500182173109963, 'bathrooms'), (0.066672396299825, 'bedrooms'), (0.022039595019232767, 'district_rank'), (0.021837520221627504, 'districtAVGcost')]


## Create a Deep Learning Model

In [21]:
# Create a deep learning Sequential model.
deep_model = Sequential()
deep_model.add(Dense(units=500, activation='relu', input_dim=9))
deep_model.add(Dense(units=200, activation='relu'))
deep_model.add(Dense(units=100, activation='relu'))
deep_model.add(Dense(units=5, activation='softmax'))

In [22]:
# Compile and fit the model.
deep_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=200,
    shuffle=True,
    verbose=2
)

Train on 1457 samples
Epoch 1/200
1457/1457 - 1s - loss: 1.3682 - accuracy: 0.4118
Epoch 2/200
1457/1457 - 0s - loss: 1.1021 - accuracy: 0.5093
Epoch 3/200
1457/1457 - 0s - loss: 1.0658 - accuracy: 0.5312
Epoch 4/200
1457/1457 - 0s - loss: 1.0211 - accuracy: 0.5422
Epoch 5/200
1457/1457 - 0s - loss: 0.9938 - accuracy: 0.5642
Epoch 6/200
1457/1457 - 0s - loss: 0.9681 - accuracy: 0.5724
Epoch 7/200
1457/1457 - 0s - loss: 0.9606 - accuracy: 0.5683
Epoch 8/200
1457/1457 - 0s - loss: 0.9212 - accuracy: 0.5916
Epoch 9/200
1457/1457 - 0s - loss: 0.9224 - accuracy: 0.5854
Epoch 10/200
1457/1457 - 0s - loss: 0.8997 - accuracy: 0.6122
Epoch 11/200
1457/1457 - 0s - loss: 0.8851 - accuracy: 0.6047
Epoch 12/200
1457/1457 - 0s - loss: 0.8881 - accuracy: 0.6205
Epoch 13/200
1457/1457 - 0s - loss: 0.8695 - accuracy: 0.6342
Epoch 14/200
1457/1457 - 0s - loss: 0.8623 - accuracy: 0.6218
Epoch 15/200
1457/1457 - 0s - loss: 0.8537 - accuracy: 0.6307
Epoch 16/200
1457/1457 - 0s - loss: 0.9314 - accuracy: 0.

Epoch 133/200
1457/1457 - 0s - loss: 0.5936 - accuracy: 0.7467
Epoch 134/200
1457/1457 - 0s - loss: 0.5833 - accuracy: 0.7509
Epoch 135/200
1457/1457 - 0s - loss: 0.5581 - accuracy: 0.7632
Epoch 136/200
1457/1457 - 0s - loss: 0.5607 - accuracy: 0.7742
Epoch 137/200
1457/1457 - 0s - loss: 0.5673 - accuracy: 0.7646
Epoch 138/200
1457/1457 - 0s - loss: 0.5801 - accuracy: 0.7522
Epoch 139/200
1457/1457 - 0s - loss: 0.5547 - accuracy: 0.7673
Epoch 140/200
1457/1457 - 0s - loss: 0.5573 - accuracy: 0.7763
Epoch 141/200
1457/1457 - 0s - loss: 0.5761 - accuracy: 0.7536
Epoch 142/200
1457/1457 - 0s - loss: 0.5630 - accuracy: 0.7618
Epoch 143/200
1457/1457 - 0s - loss: 0.5819 - accuracy: 0.7584
Epoch 144/200
1457/1457 - 0s - loss: 0.5678 - accuracy: 0.7584
Epoch 145/200
1457/1457 - 0s - loss: 0.5871 - accuracy: 0.7426
Epoch 146/200
1457/1457 - 0s - loss: 0.5503 - accuracy: 0.7653
Epoch 147/200
1457/1457 - 0s - loss: 0.5557 - accuracy: 0.7536
Epoch 148/200
1457/1457 - 0s - loss: 0.5484 - accuracy:

<tensorflow.python.keras.callbacks.History at 0x1a4aab72d0>

## Quantify our Trained Model

In [23]:
model_loss, model_accuracy = deep_model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

486/1 - 0s - loss: 0.8680 - accuracy: 0.6667
Loss: 1.0733362809130194, Accuracy: 0.6666666865348816


## Make Predictions

In [24]:
# Use the first 10 test data values to make a prediction and compare it to the actual labels.
encoded_predictions = deep_model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:10])}")

Predicted classes: [Interval(449000.0, 609000.0, closed='right')
 Interval(609000.0, 825000.0, closed='right')
 Interval(348340.0, 449000.0, closed='right')
 Interval(449000.0, 609000.0, closed='right')
 Interval(609000.0, 825000.0, closed='right')
 Interval(449000.0, 609000.0, closed='right')
 Interval(449000.0, 609000.0, closed='right')
 Interval(123499.999, 348340.0, closed='right')
 Interval(123499.999, 348340.0, closed='right')
 Interval(449000.0, 609000.0, closed='right')]
Actual Labels: [Interval(609000.0, 825000.0, closed='right'), Interval(609000.0, 825000.0, closed='right'), Interval(449000.0, 609000.0, closed='right'), Interval(449000.0, 609000.0, closed='right'), Interval(449000.0, 609000.0, closed='right'), Interval(449000.0, 609000.0, closed='right'), Interval(609000.0, 825000.0, closed='right'), Interval(123499.999, 348340.0, closed='right'), Interval(123499.999, 348340.0, closed='right'), Interval(449000.0, 609000.0, closed='right')]


## Save the trained model

In [25]:
# Save the model
deep_model.save("housing_model_trained.h5")

## Test the saved model, scaler, and label encoder

In [26]:
# Load the model, scaler and label encoder.
model = load_model("housing_model_trained.h5")
scaler = load("minmax_scaler.bin")
label_encoder = load("label_encoder.bin")

In [27]:
# Input data for testing.
input_data = np.array(np.array([X.iloc[0]]))

In [28]:
X.iloc[0]

bathrooms               1.000000
bedrooms                1.000000
built                1960.000000
lot_size                0.000000
square_feet           735.000000
districtAVGcost    393627.258621
district_rank          10.000000
zipcodeAVGcost     412295.897059
zipcode_rank           29.000000
Name: 0, dtype: float64

In [29]:
encoded_predictions = model.predict_classes(scaler.transform(input_data))
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"{prediction_labels[0].left}, {prediction_labels[0].right}")

123499.999, 348340.0
