## Setup

In [1]:
# Import Dependencies.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import requests
import json

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [2]:
# Fetch the data from the API.
listings_json = requests.get("http://127.0.0.1:5000/housingDataAPI/v1.0/listings").json()

# Examine the data.
print(json.dumps(listings_json[0], indent=4, sort_keys=True))

{
    "address": "255 SW HARRISON ST 19b, Portland OR 97201",
    "bathrooms": 1.0,
    "bedrooms": 1,
    "built": 1965,
    "city": "Portland",
    "county": "Multnomah",
    "elementary_school": "Ainsworth",
    "high_school": "Lincoln",
    "home_type": "Condo - Contemporary",
    "lot_size": null,
    "middle_school": "West Sylvan",
    "neighborhood": "HARRISON WEST PSU OHSU",
    "price": 250000,
    "square_feet": 744,
    "zipcode": 97201
}


In [3]:
# Create a dataframe to use for our model.
data_df = pd.DataFrame(listings_json)

print(len(data_df))
data_df.head()

1619


Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode
0,"255 SW HARRISON ST 19b, Portland OR 97201",1.0,1,1965,Portland,Multnomah,Ainsworth,Lincoln,Condo - Contemporary,,West Sylvan,HARRISON WEST PSU OHSU,250000,744,97201
1,"1811 NW COUCH ST 410, Portland OR 97209",1.0,1,1916,Portland,Multnomah,Chapman,Lincoln,Condo - Tudor,,West Sylvan,unknown,250000,519,97209
2,"2211 SW PARK PL 102, Portland OR 97205",1.0,1,1965,Portland,Multnomah,Chapman,Lincoln,Condo - Contemporary,,West Sylvan,KINGS HILL,250000,849,97205
3,"501 NE Bridgeton RD 5, Portland OR 97211",1.0,2,1964,Portland,Multnomah,Faubion,Jefferson,Floating Home - 1 Story,,Other,BRIDGETON,250000,791,97211
4,"8712 N DECATUR ST 301, Portland OR 97203",1.0,2,2007,Portland,Multnomah,James John,Roosevelt,Condo - Contemporary,,George,unknown,252000,934,97203


## Data Preprocessing

In [7]:
# Simplify home types 
for i in data_df.index:
    if "Floating" in data_df.at[i, "home_type"]:
        data_df.at[i, "home_type"] = "Floating"
    if "Condo" in data_df.at[i, "home_type"]:
        data_df.at[i, "home_type"] = "Condo"
    if "Single Family" in data_df.at[i, "home_type"]:
        data_df.at[i, "home_type"] = "Single Family"
    if "Manufactured" in data_df.at[i, "home_type"]:
        data_df.at[i, "home_type"] = "Manufactured"
    
data_df.home_type.unique() 

array(['Condo', 'Floating', 'Single Family'], dtype=object)

In [9]:
# Print data to compare how many data points lost
print(f'Current Amount of Listings: {len(data_df)}')

# Change lot size to 0 for floating homes and condos
for i in data_df.index:
    if data_df.at[i, "home_type"] == "Floating":
        data_df.at[i, "lot_size"] = 0
    if data_df.at[i, "home_type"] == "Condo":
        data_df.at[i, "lot_size"] = 0

# Drop listing with null lot_size
data_df.drop(data_df[data_df["lot_size"].isnull()].index, inplace=True)
      
# Print length of data
print(f'Updated Amount of Listings: {len(data_df)}')

Current Amount of Listings: 1619
Updated Amount of Listings: 1523


In [10]:
# Print data to compare how many data points lost
print(f'Current Amount of Listings: {len(data_df)}')

# Drop rows where high school cannot be compared.
data_df.drop(data_df[data_df.high_school == "Other"].index, inplace = True)

# Print length of data
print(f'Updated Amount of Listings: {len(data_df)}')

Current Amount of Listings: 1523
Updated Amount of Listings: 1521


In [12]:
# Create district df
school_dict = ({"high_school" : ['Reynolds', 'Parkrose', 'David Douglas', 'Centennial', 'Cleveland',
        'Lincoln', 'Madison', 'Jefferson', 'Roosevelt', 'Sunset','Westview', 'Liberty', 'Beaverton', 
        'Grant', 'Southridge', 'Tigard', 'Wilson', 'Riverdale', 'Lake Oswego', 'Franklin',
        'Tualatin', 'Milwaukie', 'Scappoose'], "district" : ['Reynolds', 'Parkrose','David Douglas',
        'Centennial', 'Portland Public', 'Portland Public', 'Portland Public', 'Portland Public',
        'Portland Public', 'Beaverton', 'Beaverton', 'Hillsboro', 'Beaverton', 'Portland Public',
        'Beaverton', 'Tigard-Tualatin', 'Portland Public', 'Riverdale', 'Lake Oswego', 'Portland Public',
        'Tigard-Tualatin', 'North Clackamas', 'Scappose']})
district_df = pd.DataFrame (school_dict)

# Merge into OG df
data_df = pd.merge(data_df, district_df, on="high_school")
data_df.head()

Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode,district
0,"255 SW HARRISON ST 19b, Portland OR 97201",1.0,1,1965,Portland,Multnomah,Ainsworth,Lincoln,Condo,0.0,West Sylvan,HARRISON WEST PSU OHSU,250000,744,97201,Portland Public
1,"1811 NW COUCH ST 410, Portland OR 97209",1.0,1,1916,Portland,Multnomah,Chapman,Lincoln,Condo,0.0,West Sylvan,unknown,250000,519,97209,Portland Public
2,"2211 SW PARK PL 102, Portland OR 97205",1.0,1,1965,Portland,Multnomah,Chapman,Lincoln,Condo,0.0,West Sylvan,KINGS HILL,250000,849,97205,Portland Public
3,"111 SW HARRISON ST 7G, Portland OR 97201",1.0,1,1965,Portland,Multnomah,Ainsworth,Lincoln,Condo,0.0,West Sylvan,unknown,253000,600,97201,Portland Public
4,"1500 SW SKYLINE BLVD 16, Portland OR 97221",2.0,2,1966,Portland,Multnomah,Ainsworth,Lincoln,Condo,0.0,West Sylvan,Sylvan/Skyline,254900,1156,97221,Portland Public


In [13]:
# Add age of home column
data_df["house_age"] = 2020 - data_df["built"]
data_df.head()

Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode,district,house_age
0,"255 SW HARRISON ST 19b, Portland OR 97201",1.0,1,1965,Portland,Multnomah,Ainsworth,Lincoln,Condo,0.0,West Sylvan,HARRISON WEST PSU OHSU,250000,744,97201,Portland Public,55
1,"1811 NW COUCH ST 410, Portland OR 97209",1.0,1,1916,Portland,Multnomah,Chapman,Lincoln,Condo,0.0,West Sylvan,unknown,250000,519,97209,Portland Public,104
2,"2211 SW PARK PL 102, Portland OR 97205",1.0,1,1965,Portland,Multnomah,Chapman,Lincoln,Condo,0.0,West Sylvan,KINGS HILL,250000,849,97205,Portland Public,55
3,"111 SW HARRISON ST 7G, Portland OR 97201",1.0,1,1965,Portland,Multnomah,Ainsworth,Lincoln,Condo,0.0,West Sylvan,unknown,253000,600,97201,Portland Public,55
4,"1500 SW SKYLINE BLVD 16, Portland OR 97221",2.0,2,1966,Portland,Multnomah,Ainsworth,Lincoln,Condo,0.0,West Sylvan,Sylvan/Skyline,254900,1156,97221,Portland Public,54


## Prepare Data for Model

In [14]:
# Include only those columns that will be used in the deep learning model.
model_df = data_df.loc[:, ["bathrooms",
                            "bedrooms",
                            "house_age",
                            "lot_size",
                            "square_feet",
                            "home_type",
                            "district",
                            "zipcode",
                            "price"]
                       ]

# Drop rows with NaN entries.
model_df.dropna(inplace=True)

# Check the model data.
print(len(model_df))
model_df.head()

1519


Unnamed: 0,bathrooms,bedrooms,house_age,lot_size,square_feet,home_type,district,zipcode,price
0,1.0,1,55,0.0,744,Condo,Portland Public,97201,250000
1,1.0,1,104,0.0,519,Condo,Portland Public,97209,250000
2,1.0,1,55,0.0,849,Condo,Portland Public,97205,250000
3,1.0,1,55,0.0,600,Condo,Portland Public,97201,253000
4,2.0,2,54,0.0,1156,Condo,Portland Public,97221,254900


In [None]:
# Assign X (input) and y (target).

X = model_df.drop("price", axis=1)
y = model_df.loc[:,"price"].values.reshape(-1, 1)

In [None]:
# Split the data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Create a MinMaxScaler model and fit it to the training data

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
# Transform the training and testing data using the X_scaler and y_scaler models.

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

## Model Creation, Training, and Testing

In [None]:
# Create an mlp model.
model = MLPRegressor(hidden_layer_sizes=(100,100), random_state=42, max_iter=1000)

In [None]:
# Use grid search to tune the model.
grid = GridSearchCV(model, {"alpha":10.0 ** -np.arange(1, 7)}, verbose=2)
grid.fit(X_train_scaled, np.ravel(y_train_scaled))

In [None]:
# List the best parameters and best score.
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# Make predictions using a fitted MLPRegression model.
# Plot the difference between the model predicted values and actual y values, versus the model predicted values.

plt.scatter(grid.predict(X_train_scaled), grid.predict(X_train_scaled) - np.ravel(y_train_scaled), c="blue", label="Training Data")
plt.scatter(grid.predict(X_test_scaled), grid.predict(X_test_scaled) - np.ravel(y_test_scaled), c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("MLP Residual Plot")
plt.show()

In [None]:
# Used X_test_scaled, y_test_scaled, and grid.predict(X_test_scaled) to calculate MSE and R2.

grid_predictions = grid.predict(X_test_scaled)
grid_MSE = mean_squared_error(y_test_scaled, grid_predictions)
grid_r2 = grid.score(X_test_scaled, y_test_scaled)

print(f"MSE: {grid_MSE}, R2: {grid_r2}")