In [14]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import distutils as _distutils
import tensorflow as tf
import tensorflow.compat.v1 as tf
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [15]:
# Import and read the cleaned_state_market_tracker.csv.
import pandas as pd 
market_df = pd.read_csv("../Resources/cleaned_state_market_tracker.csv")
market_df.head()

Unnamed: 0.1,Unnamed: 0,table_id,state,state_code,property_type,property_type_id,median_sale_price,median_list_price,median_ppsf,median_list_ppsf,homes_sold,pending_sales,new_listings,inventory,months_of_supply,avg_sale_to_list,sold_above_list,parent_metro_region,year,month
0,0,25,Hawaii,HI,Townhouse,13,776200,485000.0,484.0,398.0,16,10.0,10.0,76.0,4.8,1.002073,0.3125,West Region,2019,12
1,1,6,Nebraska,NE,Multi-Family (2-4 Unit),4,109800,138500.0,56.0,80.0,15,13.0,33.0,125.0,8.3,0.916258,0.133333,Midwest Region,2012,8
2,2,42,Virginia,VA,Condo/Co-op,3,275700,440600.0,264.0,258.0,1256,1140.0,1521.0,2712.0,2.2,0.994101,0.308121,South Region,2020,10
3,3,21,Georgia,GA,All Residential,-1,237100,254800.0,117.0,125.0,11366,9352.0,12986.0,50603.0,4.5,0.97647,0.178075,South Region,2019,11
4,4,10,New Hampshire,NH,Townhouse,13,200000,262300.0,135.0,148.0,173,141.0,105.0,442.0,2.6,0.981605,0.277457,Northeast Region,2017,11


In [16]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
market_df.drop(columns = ['table_id','property_type_id', 'state_code', 'median_ppsf', "median_list_ppsf", "sold_above_list", "year", "months_of_supply", "state", "parent_metro_region"], inplace=True)
market_df

Unnamed: 0.1,Unnamed: 0,property_type,median_sale_price,median_list_price,homes_sold,pending_sales,new_listings,inventory,avg_sale_to_list,month
0,0,Townhouse,776200,485000.0,16,10.0,10.0,76.0,1.002073,12
1,1,Multi-Family (2-4 Unit),109800,138500.0,15,13.0,33.0,125.0,0.916258,8
2,2,Condo/Co-op,275700,440600.0,1256,1140.0,1521.0,2712.0,0.994101,10
3,3,All Residential,237100,254800.0,11366,9352.0,12986.0,50603.0,0.976470,11
4,4,Townhouse,200000,262300.0,173,141.0,105.0,442.0,0.981605,11
...,...,...,...,...,...,...,...,...,...,...
35418,35418,Multi-Family (2-4 Unit),77200,137000.0,299,221.0,459.0,1829.0,0.926174,5
35419,35419,Single Family Residential,273000,296600.0,138,59.0,197.0,940.0,0.973945,2
35420,35420,Multi-Family (2-4 Unit),236300,324500.0,226,236.0,485.0,1095.0,0.995301,7
35421,35421,Townhouse,373000,460900.0,187,255.0,263.0,226.0,1.007853,1


In [17]:
null_market_df = market_df.dropna(how='any', axis= 0)
null_market_df

Unnamed: 0.1,Unnamed: 0,property_type,median_sale_price,median_list_price,homes_sold,pending_sales,new_listings,inventory,avg_sale_to_list,month
0,0,Townhouse,776200,485000.0,16,10.0,10.0,76.0,1.002073,12
1,1,Multi-Family (2-4 Unit),109800,138500.0,15,13.0,33.0,125.0,0.916258,8
2,2,Condo/Co-op,275700,440600.0,1256,1140.0,1521.0,2712.0,0.994101,10
3,3,All Residential,237100,254800.0,11366,9352.0,12986.0,50603.0,0.976470,11
4,4,Townhouse,200000,262300.0,173,141.0,105.0,442.0,0.981605,11
...,...,...,...,...,...,...,...,...,...,...
35418,35418,Multi-Family (2-4 Unit),77200,137000.0,299,221.0,459.0,1829.0,0.926174,5
35419,35419,Single Family Residential,273000,296600.0,138,59.0,197.0,940.0,0.973945,2
35420,35420,Multi-Family (2-4 Unit),236300,324500.0,226,236.0,485.0,1095.0,0.995301,7
35421,35421,Townhouse,373000,460900.0,187,255.0,263.0,226.0,1.007853,1


In [18]:
# Determine the number of unique values in each column.
category_values = null_market_df.nunique()
category_values

Unnamed: 0           34012
property_type            5
median_sale_price     6061
median_list_price     5553
homes_sold            8834
pending_sales         7719
new_listings          9472
inventory            14159
avg_sale_to_list     33875
month                   12
dtype: int64

In [19]:
# Split our preprocessed data into our features and target arrays
y = null_market_df["property_type"].astype('category').cat.codes
X = null_market_df.drop(["property_type"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=55)
y

0        4
1        2
2        1
3        0
4        4
        ..
35418    2
35419    3
35420    2
35421    4
35422    0
Length: 34012, dtype: int8

In [20]:
# Create a StandardScaler instances
scaler = MinMaxScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])

nn_model = tf.keras.models.Sequential()

hidden_nodes_layer1 = 4
hidden_nodes_layer2 = 4

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='sigmoid'))

# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='sigmoid'))

# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation= "sigmoid"))

# Check the structure of the model
nn_model.summary()

print(number_input_features)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


9


In [22]:
# Compile the model
nn_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [23]:
# Train the model
fit_model = nn_model.fit(X_train_scaled,y_train, epochs=50)

Epoch 1/50


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.2056 - loss: 0.0000e+00
Epoch 2/50
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 932us/step - accuracy: 0.1962 - loss: 0.0000e+00
Epoch 3/50
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 817us/step - accuracy: 0.2027 - loss: 0.0000e+00
Epoch 4/50
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 894us/step - accuracy: 0.2029 - loss: 0.0000e+00
Epoch 5/50
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.2029 - loss: 0.0000e+00
Epoch 6/50
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 754us/step - accuracy: 0.1973 - loss: 0.0000e+00
Epoch 7/50
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 910us/step - accuracy: 0.1996 - loss: 0.0000e+00
Epoch 8/50
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.2004 - loss: 0.0000e+00
Epoch 9/5

In [24]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

266/266 - 1s - 3ms/step - accuracy: 0.2004 - loss: 0.0000e+00
Loss: 0.0, Accuracy: 0.20039986073970795
