In [24]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
import time

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from pathlib import Path
import tensorflow as tf
import matplotlib.pyplot as plt

In [26]:
# Download raw data from postgres for stage 1 ETL

conn_string = 'postgres://whnpmxwsiccrtg:53c453893549d2b1e6a4ff92e626a2a08ebcaff66678e50d33e3742f66e3e4f4@ec2-52-4-171-132.compute-1.amazonaws.com/d2ajro4cjr10lb'

db = create_engine(conn_string)
conn = db.connect()

start_time = time.time()
merge1 = pd.read_sql_query('select * from "merged_errors_corrected"',con=conn)
print("PostGres Download Duration: {} seconds".format(time.time() - start_time))
conn.close ()

PostGres Download Duration: 1.0202064514160156 seconds


In [27]:
# Create a list of the object type columns.
objects = merge1.dtypes[merge1.dtypes == 'object'].index.tolist()

In [28]:
# Create a OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False, drop='if_binary')

# Fit and transform the OneHotEncoder using the categorical variable list
objects = merge1.dtypes[merge1.dtypes == 'object'].index.tolist()
encode_df = pd.DataFrame(enc.fit_transform(merge1[objects]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(objects)
encode_df.head()

# Merge one-hot encoded features and drop the originals
merge1 = merge1.merge(encode_df,left_index=True, right_index=True)
merge1 = merge1.drop(columns=objects)

In [29]:
# merge1.loc[(merge1['host_listings_count']<101),'host_listings_count'] = 0
# merge1.loc[(merge1['host_listings_count']>100),'host_listings_count'] = 1

In [30]:
# Split our preprocessed data into our features and target arrays
y = merge1["price"].values
X = merge1.drop("price",1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Perform RFR to evaluate feature importance

In [31]:
# Get feature importances using rfpimp and RFR
# import the regressor 
from sklearn.ensemble import RandomForestRegressor

#Instantiate the Decision Tree Regression model:
random_forest_regression_model = RandomForestRegressor(n_estimators = 64, random_state = 78, max_depth=32) 
random_forest_regression_model.fit(X_train, y_train)

X_columns = merge1.drop(columns ='price').columns

In [32]:
X_columns

Index(['host_listings_count', 'accommodates', 'bathrooms', 'bedrooms',
       'security_deposit', 'cleaning_fee', 'number_of_reviews',
       'number_of_reviews_ltm', 'review_scores_rating',
       'review_scores_accuracy',
       ...
       'cancellation_policy_flexible', 'cancellation_policy_moderate',
       'cancellation_policy_strict',
       'cancellation_policy_strict_14_with_grace_period',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60',
       'require_guest_profile_picture_t', 'require_guest_phone_verification_t',
       'has_availability_t', 'outlier'],
      dtype='object', length=265)

In [33]:
y_columns = merge1.columns

In [34]:

# Measuring feature importance using permutation via rfpimp library
from sklearn.metrics import r2_score
from rfpimp import permutation_importances

# Need to declare new X_train that is df instead of arrays
X_train_df = pd.DataFrame(X_train, columns=X_columns)
y_train_df = pd.DataFrame(y_train, columns=['price'])
def r2(random_forest_regression_model, X_train, y_train):
    return r2_score(y_train_df, random_forest_regression_model.predict(X_train_df))

perm_importances_rfpimp = permutation_importances(random_forest_regression_model, X_train_df, y_train_df, r2)

In [35]:
pd.set_option('display.max_rows', 300)
pd.set_option('display.float_format', '{:.6f}'.format)
perm_importances_rfpimp

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
host_listings_count,0.59611
bathrooms,0.294561
cleaning_fee,0.141705
accommodates,0.085826
reviews_per_month,0.072002
bedrooms,0.069341
neighbourhood_cleansed_District 19,0.040496
availability_365,0.028085
guests_included,0.026417
days_host,0.023961


In [36]:
cols_to_drop = perm_importances_rfpimp[perm_importances_rfpimp['Importance'] < 0.001].index

In [37]:
merge2 = merge1.drop(columns=cols_to_drop)

In [38]:
merge2.columns

Index(['host_listings_count', 'accommodates', 'bathrooms', 'bedrooms', 'price',
       'security_deposit', 'cleaning_fee', 'number_of_reviews',
       'number_of_reviews_ltm', 'review_scores_rating',
       'review_scores_accuracy', 'guests_included', 'availability_30',
       'availability_60', 'availability_90', 'availability_365',
       'reviews_per_month', 'days_host', 'internet',
       'free_parking_on_premises', 'heating', 'family/kid_friendly',
       'smoke_detector', 'carbon_monoxide_detector', 'first_aid_kit',
       'safety_card', 'fire_extinguisher', 'essentials', 'shampoo',
       'lock_on_bedroom_door', 'hair_dryer', 'iron', 'bathtub', 'hot_water',
       'bed_linens', 'extra_pillows_and_blankets', 'long_term_stays_allowed',
       'cable_tv', 'free_street_parking', 'indoor_fireplace',
       'paid_parking_off_premises', 'elevator', 'self_check-in', 'smart_lock',
       'private_entrance', 'keypad', 'bbq_grill', 'suitable_for_events',
       'pool', 'gym', 'Kitchen_Grou

In [39]:
# Split our preprocessed data into our features and target arrays
y = merge2["price"].values
X = merge2.drop("price",1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [40]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
# Fit the RobustScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [41]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_in_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,kernel_initializer='normal', input_dim=number_in_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2,kernel_initializer='normal', activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="linear"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 80)                5760      
_________________________________________________________________
dense_4 (Dense)              (None, 30)                2430      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 31        
Total params: 8,221
Trainable params: 8,221
Non-trainable params: 0
_________________________________________________________________


In [42]:
# Compile the model
nn.compile(loss="mean_absolute_error", optimizer="adam", metrics=['mean_absolute_error'])

In [43]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [44]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, MAE: {model_accuracy}")

44/44 - 0s - loss: 58.6332 - mean_absolute_error: 58.6332
Loss: 58.6331787109375, MAE: 58.6331787109375
