In [20]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
import time

In [21]:
# Download raw data from postgres for stage 1 ETL

conn_string = 'postgres://whnpmxwsiccrtg:53c453893549d2b1e6a4ff92e626a2a08ebcaff66678e50d33e3742f66e3e4f4@ec2-52-4-171-132.compute-1.amazonaws.com/d2ajro4cjr10lb'

db = create_engine(conn_string)
conn = db.connect()

start_time = time.time()
clean_listing = pd.read_sql_query('select * from "clean_listing_remove_somereviews"',con=conn)
amenities = pd.read_sql_query('select * from "amenities_bucketed"',con=conn)
print("PostGres Download Duration: {} seconds".format(time.time() - start_time))

PostGres Download Duration: 1.0560097694396973 seconds


In [22]:
# Drop duplicated variables.
listing = clean_listing.drop(columns = ['last_scraped', 'host_since', 'latitude', 'longitude'])

In [23]:
# Merge amenities table to full listings.
merged = listing.merge(amenities, how='left', on ='id')

In [24]:
# Merge tables and drop 'id' (unique identifier - not relevant)
merged = merged.drop(columns = 'id')

In [25]:
objects = merged.dtypes[merged.dtypes == 'object'].index.tolist()

In [26]:
# Create a OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False, drop='if_binary')

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(merged[objects]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(objects)

In [27]:
# Merge one-hot encoded features and drop the originals
merged = merged.merge(encode_df,left_index=True, right_index=True)
merged = merged.drop(columns=objects)

In [28]:
# Set erroneous 30 bedroom listings for apartments to 1
merged.loc[merged['bedrooms'] > 29, 'bedrooms'] = 1

In [29]:
# Convert zero bedrooms with more than 4 accommodates to 2 bedrooms
merged.loc[(merged['bedrooms'] == 0) & (merged['accommodates'] > 4), 'bedrooms'] = 2

In [30]:
# Convert zero bedrooms with more than 4 accommodates to 1 bedroom
merged.loc[(merged['bedrooms'] == 0) & (merged['accommodates'] < 5), 'bedrooms'] = 1

In [31]:
from scipy import stats
merged['accommodates_logs'] = np.log(merged['accommodates'])
merged = merged[(np.abs(stats.zscore(merged['accommodates_logs'])) < 2)]
merged.shape

(5899, 264)

In [32]:
merged.loc[merged.bathrooms == 0, 'bathrooms'] = .001
merged['baths_logs'] = np.log(merged['bathrooms'])
merged = merged[(np.abs(stats.zscore(merged['baths_logs'])) < 2)]
merged.shape

(5831, 265)

In [33]:
merged.drop(columns=['baths_logs', 'accommodates_logs'], inplace = True)

In [34]:
merged.shape

(5831, 263)

In [38]:
merged['review_scores_rating'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: review_scores_rating, dtype: float64

In [37]:
# For loop to delete any rows with outliers in any row (3 SD) 
from scipy import stats
log_column_list = []

for column in merged.columns:
    log_col_name = column + "_logs"
    # Ignore columns with max less than or equal to 1 (binary)
    if merged[column].max() > 1:
        # natural log transform (+1 to handle 0 values)
        merged[log_col_name] = np.log(merged[column]+1)
        merged = merged[(np.abs(stats.zscore(merged[log_col_name])) < 3)]
        log_column_list.append(log_col_name)
        print(log_col_name)
        print(merged.shape)

merged.drop(columns=log_column_list, inplace=True)

host_listings_count_logs
(5831, 264)
accommodates_logs
(5831, 265)
bathrooms_logs
(5831, 266)
bedrooms_logs
(5820, 267)
price_logs
(5799, 268)
security_deposit_logs
(5799, 269)
cleaning_fee_logs
(5799, 270)
review_scores_rating_logs
(0, 271)


In [18]:
# Drop additional outliers using IsolationForest
from sklearn.ensemble import IsolationForest
X = merged.drop(columns=['price']).values
iso = IsolationForest(contamination='auto')
yhat = iso.fit_predict(X)
merged['outlier'] = yhat

ValueError: Found array with 0 sample(s) (shape=(0, 262)) while a minimum of 1 is required.

In [17]:
# Delete any rows with outliers in any row (3 SD) using calculated field log(price/accommodates)
merged = merged[(np.abs(stats.zscore(np.log(merged['price']/merged['accommodates']))) < 3)]

  mns = a.mean(axis=axis, keepdims=True)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret, rcount, out=ret, casting='unsafe', subok=False)


In [19]:
X = merged.drop(columns=['price']).values

In [44]:
merged = merged[merged['outlier']!=-1]

In [45]:
# Upload Merged dataset with errors corrected to PostGres

start_time = time.time()
merged.to_sql('merged_errors_corrected', con=conn, if_exists='replace', index=False)
print("PostGres Upload Duration: {} seconds".format(time.time() - start_time))
conn.close ()

PostGres Upload Duration: 140.9793496131897 seconds
