In [1]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [2]:
import pandas as pd
import IPython.display
import matplotlib.pyplot as plt
import csv
import re
import numpy as np
from datetime import datetime
import statsmodels.api as sm

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [3]:
# File path for cities: San Francisco(sf), Okaland(ok), Santa Clara(sc), 
# Santa Cruze(scz)

file_url = '/Users/xzhou/github/project_archives/files_airbnb/{}.csv'

sf_file = file_url.format('sf_airbnb/listings_details')
ok_file = file_url.format('ok_airbnb/listings_details')
sc_file = file_url.format('sc_airbnb/listings_details')
scz_file = file_url.format('scz_airbnb/listings_details')

In [4]:
# Loaded files as DataFrame

date_to_parse = ['last_scraped','host_since', 'first_review', 'last_review']

listings_sf = pd.read_csv(sf_file, parse_dates=date_to_parse)
listings_ok = pd.read_csv(ok_file, parse_dates=date_to_parse)
listings_sc = pd.read_csv(sc_file, parse_dates=date_to_parse)
listings_scz = pd.read_csv(scz_file, parse_dates=date_to_parse)

print('sf file dimensions: ', listings_sf.shape) 
print('ok file dimensions: ', listings_ok.shape) 
print('sc file dimensions: ', listings_sc.shape) 
print('scz file dimensions: ', listings_scz.shape) 

sf file dimensions:  (6633, 96)
ok file dimensions:  (2898, 96)
sc file dimensions:  (5668, 96)
scz file dimensions:  (1570, 96)


In [5]:
# Concatenate four files into one
# Created keys for each file for future references

frames = [listings_sf, listings_ok, listings_sc, listings_scz]
keys = ['sf', 'ok', 'sc', 'scz']

df_listings = pd.concat(frames, keys=keys)
df_listings.reset_index()

df_listings.head()

Unnamed: 0,Unnamed: 1,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
sf,0,958,https://www.airbnb.com/rooms/958,20180800000000.0,2018-08-06,"Bright, Modern Garden Unit - 1BR/1B",Our bright garden unit overlooks a grassy back...,"Newly remodeled, modern, and bright garden uni...",Our bright garden unit overlooks a grassy back...,none,*Quiet cul de sac in friendly neighborhood *St...,...,t,STR-0001256,"{""SAN FRANCISCO""}",f,f,moderate,f,f,1,1.38
sf,1,5858,https://www.airbnb.com/rooms/5858,20180800000000.0,2018-08-06,Creative Sanctuary,,We live in a large Victorian house on a quiet ...,We live in a large Victorian house on a quiet ...,none,I love how our neighborhood feels quiet but is...,...,t,,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,1,0.99
sf,2,7918,https://www.airbnb.com/rooms/7918,20180800000000.0,2018-08-06,A Friendly Room - UCSF/USF - San Francisco,Nice and good public transportation. 7 minute...,Room rental-sunny view room/sink/Wi Fi (inner ...,Nice and good public transportation. 7 minute...,none,"Shopping old town, restaurants, McDonald, Whol...",...,t,,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.16
sf,3,8142,https://www.airbnb.com/rooms/8142,20180800000000.0,2018-08-06,Friendly Room Apt. Style -UCSF/USF - San Franc...,Nice and good public transportation. 7 minute...,Room rental Sunny view Rm/Wi-Fi/TV/sink/large ...,Nice and good public transportation. 7 minute...,none,,...,t,,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.15
sf,4,8339,https://www.airbnb.com/rooms/8339,20180800000000.0,2018-08-06,Historic Alamo Square Victorian,Pls email before booking. Interior featured i...,Please send us a quick message before booking ...,Pls email before booking. Interior featured i...,none,,...,t,STR-0000264,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,t,t,2,0.24


In [6]:
# Added additional column to keep track of original district

df_listings['district'] = ''

df_listings.loc['sf']['district'] = 'San Francisco'
df_listings.loc['ok']['district'] = 'Oakland'
df_listings.loc['sc']['district'] = 'Santa Clara'
df_listings.loc['scz']['district'] = 'Santa Cruz'

In [7]:
df_district =  df_listings.loc[:, ['id', 'district']]
df_district.head()

Unnamed: 0,Unnamed: 1,id,district
sf,0,958,San Francisco
sf,1,5858,San Francisco
sf,2,7918,San Francisco
sf,3,8142,San Francisco
sf,4,8339,San Francisco


In [8]:
# Check for Null vales within data

display(df_listings.columns)
display(df_listings.isnull().sum())

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'access', 'interaction', 'house_rules',
       'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms',

id                                      0
listing_url                             0
scrape_id                               0
last_scraped                            0
name                                    2
summary                               468
space                                3489
description                           124
experiences_offered                     0
neighborhood_overview                5398
notes                                7321
transit                              5550
access                               5261
interaction                          5880
house_rules                          4658
thumbnail_url                       16769
medium_url                          16769
picture_url                             0
xl_picture_url                      16769
host_id                                 0
host_url                                0
host_name                               0
host_since                              0
host_location                     

In [9]:
# Columns removed/dropped:
# Removed columns with >90% null values: 'experience_offered', 'host_acceptance_rate', 
#'neighbourhood_group_cleansed', 'square_feet'
# Removed 'jurisdiction_names', 'neighbourhood', kept'neighbourhood_cleansed'
# Noted'city' and 'smart location' provide duplicate information, removed 'smart location'
# Removed 'Country code' and 'country', as all instances locate within US


columns_to_keep =[
       'id','host_since','host_response_time', 'host_response_rate',
        'host_is_superhost', 'host_listings_count',
       'host_total_listings_count', 
       'host_has_profile_pic', 'host_identity_verified', 
        'neighbourhood_cleansed',
       'city', 
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type','amenities', 
       'price', 'security_deposit',
       'cleaning_fee', 'guests_included',  'minimum_nights','number_of_reviews',
       'first_review', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'jurisdiction_names', 'instant_bookable',
       'is_business_travel_ready', 'cancellation_policy',
       'require_guest_profile_picture', 'require_guest_phone_verification',
       'calculated_host_listings_count', 'reviews_per_month', 'district']


df_smaller_listings = df_listings[columns_to_keep]
display ('Dimensions after clean up: ' ,df_smaller_listings.shape)
df_smaller_listings.head()

'Dimensions after clean up: '

(16769, 43)

Unnamed: 0,Unnamed: 1,id,host_since,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,...,review_scores_value,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,district
sf,0,958,2008-07-31,within an hour,92%,t,1,1,t,t,Western Addition,...,10.0,"{""SAN FRANCISCO""}",f,f,moderate,f,f,1,1.38,San Francisco
sf,1,5858,2009-03-02,within an hour,100%,f,2,2,t,t,Bernal Heights,...,9.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,1,0.99,San Francisco
sf,2,7918,2009-06-17,within a few hours,100%,f,10,10,t,t,Haight Ashbury,...,8.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.16,San Francisco
sf,3,8142,2009-06-17,within a few hours,100%,f,10,10,t,t,Haight Ashbury,...,9.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.15,San Francisco
sf,4,8339,2009-07-02,within an hour,100%,f,2,2,t,t,Western Addition,...,10.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,t,t,2,0.24,San Francisco


In [10]:
# Noted columns with percentage information are formated as strings, 
# and need to convert to float

def percent2float(string):
    """
    Convert strings with percentage sign to float
    """
    percentage = float(str(string).strip('%'))/100.0
    return percentage

In [11]:
# Converted host_response_rate to float

df_smaller_listings['host_response_rate'] = df_smaller_listings['host_response_rate']  \
                                            .apply(lambda x: percent2float(x))

In [12]:
# Noted columns with currency information are formated as strings, 
# and need to convert to float

def currency2float(string):
    """
    Convert currency to float
    """
    string = str(string).strip(' ').replace(' \\n', '')
    float_ = float(string.strip('$').replace(',',''))
    return float_

In [13]:
# Converted currency features to float

df_smaller_listings['price'] = df_smaller_listings['price'].apply(lambda x: currency2float(x))
df_smaller_listings['security_deposit'] = df_smaller_listings['security_deposit'].apply(lambda x: currency2float(x))
df_smaller_listings['cleaning_fee'] = df_smaller_listings['cleaning_fee'].apply(lambda x: currency2float(x))

In [14]:
df_smaller_listings.price.describe(percentiles=[.05, .10, .25, .50, .75, .90, .95])

count    16769.000000
mean       188.023555
std        274.564274
min          0.000000
5%          41.000000
10%         52.000000
25%         79.000000
50%        125.000000
75%        201.000000
90%        350.000000
95%        499.000000
max      10000.000000
Name: price, dtype: float64

In [15]:
# Removed outliers. Kept values between 5% and 95% percentiles

five_percentitle = np.percentile(df_smaller_listings['price'], 5)
ninety_five_percentitle = np.percentile(df_smaller_listings['price'], 95)

df_smaller_listings = df_smaller_listings[
                     (df_smaller_listings['price']<=ninety_five_percentitle) 
                      & (df_smaller_listings['price']>=five_percentitle)]

display(df_smaller_listings.shape)

(15096, 43)

In [16]:
# Removed special characters within Amenities

df_smaller_listings['jurisdiction_names'] = df_smaller_listings['jurisdiction_names'].str.replace('[^\w\s,]', '')
df_smaller_listings['amenities'] = df_smaller_listings['amenities'].str.replace('[^\w\s,]', '')

In [20]:
# Converted list of amenities to a dataframe

display ('Original list of amenities: ', df_smaller_listings.amenities.head())

df_amenities = df_smaller_listings  \
               .amenities  \
               .str.get_dummies(sep = ",")  \
               .add_prefix('amenities_')

print('df_ammenities dimension: ', df_amenities.shape)
display(df_amenities.head())

'Original list of amenities: '

sf  0    TV,Cable TV,Internet,Wifi,Kitchen,Pets live on...
    1    Internet,Wifi,Kitchen,Heating,Familykid friend...
    2    TV,Internet,Wifi,Kitchen,Free street parking,H...
    3    TV,Internet,Wifi,Kitchen,Free street parking,H...
    5    TV,Cable TV,Internet,Wifi,Kitchen,Free parking...
Name: amenities, dtype: object

df_ammenities dimension:  (15096, 184)


Unnamed: 0,Unnamed: 1,amenities_ toilet,amenities_24hour checkin,amenities_Accessibleheight bed,amenities_Accessibleheight toilet,amenities_Air conditioning,amenities_Air purifier,amenities_Amazon Echo,amenities_BBQ grill,amenities_Baby bath,amenities_Baby monitor,...,amenities_Wide clearance to bed,amenities_Wide clearance to shower,amenities_Wide doorway,amenities_Wide entryway,amenities_Wide hallway clearance,amenities_Wifi,amenities_Window guards,amenities_Wine cooler,amenities_translation missing enhosting_amenity_49,amenities_translation missing enhosting_amenity_50
sf,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
sf,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
sf,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
sf,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
sf,5,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [21]:
# There are 100+ types of amenities. 
# This is to filter down to common owned amenities.

common_amenities = []

for column in df_amenities.columns:
    value = df_amenities[column].quantile(0.75)
    if value > 0:
        common_amenities.append(column)

df_common_amenities = df_amenities[common_amenities]

display (common_amenities)
display (df_common_amenities.head())

['amenities_Air conditioning',
 'amenities_Bed linens',
 'amenities_Cable TV',
 'amenities_Carbon monoxide detector',
 'amenities_Coffee maker',
 'amenities_Cooking basics',
 'amenities_Dishes and silverware',
 'amenities_Dryer',
 'amenities_Essentials',
 'amenities_Extra pillows and blankets',
 'amenities_Familykid friendly',
 'amenities_Fire extinguisher',
 'amenities_First aid kit',
 'amenities_Free parking on premises',
 'amenities_Free street parking',
 'amenities_Hair dryer',
 'amenities_Hangers',
 'amenities_Heating',
 'amenities_Hot water',
 'amenities_Internet',
 'amenities_Iron',
 'amenities_Kitchen',
 'amenities_Laptop friendly workspace',
 'amenities_Lock on bedroom door',
 'amenities_Long term stays allowed',
 'amenities_Luggage dropoff allowed',
 'amenities_Microwave',
 'amenities_Oven',
 'amenities_Private entrance',
 'amenities_Refrigerator',
 'amenities_Self checkin',
 'amenities_Shampoo',
 'amenities_Smoke detector',
 'amenities_Stove',
 'amenities_TV',
 'amenities_Wa

Unnamed: 0,Unnamed: 1,amenities_Air conditioning,amenities_Bed linens,amenities_Cable TV,amenities_Carbon monoxide detector,amenities_Coffee maker,amenities_Cooking basics,amenities_Dishes and silverware,amenities_Dryer,amenities_Essentials,amenities_Extra pillows and blankets,...,amenities_Oven,amenities_Private entrance,amenities_Refrigerator,amenities_Self checkin,amenities_Shampoo,amenities_Smoke detector,amenities_Stove,amenities_TV,amenities_Washer,amenities_Wifi
sf,0,0,0,1,1,0,0,0,1,1,0,...,0,1,0,1,1,1,0,1,1,1
sf,1,0,0,0,0,0,0,0,1,1,0,...,0,1,0,0,1,1,0,0,1,1
sf,2,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,1,0,1,1,1
sf,3,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,1,0,1,1,1
sf,5,0,0,1,1,1,1,1,1,1,0,...,1,0,1,0,1,1,1,1,1,1


In [22]:
# Added common amenities dataframe to original dataframe
# Drop original amenties columns, which is a list of amenities

df_listings_refined = pd.concat([df_smaller_listings, df_common_amenities], axis=1)

df_listings_refined.drop('amenities', axis=1, inplace=True)
df_listings_refined.columns

Index(['id', 'host_since', 'host_response_time', 'host_response_rate',
       'host_is_superhost', 'host_listings_count', 'host_total_listings_count',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'city', 'is_location_exact', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'bed_type', 'price', 'security_deposit', 'cleaning_fee',
       'guests_included', 'minimum_nights', 'number_of_reviews',
       'first_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'jurisdiction_names', 'instant_bookable',
       'is_business_travel_ready', 'cancellation_policy',
       'require_guest_profile_picture', 'require_guest_phone_verification',
       'calculated_host_listings_count', 'reviews_per_month', 'district',
       'amenities_Air conditioning', 

In [23]:
# This is to convert timestamp features to duration
# Duration calcuation: number of years between original timestamp and current year

def duration(starting_time):
    """
    Calculate duration or number of years between current year and starting_time
    """
    starting_year = starting_time.year
    current_year = datetime.now().year
    duration = current_year - starting_year + 1
    
    return duration

In [24]:
# Converted timestamp features to duration

df_listings_refined['host_yrs'] = df_listings_refined['host_since']  \
                                  .apply(lambda x: duration(x))
df_listings_refined['yrs_since_first_review'] = df_listings_refined['first_review']  \
                                                .apply(lambda x: duration(x))

# Drop original timestamp features

time_stamps =['host_since', 'first_review']
df_listings_refined.drop(time_stamps, axis=1, inplace=True)

In [25]:
# Check for null values

df_listings_refined.isnull().sum()

id                                         0
host_response_time                      1763
host_response_rate                      1763
host_is_superhost                          0
host_listings_count                        0
host_total_listings_count                  0
host_has_profile_pic                       0
host_identity_verified                     0
neighbourhood_cleansed                     0
city                                      14
is_location_exact                          0
property_type                              0
room_type                                  0
accommodates                               0
bathrooms                                 29
bedrooms                                   4
beds                                      14
bed_type                                   0
price                                      0
security_deposit                        3669
cleaning_fee                            1991
guests_included                            0
minimum_ni

In [26]:
# Filled null values with average for security deposits and cleaning fee

df_listings_refined['security_deposit'].fillna(
                    value=df_listings_refined['security_deposit'].mean(),
                    inplace=True)

df_listings_refined['cleaning_fee'].fillna(
                    value=df_listings_refined['cleaning_fee'].mean(),
                    inplace=True)

In [27]:
# Noted 'security deposit', 'minimum nights' can vary considerably among postings 
# Converted them to boolean variables

df_listings_refined['security_deposit'] = (df_listings_refined['security_deposit']>0) \
                                          .astype(int)

df_listings_refined['minimum_nights'] = (df_listings_refined['minimum_nights']>1)  \
                                        .astype(int)

sf  0    0
    1    1
    2    1
    3    1
    5    1
Name: minimum_nights, dtype: int64

In [28]:
df_listings_refined.isnull().sum()

id                                         0
host_response_time                      1763
host_response_rate                      1763
host_is_superhost                          0
host_listings_count                        0
host_total_listings_count                  0
host_has_profile_pic                       0
host_identity_verified                     0
neighbourhood_cleansed                     0
city                                      14
is_location_exact                          0
property_type                              0
room_type                                  0
accommodates                               0
bathrooms                                 29
bedrooms                                   4
beds                                      14
bed_type                                   0
price                                      0
security_deposit                           0
cleaning_fee                               0
guests_included                            0
minimum_ni

In [29]:
# Dropped remaining null values

df_listings_cleaned = df_listings_refined.dropna()

display('Dimensions after drop null values: ', df_listings_cleaned.shape)
df_listings_cleaned.isnull().sum()

'Dimensions after drop null values: '

(10106, 79)

id                                      0
host_response_time                      0
host_response_rate                      0
host_is_superhost                       0
host_listings_count                     0
host_total_listings_count               0
host_has_profile_pic                    0
host_identity_verified                  0
neighbourhood_cleansed                  0
city                                    0
is_location_exact                       0
property_type                           0
room_type                               0
accommodates                            0
bathrooms                               0
bedrooms                                0
beds                                    0
bed_type                                0
price                                   0
security_deposit                        0
cleaning_fee                            0
guests_included                         0
minimum_nights                          0
number_of_reviews                 

In [30]:
df_listings_cleaned.columns

Index(['id', 'host_response_time', 'host_response_rate', 'host_is_superhost',
       'host_listings_count', 'host_total_listings_count',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'city', 'is_location_exact', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'bed_type', 'price', 'security_deposit', 'cleaning_fee',
       'guests_included', 'minimum_nights', 'number_of_reviews',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'jurisdiction_names', 'instant_bookable',
       'is_business_travel_ready', 'cancellation_policy',
       'require_guest_profile_picture', 'require_guest_phone_verification',
       'calculated_host_listings_count', 'reviews_per_month', 'district',
       'amenities_Air conditioning', 'amenities_Bed linens',
      

In [31]:
df_jurisdiction = df_listings_cleaned.loc[:, ['id', 'jurisdiction_names']]

In [32]:
# Selected all non-numeric variables and converted to dummy varialbes

categorical_variables = df_listings_cleaned.select_dtypes(include=['object']).columns
df_model = pd.get_dummies(data=df_listings_cleaned,columns=categorical_variables)

display(df_listings_cleaned[categorical_variables].head())
print('Dimension after converted categorical features to dummy values: ', df_model.shape)

Unnamed: 0,Unnamed: 1,host_response_time,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,city,is_location_exact,property_type,room_type,bed_type,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,district
sf,0,within an hour,t,t,t,Western Addition,San Francisco,t,Apartment,Entire home/apt,Real Bed,SAN FRANCISCO,f,f,moderate,f,f,San Francisco
sf,1,within an hour,f,t,t,Bernal Heights,San Francisco,t,Apartment,Entire home/apt,Real Bed,SAN FRANCISCO,f,f,strict_14_with_grace_period,f,f,San Francisco
sf,2,within a few hours,f,t,t,Haight Ashbury,San Francisco,t,Apartment,Private room,Real Bed,SAN FRANCISCO,f,f,strict_14_with_grace_period,f,f,San Francisco
sf,3,within a few hours,f,t,t,Haight Ashbury,San Francisco,t,Apartment,Private room,Real Bed,SAN FRANCISCO,f,f,strict_14_with_grace_period,f,f,San Francisco
sf,5,within a day,f,t,f,Western Addition,San Francisco,t,Apartment,Entire home/apt,Real Bed,SAN FRANCISCO,f,f,moderate,f,f,San Francisco


Dimension after converted categorical features to dummy values:  (10106, 349)


In [34]:
# Identified features with high correlations, and dropped accordingly

# Created correlation matrix
corr_matrix = df_model.corr().abs()

# Selected upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Identified index of feature columns with correlation greater than 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

display (to_drop)

# Dropped features with high corrleation 
df_model.drop(columns = to_drop, inplace = True)
display('Dimension after drop high correlated fetures', df_model.shape)

['host_total_listings_count',
 'amenities_Refrigerator',
 'amenities_Stove',
 'amenities_Washer',
 'host_is_superhost_t',
 'host_has_profile_pic_t',
 'host_identity_verified_t',
 'city_Cupertino',
 'city_Palo Alto',
 'city_San Jose',
 'city_Santa Clara',
 'city_Sunnyvale',
 'is_location_exact_t',
 'room_type_Private room',
 'jurisdiction_names_Cupertino, CA',
 'jurisdiction_names_OAKLAND',
 'jurisdiction_names_PALO ALTO',
 'jurisdiction_names_SAN FRANCISCO',
 'jurisdiction_names_SAN JOSE',
 'jurisdiction_names_SANTA CLARA',
 'jurisdiction_names_Sunnyvale, CA',
 'instant_bookable_t',
 'require_guest_profile_picture_t',
 'require_guest_phone_verification_t',
 'district_Oakland',
 'district_San Francisco',
 'district_Santa Cruz']

'Dimension after drop high correlated fetures'

(10106, 322)

In [47]:
# Stored the complete set of features

pkl_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.pkl'

pkl_file_name = pkl_file.format('all_listings')

df_model.to_pickle(pkl_file_name)

In [35]:
# Performed quick regression models to get a big picture of performance, 
# and prepare for feature slections

X = df_model.drop(['id','price'], axis=1)
y = df_model['price']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state=42)

In [37]:
# Normalize features 

ss = StandardScaler()
X_train_trans = ss.fit_transform(X_train)
X_test_trans = ss.transform (X_test)

In [38]:
model = sm.OLS(y_train, sm.add_constant(X_train_trans))
fit = model.fit()
fit.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.686
Model:,OLS,Adj. R-squared:,0.672
Method:,Least Squares,F-statistic:,49.28
Date:,"Mon, 11 Feb 2019",Prob (F-statistic):,0.0
Time:,19:31:13,Log-Likelihood:,-38183.0
No. Observations:,7074,AIC:,76970.0
Df Residuals:,6773,BIC:,79030.0
Df Model:,300,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,153.4071,0.649,236.214,0.000,152.134,154.680
x1,0.4875,1.105,0.441,0.659,-1.678,2.653
x2,4.3053,0.906,4.754,0.000,2.530,6.081
x3,26.7757,1.570,17.055,0.000,23.698,29.853
x4,4.2322,0.818,5.177,0.000,2.630,5.835
x5,21.6507,1.234,17.552,0.000,19.233,24.069
x6,-4.1823,1.344,-3.111,0.002,-6.817,-1.547
x7,-0.8905,0.706,-1.262,0.207,-2.274,0.493
x8,11.5215,0.980,11.756,0.000,9.600,13.443

0,1,2,3
Omnibus:,1139.428,Durbin-Watson:,2.015
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3918.299
Skew:,0.8,Prob(JB):,0.0
Kurtosis:,6.276,Cond. No.,5.8e+16


In [39]:
rf = RandomForestRegressor(n_estimators=500, 
                               criterion='mse', 
                               random_state=3, 
                               n_jobs=-1)
rf.fit(X_train_trans, y_train)
y_train_pred = rf.predict(X_train_trans)
y_test_pred = rf.predict(X_test_trans)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 377.840, test: 2737.552
R^2 train: 0.958, test: 0.706


In [40]:
# Sorted feature by importance

feature_labels = X.columns
feature_tuples = zip (feature_labels, rf.feature_importances_)
feature_lists = [list(feature_tuple) for feature_tuple in feature_tuples]

feature_lists.sort(key=lambda x: abs(x[1]), reverse=True)

display (feature_lists)

[['bedrooms', 0.42247850647698537],
 ['cleaning_fee', 0.08812988385946914],
 ['room_type_Entire home/apt', 0.05263985152197117],
 ['reviews_per_month', 0.02889764332299181],
 ['city_Oakland', 0.02737539776177687],
 ['accommodates', 0.02587345350142158],
 ['bathrooms', 0.023148057530810753],
 ['city_San Francisco', 0.02065578096909538],
 ['number_of_reviews', 0.02012091977299176],
 ['host_listings_count', 0.017862027627042667],
 ['review_scores_rating', 0.01594824266587264],
 ['host_yrs', 0.013714675114382444],
 ['guests_included', 0.01189451909889466],
 ['calculated_host_listings_count', 0.009588434434626496],
 ['review_scores_location', 0.009531015918441105],
 ['yrs_since_first_review', 0.008940946595058437],
 ['beds', 0.00686669893459572],
 ['host_response_rate', 0.005042234948329899],
 ['amenities_Shampoo', 0.004503452149361616],
 ['review_scores_value', 0.004442191043530934],
 ['neighbourhood_cleansed_Chinatown', 0.004412725849683528],
 ['jurisdiction_names_Santa Cruz County, CA', 

In [42]:
# Kept features that are above 0.001 thresold

features_selected = [feature for feature, importance in feature_lists
                    if importance >= 0.001]

nums_features_selected = len(features_selected)

print ('Number of features kept: ', nums_features_selected)
display(feature_lists[0:nums_features_selected])

Number of features kept:  88


[['bedrooms', 0.42247850647698537],
 ['cleaning_fee', 0.08812988385946914],
 ['room_type_Entire home/apt', 0.05263985152197117],
 ['reviews_per_month', 0.02889764332299181],
 ['city_Oakland', 0.02737539776177687],
 ['accommodates', 0.02587345350142158],
 ['bathrooms', 0.023148057530810753],
 ['city_San Francisco', 0.02065578096909538],
 ['number_of_reviews', 0.02012091977299176],
 ['host_listings_count', 0.017862027627042667],
 ['review_scores_rating', 0.01594824266587264],
 ['host_yrs', 0.013714675114382444],
 ['guests_included', 0.01189451909889466],
 ['calculated_host_listings_count', 0.009588434434626496],
 ['review_scores_location', 0.009531015918441105],
 ['yrs_since_first_review', 0.008940946595058437],
 ['beds', 0.00686669893459572],
 ['host_response_rate', 0.005042234948329899],
 ['amenities_Shampoo', 0.004503452149361616],
 ['review_scores_value', 0.004442191043530934],
 ['neighbourhood_cleansed_Chinatown', 0.004412725849683528],
 ['jurisdiction_names_Santa Cruz County, CA', 

In [43]:
# Perform another model(Random Forest) based on selected features

X_updated = df_model[features_selected]
y_updated = df_model['price']

In [44]:
X_updated_train, X_updated_test, y_updated_train, y_updated_test = train_test_split(
    X_updated, y_updated, test_size =0.3, random_state=42)

In [45]:
ssX2 = StandardScaler()
X_updated_train_trans = ssX2.fit_transform(X_updated_train)
X_updated_test_trans = ssX2.transform (X_updated_test)

In [46]:
rf_updated = RandomForestRegressor(n_estimators=500, 
                                   criterion='mse', 
                                   random_state=3, 
                                   n_jobs=-1)
rf_updated.fit(X_updated_train_trans, y_updated_train)
y_updated_train_pred = rf_updated.predict(X_updated_train_trans)
y_updated_test_pred = rf_updated.predict(X_updated_test_trans)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_updated_train, y_updated_train_pred),
        mean_squared_error(y_updated_test, y_updated_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_updated_train, y_updated_train_pred),
        r2_score(y_updated_test, y_updated_test_pred)))

MSE train: 379.237, test: 2757.359
R^2 train: 0.958, test: 0.703


In [47]:
# Kept 'id', and all selected features

columns =['id'] + features_selected
df_listings_top_features = df_model[columns]
df_listings_top_features.head()

Unnamed: 0,Unnamed: 1,id,bedrooms,cleaning_fee,room_type_Entire home/apt,reviews_per_month,city_Oakland,accommodates,bathrooms,city_San Francisco,number_of_reviews,...,amenities_Essentials,review_scores_checkin,amenities_Microwave,neighbourhood_cleansed_Downtown/Civic Center,neighbourhood_cleansed_Unincorporated Areas,neighbourhood_cleansed_Noe Valley,amenities_Oven,amenities_Dishes and silverware,amenities_Hangers,neighbourhood_cleansed_Sunnyvale
sf,0,958,1.0,100.0,1,1.38,0,3,1.0,1,152,...,1,10.0,0,0,0,0,0,0,1,0
sf,1,5858,2.0,100.0,1,0.99,0,5,1.0,1,112,...,1,10.0,0,0,0,0,0,0,1,0
sf,2,7918,1.0,50.0,0,0.16,0,2,4.0,1,17,...,0,9.0,0,0,0,0,0,0,1,0
sf,3,8142,1.0,50.0,0,0.15,0,2,4.0,1,7,...,0,9.0,0,0,0,0,0,0,0,0
sf,5,8567,2.0,125.0,1,0.27,0,6,1.0,1,30,...,1,10.0,1,0,0,0,1,1,0,0


In [48]:
# Added District and jurisdiction informaiton for future use

df_listings_top_features = pd.merge(left=df_listings_top_features, 
                                    right=df_district, 
                                    how ='left', 
                                    left_on='id',
                                    right_on='id')


df_listings_top_features = pd.merge(left=df_listings_top_features, 
                                    right=df_jurisdiction, 
                                    how ='left', 
                                    left_on='id',
                                    right_on='id')


# Renamed columns used for furture reference to avoid confusion

df_listings_top_features.rename(columns={'district':'ref_district',
                                        'jurisdiction_names': 'ref_jurisdiction'},
                               inplace=True)
df_listings_top_features

Unnamed: 0,id,bedrooms,cleaning_fee,room_type_Entire home/apt,reviews_per_month,city_Oakland,accommodates,bathrooms,city_San Francisco,number_of_reviews,...,amenities_Microwave,neighbourhood_cleansed_Downtown/Civic Center,neighbourhood_cleansed_Unincorporated Areas,neighbourhood_cleansed_Noe Valley,amenities_Oven,amenities_Dishes and silverware,amenities_Hangers,neighbourhood_cleansed_Sunnyvale,ref_district,ref_jurisdiction
0,958,1.0,100.000000,1,1.38,0,3,1.0,1,152,...,0,0,0,0,0,0,1,0,San Francisco,SAN FRANCISCO
1,5858,2.0,100.000000,1,0.99,0,5,1.0,1,112,...,0,0,0,0,0,0,1,0,San Francisco,SAN FRANCISCO
2,7918,1.0,50.000000,0,0.16,0,2,4.0,1,17,...,0,0,0,0,0,0,1,0,San Francisco,SAN FRANCISCO
3,8142,1.0,50.000000,0,0.15,0,2,4.0,1,7,...,0,0,0,0,0,0,0,0,San Francisco,SAN FRANCISCO
4,8567,2.0,125.000000,1,0.27,0,6,1.0,1,30,...,1,0,0,0,1,1,0,0,San Francisco,SAN FRANCISCO
5,8739,1.0,50.000000,0,5.41,0,3,1.0,1,594,...,1,0,0,0,0,1,1,0,San Francisco,SAN FRANCISCO
6,9225,1.0,50.000000,0,3.84,0,2,1.0,1,411,...,0,0,0,0,0,0,1,0,San Francisco,SAN FRANCISCO
7,10251,2.0,100.000000,1,2.69,0,6,1.0,1,291,...,0,0,0,0,0,0,1,0,San Francisco,SAN FRANCISCO
8,10578,0.0,75.000000,1,0.22,0,2,1.0,1,18,...,0,0,0,0,0,0,1,0,San Francisco,SAN FRANCISCO
9,10819,3.0,208.000000,1,0.34,0,4,2.0,1,22,...,0,0,0,0,0,0,1,0,San Francisco,SAN FRANCISCO


In [49]:
# Checked for null values before data storage

df_listings_top_features.isnull().sum()

id                                              0
bedrooms                                        0
cleaning_fee                                    0
room_type_Entire home/apt                       0
reviews_per_month                               0
city_Oakland                                    0
accommodates                                    0
bathrooms                                       0
city_San Francisco                              0
number_of_reviews                               0
host_listings_count                             0
review_scores_rating                            0
host_yrs                                        0
guests_included                                 0
calculated_host_listings_count                  0
review_scores_location                          0
yrs_since_first_review                          0
beds                                            0
host_response_rate                              0
amenities_Shampoo                               0


In [50]:
# Stored data post feature selection

pkl_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.pkl'

pkl_path = pkl_file.format('listings_post_feature_selection')

df_listings_top_features.to_pickle(pkl_path)