In [18]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import csv
import re
import numpy as np
from datetime import datetime
import statsmodels.api as sm

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [20]:
# File path for cities: San Francisco(sf), Okaland(ok), Santa Clara(sc), 
# Santa Cruze(scz)

file_url = '/Users/xzhou/github/project_archives/files_airbnb/{}.csv'

sf_file = file_url.format('sf_airbnb/listings_details')
ok_file = file_url.format('ok_airbnb/listings_details')
sc_file = file_url.format('sc_airbnb/listings_details')
scz_file = file_url.format('scz_airbnb/listings_details')

In [21]:
# Load files as DataFrame

date_to_parse = ['last_scraped','host_since', 'first_review', 'last_review']

listings_sf = pd.read_csv(sf_file, parse_dates=date_to_parse)
listings_ok = pd.read_csv(ok_file, parse_dates=date_to_parse)
listings_sc = pd.read_csv(sc_file, parse_dates=date_to_parse)
listings_scz = pd.read_csv(scz_file, parse_dates=date_to_parse)

print('sf file dimensions: ', listings_sf.shape) 
print('ok file dimensions: ', listings_ok.shape) 
print('sc file dimensions: ', listings_sc.shape) 
print('scz file dimensions: ', listings_scz.shape) 

sf file dimensions:  (6633, 96)
ok file dimensions:  (2898, 96)
sc file dimensions:  (5668, 96)
scz file dimensions:  (1570, 96)


In [22]:
# Concatenate four files into one
# Created keys for each file for future references

frames = [listings_sf, listings_ok, listings_sc, listings_scz]
keys = ['sf', 'ok', 'sc', 'scz']
df_listings = pd.concat(frames, keys=keys)
df_listings.reset_index()

df_listings.head()

Unnamed: 0,Unnamed: 1,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
sf,0,958,https://www.airbnb.com/rooms/958,20180800000000.0,2018-08-06,"Bright, Modern Garden Unit - 1BR/1B",Our bright garden unit overlooks a grassy back...,"Newly remodeled, modern, and bright garden uni...",Our bright garden unit overlooks a grassy back...,none,*Quiet cul de sac in friendly neighborhood *St...,...,t,STR-0001256,"{""SAN FRANCISCO""}",f,f,moderate,f,f,1,1.38
sf,1,5858,https://www.airbnb.com/rooms/5858,20180800000000.0,2018-08-06,Creative Sanctuary,,We live in a large Victorian house on a quiet ...,We live in a large Victorian house on a quiet ...,none,I love how our neighborhood feels quiet but is...,...,t,,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,1,0.99
sf,2,7918,https://www.airbnb.com/rooms/7918,20180800000000.0,2018-08-06,A Friendly Room - UCSF/USF - San Francisco,Nice and good public transportation. 7 minute...,Room rental-sunny view room/sink/Wi Fi (inner ...,Nice and good public transportation. 7 minute...,none,"Shopping old town, restaurants, McDonald, Whol...",...,t,,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.16
sf,3,8142,https://www.airbnb.com/rooms/8142,20180800000000.0,2018-08-06,Friendly Room Apt. Style -UCSF/USF - San Franc...,Nice and good public transportation. 7 minute...,Room rental Sunny view Rm/Wi-Fi/TV/sink/large ...,Nice and good public transportation. 7 minute...,none,,...,t,,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.15
sf,4,8339,https://www.airbnb.com/rooms/8339,20180800000000.0,2018-08-06,Historic Alamo Square Victorian,Pls email before booking. Interior featured i...,Please send us a quick message before booking ...,Pls email before booking. Interior featured i...,none,,...,t,STR-0000264,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,t,t,2,0.24


In [23]:
# Added additional column to keep track of original district

df_listings['district'] = ''

df_listings.loc['sf']['district'] = 'San Francisco'
df_listings.loc['ok']['district'] = 'Oakland'
df_listings.loc['sc']['district'] = 'Santa Clara'
df_listings.loc['scz']['district'] = 'Santa Cruz'

In [24]:
df_district =  df_listings.loc[:, ['id', 'district']]
df_district.head()

Unnamed: 0,Unnamed: 1,id,district
sf,0,958,San Francisco
sf,1,5858,San Francisco
sf,2,7918,San Francisco
sf,3,8142,San Francisco
sf,4,8339,San Francisco


In [25]:
# Check Null vales within data

df_listings.isnull().sum()

id                                      0
listing_url                             0
scrape_id                               0
last_scraped                            0
name                                    2
summary                               468
space                                3489
description                           124
experiences_offered                     0
neighborhood_overview                5398
notes                                7321
transit                              5550
access                               5261
interaction                          5880
house_rules                          4658
thumbnail_url                       16769
medium_url                          16769
picture_url                             0
xl_picture_url                      16769
host_id                                 0
host_url                                0
host_name                               0
host_since                              0
host_location                     

In [26]:
df_listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'access', 'interaction', 'house_rules',
       'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms',

In [27]:
# Columns removed/dropped:
# Noted >90% null values in 'experience_offered', 'host_acceptance_rate', 
#'neighbourhood_group_cleansed', 'square_feet'
# Removed 'jurisdiction_names', 'neighbourhood',  kept'neighbourhood_cleansed'
# Noted'city' and 'smart location' provide duplicate information, removed 'smart location'
# Dropped 'Country code' and 'country', as all instances locate within US


columns_to_keep =[
       'id','host_since','host_response_time', 'host_response_rate',
        'host_is_superhost', 'host_listings_count',
       'host_total_listings_count', 
       'host_has_profile_pic', 'host_identity_verified', 
        'neighbourhood_cleansed',
       'city', 
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type','amenities', 
       'price', 'security_deposit',
       'cleaning_fee', 'guests_included',  'minimum_nights','number_of_reviews',
       'first_review', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'jurisdiction_names', 'instant_bookable',
       'is_business_travel_ready', 'cancellation_policy',
       'require_guest_profile_picture', 'require_guest_phone_verification',
       'calculated_host_listings_count', 'reviews_per_month', 'district']


df_smaller_listings = df_listings[columns_to_keep]
df_smaller_listings.head()

Unnamed: 0,Unnamed: 1,id,host_since,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,...,review_scores_value,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,district
sf,0,958,2008-07-31,within an hour,92%,t,1,1,t,t,Western Addition,...,10.0,"{""SAN FRANCISCO""}",f,f,moderate,f,f,1,1.38,San Francisco
sf,1,5858,2009-03-02,within an hour,100%,f,2,2,t,t,Bernal Heights,...,9.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,1,0.99,San Francisco
sf,2,7918,2009-06-17,within a few hours,100%,f,10,10,t,t,Haight Ashbury,...,8.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.16,San Francisco
sf,3,8142,2009-06-17,within a few hours,100%,f,10,10,t,t,Haight Ashbury,...,9.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.15,San Francisco
sf,4,8339,2009-07-02,within an hour,100%,f,2,2,t,t,Western Addition,...,10.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,t,t,2,0.24,San Francisco


In [28]:
# Noted columns with percentage information are formated as strings, 
# but need to convert to float

def percent2float(string):
    """
    Convert strings with percentage sign to float
    """
    percentage = float(str(string).strip('%'))/100.0
    return percentage

In [29]:
# Converted host_response_rate to float

df_smaller_listings['host_response_rate'] = df_smaller_listings['host_response_rate']  \
                                            .apply(lambda x: percent2float(x))

In [40]:
# Noted columns with currency information are formated as strings, 
# but need to convert to float

def currency2float(string):
    """
    Convert currency to float
    """
    string = str(string).strip(' ').replace(' \\n', '')
    float_ = float(string.strip('$').replace(',',''))
    return float_

In [42]:
# Converted currency features to float

df_smaller_listings['price'] = df_smaller_listings['price'].apply(lambda x: currency2float(x))
df_smaller_listings['security_deposit'] = df_smaller_listings['security_deposit'].apply(lambda x: currency2float(x))
df_smaller_listings['cleaning_fee'] = df_smaller_listings['cleaning_fee'].apply(lambda x: currency2float(x))

In [44]:
# Converted currency features to float

# columns_w_currency = ['price', 'security_deposit', 'cleaning_fee']

# df_smaller_listings[columns_w_currency] = df_smaller_listings[columns_w_currency]  \
#                                           .apply(lambda x: currency2float(x))

In [43]:
df_smaller_listings[['price', 'security_deposit', 'cleaning_fee']]

Unnamed: 0,Unnamed: 1,price,security_deposit,cleaning_fee
sf,0,170.0,100.0,100.0
sf,1,235.0,,100.0
sf,2,65.0,200.0,50.0
sf,3,65.0,200.0,50.0
sf,4,675.0,0.0,225.0
sf,5,255.0,0.0,125.0
sf,6,139.0,0.0,50.0
sf,7,135.0,,50.0
sf,8,265.0,500.0,100.0
sf,9,120.0,500.0,75.0


In [20]:
df_smaller_listings.shape

(16769, 43)

In [21]:
df_smaller_listings.price.describe(percentiles=[.05, .10, .25, .5, .75, .90, .95])

count    16769.000000
mean       188.023555
std        274.564274
min          0.000000
5%          41.000000
10%         52.000000
25%         79.000000
50%        125.000000
75%        201.000000
90%        350.000000
95%        499.000000
max      10000.000000
Name: price, dtype: float64

In [22]:
# Remove outliers. Use 5% and 95% percentiles as cutoff

df_smaller_listings = df_smaller_listings[(df_smaller_listings['price']<=500) 
                                         & (df_smaller_listings['price']>=40)]

In [23]:
df_smaller_listings.shape

(15379, 43)

In [24]:
# Remove special characters within Amenities

df_smaller_listings['jurisdiction_names'] = df_smaller_listings['jurisdiction_names'].str.replace('[^\w\s,]', '')
df_smaller_listings['amenities'] = df_smaller_listings['amenities'].str.replace('[^\w\s,]', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [25]:
df_smaller_listings.amenities.head()

listings_sf  0    TV,Cable TV,Internet,Wifi,Kitchen,Pets live on...
             1    Internet,Wifi,Kitchen,Heating,Familykid friend...
             2    TV,Internet,Wifi,Kitchen,Free street parking,H...
             3    TV,Internet,Wifi,Kitchen,Free street parking,H...
             5    TV,Cable TV,Internet,Wifi,Kitchen,Free parking...
Name: amenities, dtype: object

In [26]:
df_amenities = df_smaller_listings  \
               .amenities  \
               .str.get_dummies(sep = ",")  \
               .add_prefix('amenities_')

In [27]:
df_amenities.head()

Unnamed: 0,Unnamed: 1,amenities_ toilet,amenities_24hour checkin,amenities_Accessibleheight bed,amenities_Accessibleheight toilet,amenities_Air conditioning,amenities_Air purifier,amenities_Amazon Echo,amenities_BBQ grill,amenities_Baby bath,amenities_Baby monitor,...,amenities_Wide clearance to bed,amenities_Wide clearance to shower,amenities_Wide doorway,amenities_Wide entryway,amenities_Wide hallway clearance,amenities_Wifi,amenities_Window guards,amenities_Wine cooler,amenities_translation missing enhosting_amenity_49,amenities_translation missing enhosting_amenity_50
listings_sf,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
listings_sf,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
listings_sf,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
listings_sf,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
listings_sf,5,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [28]:
# There is a long list of amenities. This is to filter down to common amenities.

common_amenities=[]

for column in df_amenities.columns:
    value = df_amenities[column].quantile(0.75)
    if value>0:
        common_amenities.append(column)

common_amenities

['amenities_Air conditioning',
 'amenities_Bed linens',
 'amenities_Cable TV',
 'amenities_Carbon monoxide detector',
 'amenities_Coffee maker',
 'amenities_Cooking basics',
 'amenities_Dishes and silverware',
 'amenities_Dryer',
 'amenities_Essentials',
 'amenities_Extra pillows and blankets',
 'amenities_Familykid friendly',
 'amenities_Fire extinguisher',
 'amenities_First aid kit',
 'amenities_Free parking on premises',
 'amenities_Free street parking',
 'amenities_Hair dryer',
 'amenities_Hangers',
 'amenities_Heating',
 'amenities_Hot water',
 'amenities_Internet',
 'amenities_Iron',
 'amenities_Kitchen',
 'amenities_Laptop friendly workspace',
 'amenities_Lock on bedroom door',
 'amenities_Long term stays allowed',
 'amenities_Microwave',
 'amenities_Oven',
 'amenities_Private entrance',
 'amenities_Refrigerator',
 'amenities_Self checkin',
 'amenities_Shampoo',
 'amenities_Smoke detector',
 'amenities_Stove',
 'amenities_TV',
 'amenities_Washer',
 'amenities_Wifi']

In [29]:
df_common_amenities = df_amenities[common_amenities]
df_common_amenities.head()

Unnamed: 0,Unnamed: 1,amenities_Air conditioning,amenities_Bed linens,amenities_Cable TV,amenities_Carbon monoxide detector,amenities_Coffee maker,amenities_Cooking basics,amenities_Dishes and silverware,amenities_Dryer,amenities_Essentials,amenities_Extra pillows and blankets,...,amenities_Oven,amenities_Private entrance,amenities_Refrigerator,amenities_Self checkin,amenities_Shampoo,amenities_Smoke detector,amenities_Stove,amenities_TV,amenities_Washer,amenities_Wifi
listings_sf,0,0,0,1,1,0,0,0,1,1,0,...,0,1,0,1,1,1,0,1,1,1
listings_sf,1,0,0,0,0,0,0,0,1,1,0,...,0,1,0,0,1,1,0,0,1,1
listings_sf,2,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,1,0,1,1,1
listings_sf,3,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,1,0,1,1,1
listings_sf,5,0,0,1,1,1,1,1,1,1,0,...,1,0,1,0,1,1,1,1,1,1


In [30]:
df_listings_refined = pd.concat([df_smaller_listings, df_common_amenities], axis=1)

In [31]:
df_listings_refined.shape

(15379, 79)

In [32]:
df_listings_refined.drop('amenities', axis=1, inplace=True)

In [33]:
df_listings_refined.columns

Index(['id', 'host_since', 'host_response_time', 'host_response_rate',
       'host_is_superhost', 'host_listings_count', 'host_total_listings_count',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'city', 'is_location_exact', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'bed_type', 'price', 'security_deposit', 'cleaning_fee',
       'guests_included', 'minimum_nights', 'number_of_reviews',
       'first_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'jurisdiction_names', 'instant_bookable',
       'is_business_travel_ready', 'cancellation_policy',
       'require_guest_profile_picture', 'require_guest_phone_verification',
       'calculated_host_listings_count', 'reviews_per_month', 'district',
       'amenities_Air conditioning', 

In [34]:
# This is to convert timestamp feature to duration

def duration(starting_time):
    """
    Calculate duration or number of years between current year and input years
    """
    starting_year = starting_time.year
    current_year = datetime.now().year
    duration = current_year - starting_year + 1
    
    return duration

In [35]:
# Calculate durations for timestamp columns

df_listings_refined['host_yrs'] = df_listings_refined['host_since'].apply(lambda x: duration(x))
df_listings_refined['yrs_since_first_review'] = df_listings_refined['first_review'].apply(lambda x: duration(x))

In [36]:
# Drop original and un-used timestamp features

time_stamps =['host_since', 'first_review']
df_listings_refined.drop(time_stamps, axis=1, inplace=True)

In [37]:
df_listings_refined.isnull().sum()

id                                         0
host_response_time                      1803
host_response_rate                      1803
host_is_superhost                          0
host_listings_count                        0
host_total_listings_count                  0
host_has_profile_pic                       0
host_identity_verified                     0
neighbourhood_cleansed                     0
city                                      14
is_location_exact                          0
property_type                              0
room_type                                  0
accommodates                               0
bathrooms                                 30
bedrooms                                   4
beds                                      15
bed_type                                   0
price                                      0
security_deposit                        3763
cleaning_fee                            2051
guests_included                            0
minimum_ni

In [38]:
# Filled null values with average for security deposits and cleaning fee

df_listings_refined['security_deposit'].fillna(
    value=df_listings_refined['security_deposit'].mean(),
    inplace=True)

df_listings_refined['cleaning_fee'].fillna(
    value=df_listings_refined['cleaning_fee'].mean(),
    inplace=True)

In [39]:
# Security deposit can vary in big range. Convert it to boolean varialbes

df_listings_refined['security_deposit'] = (df_listings_refined['security_deposit']>0).astype(int)

In [40]:
df_listings_refined.describe(include='all')

Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,city,...,amenities_Refrigerator,amenities_Self checkin,amenities_Shampoo,amenities_Smoke detector,amenities_Stove,amenities_TV,amenities_Washer,amenities_Wifi,host_yrs,yrs_since_first_review
count,15379.0,13576,13576.0,15379,15379.0,15379.0,15379,15379,15379,15365,...,15379.0,15379.0,15379.0,15379.0,15379.0,15379.0,15379.0,15379.0,15379.0,12876.0
unique,,4,,2,,,2,2,178,64,...,,,,,,,,,,
top,,within an hour,,f,,,t,t,San Jose,San Francisco,...,,,,,,,,,,
freq,,8950,,9888,,,15330,9031,1777,6168,...,,,,,,,,,,
mean,14700610.0,,0.961138,,25.504194,25.504194,,,,,...,0.370765,0.433123,0.825281,0.922037,0.292412,0.729046,0.726965,0.977177,4.873659,2.927695
std,8398490.0,,0.124995,,122.547116,122.547116,,,,,...,0.483025,0.495523,0.379739,0.268123,0.454885,0.444467,0.445533,0.149345,2.123994,1.733904
min,958.0,,0.0,,0.0,0.0,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,7396749.0,,1.0,,1.0,1.0,,,,,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,3.0,2.0
50%,15759960.0,,1.0,,2.0,2.0,,,,,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,5.0,3.0
75%,21981580.0,,1.0,,5.0,5.0,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,4.0


In [41]:
df_listings_refined.review_scores_rating.describe(include='all')

count    12764.000000
mean        95.435914
std          6.903849
min         20.000000
25%         94.000000
50%         97.000000
75%        100.000000
max        100.000000
Name: review_scores_rating, dtype: float64

In [42]:
df_listings_refined.review_scores_accuracy.describe(include='all')

count    12755.000000
mean         9.742140
std          0.690268
min          2.000000
25%         10.000000
50%         10.000000
75%         10.000000
max         10.000000
Name: review_scores_accuracy, dtype: float64

In [43]:
# Those varialbes have big range. Need to narrow down range to enable better modeling (take log10)


df_listings_refined['cleaning_fee'] = df_listings_refined['cleaning_fee'].apply(lambda x: np.log10(x) if x>0 else 0)
df_listings_refined['review_scores_rating_log10'] = df_listings_refined['review_scores_rating'].apply(lambda x: np.log10(x) if x>0 else 0)
df_listings_refined['review_scores_accuracy_log10'] = df_listings_refined['review_scores_accuracy'].apply(lambda x: np.log10(x) if x>0 else 0)
df_listings_refined['review_scores_checkin_log10'] = df_listings_refined['review_scores_checkin'].apply(lambda x: np.log10(x) if x>0 else 0)
df_listings_refined['review_scores_cleanliness_log10'] = df_listings_refined['review_scores_cleanliness'].apply(lambda x: np.log10(x) if x>0 else 0)
df_listings_refined['review_scores_communication_log10'] = df_listings_refined['review_scores_communication'].apply(lambda x: np.log10(x) if x>0 else 0)
df_listings_refined['review_scores_location_log10'] = df_listings_refined['review_scores_location'].apply(lambda x: np.log10(x) if x>0 else 0)
df_listings_refined['review_scores_value_log10'] = df_listings_refined['review_scores_value'].apply(lambda x: np.log10(x) if x>0 else 0)


In [44]:
# Drop original ratings

review_scores =['review_scores_rating','review_scores_accuracy',
                'review_scores_checkin','review_scores_cleanliness', 
                'review_scores_communication', 'review_scores_location', 
                'review_scores_value']
df_listings_refined.drop(review_scores, axis=1, inplace=True)

In [45]:
# Minimum nights can vary in big range. Convert it to boolean varialbes

df_listings_refined['minimum_nights'] = (df_listings_refined['minimum_nights']>1).astype(int)
df_listings_refined.minimum_nights.head()

listings_sf  0    0
             1    1
             2    1
             3    1
             5    1
Name: minimum_nights, dtype: int64

In [46]:
df_listings_refined.isnull().sum()

id                                        0
host_response_time                     1803
host_response_rate                     1803
host_is_superhost                         0
host_listings_count                       0
host_total_listings_count                 0
host_has_profile_pic                      0
host_identity_verified                    0
neighbourhood_cleansed                    0
city                                     14
is_location_exact                         0
property_type                             0
room_type                                 0
accommodates                              0
bathrooms                                30
bedrooms                                  4
beds                                     15
bed_type                                  0
price                                     0
security_deposit                          0
cleaning_fee                              0
guests_included                           0
minimum_nights                  

In [47]:
# Drop rest of Null values

df_listings_cleaned = df_listings_refined.dropna()

In [48]:
df_listings_cleaned.shape

(10352, 78)

In [49]:
df_listings_cleaned.isnull().sum()

id                                     0
host_response_time                     0
host_response_rate                     0
host_is_superhost                      0
host_listings_count                    0
host_total_listings_count              0
host_has_profile_pic                   0
host_identity_verified                 0
neighbourhood_cleansed                 0
city                                   0
is_location_exact                      0
property_type                          0
room_type                              0
accommodates                           0
bathrooms                              0
bedrooms                               0
beds                                   0
bed_type                               0
price                                  0
security_deposit                       0
cleaning_fee                           0
guests_included                        0
minimum_nights                         0
number_of_reviews                      0
jurisdiction_nam

In [50]:
df_listings_cleaned.columns

Index(['id', 'host_response_time', 'host_response_rate', 'host_is_superhost',
       'host_listings_count', 'host_total_listings_count',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'city', 'is_location_exact', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'bed_type', 'price', 'security_deposit', 'cleaning_fee',
       'guests_included', 'minimum_nights', 'number_of_reviews',
       'jurisdiction_names', 'instant_bookable', 'is_business_travel_ready',
       'cancellation_policy', 'require_guest_profile_picture',
       'require_guest_phone_verification', 'calculated_host_listings_count',
       'reviews_per_month', 'district', 'amenities_Air conditioning',
       'amenities_Bed linens', 'amenities_Cable TV',
       'amenities_Carbon monoxide detector', 'amenities_Coffee maker',
       'amenities_Cooking basics', 'amenities_Dishes and silverware',
       'amenities_Dryer', 'amenities_Essentials'

In [51]:
df_jurisdiction = df_listings_cleaned.loc[:, ['id', 'jurisdiction_names']]

In [52]:
# select non-numeric variables and create dummies

categorical_variables = df_listings_cleaned.select_dtypes(include=['object']).columns
df_listings_cleaned[categorical_variables].head()

Unnamed: 0,Unnamed: 1,host_response_time,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,city,is_location_exact,property_type,room_type,bed_type,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,district
listings_sf,0,within an hour,t,t,t,Western Addition,San Francisco,t,Apartment,Entire home/apt,Real Bed,SAN FRANCISCO,f,f,moderate,f,f,San Francisco
listings_sf,1,within an hour,f,t,t,Bernal Heights,San Francisco,t,Apartment,Entire home/apt,Real Bed,SAN FRANCISCO,f,f,strict_14_with_grace_period,f,f,San Francisco
listings_sf,2,within a few hours,f,t,t,Haight Ashbury,San Francisco,t,Apartment,Private room,Real Bed,SAN FRANCISCO,f,f,strict_14_with_grace_period,f,f,San Francisco
listings_sf,3,within a few hours,f,t,t,Haight Ashbury,San Francisco,t,Apartment,Private room,Real Bed,SAN FRANCISCO,f,f,strict_14_with_grace_period,f,f,San Francisco
listings_sf,5,within a day,f,t,f,Western Addition,San Francisco,t,Apartment,Entire home/apt,Real Bed,SAN FRANCISCO,f,f,moderate,f,f,San Francisco


In [53]:
categorical_variables.shape

(17,)

In [55]:
df_model = pd.get_dummies(data=df_listings_cleaned,columns=categorical_variables)

In [56]:
df_model.shape

(10352, 349)

In [57]:
df_model.columns

Index(['id', 'host_response_rate', 'host_listings_count',
       'host_total_listings_count', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'price', 'security_deposit',
       ...
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60',
       'require_guest_profile_picture_f', 'require_guest_profile_picture_t',
       'require_guest_phone_verification_f',
       'require_guest_phone_verification_t', 'district_Oakland',
       'district_San Francisco', 'district_Santa Clara',
       'district_Santa Cruz'],
      dtype='object', length=349)

In [58]:
# Create correlation matrix
corr_matrix = df_model.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

In [59]:
to_drop

['host_total_listings_count',
 'amenities_Refrigerator',
 'amenities_Stove',
 'amenities_Washer',
 'review_scores_checkin_log10',
 'review_scores_cleanliness_log10',
 'review_scores_communication_log10',
 'host_is_superhost_t',
 'host_has_profile_pic_t',
 'host_identity_verified_t',
 'city_Cupertino',
 'city_Palo Alto',
 'city_San Jose',
 'city_Santa Clara',
 'city_Sunnyvale',
 'is_location_exact_t',
 'room_type_Private room',
 'jurisdiction_names_Cupertino, CA',
 'jurisdiction_names_OAKLAND',
 'jurisdiction_names_PALO ALTO',
 'jurisdiction_names_SAN FRANCISCO',
 'jurisdiction_names_SAN JOSE',
 'jurisdiction_names_SANTA CLARA',
 'jurisdiction_names_Sunnyvale, CA',
 'instant_bookable_t',
 'require_guest_profile_picture_t',
 'require_guest_phone_verification_t',
 'district_Oakland',
 'district_San Francisco',
 'district_Santa Cruz']

In [60]:
# Drop features 
df_model.drop(to_drop, axis=1, inplace=True)

In [61]:
df_model.dropna(inplace=True)

In [62]:
df_model.columns

Index(['id', 'host_response_rate', 'host_listings_count', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'price', 'security_deposit',
       'cleaning_fee',
       ...
       'is_business_travel_ready_f', 'cancellation_policy_flexible',
       'cancellation_policy_moderate', 'cancellation_policy_strict',
       'cancellation_policy_strict_14_with_grace_period',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60',
       'require_guest_profile_picture_f', 'require_guest_phone_verification_f',
       'district_Santa Clara'],
      dtype='object', length=319)

In [63]:
# Store as csv file for feature investigation

csv_path = '/Users/xzhou/github/project_archives/files_airbnb/columns_check.csv'

df_model.to_csv(csv_path)

In [64]:
# Store the complete set of features

pkl_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.csv'

pkl_listings = pkl_file.format('all_listings')

df_model.to_pickle(pkl_listings)

In [65]:
# Perform initial modelings to get a sense of performance, and for future selection

X = df_model.drop(['id','price'], axis=1)
y = df_model['price']

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state=42)

In [67]:
ss = StandardScaler()
X_train_trans = ss.fit_transform(X_train)
X_test_trans = ss.transform (X_test)

In [73]:
model = sm.OLS(y_train, sm.add_constant(X_train_trans))
fit = model.fit()
fit.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.678
Model:,OLS,Adj. R-squared:,0.664
Method:,Least Squares,F-statistic:,49.07
Date:,"Mon, 17 Sep 2018",Prob (F-statistic):,0.0
Time:,00:40:11,Log-Likelihood:,-39468.0
No. Observations:,7246,AIC:,79530.0
Df Residuals:,6947,BIC:,81590.0
Df Model:,298,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,153.6855,0.674,228.156,0.000,152.365,155.006
x1,1.4151,1.181,1.198,0.231,-0.900,3.730
x2,3.4840,0.965,3.609,0.000,1.592,5.376
x3,24.3231,1.621,15.007,0.000,21.146,27.500
x4,5.2574,0.861,6.107,0.000,3.570,6.945
x5,26.9007,1.278,21.041,0.000,24.394,29.407
x6,-2.3378,1.390,-1.682,0.093,-5.063,0.387
x7,-1.2307,0.737,-1.669,0.095,-2.676,0.214
x8,4.6833,0.873,5.366,0.000,2.972,6.394

0,1,2,3
Omnibus:,1610.101,Durbin-Watson:,2.018
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6654.485
Skew:,1.041,Prob(JB):,0.0
Kurtosis:,7.208,Cond. No.,4.56e+16


In [74]:
X_train_trans

array([[ 0.29889922, -0.13925227, -0.09659807, ...,  0.2152993 ,
         0.2359082 , -0.60786131],
       [ 0.29889922, -0.13925227, -0.59126208, ...,  0.2152993 ,
         0.2359082 , -0.60786131],
       [ 0.29889922, -0.16103505, -0.09659807, ...,  0.2152993 ,
         0.2359082 ,  1.64511211],
       ...,
       [ 0.29889922, -0.12836088, -1.0859261 , ...,  0.2152993 ,
         0.2359082 , -0.60786131],
       [ 0.29889922, -0.13925227, -0.59126208, ...,  0.2152993 ,
         0.2359082 , -0.60786131],
       [ 0.29889922, -0.13925227, -0.59126208, ...,  0.2152993 ,
         0.2359082 ,  1.64511211]])

In [75]:
rf = RandomForestRegressor(n_estimators=500, 
                               criterion='mse', 
                               random_state=3, 
                               n_jobs=-1)
rf.fit(X_train_trans, y_train)
y_train_pred = rf.predict(X_train_trans)
y_test_pred = rf.predict(X_test_trans)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 405.718, test: 2884.882
R^2 train: 0.959, test: 0.697


In [76]:
feature_labels = X.columns
feature_tuples = zip (feature_labels, rf.feature_importances_)
feature_lists = [list(feature_tuple) for feature_tuple in feature_tuples]

In [77]:
feature_lists.sort(key=lambda x: abs(x[1]), reverse=True)

In [78]:
feature_lists

[['bedrooms', 0.4395051810776777],
 ['cleaning_fee', 0.07548634741665643],
 ['room_type_Entire home/apt', 0.05210216196428506],
 ['reviews_per_month', 0.030616943326388334],
 ['city_San Francisco', 0.023855576753102584],
 ['city_Oakland', 0.021439991857963716],
 ['bathrooms', 0.020062816705203445],
 ['accommodates', 0.019963979294194033],
 ['number_of_reviews', 0.019683442343555495],
 ['host_listings_count', 0.016574790484920176],
 ['review_scores_rating_log10', 0.016220977738156356],
 ['host_yrs', 0.015688093642184035],
 ['guests_included', 0.012770873289267017],
 ['calculated_host_listings_count', 0.010171681964214344],
 ['review_scores_location_log10', 0.009198141142719852],
 ['beds', 0.008426475074116814],
 ['yrs_since_first_review', 0.007850830851113651],
 ['host_response_rate', 0.006379545300177394],
 ['amenities_Air conditioning', 0.0060425767787575845],
 ['review_scores_value_log10', 0.005128262103464437],
 ['jurisdiction_names_Santa Cruz County, CA', 0.004891423000971399],
 ['

In [79]:
# Keep features that have above 0.003 thresold

features_to_keep=[]

for (feature, importance) in feature_lists:
    if importance>=0.003:
        features_to_keep.append(feature)
    else:
        break

features_to_keep

['bedrooms',
 'cleaning_fee',
 'room_type_Entire home/apt',
 'reviews_per_month',
 'city_San Francisco',
 'city_Oakland',
 'bathrooms',
 'accommodates',
 'number_of_reviews',
 'host_listings_count',
 'review_scores_rating_log10',
 'host_yrs',
 'guests_included',
 'calculated_host_listings_count',
 'review_scores_location_log10',
 'beds',
 'yrs_since_first_review',
 'host_response_rate',
 'amenities_Air conditioning',
 'review_scores_value_log10',
 'jurisdiction_names_Santa Cruz County, CA',
 'amenities_Cable TV',
 'neighbourhood_cleansed_Palo Alto',
 'host_is_superhost_f',
 'amenities_Lock on bedroom door',
 'amenities_Private entrance',
 'amenities_Familykid friendly',
 'district_Santa Clara',
 'neighbourhood_cleansed_San Jose',
 'host_identity_verified_f']

In [80]:
len(features_to_keep)

30

In [81]:
no_features_kept = len(features_to_keep)
features_w_importance = feature_lists[0: no_features_kept]
features_w_importance

[['bedrooms', 0.4395051810776777],
 ['cleaning_fee', 0.07548634741665643],
 ['room_type_Entire home/apt', 0.05210216196428506],
 ['reviews_per_month', 0.030616943326388334],
 ['city_San Francisco', 0.023855576753102584],
 ['city_Oakland', 0.021439991857963716],
 ['bathrooms', 0.020062816705203445],
 ['accommodates', 0.019963979294194033],
 ['number_of_reviews', 0.019683442343555495],
 ['host_listings_count', 0.016574790484920176],
 ['review_scores_rating_log10', 0.016220977738156356],
 ['host_yrs', 0.015688093642184035],
 ['guests_included', 0.012770873289267017],
 ['calculated_host_listings_count', 0.010171681964214344],
 ['review_scores_location_log10', 0.009198141142719852],
 ['beds', 0.008426475074116814],
 ['yrs_since_first_review', 0.007850830851113651],
 ['host_response_rate', 0.006379545300177394],
 ['amenities_Air conditioning', 0.0060425767787575845],
 ['review_scores_value_log10', 0.005128262103464437],
 ['jurisdiction_names_Santa Cruz County, CA', 0.004891423000971399],
 ['

In [82]:
# Perform another model(Random Forest) based on selected features

X_updated = df_model[features_to_keep]
y_updated = df_model['price']

In [83]:
X_updated_train, X_updated_test, y_updated_train, y_updated_test = train_test_split(
    X_updated, y_updated, test_size =0.3, random_state=42)

In [84]:
ssX2 = StandardScaler()
X_updated_train_trans = ssX2.fit_transform(X_updated_train)
X_updated_test_trans = ssX2.transform (X_updated_test)

In [85]:
rf_updated = RandomForestRegressor(n_estimators=500, 
                                   criterion='mse', 
                                   random_state=3, 
                                   n_jobs=-1)
rf_updated.fit(X_updated_train_trans, y_updated_train)
y_updated_train_pred = rf_updated.predict(X_updated_train_trans)
y_updated_test_pred = rf_updated.predict(X_updated_test_trans)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_updated_train, y_updated_train_pred),
        mean_squared_error(y_updated_test, y_updated_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_updated_train, y_updated_train_pred),
        r2_score(y_updated_test, y_updated_test_pred)))

MSE train: 415.273, test: 3053.745
R^2 train: 0.958, test: 0.679


In [109]:
columns =['id']
columns.extend(features_to_keep)

In [110]:
df_listings_top_features = df_model[columns]
df_listings_top_features.head()

Unnamed: 0,Unnamed: 1,id,bedrooms,cleaning_fee,room_type_Entire home/apt,reviews_per_month,city_San Francisco,city_Oakland,bathrooms,accommodates,number_of_reviews,...,"jurisdiction_names_Santa Cruz County, CA",amenities_Cable TV,neighbourhood_cleansed_Palo Alto,host_is_superhost_f,amenities_Lock on bedroom door,amenities_Private entrance,amenities_Familykid friendly,district_Santa Clara,neighbourhood_cleansed_San Jose,host_identity_verified_f
listings_sf,0,958,1.0,2.0,1,1.38,1,0,1.0,3,152,...,0,1,0,0,0,1,1,0,0,0
listings_sf,1,5858,2.0,2.0,1,0.99,1,0,1.0,5,112,...,0,0,0,1,0,1,1,0,0,0
listings_sf,2,7918,1.0,1.69897,0,0.16,1,0,4.0,2,17,...,0,0,0,1,1,1,0,0,0,0
listings_sf,3,8142,1.0,1.69897,0,0.15,1,0,4.0,2,7,...,0,0,0,1,1,1,1,0,0,0
listings_sf,5,8567,2.0,2.09691,1,0.27,1,0,1.0,6,30,...,0,1,0,1,0,0,1,0,0,1


In [111]:
# Add District informaiton for future use

df_listings_top_features = pd.merge(left=df_listings_top_features, 
                                    right=df_district, 
                                    how ='left', 
                                    left_on='id',
                                    right_on='id')

df_listings_top_features.head()

Unnamed: 0,id,bedrooms,cleaning_fee,room_type_Entire home/apt,reviews_per_month,city_San Francisco,city_Oakland,bathrooms,accommodates,number_of_reviews,...,amenities_Cable TV,neighbourhood_cleansed_Palo Alto,host_is_superhost_f,amenities_Lock on bedroom door,amenities_Private entrance,amenities_Familykid friendly,district_Santa Clara,neighbourhood_cleansed_San Jose,host_identity_verified_f,district
0,958,1.0,2.0,1,1.38,1,0,1.0,3,152,...,1,0,0,0,1,1,0,0,0,San Francisco
1,5858,2.0,2.0,1,0.99,1,0,1.0,5,112,...,0,0,1,0,1,1,0,0,0,San Francisco
2,7918,1.0,1.69897,0,0.16,1,0,4.0,2,17,...,0,0,1,1,1,0,0,0,0,San Francisco
3,8142,1.0,1.69897,0,0.15,1,0,4.0,2,7,...,0,0,1,1,1,1,0,0,0,San Francisco
4,8567,2.0,2.09691,1,0.27,1,0,1.0,6,30,...,1,0,1,0,0,1,0,0,1,San Francisco


In [112]:
# Add Jurisdition informaiton for future use

df_listings_top_features = pd.merge(left=df_listings_top_features, 
                                    right=df_jurisdiction, 
                                    how ='left', 
                                    left_on='id',
                                    right_on='id')

df_listings_top_features.head()

Unnamed: 0,id,bedrooms,cleaning_fee,room_type_Entire home/apt,reviews_per_month,city_San Francisco,city_Oakland,bathrooms,accommodates,number_of_reviews,...,neighbourhood_cleansed_Palo Alto,host_is_superhost_f,amenities_Lock on bedroom door,amenities_Private entrance,amenities_Familykid friendly,district_Santa Clara,neighbourhood_cleansed_San Jose,host_identity_verified_f,district,jurisdiction_names
0,958,1.0,2.0,1,1.38,1,0,1.0,3,152,...,0,0,0,1,1,0,0,0,San Francisco,SAN FRANCISCO
1,5858,2.0,2.0,1,0.99,1,0,1.0,5,112,...,0,1,0,1,1,0,0,0,San Francisco,SAN FRANCISCO
2,7918,1.0,1.69897,0,0.16,1,0,4.0,2,17,...,0,1,1,1,0,0,0,0,San Francisco,SAN FRANCISCO
3,8142,1.0,1.69897,0,0.15,1,0,4.0,2,7,...,0,1,1,1,1,0,0,0,San Francisco,SAN FRANCISCO
4,8567,2.0,2.09691,1,0.27,1,0,1.0,6,30,...,0,1,0,0,1,0,0,1,San Francisco,SAN FRANCISCO


In [113]:
# Check for null values before storing

df_listings_top_features.isnull().sum()

id                                          0
bedrooms                                    0
cleaning_fee                                0
room_type_Entire home/apt                   0
reviews_per_month                           0
city_San Francisco                          0
city_Oakland                                0
bathrooms                                   0
accommodates                                0
number_of_reviews                           0
host_listings_count                         0
review_scores_rating_log10                  0
host_yrs                                    0
guests_included                             0
calculated_host_listings_count              0
review_scores_location_log10                0
beds                                        0
yrs_since_first_review                      0
host_response_rate                          0
amenities_Air conditioning                  0
review_scores_value_log10                   0
jurisdiction_names_Santa Cruz Coun

In [114]:
# Rename columns used for furture reference to avoid confusion

df_listings_top_features.rename(columns={'district':'ref_district',
                                        'jurisdiction_names': 'ref_jurisdiction'},
                               inplace=True)
df_listings_top_features

Unnamed: 0,id,bedrooms,cleaning_fee,room_type_Entire home/apt,reviews_per_month,city_San Francisco,city_Oakland,bathrooms,accommodates,number_of_reviews,...,neighbourhood_cleansed_Palo Alto,host_is_superhost_f,amenities_Lock on bedroom door,amenities_Private entrance,amenities_Familykid friendly,district_Santa Clara,neighbourhood_cleansed_San Jose,host_identity_verified_f,ref_district,ref_jurisdiction
0,958,1.0,2.000000,1,1.38,1,0,1.0,3,152,...,0,0,0,1,1,0,0,0,San Francisco,SAN FRANCISCO
1,5858,2.0,2.000000,1,0.99,1,0,1.0,5,112,...,0,1,0,1,1,0,0,0,San Francisco,SAN FRANCISCO
2,7918,1.0,1.698970,0,0.16,1,0,4.0,2,17,...,0,1,1,1,0,0,0,0,San Francisco,SAN FRANCISCO
3,8142,1.0,1.698970,0,0.15,1,0,4.0,2,7,...,0,1,1,1,1,0,0,0,San Francisco,SAN FRANCISCO
4,8567,2.0,2.096910,1,0.27,1,0,1.0,6,30,...,0,1,0,0,1,0,0,1,San Francisco,SAN FRANCISCO
5,8739,1.0,1.698970,0,5.41,1,0,1.0,3,594,...,0,0,0,0,1,0,0,1,San Francisco,SAN FRANCISCO
6,9225,1.0,1.698970,0,3.84,1,0,1.0,2,411,...,0,1,0,0,0,0,0,0,San Francisco,SAN FRANCISCO
7,10251,2.0,2.000000,1,2.69,1,0,1.0,6,291,...,0,0,0,1,1,0,0,0,San Francisco,SAN FRANCISCO
8,10578,0.0,1.875061,1,0.22,1,0,1.0,2,18,...,0,1,0,0,0,0,0,0,San Francisco,SAN FRANCISCO
9,10819,3.0,2.318063,1,0.34,1,0,2.0,4,22,...,0,1,0,1,1,0,0,0,San Francisco,SAN FRANCISCO


In [116]:
# Store processed data with selected features for future use

pkl_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.pkl'

pkl_path = pkl_file.format('listings_with_selected_features')

df_listings_top_features.to_pickle(pkl_path)