In [71]:
import pandas as pd
import csv
import re
import statsmodels.api as sm
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.ensemble import RandomForestRegressor

In [4]:
file_url = '/Users/xzhou/github/project_archives/files_airbnb/{}/calendar.csv'

sf_file = file_url.format('sf_airbnb')
ok_file = file_url.format('ok_airbnb')
sc_file = file_url.format('sc_airbnb')
scz_file = file_url.format('scz_airbnb')

In [5]:
pricing_sf = pd.read_csv(sf_file, parse_dates=['date'])
pricing_ok = pd.read_csv(ok_file, parse_dates=['date'])
pricing_sc = pd.read_csv(sc_file, parse_dates=['date'])
pricing_scz = pd.read_csv(scz_file, parse_dates=['date'])

In [6]:
print(pricing_sf.shape, pricing_ok.shape, pricing_sc.shape, pricing_scz.shape)

(2420680, 4) (1057770, 4) (2068820, 4) (573050, 4)


In [7]:
frames = [pricing_sf, pricing_ok, pricing_sc, pricing_scz]
keys = ['pricing_sf', 'pricing_ok', 'pricing_sc', 'pricing_scz']

In [8]:
df_pricing = pd.concat(frames, keys=keys)

In [9]:
df_pricing.listing_id.nunique()

16768

In [10]:
df_pricing.head(20)

Unnamed: 0,Unnamed: 1,listing_id,date,available,price
pricing_sf,0,958,2019-08-05,f,
pricing_sf,1,958,2019-08-04,f,
pricing_sf,2,958,2019-08-03,f,
pricing_sf,3,958,2019-08-02,f,
pricing_sf,4,958,2019-08-01,f,
pricing_sf,5,958,2019-07-31,f,
pricing_sf,6,958,2019-07-30,f,
pricing_sf,7,958,2019-07-29,f,
pricing_sf,8,958,2019-07-28,f,
pricing_sf,9,958,2019-07-27,f,


In [11]:
# Filter down to houses that are available and have listing prices

df_pricing_smaller = df_pricing[df_pricing['available']=='t']
df_pricing_smaller = df_pricing_smaller.copy()

In [12]:
df_pricing_smaller.shape

(2853357, 4)

In [13]:
df_pricing_smaller.listing_id.nunique()

14770

In [14]:
df_pricing_smaller.date.describe(include='all')

count                 2853357
unique                    389
top       2018-11-03 00:00:00
freq                    10393
first     2018-08-06 00:00:00
last      2019-08-29 00:00:00
Name: date, dtype: object

In [15]:
df_pricing_smaller.columns

Index(['listing_id', 'date', 'available', 'price'], dtype='object')

In [16]:
def currency_to_float(string):
    """
    Convert currency to float
    """
    float_ = float(str(string).strip('$').replace(',',''))
    return float_

In [17]:
# Convert price to float format

df_pricing_smaller['price'] = df_pricing_smaller['price'].apply(lambda x: currency_to_float(x))

In [18]:
# Add a month column based on date

df_pricing_smaller['month'] = df_pricing_smaller['date'].dt.month

In [19]:
# Add a day_of_week column based on Date. Use default format: Monday=0, Sunday=6

df_pricing_smaller['day_of_week'] = df_pricing_smaller['date'].dt.weekday

In [20]:
df_pricing_smaller.head()

Unnamed: 0,Unnamed: 1,listing_id,date,available,price,month,day_of_week
pricing_sf,63,958,2019-05-02,t,181.0,5,3
pricing_sf,64,958,2019-05-01,t,181.0,5,2
pricing_sf,65,958,2019-04-30,t,181.0,4,1
pricing_sf,66,958,2019-04-29,t,181.0,4,0
pricing_sf,67,958,2019-04-28,t,181.0,4,6


In [21]:
df_pricing_smaller.shape

(2853357, 6)

In [22]:
df_pricing_smaller.describe(include ='all')

Unnamed: 0,listing_id,date,available,price,month,day_of_week
count,2853357.0,2853357,2853357,2853357.0,2853357.0,2853357.0
unique,,389,1,,,
top,,2018-11-03 00:00:00,t,,,
freq,,10393,2853357,,,
first,,2018-08-06 00:00:00,,,,
last,,2019-08-29 00:00:00,,,,
mean,14055230.0,,,213.2333,6.571592,2.988646
std,8563144.0,,,703.5042,3.629179,2.003061
min,958.0,,,10.0,1.0,0.0
25%,6453774.0,,,80.0,3.0,1.0


In [23]:
# Count how many months each house was listed in the last year

df_month_count = df_pricing_smaller.groupby(['listing_id']).month.nunique().reset_index()
df_month_count.rename(columns={'month':'month_count'}, inplace=True)
df_month_count.head()

Unnamed: 0,listing_id,month_count
0,958,10
1,3083,5
2,3264,3
3,5739,2
4,5858,12


In [24]:
# Count how many days of week each house was listed in the last year

df_day_of_week_count = df_pricing_smaller.groupby(['listing_id']).day_of_week.nunique().reset_index()
df_day_of_week_count.rename(columns={'day_of_week':'day_of_week_count'}, inplace=True)
df_day_of_week_count.head()

Unnamed: 0,listing_id,day_of_week_count
0,958,7
1,3083,7
2,3264,7
3,5739,7
4,5858,7


In [25]:
df_pricing_with_counts = pd.merge(left=df_pricing_smaller, 
                                  right=df_month_count,
                                  how='left', 
                                  left_on='listing_id', 
                                  right_on='listing_id')
df_pricing_with_counts.head()

Unnamed: 0,listing_id,date,available,price,month,day_of_week,month_count
0,958,2019-05-02,t,181.0,5,3,10
1,958,2019-05-01,t,181.0,5,2,10
2,958,2019-04-30,t,181.0,4,1,10
3,958,2019-04-29,t,181.0,4,0,10
4,958,2019-04-28,t,181.0,4,6,10


In [26]:
df_pricing_with_counts = pd.merge(left=df_pricing_with_counts, 
                                  right=df_day_of_week_count,
                                  how='left', 
                                  left_on='listing_id', 
                                  right_on='listing_id')
df_pricing_with_counts.head()

Unnamed: 0,listing_id,date,available,price,month,day_of_week,month_count,day_of_week_count
0,958,2019-05-02,t,181.0,5,3,10,7
1,958,2019-05-01,t,181.0,5,2,10,7
2,958,2019-04-30,t,181.0,4,1,10,7
3,958,2019-04-29,t,181.0,4,0,10,7
4,958,2019-04-28,t,181.0,4,6,10,7


In [27]:
df_pricing_with_counts.describe(include='all')

Unnamed: 0,listing_id,date,available,price,month,day_of_week,month_count,day_of_week_count
count,2853357.0,2853357,2853357,2853357.0,2853357.0,2853357.0,2853357.0,2853357.0
unique,,389,1,,,,,
top,,2018-11-03 00:00:00,t,,,,,
freq,,10393,2853357,,,,,
first,,2018-08-06 00:00:00,,,,,,
last,,2019-08-29 00:00:00,,,,,,
mean,14055230.0,,,213.2333,6.571592,2.988646,10.20866,6.995392
std,8563144.0,,,703.5042,3.629179,2.003061,2.696574,0.1206497
min,958.0,,,10.0,1.0,0.0,1.0,1.0
25%,6453774.0,,,80.0,3.0,1.0,9.0,7.0


In [28]:
df_pricing_with_counts.price.describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

count    2.853357e+06
mean     2.132333e+02
std      7.035042e+02
min      1.000000e+01
5%       4.100000e+01
10%      5.400000e+01
25%      8.000000e+01
50%      1.350000e+02
75%      2.250000e+02
90%      3.850000e+02
95%      5.500000e+02
max      1.024520e+05
Name: price, dtype: float64

In [29]:
# Remove outliers for pricing. Use 5% and 95% percentiles as cutoff

df_pricing_with_counts = df_pricing_with_counts[(df_pricing_with_counts.price>=40)
                                                & (df_pricing_with_counts.price<=550)]

In [30]:
df_pricing_with_counts.shape

(2599154, 8)

In [31]:
# Drill down to houses that price available in 12 months and 7 days a week

df_pricing_available = df_pricing_with_counts[
                                              (df_pricing_with_counts['month_count']==12)
                                              & (df_pricing_with_counts['day_of_week_count']==7)
                                             ]

In [32]:
df_pricing_available.shape

(1497091, 8)

In [33]:
df_pricing_available.price.describe()

count    1.497091e+06
mean     1.652814e+02
std      1.077519e+02
min      4.000000e+01
25%      8.500000e+01
50%      1.350000e+02
75%      2.130000e+02
max      5.500000e+02
Name: price, dtype: float64

In [34]:
df_pricing_available.isnull().sum()

listing_id           0
date                 0
available            0
price                0
month                0
day_of_week          0
month_count          0
day_of_week_count    0
dtype: int64

In [35]:
# Group by month by weekday, and calculate an average pricing

df_pricing_by_month_weekday = df_pricing_available.groupby(['listing_id', 'month', 'day_of_week']) \
                              .price.mean()  \
                              .reset_index()
df_pricing_by_month_weekday.head()

Unnamed: 0,listing_id,month,day_of_week,price
0,5858,1,0,235.0
1,5858,1,1,235.0
2,5858,1,2,235.0
3,5858,1,3,235.0
4,5858,1,4,235.0


In [36]:
df_pricing_by_month_weekday.describe(include='all')

Unnamed: 0,listing_id,month,day_of_week,price
count,370857.0,370857.0,370857.0,370857.0
mean,13507310.0,6.455588,3.004762,165.770394
std,8696651.0,3.462933,2.003016,107.889496
min,5858.0,1.0,0.0,40.0
25%,5632672.0,3.0,1.0,85.0
50%,13782240.0,6.0,3.0,135.0
75%,21261870.0,10.0,5.0,215.0
max,28152470.0,12.0,6.0,550.0


In [37]:
# Check for null values if any

df_pricing_by_month_weekday.isnull().sum()

listing_id     0
month          0
day_of_week    0
price          0
dtype: int64

In [38]:
# Rename the average price column to avoid confusion

df_pricing_by_month_weekday.rename(columns={'price':'price_by_month_week'}, inplace=True)

In [39]:
df_pricing_by_month_weekday.shape

(370857, 4)

In [40]:
def get_month(number):
    """
    Given a number, return the name of month.
    
    """
    month_lookup = {1:'Jan.', 2:'Feb.', 3:'Mar.', 4:'Apr.', 5:'May.', 6:'Jun.',
                    7:'Jul.', 8:'Aug.', 9:'Sep.', 10:'Oct.', 11:'Nov.', 12:'Dec.'}
    
    return month_lookup[number]

In [41]:
def get_weekdays(number):
    """
    Given a number, return the name of month.
    
    """
    weekday_lookup = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 
                      4:'Friday', 5:'Saturday', 6:'Sunday'}
    
    return weekday_lookup[number]

In [42]:
df_pricing_by_month_weekday['month'] = df_pricing_by_month_weekday['month']  \
                                       .apply(lambda x: get_month(x))

In [43]:
df_pricing_by_month_weekday['day_of_week'] = df_pricing_by_month_weekday['day_of_week']  \
                                             .apply(lambda x: get_weekdays(x))

In [57]:
# This is to create visual to show seasonality

df_visual = pd.merge(left=df_pricing_by_month_weekday, 
                    right=df_listings_reviews,
                    how='inner',
                    left_on='listing_id',
                    right_on='id')

In [58]:
df_visual.shape

(245629, 37)

In [59]:
df_visual.columns

Index(['listing_id', 'month', 'day_of_week', 'price_by_month_week', 'id',
       'bedrooms', 'cleaning_fee', 'room_type_Entire home/apt',
       'reviews_per_month', 'city_San Francisco', 'city_Oakland', 'bathrooms',
       'accommodates', 'number_of_reviews', 'host_listings_count',
       'review_scores_rating_log10', 'host_yrs', 'guests_included',
       'calculated_host_listings_count', 'review_scores_location_log10',
       'beds', 'yrs_since_first_review', 'host_response_rate',
       'amenities_Air conditioning', 'review_scores_value_log10',
       'jurisdiction_names_Santa Cruz County, CA', 'amenities_Cable TV',
       'neighbourhood_cleansed_Palo Alto', 'host_is_superhost_f',
       'amenities_Lock on bedroom door', 'amenities_Private entrance',
       'amenities_Familykid friendly', 'district_Santa Clara',
       'neighbourhood_cleansed_San Jose', 'host_identity_verified_f',
       'ref_district', 'ref_jurisdiction'],
      dtype='object')

In [60]:
df_visual_entire_apartment = df_visual[df_visual['room_type_Entire home/apt']==1 ]
df_visual_single_room = df_visual[df_visual['room_type_Entire home/apt']!=1 ]

print(df_visual_entire_apartment.shape, df_visual_single_room.shape )

(135866, 37) (109763, 37)


In [61]:
df_visual_entire_apartment.ref_district.describe

<bound method NDFrame.describe of 0         San Francisco
1         San Francisco
2         San Francisco
3         San Francisco
4         San Francisco
5         San Francisco
6         San Francisco
7         San Francisco
8         San Francisco
9         San Francisco
10        San Francisco
11        San Francisco
12        San Francisco
13        San Francisco
14        San Francisco
15        San Francisco
16        San Francisco
17        San Francisco
18        San Francisco
19        San Francisco
20        San Francisco
21        San Francisco
22        San Francisco
23        San Francisco
24        San Francisco
25        San Francisco
26        San Francisco
27        San Francisco
28        San Francisco
29        San Francisco
              ...      
245599      Santa Clara
245600      Santa Clara
245601      Santa Clara
245602      Santa Clara
245603      Santa Clara
245604      Santa Clara
245605      Santa Clara
245606      Santa Clara
245607      Santa Clara
245608

In [62]:
df_visual_entire_apartment_sf = df_visual_entire_apartment[df_visual_entire_apartment['ref_district']=='San Francisco']
df_visual_entire_apartment_scz = df_visual_entire_apartment[df_visual_entire_apartment['ref_district']=='Santa Clara']

print(df_visual_entire_apartment_sf.shape, df_visual_entire_apartment_scz.shape )

(56047, 37) (35034, 37)


In [63]:
df_visual_entire_apartment_sf.id.nunique()

695

In [64]:
df_visual_entire_apartment_scz.id.nunique()

422

In [66]:
# Weekly pattern for entire apartment

df_week_entire_apartment = df_visual_entire_apartment.groupby(['ref_district', 'day_of_week'])  \
                                                     .price_by_month_week.mean()
df_week_entire_apartment

ref_district   day_of_week
Oakland        Friday         159.474027
               Monday         154.944938
               Saturday       159.725316
               Sunday         155.161974
               Thursday       155.060959
               Tuesday        154.571370
               Wednesday      154.744414
San Francisco  Friday         211.870899
               Monday         204.897288
               Saturday       211.966729
               Sunday         205.295141
               Thursday       204.853625
               Tuesday        205.165253
               Wednesday      205.049197
Santa Clara    Friday         209.905996
               Monday         208.723240
               Saturday       209.942656
               Sunday         208.205175
               Thursday       209.189709
               Tuesday        209.198270
               Wednesday      209.493972
Santa Cruz     Friday         279.173827
               Monday         265.707357
               Saturday       

In [69]:
# Weekly pattern for single room

df_week_single_room = df_visual_single_room.groupby(['ref_district', 'day_of_week'])  \
                                           .price_by_month_week.mean()
df_week_single_room

ref_district   day_of_week
Oakland        Friday          77.408285
               Monday          75.695416
               Saturday        77.407270
               Sunday          75.643985
               Thursday        75.771951
               Tuesday         75.718651
               Wednesday       75.716536
San Francisco  Friday         121.290236
               Monday         115.501022
               Saturday       121.449297
               Sunday         115.908191
               Thursday       115.840256
               Tuesday        115.686374
               Wednesday      115.818529
Santa Clara    Friday          82.181104
               Monday          80.824664
               Saturday        82.253229
               Sunday          80.811711
               Thursday        80.821351
               Tuesday         80.761868
               Wednesday       80.724975
Santa Cruz     Friday         127.845906
               Monday         122.235648
               Saturday       

In [70]:
# Montly Pattern for entire apartment

df_visual_entire_apartment.groupby(['ref_district', 'month']).price_by_month_week.mean()

ref_district   month
Oakland        Apr.     158.153970
               Aug.     157.264184
               Dec.     153.305877
               Feb.     156.082218
               Jan.     153.080845
               Jul.     159.672266
               Jun.     159.677606
               Mar.     157.705606
               May.     159.250697
               Nov.     151.698238
               Oct.     151.726567
               Sep.     156.919118
San Francisco  Apr.     207.472221
               Aug.     209.898199
               Dec.     203.178067
               Feb.     207.013189
               Jan.     205.855001
               Jul.     209.035882
               Jun.     208.817503
               Mar.     207.347120
               May.     207.832853
               Nov.     202.567577
               Oct.     206.984157
               Sep.     208.986456
Santa Clara    Apr.     211.766231
               Aug.     208.364194
               Dec.     205.188419
               Feb.     210.138842

In [71]:
# Montly Pattern for single room

df_visual_single_room.groupby(['ref_district', 'month']).price_by_month_week.mean()

ref_district   month
Oakland        Apr.      77.677205
               Aug.      76.846752
               Dec.      74.175045
               Feb.      76.617210
               Jan.      74.548583
               Jul.      77.792769
               Jun.      77.792769
               Mar.      77.669665
               May.      77.878307
               Nov.      73.508439
               Oct.      74.034876
               Sep.      75.448894
San Francisco  Apr.     118.221302
               Aug.     120.163396
               Dec.     112.868533
               Feb.     116.927029
               Jan.     114.475622
               Jul.     120.691553
               Jun.     120.156573
               Mar.     118.046094
               May.     119.168788
               Nov.     113.456209
               Oct.     116.937953
               Sep.     117.467900
Santa Clara    Apr.      82.440170
               Aug.      80.775148
               Dec.      79.769163
               Feb.      82.211255

In [46]:
df_pricing_final = pd.get_dummies(df_pricing_by_month_weekday, 
                                  prefix='',
                                  prefix_sep='',
                                  columns=['month', 'day_of_week'])

In [47]:
df_pricing_final.columns

Index(['listing_id', 'price_by_month_week', 'Apr.', 'Aug.', 'Dec.', 'Feb.',
       'Jan.', 'Jul.', 'Jun.', 'Mar.', 'May.', 'Nov.', 'Oct.', 'Sep.',
       'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
       'Wednesday'],
      dtype='object')

In [48]:
df_pricing_final.shape

(370857, 21)

In [52]:
pkl_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.pkl'

pkl_path = pkl_file.format('listings_with_selected_features')

df_listings_reviews = pd.read_pickle(pkl_path)
df_listings_reviews.head()

Unnamed: 0,id,bedrooms,cleaning_fee,room_type_Entire home/apt,reviews_per_month,city_San Francisco,city_Oakland,bathrooms,accommodates,number_of_reviews,...,neighbourhood_cleansed_Palo Alto,host_is_superhost_f,amenities_Lock on bedroom door,amenities_Private entrance,amenities_Familykid friendly,district_Santa Clara,neighbourhood_cleansed_San Jose,host_identity_verified_f,ref_district,ref_jurisdiction
0,958,1.0,2.0,1,1.38,1,0,1.0,3,152,...,0,0,0,1,1,0,0,0,San Francisco,SAN FRANCISCO
1,5858,2.0,2.0,1,0.99,1,0,1.0,5,112,...,0,1,0,1,1,0,0,0,San Francisco,SAN FRANCISCO
2,7918,1.0,1.69897,0,0.16,1,0,4.0,2,17,...,0,1,1,1,0,0,0,0,San Francisco,SAN FRANCISCO
3,8142,1.0,1.69897,0,0.15,1,0,4.0,2,7,...,0,1,1,1,1,0,0,0,San Francisco,SAN FRANCISCO
4,8567,2.0,2.09691,1,0.27,1,0,1.0,6,30,...,0,1,0,0,1,0,0,1,San Francisco,SAN FRANCISCO


In [53]:
df_listings_reviews.shape

(10352, 33)

In [54]:
df_final = pd.merge(left=df_pricing_final, 
                    right=df_listings_reviews,
                    how='inner',
                    left_on='listing_id',
                    right_on='id')

In [55]:
df_final.shape

(245629, 54)

In [56]:
df_final.isnull().sum()

listing_id                                  0
price_by_month_week                         0
Apr.                                        0
Aug.                                        0
Dec.                                        0
Feb.                                        0
Jan.                                        0
Jul.                                        0
Jun.                                        0
Mar.                                        0
May.                                        0
Nov.                                        0
Oct.                                        0
Sep.                                        0
Friday                                      0
Monday                                      0
Saturday                                    0
Sunday                                      0
Thursday                                    0
Tuesday                                     0
Wednesday                                   0
id                                

In [50]:
df_final.shape

(245629, 54)

In [52]:
df_final.columns

Index(['listing_id', 'price_by_month_week', 'Apr.', 'Aug.', 'Dec.', 'Feb.',
       'Jan.', 'Jul.', 'Jun.', 'Mar.', 'May.', 'Nov.', 'Oct.', 'Sep.',
       'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
       'Wednesday', 'id', 'bedrooms', 'cleaning_fee',
       'room_type_Entire home/apt', 'reviews_per_month', 'city_San Francisco',
       'city_Oakland', 'bathrooms', 'accommodates', 'number_of_reviews',
       'host_listings_count', 'review_scores_rating_log10', 'host_yrs',
       'guests_included', 'calculated_host_listings_count',
       'review_scores_location_log10', 'beds', 'yrs_since_first_review',
       'host_response_rate', 'amenities_Air conditioning',
       'review_scores_value_log10', 'jurisdiction_names_Santa Cruz County, CA',
       'amenities_Cable TV', 'neighbourhood_cleansed_Palo Alto',
       'host_is_superhost_f', 'amenities_Lock on bedroom door',
       'amenities_Private entrance', 'amenities_Familykid friendly',
       'district_Santa Clara', 'n

In [53]:
# Drop duplicate columns

df_final.drop(['id'], axis=1, inplace=True)

In [54]:
df_final.shape

(245629, 53)

In [55]:
# Create correlation matrix
corr_matrix = df_final.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

to_drop

[]

In [56]:
# Drop features that have high correlation

df_final.drop(to_drop, axis=1, inplace=True)

In [57]:
df_final.shape

(245629, 53)

In [58]:
pkl_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.pkl'
pkl_path = pkl_file.format('listings_seasonality')

try: 
    with open(pkl_path, 'rb') as file:
        df_final = pd.read_pickle(file)
except:
    with open(pkl_path, 'wb') as file:
        pd.to_pickle(df_final, file)

In [108]:
# Perform multiple modeling for price prediction

X = df_final.drop(['price_by_month_week', 'ref_district', 'ref_jurisdiction'], axis=1)
y = df_final.loc[:, ['listing_id', 'price_by_month_week']]

In [77]:
grp_by_listing_ids = df_final.groupby('listing_id').first().reset_index().listing_id

In [100]:
num_of_grps = len(grp_by_listing_ids)

training_ids = np.random.choice(listing_ids, int(num_of_grps*0.75), replace=False)
print(len(training_ids))
training_ids

2254


array([13618517, 24839902, 14310743, ..., 24808372, 12012121, 15500119])

In [91]:
testing_ids = set(grp_by_listing_ids) - set (training_ids)
len(testing_ids)

752

In [129]:
X_train = X[X['listing_id'].isin(training_ids)].drop('listing_id', axis=1)
X_test = X[X['listing_id'].isin(testing_ids)].drop('listing_id', axis=1)
y_train = y[y['listing_id'].isin(training_ids)].drop('listing_id', axis=1).values.ravel()
y_test = y[y['listing_id'].isin(testing_ids)].drop('listing_id', axis=1).values.ravel()

print(len(X_train), len(X_test), len(y_train), len(y_test))

184229 61188 184229 61188


In [130]:
ss = StandardScaler()
X_train_trans = ss.fit_transform(X_train)
X_test_trans = ss.transform (X_test)

In [131]:
# Model-LR

model=sm.OLS(y_train, sm.add_constant(X_train_trans))
fit=model.fit()
fit.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.616
Model:,OLS,Adj. R-squared:,0.616
Method:,Least Squares,F-statistic:,5916.0
Date:,"Wed, 19 Sep 2018",Prob (F-statistic):,0.0
Time:,16:17:29,Log-Likelihood:,-1027400.0
No. Observations:,184229,AIC:,2055000.0
Df Residuals:,184178,BIC:,2055000.0
Df Model:,50,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,159.4184,0.149,1070.019,0.000,159.126,159.710
x1,1.64e+11,5.15e+11,0.319,0.750,-8.45e+11,1.17e+12
x2,1.597e+11,5.01e+11,0.319,0.750,-8.23e+11,1.14e+12
x3,1.633e+11,5.13e+11,0.319,0.750,-8.41e+11,1.17e+12
x4,1.64e+11,5.15e+11,0.319,0.750,-8.45e+11,1.17e+12
x5,1.637e+11,5.14e+11,0.319,0.750,-8.43e+11,1.17e+12
x6,1.636e+11,5.14e+11,0.319,0.750,-8.43e+11,1.17e+12
x7,1.638e+11,5.14e+11,0.319,0.750,-8.44e+11,1.17e+12
x8,1.64e+11,5.15e+11,0.319,0.750,-8.45e+11,1.17e+12

0,1,2,3
Omnibus:,42360.28,Durbin-Watson:,0.066
Prob(Omnibus):,0.0,Jarque-Bera (JB):,149264.804
Skew:,1.138,Prob(JB):,0.0
Kurtosis:,6.777,Cond. No.,86800000000000.0


In [133]:
# Model-Random Forest

rf = RandomForestRegressor(n_estimators=500, 
                           criterion='mse', 
                           random_state=42, 
                           n_jobs=-1)
rf.fit(X_train_trans, y_train.ravel())

y_train_pred = rf.predict(X_train_trans)
y_test_pred = rf.predict(X_test_trans)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 5.611, test: 1043.202
R^2 train: 0.999, test: 0.900


In [135]:
RMSE = 1043.202**0.5
RMSE

32.29863774217111

In [69]:
df_final.listing_id.nunique()

3006

In [70]:
# Double check on price range. The range is as expected

y.describe(include='all')

count    245629.000000
mean        161.184664
std         104.196466
min          40.000000
25%          85.000000
50%         129.000000
75%         210.000000
max         550.000000
Name: price_by_month_week, dtype: float64

In [71]:
# Map feature importance with labels

feature_labels = X.columns
feature_tuples = zip (feature_labels, rf.feature_importances_)
feature_lists = [list(feature_tuple) for feature_tuple in feature_tuples]

In [72]:
# Order features based on importance

feature_lists.sort(key=lambda x: abs(x[1]), reverse=True)
feature_lists

[['bedrooms', 0.4156324736485746],
 ['cleaning_fee', 0.07034290324560516],
 ['room_type_Entire home/apt', 0.06705745536425838],
 ['accommodates', 0.061629916824698594],
 ['reviews_per_month', 0.04828570528371509],
 ['host_listings_count', 0.04310400183024202],
 ['number_of_reviews', 0.030599769313474728],
 ['bathrooms', 0.026573235736360914],
 ['review_scores_rating_log10', 0.02514303952043061],
 ['calculated_host_listings_count', 0.02201171941033038],
 ['host_yrs', 0.01789310282225505],
 ['city_San Francisco', 0.017425373050930167],
 ['host_response_rate', 0.01506220643000437],
 ['yrs_since_first_review', 0.011552273510368243],
 ['guests_included', 0.010881599402600497],
 ['jurisdiction_names_Santa Cruz County, CA', 0.010464262456094046],
 ['beds', 0.009429364298248035],
 ['city_Oakland', 0.009412498980338828],
 ['host_is_superhost_f', 0.008795707369316634],
 ['host_identity_verified_f', 0.007822633962941604],
 ['review_scores_location_log10', 0.0075009318936435695],
 ['review_scores_

In [73]:
features = [list_[0] for list_ in feature_lists]
features

['bedrooms',
 'cleaning_fee',
 'room_type_Entire home/apt',
 'accommodates',
 'reviews_per_month',
 'host_listings_count',
 'number_of_reviews',
 'bathrooms',
 'review_scores_rating_log10',
 'calculated_host_listings_count',
 'host_yrs',
 'city_San Francisco',
 'host_response_rate',
 'yrs_since_first_review',
 'guests_included',
 'jurisdiction_names_Santa Cruz County, CA',
 'beds',
 'city_Oakland',
 'host_is_superhost_f',
 'host_identity_verified_f',
 'review_scores_location_log10',
 'review_scores_value_log10',
 'amenities_Cable TV',
 'amenities_Lock on bedroom door',
 'amenities_Familykid friendly',
 'amenities_Air conditioning',
 'neighbourhood_cleansed_San Jose',
 'amenities_Private entrance',
 'Saturday',
 'Friday',
 'Sep.',
 'Nov.',
 'neighbourhood_cleansed_Palo Alto',
 'Dec.',
 'Oct.',
 'Aug.',
 'Jul.',
 'Jun.',
 'district_Santa Clara',
 'Jan.',
 'May.',
 'Feb.',
 'Mar.',
 'Apr.',
 'Sunday',
 'Tuesday',
 'Thursday',
 'Monday',
 'Wednesday']

In [74]:
columns =['listing_id']
columns.extend(features)
columns.extend(['ref_district', 'ref_jurisdiction', 'price_by_month_week'])

In [76]:
df_data = df_final[columns]
df_data.head()

Unnamed: 0,listing_id,bedrooms,cleaning_fee,room_type_Entire home/apt,accommodates,reviews_per_month,host_listings_count,number_of_reviews,bathrooms,review_scores_rating_log10,...,Mar.,Apr.,Sunday,Tuesday,Thursday,Monday,Wednesday,ref_district,ref_jurisdiction,price_by_month_week
0,5858,2.0,2.0,1,5,0.99,2,112,1.0,1.991226,...,0,0,0,0,0,1,0,San Francisco,SAN FRANCISCO,235.0
1,5858,2.0,2.0,1,5,0.99,2,112,1.0,1.991226,...,0,0,0,1,0,0,0,San Francisco,SAN FRANCISCO,235.0
2,5858,2.0,2.0,1,5,0.99,2,112,1.0,1.991226,...,0,0,0,0,0,0,1,San Francisco,SAN FRANCISCO,235.0
3,5858,2.0,2.0,1,5,0.99,2,112,1.0,1.991226,...,0,0,0,0,1,0,0,San Francisco,SAN FRANCISCO,235.0
4,5858,2.0,2.0,1,5,0.99,2,112,1.0,1.991226,...,0,0,0,0,0,0,0,San Francisco,SAN FRANCISCO,235.0


In [77]:
df_data.shape

(245629, 53)

In [78]:
df_data.rename(columns={'price_by_month_week':'price'}, inplace=True)

In [79]:
df_data.isnull().sum()

listing_id                                  0
bedrooms                                    0
cleaning_fee                                0
room_type_Entire home/apt                   0
accommodates                                0
reviews_per_month                           0
host_listings_count                         0
number_of_reviews                           0
bathrooms                                   0
review_scores_rating_log10                  0
calculated_host_listings_count              0
host_yrs                                    0
city_San Francisco                          0
host_response_rate                          0
yrs_since_first_review                      0
guests_included                             0
jurisdiction_names_Santa Cruz County, CA    0
beds                                        0
city_Oakland                                0
host_is_superhost_f                         0
host_identity_verified_f                    0
review_scores_location_log10      

In [93]:
temp_df = df_data[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']].copy()

days = temp_df.apply(lambda x: x.idxmax(), axis=1)
days

0            Monday
1           Tuesday
2         Wednesday
3          Thursday
4            Friday
5          Saturday
6            Sunday
7            Monday
8           Tuesday
9         Wednesday
10         Thursday
11           Friday
12         Saturday
13           Sunday
14           Monday
15          Tuesday
16        Wednesday
17         Thursday
18           Friday
19         Saturday
20           Sunday
21           Monday
22          Tuesday
23        Wednesday
24         Thursday
25           Friday
26         Saturday
27           Sunday
28           Monday
29          Tuesday
            ...    
245599     Saturday
245600       Sunday
245601       Monday
245602      Tuesday
245603    Wednesday
245604     Thursday
245605       Friday
245606     Saturday
245607       Sunday
245608       Monday
245609      Tuesday
245610    Wednesday
245611     Thursday
245612       Friday
245613     Saturday
245614       Sunday
245615       Monday
245616      Tuesday
245617    Wednesday


In [None]:
csv_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.csv'

csv_path = csv_file.format('airbnb_data')

df_data.to_csv(csv_path)