In [1]:
import pandas as pd
import csv
import re
import statsmodels.api as sm


from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, root
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
file_url = '/Users/xzhou/github/project_archives/files_airbnb/{}/calendar.csv'

sf_file = file_url.format('sf_airbnb')
ok_file = file_url.format('ok_airbnb')
sc_file = file_url.format('sc_airbnb')
scz_file = file_url.format('scz_airbnb')

In [3]:
pricing_sf = pd.read_csv(sf_file, parse_dates=['date'])
pricing_ok = pd.read_csv(ok_file, parse_dates=['date'])
pricing_sc = pd.read_csv(sc_file, parse_dates=['date'])
pricing_scz = pd.read_csv(scz_file, parse_dates=['date'])

In [4]:
print(pricing_sf.shape, pricing_ok.shape, pricing_sc.shape, pricing_scz.shape)

(2420680, 4) (1057770, 4) (2068820, 4) (573050, 4)


In [5]:
frames = [pricing_sf, pricing_ok, pricing_sc, pricing_scz]

In [6]:
df_pricing = pd.concat(frames, keys=['sf', 'ok', 'sc', 'scz'])

In [7]:
df_pricing.listing_id.nunique()

16768

In [8]:
df_pricing.head(100)

Unnamed: 0,Unnamed: 1,listing_id,date,available,price
sf,0,958,2019-08-05,f,
sf,1,958,2019-08-04,f,
sf,2,958,2019-08-03,f,
sf,3,958,2019-08-02,f,
sf,4,958,2019-08-01,f,
sf,5,958,2019-07-31,f,
sf,6,958,2019-07-30,f,
sf,7,958,2019-07-29,f,
sf,8,958,2019-07-28,f,
sf,9,958,2019-07-27,f,


In [9]:
df_test = df_pricing[df_pricing['available']=='f']

In [10]:
df_pricing_smaller = df_pricing[df_pricing['available']=='t']

In [11]:
df_pricing_smaller.shape

(2853357, 4)

In [12]:
df_pricing_smaller.listing_id.nunique()

14770

In [13]:
df_pricing_smaller.date.describe(include='all')

count                 2853357
unique                    389
top       2018-11-03 00:00:00
freq                    10393
first     2018-08-06 00:00:00
last      2019-08-29 00:00:00
Name: date, dtype: object

In [14]:
df_pricing_smaller.columns

Index(['listing_id', 'date', 'available', 'price'], dtype='object')

In [15]:
def currency_to_float(string):
    """
    Convert currency to float
    """
    float_ = float(str(string).strip('$').replace(',',''))
    return float_

In [16]:
# Convert price to float format

df_pricing_smaller['price'] = df_pricing_smaller['price'].apply(lambda x: currency_to_float(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
# Add a month column based on date

df_pricing_smaller['month'] = df_pricing_smaller['date'].dt.month

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
# Add a day_of_week column based on Date. Use default format: Monday=0, Sunday=6

df_pricing_smaller['day_of_week'] = df_pricing_smaller['date'].dt.weekday

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
df_pricing_smaller.head()

Unnamed: 0,Unnamed: 1,listing_id,date,available,price,month,day_of_week
sf,63,958,2019-05-02,t,181.0,5,3
sf,64,958,2019-05-01,t,181.0,5,2
sf,65,958,2019-04-30,t,181.0,4,1
sf,66,958,2019-04-29,t,181.0,4,0
sf,67,958,2019-04-28,t,181.0,4,6


In [20]:
df_pricing_smaller.shape

(2853357, 6)

In [21]:
df_pricing_smaller.describe(include ='all')

Unnamed: 0,listing_id,date,available,price,month,day_of_week
count,2853357.0,2853357,2853357,2853357.0,2853357.0,2853357.0
unique,,389,1,,,
top,,2018-11-03 00:00:00,t,,,
freq,,10393,2853357,,,
first,,2018-08-06 00:00:00,,,,
last,,2019-08-29 00:00:00,,,,
mean,14055230.0,,,213.2333,6.571592,2.988646
std,8563144.0,,,703.5042,3.629179,2.003061
min,958.0,,,10.0,1.0,0.0
25%,6453774.0,,,80.0,3.0,1.0


In [23]:
df_pricing_smaller.loc['sf'].describe(include='all')

Unnamed: 0,listing_id,date,available,price,month,day_of_week
count,1073584.0,1073584,1073584,1073584.0,1073584.0,1073584.0
unique,,365,1,,,
top,,2018-11-03 00:00:00,t,,,
freq,,3953,1073584,,,
first,,2018-08-06 00:00:00,,,,
last,,2019-08-05 00:00:00,,,,
mean,13228730.0,,,227.7898,6.511572,2.99672
std,9057012.0,,,1061.485,3.6334,2.003876
min,958.0,,,10.0,1.0,0.0
25%,4097700.0,,,100.0,3.0,1.0


In [24]:
df_pricing_smaller.loc['ok'].describe(include='all')

Unnamed: 0,listing_id,date,available,price,month,day_of_week
count,337501.0,337501,337501,337501.0,337501.0,337501.0
unique,,365,1,,,
top,,2018-11-12 00:00:00,t,,,
freq,,1415,337501,,,
first,,2018-08-16 00:00:00,,,,
last,,2019-08-15 00:00:00,,,,
mean,13191260.0,,,140.81283,6.635752,2.987301
std,8759368.0,,,196.83452,3.644244,2.002487
min,3083.0,,,20.0,1.0,0.0
25%,4777769.0,,,69.0,3.0,1.0


In [25]:
df_pricing_smaller.loc['sc'].describe(include='all')

Unnamed: 0,listing_id,date,available,price,month,day_of_week
count,1177267.0,1177267,1177267,1177267.0,1177267.0,1177267.0
unique,,365,1,,,
top,,2018-11-09 00:00:00,t,,,
freq,,4336,1177267,,,
first,,2018-08-14 00:00:00,,,,
last,,2019-08-13 00:00:00,,,,
mean,15209840.0,,,203.9792,6.635694,2.991873
std,7909631.0,,,377.8313,3.619311,2.002146
min,11464.0,,,18.0,1.0,0.0
25%,9029655.0,,,66.0,3.0,1.0


In [26]:
# Count how many months each house was listed in the last year

df_month_count = df_pricing_smaller.groupby(['listing_id']).month.nunique().reset_index()
df_month_count.rename(columns={'month':'month_count'}, inplace=True)
df_month_count.head()

Unnamed: 0,listing_id,month_count
0,958,10
1,3083,5
2,3264,3
3,5739,2
4,5858,12


In [27]:
df_month_count.shape

(14770, 2)

In [28]:
# Count how many days of week each house was listed in the last year

df_day_of_week_count = df_pricing_smaller.groupby(['listing_id']).day_of_week.nunique().reset_index()
df_day_of_week_count.rename(columns={'day_of_week':'day_of_week_count'}, inplace=True)
df_day_of_week_count.head()

Unnamed: 0,listing_id,day_of_week_count
0,958,7
1,3083,7
2,3264,7
3,5739,7
4,5858,7


In [29]:
df_day_of_week_count.shape

(14770, 2)

In [30]:
df_pricing_with_counts = pd.merge(left=df_pricing_smaller, 
                                  right=df_month_count,
                                  how='left', 
                                  left_on='listing_id', 
                                  right_on='listing_id')
df_pricing_with_counts.head()

Unnamed: 0,listing_id,date,available,price,month,day_of_week,month_count
0,958,2019-05-02,t,181.0,5,3,10
1,958,2019-05-01,t,181.0,5,2,10
2,958,2019-04-30,t,181.0,4,1,10
3,958,2019-04-29,t,181.0,4,0,10
4,958,2019-04-28,t,181.0,4,6,10


In [31]:
df_pricing_with_counts.shape

(2853357, 7)

In [32]:
df_pricing_with_counts = pd.merge(left=df_pricing_with_counts, 
                                  right=df_day_of_week_count,
                                  how='left', 
                                  left_on='listing_id', 
                                  right_on='listing_id')
df_pricing_with_counts.head()

Unnamed: 0,listing_id,date,available,price,month,day_of_week,month_count,day_of_week_count
0,958,2019-05-02,t,181.0,5,3,10,7
1,958,2019-05-01,t,181.0,5,2,10,7
2,958,2019-04-30,t,181.0,4,1,10,7
3,958,2019-04-29,t,181.0,4,0,10,7
4,958,2019-04-28,t,181.0,4,6,10,7


In [33]:
df_pricing_with_counts.shape

(2853357, 8)

In [34]:
df_pricing_with_counts.describe(include='all')

Unnamed: 0,listing_id,date,available,price,month,day_of_week,month_count,day_of_week_count
count,2853357.0,2853357,2853357,2853357.0,2853357.0,2853357.0,2853357.0,2853357.0
unique,,389,1,,,,,
top,,2018-11-03 00:00:00,t,,,,,
freq,,10393,2853357,,,,,
first,,2018-08-06 00:00:00,,,,,,
last,,2019-08-29 00:00:00,,,,,,
mean,14055230.0,,,213.2333,6.571592,2.988646,10.20866,6.995392
std,8563144.0,,,703.5042,3.629179,2.003061,2.696574,0.1206497
min,958.0,,,10.0,1.0,0.0,1.0,1.0
25%,6453774.0,,,80.0,3.0,1.0,9.0,7.0


In [43]:
df_pricing_with_counts.price.describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

count    2.581280e+06
mean     1.603721e+02
std      1.016462e+02
min      4.000000e+01
5%       5.000000e+01
10%      6.000000e+01
25%      8.400000e+01
50%      1.310000e+02
75%      2.050000e+02
90%      3.020000e+02
95%      3.840000e+02
max      5.000000e+02
Name: price, dtype: float64

In [46]:
# Use 5% and 95% percentiles as cutoff

df_pricing_with_counts = df_pricing_with_counts[(df_pricing_with_counts.price>=50)
                                                & (df_pricing_with_counts.price<=400)]

In [47]:
df_pricing_with_counts.shape

(2377562, 8)

In [48]:
# (df_pricing_with_counts['month_count']==12)
#                   & (df_pricing_with_counts['day_of_week_count']==7)

df_pricing_available = df_pricing_with_counts[(df_pricing_with_counts['month_count']==12)
                                 & (df_pricing_with_counts['day_of_week_count']==7)]

In [49]:
df_pricing_available.shape

(1362356, 8)

In [50]:
df_pricing_available.listing_id.nunique()

4253

In [51]:
df_pricing_available.isnull().sum()

listing_id           0
date                 0
available            0
price                0
month                0
day_of_week          0
month_count          0
day_of_week_count    0
dtype: int64

In [53]:
df_pricing_available.describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

Unnamed: 0,listing_id,price,month,day_of_week,month_count,day_of_week_count
count,1362356.0,1362356.0,1362356.0,1362356.0,1362356.0,1362356.0
mean,13503600.0,157.8347,6.318691,2.986385,12.0,7.0
std,8753801.0,86.73772,3.496964,2.005098,0.0,0.0
min,5858.0,50.0,1.0,0.0,12.0,7.0
5%,565701.2,59.0,1.0,0.0,12.0,7.0
10%,1299242.0,65.0,2.0,0.0,12.0,7.0
25%,5406477.0,89.0,3.0,1.0,12.0,7.0
50%,13732540.0,136.0,6.0,3.0,12.0,7.0
75%,21399940.0,200.0,10.0,5.0,12.0,7.0
90%,25370850.0,295.0,11.0,6.0,12.0,7.0


In [56]:
df_pricing_by_month_weekday = df_pricing_available.groupby(['listing_id', 'month', 'day_of_week']) \
                              .price.mean()  \
                              .reset_index()
df_pricing_by_month_weekday.head()

Unnamed: 0,listing_id,month,day_of_week,price
0,5858,1,0,235.0
1,5858,1,1,235.0
2,5858,1,2,235.0
3,5858,1,3,235.0
4,5858,1,4,235.0


In [57]:
df_pricing_by_month_weekday.describe(include='all')

Unnamed: 0,listing_id,month,day_of_week,price
count,338460.0,338460.0,338460.0,338460.0
mean,13422730.0,6.453646,3.001971,158.217563
std,8750020.0,3.464062,2.003586,86.813187
min,5858.0,1.0,0.0,50.0
25%,5286782.0,3.0,1.0,89.0
50%,13656490.0,6.0,3.0,137.0
75%,21291770.0,10.0,5.0,202.0
max,28152470.0,12.0,6.0,400.0


In [58]:
df_pricing_by_month_weekday.isnull().sum()

listing_id     0
month          0
day_of_week    0
price          0
dtype: int64

In [60]:
df_pricing_by_month_weekday.rename(columns={'price':'price_by_month_week'}, inplace=True)

In [61]:
df_pricing_by_month_weekday.shape

(338460, 4)

In [62]:
df_pricing_final = pd.get_dummies(df_pricing_by_month_weekday, columns=['month', 'day_of_week'])

In [63]:
df_pricing_final.columns

Index(['listing_id', 'price_by_month_week', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10', 'month_11', 'month_12', 'day_of_week_0', 'day_of_week_1',
       'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',
       'day_of_week_6'],
      dtype='object')

In [64]:
df_pricing_final.shape

(338460, 21)

In [68]:
pkl_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.csv'

pkl = pkl_file.format('all_listings')

df_listings_reviews = pd.read_pickle(pkl)
df_listings_reviews.head()

Unnamed: 0,id,host_response_rate,host_listings_count,accommodates,bathrooms,bedrooms,beds,price,security_deposit,cleaning_fee,...,instant_bookable_f,is_business_travel_ready_f,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,require_guest_profile_picture_f,require_guest_phone_verification_f
0,958,0.92,1,3,1.0,1.0,2.0,170.0,1,2.0,...,1,1,0,1,0,0,0,0,1,1
1,5858,1.0,2,5,1.0,2.0,3.0,235.0,1,2.0,...,1,1,0,0,0,1,0,0,1,1
2,7918,1.0,10,2,4.0,1.0,1.0,65.0,1,1.69897,...,1,1,0,0,0,1,0,0,1,1
3,8142,1.0,10,2,4.0,1.0,1.0,65.0,1,1.69897,...,1,1,0,0,0,1,0,0,1,1
5,8567,0.8,1,6,1.0,2.0,3.0,255.0,0,2.09691,...,1,1,0,1,0,0,0,0,1,1


In [69]:
df_listings_reviews.shape

(11700, 336)

In [76]:
df_final = pd.merge(left=df_pricing_final, 
                    right=df_listings_reviews,
                    how='inner',
                    left_on='listing_id',
                    right_on='id')

In [77]:
df_final.shape

(260392, 357)

In [78]:
df_final.isnull().sum()

listing_id                                         0
price_by_month_week                                0
month_1                                            0
month_2                                            0
month_3                                            0
month_4                                            0
month_5                                            0
month_6                                            0
month_7                                            0
month_8                                            0
month_9                                            0
month_10                                           0
month_11                                           0
month_12                                           0
day_of_week_0                                      0
day_of_week_1                                      0
day_of_week_2                                      0
day_of_week_3                                      0
day_of_week_4                                 

In [80]:
df_final.shape

(260392, 357)

In [81]:
df_final.columns

Index(['listing_id', 'price_by_month_week', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8',
       ...
       'instant_bookable_f', 'is_business_travel_ready_f',
       'cancellation_policy_flexible', 'cancellation_policy_moderate',
       'cancellation_policy_strict',
       'cancellation_policy_strict_14_with_grace_period',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60',
       'require_guest_profile_picture_f',
       'require_guest_phone_verification_f'],
      dtype='object', length=357)

In [82]:
df_final.drop(['id', 'price'], axis=1, inplace=True)

In [83]:
df_final.shape

(260392, 355)

In [84]:
# Create correlation matrix
corr_matrix = df_final.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

to_drop

In [85]:
to_drop

['review_scores_value_log10']

In [87]:
# Drop features that have high correlation

df_final.drop(to_drop, axis=1, inplace=True)

In [88]:
df_final.shape

(260392, 354)

In [89]:
# word count

X = df_final.drop(['listing_id', 'price_by_month_week'], axis=1)
y = df_final['price_by_month_week']

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state=42)

In [91]:
ss = StandardScaler()
X_train_trans = ss.fit_transform(X_train)
X_test_trans = ss.transform (X_test)

In [92]:
X_train_trans.shape

(182274, 352)

In [93]:
X_test_trans.shape

(78118, 352)

In [94]:
lr = linear_model.LinearRegression()
lr.fit(X_train_trans, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [95]:
lr.score(X_test_trans, y_test)

0.695006850955054

In [96]:
y_predict = lr.predict(X_test_trans)

In [97]:
r2_score(y_predict, y_test)

0.5618587800155335

In [98]:
mean_squared_error(y_predict, y_test)

2248.8278807217257

In [99]:
model=sm.OLS(y_train, sm.add_constant(X_train_trans))
fit=model.fit()
fit.summary()

  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3
Dep. Variable:,price_by_month_week,R-squared:,0.696
Model:,OLS,Adj. R-squared:,0.695
Method:,Least Squares,F-statistic:,1474.0
Date:,"Sun, 16 Sep 2018",Prob (F-statistic):,0.0
Time:,16:23:46,Log-Likelihood:,-961810.0
No. Observations:,182274,AIC:,1924000.0
Df Residuals:,181991,BIC:,1927000.0
Df Model:,282,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,153.2869,0.112,1373.115,0.000,153.068,153.506
x1,-3.555e+11,4.67e+11,-0.761,0.447,-1.27e+12,5.6e+11
x2,-3.565e+11,4.68e+11,-0.761,0.447,-1.27e+12,5.62e+11
x3,-3.562e+11,4.68e+11,-0.761,0.447,-1.27e+12,5.61e+11
x4,-3.557e+11,4.67e+11,-0.761,0.447,-1.27e+12,5.61e+11
x5,-3.557e+11,4.67e+11,-0.761,0.447,-1.27e+12,5.61e+11
x6,-3.555e+11,4.67e+11,-0.761,0.447,-1.27e+12,5.6e+11
x7,-3.552e+11,4.67e+11,-0.761,0.447,-1.27e+12,5.6e+11
x8,-3.482e+11,4.58e+11,-0.761,0.447,-1.25e+12,5.49e+11

0,1,2,3
Omnibus:,24111.829,Durbin-Watson:,2.003
Prob(Omnibus):,0.0,Jarque-Bera (JB):,62643.231
Skew:,0.747,Prob(JB):,0.0
Kurtosis:,5.453,Cond. No.,2.29e+17


In [101]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=100, 
                               criterion='mse', 
                               random_state=3, 
                               n_jobs=-1)
forest.fit(X_train_trans, y_train)
y_train_pred = forest.predict(X_train_trans)
y_test_pred = forest.predict(X_test_trans)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 4.032, test: 26.640
R^2 train: 0.999, test: 0.996


In [103]:
feature_labels = X.columns

In [104]:
feature_labels

Index(['month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10',
       ...
       'instant_bookable_f', 'is_business_travel_ready_f',
       'cancellation_policy_flexible', 'cancellation_policy_moderate',
       'cancellation_policy_strict',
       'cancellation_policy_strict_14_with_grace_period',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60',
       'require_guest_profile_picture_f',
       'require_guest_phone_verification_f'],
      dtype='object', length=352)

In [115]:
feature_tuples = zip (feature_labels, forest.feature_importances_)
features_list = [list(feature_tuple) for feature_tuple in feature_tuples]

In [116]:
features_list.sort(key=lambda x: abs(x[1]), reverse=True)

In [117]:
features_list

[['bedrooms', 0.3779611219774654],
 ['room_type_Entire home/apt', 0.08621581584273132],
 ['cleaning_fee', 0.05495123226541247],
 ['accommodates', 0.05008309977841667],
 ['host_listings_count', 0.04106299177196036],
 ['reviews_per_month', 0.03560719197434737],
 ['number_of_reviews', 0.021142160229082926],
 ['review_scores_rating_log10', 0.016351073166625863],
 ['calculated_host_listings_count', 0.01564030509043598],
 ['city_Oakland', 0.014027913273177421],
 ['host_duration', 0.013569153353121963],
 ['city_San Francisco', 0.012945384417496772],
 ['beds', 0.010727598449813802],
 ['bathrooms', 0.009965611501310395],
 ['guests_included', 0.009112864784728539],
 ['host_response_rate', 0.0089809295887684],
 ['yrs_since_first_review', 0.008191202472606168],
 ['Air conditioning', 0.006680119223938971],
 ['Self checkin', 0.00663697762074117],
 ['host_is_superhost_f', 0.006129690121940262],
 ['review_scores_location_log10', 0.005442357654315655],
 ['neighbourhood_cleansed_Downtown/Civic Center', 