In [1]:
import pandas as pd
import csv
import statsmodels.api as sm
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
file_url = '/Users/xzhou/github/project_archives/files_airbnb/sf_airbnb/{}.csv'

listings_file = file_url.format('listings_details')
reviews_file = file_url.format('reviews_details')
calendar_file = file_url.format('calendar_details')

In [3]:
listings = pd.read_csv(listings_file, parse_dates=['last_scraped','host_since', 'first_review', 'last_review'])

In [4]:
listings.columns.values

array(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name',
       'summary', 'space', 'description', 'experiences_offered',
       'neighborhood_overview', 'notes', 'transit', 'access',
       'interaction', 'house_rules', 'thumbnail_url', 'medium_url',
       'picture_url', 'xl_picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode',
       'market', 'smart_location', 'country_code', 'country', 'latitude',
       'longitude', 'is_location_exact', 'property_type', 'room_type',
       'accommodates', 'bath

In [5]:
df = listings[["host_response_rate", "host_acceptance_rate", "host_is_superhost",
               "host_listings_count", "zipcode", "property_type","room_type", "accommodates", "bathrooms", "bedrooms", 
               "beds", "price", "number_of_reviews", "review_scores_rating", "cancellation_policy", 
               "reviews_per_month"]]

In [6]:
df.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,review_scores_rating,cancellation_policy,reviews_per_month
0,92%,,t,1,94117.0,Apartment,Entire home/apt,3,1.0,1.0,2.0,$170.00,152,97.0,moderate,1.38
1,100%,,f,2,94110.0,Apartment,Entire home/apt,5,1.0,2.0,3.0,$235.00,112,98.0,strict_14_with_grace_period,0.99
2,100%,,f,10,94117.0,Apartment,Private room,2,4.0,1.0,1.0,$65.00,17,85.0,strict_14_with_grace_period,0.16
3,100%,,f,10,94117.0,Apartment,Private room,2,4.0,1.0,1.0,$65.00,7,91.0,strict_14_with_grace_period,0.15
4,100%,,f,2,94117.0,House,Entire home/apt,5,1.5,2.0,2.0,$675.00,26,97.0,strict_14_with_grace_period,0.24


In [7]:
# % of NaN values
(len(df)-df.count())/len(df)*100

host_response_rate        8.668777
host_acceptance_rate    100.000000
host_is_superhost         0.000000
host_listings_count       0.000000
zipcode                   2.487562
property_type             0.000000
room_type                 0.000000
accommodates              0.000000
bathrooms                 0.391979
bedrooms                  0.015076
beds                      0.075381
price                     0.000000
number_of_reviews         0.000000
review_scores_rating     17.850143
cancellation_policy       0.000000
reviews_per_month        17.216946
dtype: float64

In [8]:
df = df.drop(['host_acceptance_rate'], axis=1)

In [9]:
# drop NaN rows
df2=df.dropna(axis=0)

In [10]:
# % of rows droppped
(len(df)-len(df2))/len(df)*100

25.569124076586764

In [11]:
df2.shape

(4937, 15)

In [12]:
pd.options.mode.chained_assignment = None  # default='warn'

df2['host_response_rate'] = df2['host_response_rate'].astype(str)
df2['price'] = df2['price'].astype(str)

In [13]:
# clean data
pd.options.mode.chained_assignment = None  # default='warn'

df2['price'] = df2['price'].str.replace("[$, ]", "").astype("float")
df2['host_response_rate'] = df2['host_response_rate'].str.replace("%", "").astype("float")

In [14]:
df2['superhost']=np.where(df2['host_is_superhost']=='t',1,0)
del df2['host_is_superhost']

In [15]:
# select non-numeric variables and create dummies
non_num_vars = df2.select_dtypes(include=['object']).columns
df2[non_num_vars].head()

Unnamed: 0,property_type,room_type,cancellation_policy
0,Apartment,Entire home/apt,moderate
1,Apartment,Entire home/apt,strict_14_with_grace_period
2,Apartment,Private room,strict_14_with_grace_period
3,Apartment,Private room,strict_14_with_grace_period
4,House,Entire home/apt,strict_14_with_grace_period


In [16]:
dummy_vars = pd.get_dummies(df2[non_num_vars])

In [17]:
dummy_vars.head()

Unnamed: 0,property_type_Aparthotel,property_type_Apartment,property_type_Bed and breakfast,property_type_Boat,property_type_Boutique hotel,property_type_Bungalow,property_type_Bus,property_type_Cabin,property_type_Condominium,property_type_Cottage,...,property_type_Treehouse,property_type_Villa,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
0,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [18]:
# drop non-numeric variables from df2 and add the dummies
df3=df2.drop(non_num_vars,axis=1)
df3 = pd.merge(df3,dummy_vars, left_index=True, right_index=True)
df3.head()

Unnamed: 0,host_response_rate,host_listings_count,zipcode,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,review_scores_rating,...,property_type_Treehouse,property_type_Villa,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
0,92.0,1,94117.0,3,1.0,1.0,2.0,170.0,152,97.0,...,0,0,1,0,0,0,1,0,0,0
1,100.0,2,94110.0,5,1.0,2.0,3.0,235.0,112,98.0,...,0,0,1,0,0,0,0,1,0,0
2,100.0,10,94117.0,2,4.0,1.0,1.0,65.0,17,85.0,...,0,0,0,1,0,0,0,1,0,0
3,100.0,10,94117.0,2,4.0,1.0,1.0,65.0,7,91.0,...,0,0,0,1,0,0,0,1,0,0
4,100.0,2,94117.0,5,1.5,2.0,2.0,675.0,26,97.0,...,0,0,1,0,0,0,0,1,0,0


In [19]:
# split into test and training data
np.random.seed(1)
indices = np.random.permutation(len(df3))
train_size = int(round(0.8*len(df3)))
test_size = len(df3)-train_size

y = df3['price']
x = df3.drop('price', axis =1)

x.train = x.iloc[indices[0:train_size]]
y.train = y.iloc[indices[0:train_size]]
x.test = x.iloc[indices[train_size+1:]]
y.test = y.iloc[indices[train_size+1:]]

x2 = x.train.as_matrix()
y2 = y.train.as_matrix()

  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [20]:
import statsmodels.api as sm
olsmod = sm.OLS(y2,x2)
olsres = olsmod.fit()
print(olsres.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.278
Model:                            OLS   Adj. R-squared:                  0.271
Method:                 Least Squares   F-statistic:                     38.58
Date:                Thu, 13 Sep 2018   Prob (F-statistic):          6.53e-243
Time:                        11:31:09   Log-Likelihood:                -27090.
No. Observations:                3950   AIC:                         5.426e+04
Df Residuals:                    3910   BIC:                         5.451e+04
Df Model:                          39                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.1679      0.350      0.479      0.6

In [21]:
# different method
from sklearn import linear_model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=1)

In [22]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=500, 
                               criterion='mse', 
                               random_state=3, 
                               n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 8359.452, test: 17833.431
R^2 train: 0.895, test: 0.279
