In [1]:
import pandas as pd
import csv
import statsmodels.api as sm
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
file_url = '/Users/xzhou/github/project_archives/files_airbnb/scz_airbnb/{}.csv'

listings_file = file_url.format('listings_details')
reviews_file = file_url.format('reviews_details')
calendar_file = file_url.format('calendar_details')

In [3]:
listings = pd.read_csv(listings_file, parse_dates=['last_scraped','host_since', 'first_review', 'last_review'])

In [4]:
listings.columns.values

array(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name',
       'summary', 'space', 'description', 'experiences_offered',
       'neighborhood_overview', 'notes', 'transit', 'access',
       'interaction', 'house_rules', 'thumbnail_url', 'medium_url',
       'picture_url', 'xl_picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode',
       'market', 'smart_location', 'country_code', 'country', 'latitude',
       'longitude', 'is_location_exact', 'property_type', 'room_type',
       'accommodates', 'bath

In [5]:
df = listings[["host_response_rate", "host_acceptance_rate", "host_is_superhost",
               "host_listings_count", "zipcode", "property_type","room_type", "accommodates", "bathrooms", "bedrooms", 
               "beds", "price", "number_of_reviews", "review_scores_rating", "cancellation_policy", 
               "reviews_per_month"]]

In [6]:
df.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,review_scores_rating,cancellation_policy,reviews_per_month
0,,,f,1.0,90230,Condominium,Entire home/apt,6,2.0,2.0,3.0,$122.00,2,80.0,strict_14_with_grace_period,0.02
1,100%,,f,1.0,91505,House,Entire home/apt,6,1.0,3.0,3.0,$168.00,1,80.0,flexible,0.04
2,0%,,f,1.0,90292,Apartment,Shared room,1,,1.0,1.0,$85.00,0,,moderate,
3,100%,,f,1.0,90046,Apartment,Private room,1,1.5,1.0,1.0,$79.00,11,96.0,strict_14_with_grace_period,0.22
4,100%,,f,2.0,90405,Apartment,Private room,1,1.0,1.0,1.0,$140.00,17,96.0,strict_14_with_grace_period,0.19


In [7]:
# % of NaN values
(len(df)-df.count())/len(df)*100

host_response_rate       17.722087
host_acceptance_rate    100.000000
host_is_superhost         0.007086
host_listings_count       0.007086
zipcode                   1.211706
property_type             0.000000
room_type                 0.000000
accommodates              0.000000
bathrooms                 0.085032
bedrooms                  0.047240
beds                      0.089756
price                     0.000000
number_of_reviews         0.000000
review_scores_rating     22.597255
cancellation_policy       0.000000
reviews_per_month        21.090299
dtype: float64

In [8]:
df = df.drop(['host_acceptance_rate'], axis=1)

In [9]:
# drop NaN rows
df2=df.dropna(axis=0)

In [10]:
# % of rows droppped
(len(df)-len(df2))/len(df)*100

32.08777192526632

In [11]:
df2.shape

(28752, 15)

In [12]:
pd.options.mode.chained_assignment = None  # default='warn'

df2['host_response_rate'] = df2['host_response_rate'].astype(str)
df2['price'] = df2['price'].astype(str)

In [13]:
# clean data
pd.options.mode.chained_assignment = None  # default='warn'

df2['price'] = df2['price'].str.replace("[$, ]", "").astype("float")
df2['host_response_rate'] = df2['host_response_rate'].str.replace("%", "").astype("float")

In [14]:
df2['superhost']=np.where(df2['host_is_superhost']=='t',1,0)
del df2['host_is_superhost']

In [15]:
# select non-numeric variables and create dummies
non_num_vars = df2.select_dtypes(include=['object']).columns
df2[non_num_vars].head()

Unnamed: 0,zipcode,property_type,room_type,cancellation_policy
1,91505,House,Entire home/apt,flexible
3,90046,Apartment,Private room,strict_14_with_grace_period
4,90405,Apartment,Private room,strict_14_with_grace_period
6,90046,House,Entire home/apt,strict_14_with_grace_period
7,91604,House,Entire home/apt,strict_14_with_grace_period


In [16]:
dummy_vars = pd.get_dummies(df2[non_num_vars])

In [17]:
dummy_vars.head()

Unnamed: 0,zipcode_90001,zipcode_90002,zipcode_90003,zipcode_90004,zipcode_90005,zipcode_90006,zipcode_90007,zipcode_90008,zipcode_90010,zipcode_90011,...,property_type_Yurt,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [18]:
# drop non-numeric variables from df2 and add the dummies
df3=df2.drop(non_num_vars,axis=1)
df3 = pd.merge(df3,dummy_vars, left_index=True, right_index=True)
df3.head()

Unnamed: 0,host_response_rate,host_listings_count,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,review_scores_rating,reviews_per_month,...,property_type_Yurt,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
1,100.0,1.0,6,1.0,3.0,3.0,168.0,1,80.0,0.04,...,0,1,0,0,1,0,0,0,0,0
3,100.0,1.0,1,1.5,1.0,1.0,79.0,11,96.0,0.22,...,0,0,1,0,0,0,0,1,0,0
4,100.0,2.0,1,1.0,1.0,1.0,140.0,17,96.0,0.19,...,0,0,1,0,0,0,0,1,0,0
6,88.0,3.0,3,1.0,1.0,2.0,76.0,18,80.0,0.3,...,0,1,0,0,0,0,0,1,0,0
7,70.0,3.0,5,1.0,2.0,3.0,225.0,15,94.0,0.17,...,0,1,0,0,0,0,0,1,0,0


In [19]:
# split into test and training data
np.random.seed(1)
indices = np.random.permutation(len(df3))
train_size = int(round(0.8*len(df3)))
test_size = len(df3)-train_size

y = df3['price']
x = df3.drop('price', axis =1)

x.train = x.iloc[indices[0:train_size]]
y.train = y.iloc[indices[0:train_size]]
x.test = x.iloc[indices[train_size+1:]]
y.test = y.iloc[indices[train_size+1:]]

x2 = x.train.as_matrix()
y2 = y.train.as_matrix()

  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [20]:
import statsmodels.api as sm
olsmod = sm.OLS(y2,x2)
olsres = olsmod.fit()
print(olsres.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.419
Model:                            OLS   Adj. R-squared:                  0.410
Method:                 Least Squares   F-statistic:                     46.14
Date:                Thu, 13 Sep 2018   Prob (F-statistic):               0.00
Time:                        11:37:36   Log-Likelihood:            -1.5556e+05
No. Observations:               23002   AIC:                         3.118e+05
Df Residuals:                   22647   BIC:                         3.147e+05
Df Model:                         354                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -0.6543      0.125     -5.251      0.0

In [21]:
# different method
from sklearn import linear_model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=1)

In [22]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=500, 
                               criterion='mse', 
                               random_state=3, 
                               n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 6097.287, test: 25847.203
R^2 train: 0.919, test: 0.593
