In [1]:
import pandas as pd
import csv
import statsmodels.api as sm
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
file_url = '/Users/xzhou/github/project_archives/files_airbnb/sd_airbnb/{}.csv'

listings_file = file_url.format('listings_details')
reviews_file = file_url.format('reviews_details')
calendar_file = file_url.format('calendar_details')

In [3]:
listings = pd.read_csv(listings_file, parse_dates=['last_scraped','host_since', 'first_review', 'last_review'])

In [4]:
listings.columns.values

array(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name',
       'summary', 'space', 'description', 'experiences_offered',
       'neighborhood_overview', 'notes', 'transit', 'access',
       'interaction', 'house_rules', 'thumbnail_url', 'medium_url',
       'picture_url', 'xl_picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode',
       'market', 'smart_location', 'country_code', 'country', 'latitude',
       'longitude', 'is_location_exact', 'property_type', 'room_type',
       'accommodates', 'bath

In [5]:
df = listings[["host_response_rate", "host_acceptance_rate", "host_is_superhost",
               "host_listings_count", "zipcode", "property_type","room_type", "accommodates", "bathrooms", "bedrooms", 
               "beds", "price", "number_of_reviews", "review_scores_rating", "cancellation_policy", 
               "reviews_per_month"]]

In [6]:
df.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,review_scores_rating,cancellation_policy,reviews_per_month
0,100%,,t,1.0,92104,House,Entire home/apt,6,2.0,3.0,4.0,$295.00,145,96.0,strict_14_with_grace_period,1.17
1,100%,,f,3.0,92109,Condominium,Entire home/apt,6,1.0,2.0,4.0,"$2,050.00",0,,strict_14_with_grace_period,
2,,,f,1.0,92107,Guesthouse,Entire home/apt,2,1.0,0.0,1.0,$84.00,178,93.0,strict_14_with_grace_period,1.6
3,100%,,t,1.0,92126,House,Private room,2,1.0,1.0,1.0,$72.00,187,98.0,moderate,1.81
4,100%,,f,3.0,92106,House,Private room,2,1.0,1.0,1.0,$75.00,136,87.0,moderate,1.41


In [7]:
# % of NaN values
(len(df)-df.count())/len(df)*100

host_response_rate       16.987890
host_acceptance_rate    100.000000
host_is_superhost         0.076753
host_listings_count       0.076753
zipcode                   1.287737
property_type             0.000000
room_type                 0.000000
accommodates              0.000000
bathrooms                 0.042640
bedrooms                  0.034112
beds                      0.042640
price                     0.000000
number_of_reviews         0.000000
review_scores_rating     19.503667
cancellation_policy       0.000000
reviews_per_month        18.326795
dtype: float64

In [8]:
df = df.drop(['host_acceptance_rate'], axis=1)

In [9]:
# drop NaN rows
df2=df.dropna(axis=0)

In [10]:
# % of rows droppped
(len(df)-len(df2))/len(df)*100

29.1488998806072

In [11]:
df2.shape

(8308, 15)

In [12]:
pd.options.mode.chained_assignment = None  # default='warn'

df2['host_response_rate'] = df2['host_response_rate'].astype(str)
df2['price'] = df2['price'].astype(str)

In [13]:
# clean data
pd.options.mode.chained_assignment = None  # default='warn'

df2['price'] = df2['price'].str.replace("[$, ]", "").astype("float")
df2['host_response_rate'] = df2['host_response_rate'].str.replace("%", "").astype("float")

In [14]:
df2['superhost']=np.where(df2['host_is_superhost']=='t',1,0)
del df2['host_is_superhost']

In [15]:
# select non-numeric variables and create dummies
non_num_vars = df2.select_dtypes(include=['object']).columns
df2[non_num_vars].head()

Unnamed: 0,zipcode,property_type,room_type,cancellation_policy
0,92104,House,Entire home/apt,strict_14_with_grace_period
3,92126,House,Private room,moderate
4,92106,House,Private room,moderate
5,92104,Tiny house,Entire home/apt,strict_14_with_grace_period
7,92037,House,Private room,flexible


In [16]:
dummy_vars = pd.get_dummies(df2[non_num_vars])

In [17]:
dummy_vars.head()

Unnamed: 0,zipcode_12345,zipcode_22404,zipcode_91901,zipcode_91902,zipcode_91910,zipcode_91911,zipcode_91913,zipcode_91914,zipcode_91915,zipcode_91942,...,property_type_Villa,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [18]:
# drop non-numeric variables from df2 and add the dummies
df3=df2.drop(non_num_vars,axis=1)
df3 = pd.merge(df3,dummy_vars, left_index=True, right_index=True)
df3.head()

Unnamed: 0,host_response_rate,host_listings_count,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,review_scores_rating,reviews_per_month,...,property_type_Villa,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
0,100.0,1.0,6,2.0,3.0,4.0,295.0,145,96.0,1.17,...,0,1,0,0,0,0,0,1,0,0
3,100.0,1.0,2,1.0,1.0,1.0,72.0,187,98.0,1.81,...,0,0,1,0,0,1,0,0,0,0
4,100.0,3.0,2,1.0,1.0,1.0,75.0,136,87.0,1.41,...,0,0,1,0,0,1,0,0,0,0
5,100.0,1.0,2,1.0,1.0,1.0,69.0,329,98.0,3.35,...,0,1,0,0,0,0,0,1,0,0
7,100.0,4.0,2,1.0,1.0,1.0,135.0,161,98.0,1.74,...,0,0,1,0,1,0,0,0,0,0


In [19]:
# split into test and training data
np.random.seed(1)
indices = np.random.permutation(len(df3))
train_size = int(round(0.8*len(df3)))
test_size = len(df3)-train_size

y = df3['price']
x = df3.drop('price', axis =1)

x.train = x.iloc[indices[0:train_size]]
y.train = y.iloc[indices[0:train_size]]
x.test = x.iloc[indices[train_size+1:]]
y.test = y.iloc[indices[train_size+1:]]

x2 = x.train.as_matrix()
y2 = y.train.as_matrix()

  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [20]:
import statsmodels.api as sm
olsmod = sm.OLS(y2,x2)
olsres = olsmod.fit()
print(olsres.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.522
Model:                            OLS   Adj. R-squared:                  0.515
Method:                 Least Squares   F-statistic:                     77.78
Date:                Thu, 13 Sep 2018   Prob (F-statistic):               0.00
Time:                        11:50:19   Log-Likelihood:                -43108.
No. Observations:                6646   AIC:                         8.640e+04
Df Residuals:                    6553   BIC:                         8.703e+04
Df Model:                          92                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.4262      0.197      2.161      0.0

In [21]:
# different method
from sklearn import linear_model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=1)

In [22]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=500, 
                               criterion='mse', 
                               random_state=3, 
                               n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 3275.819, test: 32966.356
R^2 train: 0.941, test: 0.471
