In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

%matplotlib inline

In [39]:
df = pd.read_csv('../../Data/Airbnb/featurized.csv')

In [40]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)

In [41]:
df.drop('fewer_than_five', axis=1, inplace=True)

In [42]:
df.drop('comps_found', axis=1, inplace=True)

In [50]:
df.drop('maximum_nights', axis=1, inplace=True)

In [51]:
df.head()

Unnamed: 0,bathrooms,bedrooms,price,guests_included,extra_people,minimum_nights,latitude,longitude,acc_per_bed,Couch,Futon,Pull-out Sofa,Real Bed,comp_median_price
0,1,1,250,1,0,2,37.762674,-122.439195,2.0,0,0,0,1,170.0
1,1,1,170,1,0,2,37.790745,-122.415334,1.0,0,0,0,1,190.0
2,1,0,185,1,0,2,37.759449,-122.389447,1.5,0,0,0,1,147.5
3,1,0,125,1,0,15,37.792592,-122.421331,1.0,0,0,0,1,159.5
4,1,0,140,0,0,3,37.791648,-122.413633,2.0,0,0,0,1,175.0


In [11]:
def fit_score_model(X_train, X_test, y_train, y_test, model):
	'''

	'''
	model.fit(X_train.values, y_train.values)
	y_pred = model.predict(X_test.values)

	mse = mean_squared_error(y_test.values, y_pred)
	mae = mean_absolute_error(y_test.values, y_pred)

	return mse, mae

In [52]:
y = df.price

In [53]:
X = df.drop('price', axis=1)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [55]:
X_train_gps = X_train[['latitude', 'longitude']]
X_test_gps = X_test[['latitude', 'longitude']]
X_train.drop(['latitude', 'longitude'], axis=1, inplace=True)
X_test.drop(['latitude', 'longitude'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [56]:
rf = RandomForestRegressor(n_estimators=100)
lr = LinearRegression()
gdbr = GradientBoostingRegressor()
ada = AdaBoostRegressor()

models = [rf, lr, gdbr, ada]

In [57]:
for model in models:
    mse, mae = fit_score_model(X_train, X_test, y_train, y_test, model)
    print model.__class__.__name__
    print "MSE: ", mse
    print "MAE: ", mae
    print '-' * 30

RandomForestRegressor
MSE:  10788.6672817
MAE:  62.6395356106
------------------------------
LinearRegression
MSE:  8591.38803469
MAE:  55.3756131399
------------------------------
GradientBoostingRegressor
MSE:  8844.71533545
MAE:  56.530929383
------------------------------
AdaBoostRegressor
MSE:  44177.4963047
MAE:  192.446337312
------------------------------


In [59]:
X_train.columns

Index([u'bathrooms', u'bedrooms', u'guests_included', u'extra_people',
       u'minimum_nights', u'acc_per_bed', u'Couch', u'Futon', u'Pull-out Sofa',
       u'Real Bed', u'comp_median_price'],
      dtype='object')

In [60]:
gradient_boost_grid2 = {'learning_rate': [0.05, 0.075, 0.1, 0.125, 0.15], 
						'n_estimators': [25, 40, 50, 60, 75, 100]}
gb_gridsearch = GridSearchCV(GradientBoostingRegressor(max_depth=3, min_samples_leaf=1, max_features='sqrt'),
                             gradient_boost_grid2,
                             n_jobs=-1,
                             verbose=True,
                             scoring='mean_squared_error')
gb_gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  84 out of  90 | elapsed:    1.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    1.4s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=3, max_features='sqrt', max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             random_state=None, subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'n_estimators': [25, 40, 50, 60, 75, 100], 'learning_rate': [0.05, 0.075, 0.1, 0.125, 0.15]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='mean_squared_error', verbose=True)

In [61]:
gb_val = gb_gridsearch.best_estimator_

In [62]:
fit_score_model(X_train, X_test, y_train, y_test, gb_val)

(8690.6258178583703, 56.636666920445471)

In [70]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [83]:
X_train_int = sm.add_constant(X_train)
lr2 = sm.OLS(y_train, X_train_int)
lr2 = lr2.fit()

In [84]:
lr2.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.527
Model:,OLS,Adj. R-squared:,0.525
Method:,Least Squares,F-statistic:,224.9
Date:,"Tue, 29 Sep 2015",Prob (F-statistic):,0.0
Time:,13:39:55,Log-Likelihood:,-13456.0
No. Observations:,2233,AIC:,26940.0
Df Residuals:,2221,BIC:,27010.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,-43.0363,53.008,-0.812,0.417,-146.986 60.914
bathrooms,64.9555,5.276,12.311,0.000,54.609 75.302
bedrooms,-2.9776,3.994,-0.746,0.456,-10.809 4.854
guests_included,-1.5555,1.822,-0.854,0.393,-5.128 2.017
extra_people,0.1342,0.102,1.312,0.190,-0.066 0.335
minimum_nights,-0.9968,0.383,-2.604,0.009,-1.748 -0.246
acc_per_bed,5.2501,2.675,1.962,0.050,0.004 10.496
Couch,321.2843,89.377,3.595,0.000,146.013 496.556
Futon,-15.2421,68.261,-0.223,0.823,-149.104 118.619

0,1,2,3
Omnibus:,1346.513,Durbin-Watson:,1.976
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25433.756
Skew:,2.483,Prob(JB):,0.0
Kurtosis:,18.77,Cond. No.,14300.0


In [103]:
X_train_int2 = X_train_int.drop(1861, axis=0) # couch outlier
X_train_int2 = X_train_int2.drop(['extra_people', 'Couch', 'Futon', 'Pull-out Sofa', 'Real Bed'], axis=1)

In [106]:
y_train2 = y_train.drop(1861, axis=0) # couch outlier
X_test_int = sm.add_constant(X_test)
X_test_int2 = X_test_int.drop(['extra_people', 'Couch', 'Futon', 'Pull-out Sofa', 'Real Bed'], axis=1)

In [108]:
lr3 = sm.OLS(y_train2, X_train_int2)
lr3 = lr3.fit()

In [110]:
lr3.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.527
Model:,OLS,Adj. R-squared:,0.526
Method:,Least Squares,F-statistic:,413.4
Date:,"Tue, 29 Sep 2015",Prob (F-statistic):,0.0
Time:,14:05:04,Log-Likelihood:,-13440.0
No. Observations:,2232,AIC:,26890.0
Df Residuals:,2225,BIC:,26930.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,-30.7255,8.901,-3.452,0.001,-48.180 -13.271
bathrooms,64.8118,5.246,12.354,0.000,54.524 75.100
bedrooms,-3.0507,3.957,-0.771,0.441,-10.811 4.710
guests_included,-1.0309,1.604,-0.643,0.520,-4.176 2.114
minimum_nights,-0.9986,0.380,-2.625,0.009,-1.745 -0.253
acc_per_bed,5.4661,2.506,2.181,0.029,0.552 10.380
comp_median_price,0.8287,0.038,21.606,0.000,0.754 0.904

0,1,2,3
Omnibus:,1360.418,Durbin-Watson:,1.978
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26014.637
Skew:,2.516,Prob(JB):,0.0
Kurtosis:,18.95,Cond. No.,1120.0


In [111]:
X_train_int3 = X_train_int2.drop(['guests_included', 'bedrooms'], axis=1)

In [112]:
X_test_int3 = X_test_int2.drop(['guests_included', 'bedrooms'], axis=1)

In [113]:
lr4 = sm.OLS(y_train2, X_train_int3)
lr4 = lr4.fit()

In [114]:
lr4.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.527
Model:,OLS,Adj. R-squared:,0.526
Method:,Least Squares,F-statistic:,619.9
Date:,"Tue, 29 Sep 2015",Prob (F-statistic):,0.0
Time:,14:08:04,Log-Likelihood:,-13441.0
No. Observations:,2232,AIC:,26890.0
Df Residuals:,2227,BIC:,26920.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,-29.7564,8.537,-3.486,0.001,-46.498 -13.015
bathrooms,63.1846,4.979,12.690,0.000,53.420 72.949
minimum_nights,-0.9952,0.380,-2.617,0.009,-1.741 -0.249
acc_per_bed,5.5771,2.499,2.231,0.026,0.676 10.478
comp_median_price,0.8036,0.027,29.488,0.000,0.750 0.857

0,1,2,3
Omnibus:,1366.679,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26144.399
Skew:,2.533,Prob(JB):,0.0
Kurtosis:,18.983,Cond. No.,1060.0


In [116]:
y_pred = lr4.predict(X_test_int3)

In [117]:
mean_absolute_error(y_test, y_pred)

56.019428994147617

In [120]:
rf2 = RandomForestRegressor(n_estimators=100)
rf2.fit(X_train_int3.drop('const', axis=1), y_train2)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [123]:
y_pred2 = rf2.predict(X_test_int3.drop('const', axis=1))


In [124]:
mean_absolute_error(y_test, y_pred2)

65.964973444106093

In [125]:
gdbr2 = GradientBoostingRegressor()
gdbr2.fit(X_train_int3.drop('const', axis=1), y_train2)
y_pred3 = gdbr2.predict(X_test_int3.drop('const', axis=1))
mean_absolute_error(y_test, y_pred3)

57.372872268431244

In [126]:
mean_absolute_error(X_test_int3.comp_median_price, y_test)

55.444096133751309

## Is it better to use KMeans Clustering?

In [85]:
df2 = pd.read_csv('../../Data/Airbnb/featurized_clusters.csv')

In [86]:
df2.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)

In [87]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3190 entries, 0 to 3189
Data columns (total 14 columns):
bathrooms               3190 non-null float64
bedrooms                3190 non-null float64
price                   3190 non-null float64
guests_included         3190 non-null int64
extra_people            3190 non-null float64
minimum_nights          3190 non-null int64
maximum_nights          3190 non-null int64
acc_per_bed             3190 non-null float64
Couch                   3190 non-null float64
Futon                   3190 non-null float64
Pull-out Sofa           3190 non-null float64
Real Bed                3190 non-null float64
cluster                 3190 non-null int64
cluster_median_price    3190 non-null float64
dtypes: float64(10), int64(4)
memory usage: 373.8 KB


In [88]:
y2 = df2.price
X2 = df2.drop('price', axis=1)

In [89]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3)

In [90]:
for model in models:
    mse, mae = fit_score_model(X2_train, X2_test, y2_train, y2_test, model)
    print model.__class__.__name__
    print "MSE: ", mse
    print "MAE: ", mae
    print '-' * 30

RandomForestRegressor
MSE:  12956.9644413
MAE:  68.8599371635
------------------------------
LinearRegression
MSE:  12330.5331148
MAE:  68.5498276702
------------------------------
GradientBoostingRegressor
MSE:  11859.2862131
MAE:  64.7488683751
------------------------------
AdaBoostRegressor
MSE:  16072.7206139
MAE:  88.7193042473
------------------------------


In [91]:
models

[RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0, warm_start=False),
 AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
          n_estimators=50, random_state=None)]

In [100]:
y_train[X_train.Futon > 0]

272     125
981     115
1220    170
1266     87
2116    200
Name: price, dtype: float64