In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression

%matplotlib inline

In [2]:
df = pd.read_csv('../../Data/Airbnb/featurized.csv')

In [5]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)

In [7]:
df.drop('fewer_than_five', axis=1, inplace=True)

In [9]:
df.drop('comps_found', axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,bathrooms,bedrooms,price,guests_included,extra_people,minimum_nights,maximum_nights,latitude,longitude,acc_per_bed,comp_median_price
0,1,1,250,1,0,2,365,37.762674,-122.439195,2.0,170.0
1,1,1,170,1,0,2,365,37.790745,-122.415334,1.0,190.0
2,1,0,185,1,0,2,29,37.759449,-122.389447,1.5,147.5
3,1,0,125,1,0,15,38,37.792592,-122.421331,1.0,159.5
4,1,0,140,0,0,3,1125,37.791648,-122.413633,2.0,175.0


In [11]:
def fit_score_model(X_train, X_test, y_train, y_test, model):
	'''

	'''
	model.fit(X_train.values, y_train.values)
	y_pred = model.predict(X_test.values)

	mse = mean_squared_error(y_test.values, y_pred)
	mae = mean_absolute_error(y_test.values, y_pred)

	return mse, mae

In [12]:
y = df.price

In [13]:
X = df.drop('price', axis=1)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [18]:
rf = RandomForestRegressor(n_estimators=100)
lr = LinearRegression()
gdbr = GradientBoostingRegressor()
ada = AdaBoostRegressor()

models = [rf, lr, gdbr, ada]

In [19]:
for model in models:
    mse, mae = fit_score_model(X_train, X_test, y_train, y_test, model)
    print model.__class__.__name__
    print "MSE: ", mse
    print "MAE: ", mae
    print '-' * 30

RandomForestRegressor
MSE:  9889.81358109
MAE:  61.3266980146
------------------------------
LinearRegression
MSE:  8000.11434661
MAE:  54.7575146606
------------------------------
GradientBoostingRegressor
MSE:  9009.00599867
MAE:  56.5362345486
------------------------------
AdaBoostRegressor
MSE:  38668.0091212
MAE:  176.990088996
------------------------------


In [20]:
gradient_boost_grid2 = {'learning_rate': [0.05, 0.075, 0.1, 0.125, 0.15], 
						'n_estimators': [25, 40, 50, 60, 75, 100]}
gb_gridsearch = GridSearchCV(GradientBoostingRegressor(max_depth=3, min_samples_leaf=1, max_features='sqrt'),
                             gradient_boost_grid2,
                             n_jobs=-1,
                             verbose=True,
                             scoring='mean_squared_error')
gb_gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  84 out of  90 | elapsed:    1.7s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    1.8s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=3, max_features='sqrt', max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             random_state=None, subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'n_estimators': [25, 40, 50, 60, 75, 100], 'learning_rate': [0.05, 0.075, 0.1, 0.125, 0.15]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='mean_squared_error', verbose=True)

In [21]:
gb_val = gb_gridsearch.best_estimator_

In [22]:
fit_score_model(X_train, X_test, y_train, y_test, gb_val)

(7967.8376041562524, 54.479398997593385)