In [254]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyRegressor
from scipy.stats import zscore, uniform
from math import sqrt

import altair as alt
alt.renderers.enable('notebook')
alt.data_transformers.enable('json')

from sklearn.model_selection import train_test_split

In [133]:
# read data
airbnb_ny = pd.read_csv('../data/raw_data.csv')
# drop features
airbnb_ny.drop(['id','name', 'host_id', 'host_name','last_review'], axis=1, inplace=True)
# fill nas in reviews per month
airbnb_ny = airbnb_ny.fillna({'reviews_per_month':0})
# split to X and Y
X = airbnb_ny.drop(['price'], axis=1)
y = airbnb_ny.price
# split to test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1)

In [134]:
# combine X and y for test and train respectively
full_train = pd.concat((X_train, y_train), axis= 1)
full_test = pd.concat((X_test, y_test), axis= 1)

In [135]:
full_train.head()

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price
21838,Manhattan,East Village,40.72208,-73.98109,Private room,1,116,4.07,2,238,73
47539,Manhattan,Nolita,40.72206,-73.99695,Entire home/apt,4,0,0.0,1,317,189
25386,Staten Island,Fort Wadsworth,40.59546,-74.06092,Entire home/apt,7,0,0.0,3,365,800
24173,Manhattan,Upper East Side,40.77854,-73.94984,Entire home/apt,31,7,0.3,33,345,265
27875,Brooklyn,Bedford-Stuyvesant,40.69854,-73.94069,Private room,2,4,0.21,1,0,50


In [136]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39116 entries, 21838 to 33003
Data columns (total 11 columns):
neighbourhood_group               39116 non-null object
neighbourhood                     39116 non-null object
latitude                          39116 non-null float64
longitude                         39116 non-null float64
room_type                         39116 non-null object
minimum_nights                    39116 non-null int64
number_of_reviews                 39116 non-null int64
reviews_per_month                 39116 non-null float64
calculated_host_listings_count    39116 non-null int64
availability_365                  39116 non-null int64
price                             39116 non-null int64
dtypes: float64(3), int64(5), object(3)
memory usage: 3.6+ MB


In [137]:
full_train.shape

(39116, 11)

In [138]:
full_train = full_train[(np.abs(zscore(full_train[['price']])) < 3).all(axis=1)]

In [139]:
full_train.shape

(38799, 11)

In [140]:
X_train = full_train.drop(['price'], axis=1)
y_train = full_train['price']
X_test = full_test.drop(['price'], axis=1)
y_test = full_test['price']

In [141]:
categorical_features = [
  'neighbourhood_group', 
  'neighbourhood',
  'room_type'
]

In [142]:
for feature in categorical_features:
  le = LabelEncoder()
  le.fit(X_train[feature])
  X_train[feature] = le.transform(X_train[feature])
  X_test[feature] = le.transform(X_test[feature])

In [143]:
X_train.head()

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
21838,2,64,40.72208,-73.98109,1,1,116,4.07,2,238
47539,2,145,40.72206,-73.99695,0,4,0,0.0,1,317
25386,4,82,40.59546,-74.06092,0,7,0,0.0,3,365
24173,2,201,40.77854,-73.94984,0,31,7,0.3,33,345
27875,1,13,40.69854,-73.94069,1,2,4,0.21,1,0


In [144]:
null_model = DummyRegressor()

In [145]:
null_model.fit(X_train, y_train)

DummyRegressor(constant=None, quantile=None, strategy='mean')

The MSE of the null model is:

In [220]:
sqrt(mean_squared_error(y_test, null_model.predict(X_test)))

253.5912373012752

In [147]:
random_state = 0

In [167]:
list(range(1, 5))

[1, 2, 3, 4]

In [202]:
random_forest_tuning_parameters = {
  'max_depth': [10, 50, None],
  'min_samples_split': [5, 20],
  'n_estimators': [600, 1500]
}

In [203]:
rf = GridSearchCV(
  estimator=RandomForestRegressor(random_state=random_state), 
  param_grid=random_forest_tuning_parameters, 
  cv=4, 
  verbose=2,
  n_jobs=-1,
  scoring='neg_mean_squared_error'
)

In [204]:
rf.fit(X_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 23.6min finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=0,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [10, 50, None],
       

In [262]:
def print_model_info(grid_model, model_name):
  print(model_name + ' info:')
  print('')
  print('RMSE:')
  print(sqrt(mean_squared_error(y_test, grid_model.best_estimator_.predict(X_test))))
  print('R2:')
  print(r2_score(y_test, grid_model.best_estimator_.predict(X_test)))
  print('')
  print('Best params: ')
  print(grid_model.best_params_)
  feature_importantance_series = pd.Series(grid_model.best_estimator_.feature_importances_)
  feature_importantance_series.index = X_test.columns
  print('')
  print('Feature importance:')
  print(feature_importantance_series.sort_values(ascending=False))

In [263]:
print_model_info(rf, 'Random Forest Regressor')

Random Forest Regressor info:

RMSE:
238.07890867012168
R2:
0.11633635322591229

Best params: 
{'max_depth': 50, 'min_samples_split': 20, 'n_estimators': 1500}

Feature importance:
room_type                         0.350602
longitude                         0.189770
latitude                          0.140359
availability_365                  0.094447
minimum_nights                    0.057287
reviews_per_month                 0.051639
number_of_reviews                 0.045903
calculated_host_listings_count    0.038926
neighbourhood                     0.029932
neighbourhood_group               0.001135
dtype: float64


In [237]:
light_regressor_tuning_parameters = {
  'min_data_in_leaf': [100, 300, 500, 1000, 1500],
  'num_leaves': [15, 30, 40, 50, 60],
  'max_depth': [15, 30, 45]
}

In [238]:
light_reg = GridSearchCV(
  estimator=LGBMRegressor(random_state=random_state), 
  param_grid=light_regressor_tuning_parameters, 
  cv=4, 
  verbose=2,
  n_jobs=-1,
  scoring='neg_mean_squared_error'
)

In [239]:
light_reg.fit(X_train, y_train)

Fitting 4 folds for each of 75 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   37.0s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective=None, random_state=0,
                                     reg_alpha=0.0, reg_lambda=0.0, silent=True,
                                     subsample=1.0, subsample_for_bin=200000,
                                     subsample_freq=0),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [15, 30, 45],
                         'min_data_in_leaf': [100, 300, 500, 1000, 1500],
                         'num_leaves': [15, 30, 4

In [253]:
print_model_info(light_reg, 'LightGBM Regressor')

LightGBM Regressor info:

RMSE:
238.11788393760136

Best params: 
{'max_depth': 15, 'min_data_in_leaf': 100, 'num_leaves': 50}

Feature importance:
longitude                         975
latitude                          832
availability_365                  764
minimum_nights                    471
reviews_per_month                 452
number_of_reviews                 427
calculated_host_listings_count    402
neighbourhood                     374
room_type                         178
neighbourhood_group                14
dtype: int64


In [247]:
xgb_regressor_tuning_parameters = { 
  'max_depth': [5, 7, 10],
  'colsample_bytree': [0.6, 0.7, 0.8],
  'n_estimators': [500, 1000, 1500]
}

In [248]:
xgb_reg = GridSearchCV(
  estimator=XGBRegressor(random_state=random_state), 
  param_grid=xgb_regressor_tuning_parameters, 
  cv=4, 
  verbose=2,
  n_jobs=-1,
  scoring='neg_mean_squared_error'
)

In [249]:
xgb_reg.fit(X_train, y_train)

Fitting 4 folds for each of 27 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 19.0min finished
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'colsample_bytree': [0.6, 0.7, 0.8],
                         'max_depth': [5, 7, 10],
            

In [250]:
print_model_info(xgb_reg, 'XGBoost Regressor')

XGBoost Regressor info:

MSE:
237.98357948893192

Best params: 
{'colsample_bytree': 0.6, 'max_depth': 5, 'n_estimators': 500}

Feature importance:
room_type                         0.550407
neighbourhood_group               0.109330
longitude                         0.059549
calculated_host_listings_count    0.053424
availability_365                  0.051309
latitude                          0.046532
minimum_nights                    0.046059
number_of_reviews                 0.032204
neighbourhood                     0.029781
reviews_per_month                 0.021406
dtype: float32
