In [None]:
# !pip3 install scikit-learn pandas xgboost --no-cache

In [1]:
import pandas as pd

### Read and process data

In [58]:
data = pd.read_csv("./Total_Listings.csv", index_col=False)
data = pd.read_csv("./Total_Listings_lag.csv", index_col=False)

In [59]:
data.columns

Index(['Unnamed: 0', 'ID', 'Price', 'lnPrice', 'latitude', 'longitude',
       'neighbourhood_cleansed', 'accommodates', 'bathrooms', 'bedrooms',
       'amenities_num', 'number_of_reviews_l30d', 'review_scores_location',
       'review_scores_value', 'availability_60', 'instant_bookable',
       'host_is_superhost', 'host_identity_verified',
       'calculated_host_listings_count', 'restaurant_1km', 'mall_dist',
       'convenience_500m', 'bus_stop_500m', 'subway_entrance_dist',
       'railway_station_dist', 'attraction_dist', 'museum_dist',
       'theatre_dist', 'w_lnPrice', 'w_accommodates', 'w_bathrooms',
       'w_bedrooms', 'w_amenities_num', 'w_number_of_reviews_l30d',
       'w_review_scores_location', 'w_review_scores_value',
       'w_availability_60', 'w_instant_bookable', 'w_host_is_superhost',
       'w_host_identity_verified', 'w_calculated_host_listings_count'],
      dtype='object')

In [60]:
data_with_neighbor = data.drop(columns=["Unnamed: 0", "ID", "Price", "latitude", "longitude"])

data_wo_neighbor = data.drop(columns=["Unnamed: 0", "ID", "Price", "neighbourhood_cleansed",
                                     "latitude", "longitude"])

In [None]:
# from pandas import get_dummies
# data_with_neighbor = get_dummies(data_with_neighbor, columns=['neighbourhood_cleansed'], drop_first=True, dtype=int)
# data_with_neighbor

### Spatial CV setup

In [61]:
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import cross_validate

groups = GroupKFold(n_splits=10)
kfolds = groups.split(data_with_neighbor, groups=data_with_neighbor["neighbourhood_cleansed"].to_numpy())

In [62]:
train_idx, val_idx = [list(trainVal) for trainVal in zip(*kfolds)]

In [63]:
cv_partition = [*zip(train_idx, val_idx)]

In [64]:
def print_results(results: dict):
    for key in results.keys():
        if key.startswith("test_"):
            print("{}: {:.5f}".format(key, results[key].mean()))

### ML with sklearn

In [65]:
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor

#### Lasso

In [66]:
lasso = Lasso(random_state=17, alpha=5e-5)

In [67]:
lasso_res_cv = cross_validate(lasso, 
                           data_wo_neighbor.loc[:, data_wo_neighbor.columns != 'lnPrice'],\
                           y=data_wo_neighbor["lnPrice"],\
                           scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2'),\
                           cv=10, n_jobs=-1)

print_results(lasso_res_cv)

test_neg_mean_absolute_error: -0.28501
test_neg_mean_squared_error: -0.14170
test_neg_root_mean_squared_error: -0.37621
test_r2: 0.58928


In [68]:
lasso_res = cross_validate(lasso, 
                           data_wo_neighbor.loc[:, data_wo_neighbor.columns != 'lnPrice'],\
                           y=data_wo_neighbor["lnPrice"],\
                           scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2'),\
                           cv=cv_partition, n_jobs=-1)

print_results(lasso_res) 

test_neg_mean_absolute_error: -0.28534
test_neg_mean_squared_error: -0.14207
test_neg_root_mean_squared_error: -0.37636
test_r2: 0.53971


#### Decision Tree

In [73]:
decision_tree = DecisionTreeRegressor(random_state=17,
#                                       min_samples_split=20, 
#                                       max_depth=7,
#                                      max_features=0.8,
                                     ccp_alpha=0.00017)

In [74]:
decision_tree_res_cv = cross_validate(decision_tree, data_wo_neighbor.loc[:, data_wo_neighbor.columns != 'lnPrice'],\
               y=data_wo_neighbor["lnPrice"],\
               scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2'),\
              cv=10, n_jobs=-1)

print_results(decision_tree_res_cv)

test_neg_mean_absolute_error: -0.30305
test_neg_mean_squared_error: -0.15877
test_neg_root_mean_squared_error: -0.39836
test_r2: 0.53936


In [75]:
decision_tree_res = cross_validate(decision_tree, data_wo_neighbor.loc[:, data_wo_neighbor.columns != 'lnPrice'],\
               y=data_wo_neighbor["lnPrice"],\
               scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2'),\
              cv=cv_partition, n_jobs=-1)

print_results(decision_tree_res)

test_neg_mean_absolute_error: -0.30801
test_neg_mean_squared_error: -0.16389
test_neg_root_mean_squared_error: -0.40411
test_r2: 0.46961


#### Bagging

In [76]:
bagging = BaggingRegressor(random_state=17, n_jobs=-1, n_estimators=100, max_samples=0.7)

In [77]:
bagging_res_cv = cross_validate(bagging, data_wo_neighbor.loc[:, data_wo_neighbor.columns != 'lnPrice'],\
               y=data_wo_neighbor["lnPrice"],
               scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2'),\
              cv=10, n_jobs=-1)

print_results(bagging_res_cv)

test_neg_mean_absolute_error: -0.26846
test_neg_mean_squared_error: -0.12704
test_neg_root_mean_squared_error: -0.35630
test_r2: 0.63150


In [78]:
bagging_res = cross_validate(bagging, data_wo_neighbor.loc[:, data_wo_neighbor.columns != 'lnPrice'],\
               y=data_wo_neighbor["lnPrice"],
               scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2'),\
              cv=cv_partition, n_jobs=-1)

print_results(bagging_res) # 0.56131

test_neg_mean_absolute_error: -0.27845
test_neg_mean_squared_error: -0.13496
test_neg_root_mean_squared_error: -0.36702
test_r2: 0.56131


#### Random Forest

In [79]:
randon_forest = RandomForestRegressor(random_state=17, 
                                      max_features=6, 
                                      n_estimators=1000,
                                      min_samples_split=11) 

In [80]:
randon_forest_res_cv = cross_validate(randon_forest, data_wo_neighbor.loc[:, data_wo_neighbor.columns != 'lnPrice'],\
               y=data_wo_neighbor["lnPrice"],\
               scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2'),\
               cv=10, n_jobs=-1)
print_results(randon_forest_res_cv)

test_neg_mean_absolute_error: -0.26828
test_neg_mean_squared_error: -0.12712
test_neg_root_mean_squared_error: -0.35640
test_r2: 0.63141


In [81]:
randon_forest_res = cross_validate(randon_forest, data_wo_neighbor.loc[:, data_wo_neighbor.columns != 'lnPrice'],\
               y=data_wo_neighbor["lnPrice"],\
               scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2'),\
               cv=cv_partition, n_jobs=-1)

print_results(randon_forest_res)  # 0.569830

test_neg_mean_absolute_error: -0.27787
test_neg_mean_squared_error: -0.13497
test_neg_root_mean_squared_error: -0.36697
test_r2: 0.56196


### XGBoost

In [69]:
from xgboost import XGBRegressor

In [70]:
xgb = XGBRegressor(n_estimators=400, random_state=17, grow_policy='lossguide',
                   learning_rate=0.05, tree_method='hist', max_bin=90, gamma=0.02, 
                   colsample_bytree = 0.5)

In [71]:
xgb_res_cv = cross_validate(xgb, data_wo_neighbor.loc[:, data_wo_neighbor.columns != 'lnPrice'],\
               y=data_wo_neighbor["lnPrice"],\
               scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2'),\
              cv=10, n_jobs=-1)

print_results(xgb_res_cv)

test_neg_mean_absolute_error: -0.25589
test_neg_mean_squared_error: -0.11500
test_neg_root_mean_squared_error: -0.33903
test_r2: 0.66622


In [72]:
xgb_res = cross_validate(xgb, data_wo_neighbor.loc[:, data_wo_neighbor.columns != 'lnPrice'],\
               y=data_wo_neighbor["lnPrice"],\
               scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2'),\
              cv=cv_partition, n_jobs=-1)

print_results(xgb_res) # 0.609136

test_neg_mean_absolute_error: -0.26220
test_neg_mean_squared_error: -0.11969
test_neg_root_mean_squared_error: -0.34576
test_r2: 0.60960
