In [None]:
import pickle
import numpy as np
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
import joblib
from sklearn import linear_model
import xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

scorer = make_scorer(mean_squared_error, greater_is_better=False)

def calculate_rrmse(model, x_test, y_test, y_train):
    preds = model.predict(x_test).ravel()
    y_bar_train = np.mean(y_train)
    naive_sse = sum((y_bar_train - y_test) ** 2)
    model_sse = sum((preds - y_test) ** 2)
    rrmse = 100 * np.sqrt(model_sse / naive_sse)
    return rrmse

def get_results(model):
    print(calculate_rrmse(model, X_train.tocsr(), y_train, y_train))
    print(calculate_rrmse(model, X_val.tocsr(), y_val, y_train))
    print(calculate_rrmse(model, X_test.tocsr(), y_test, y_train))
    return

def train_model(location,data_version):
    with open('pkls/{}_{}x_train.pkl'.format(location,data_version), 'rb') as f:
        X_train = pickle.load(f)

    with open('pkls/{}_{}y_train.pkl'.format(location,data_version), 'rb') as f:
        y_train = pickle.load(f)

    with open('pkls/{}_{}x_val.pkl'.format(location,data_version), 'rb') as f:
        X_val = pickle.load(f)

    with open('pkls/{}_{}y_val.pkl'.format(location,data_version), 'rb') as f:
        y_val = pickle.load(f)

    with open('pkls/{}_{}x_test.pkl'.format(location,data_version), 'rb') as f:
        X_test = pickle.load(f)

    with open('pkls/{}_{}y_test.pkl'.format(location,data_version), 'rb') as f:
        y_test = pickle.load(f)
    
    def get_model(model):
        if model == 'SVR':
            return LinearSVR()
        if model == 'Linear_Regression':
            return linear_model.LinearRegression()
        if model == 'XGBoost':
            return XGBRegressor()
        if model == 'RandomForest':
            return RandomForestRegressor()
    
    for mod in ['SVR','Linear_Regression','XGBoost','RandomForest']:
        if mod != 'RandomForest':
            continue
        train_model = get_model(mod)
        train_model.fit(X_train,y_train)
        joblib.dump(train_model, 'model_results/{}_{}_{}.pkl'.format(location,data_version,mod))

        print('training model {}'.format(mod))
        print("training location {}".format(location))
        if data_version != '':
            print("dataset version {}".format(data_version))
        else:
            print("dataset all in")
        print(calculate_rrmse(train_model, X_train.tocsr(), y_train, y_train))
        print(calculate_rrmse(train_model, X_val.tocsr(), y_val, y_train))
        print(calculate_rrmse(train_model, X_test.tocsr(), y_test, y_train))
        
def get_feature_impt(model,feat_dict):
    
    importance = model.feature_importances_
    #get array of index of element with highest feature importance to lowest
    idx = (-importance).argsort()
    sorted_values = list(map(model.feature_importances_.__getitem__, idx))
    ls = []
    for element in idx:
        ls.append([feat_dict[element],importance[element]])
    return ls
    
def get_coef_impt(model,feat_dict):
    
    importance = model.coef_
    #get array of index of element with highest feature importance to lowest
    idx = (-importance).argsort()
    sorted_values = list(map(model.coef_.__getitem__, idx))
    ls = []
    for element in idx:
        ls.append([feat_dict[element],importance[element]])
    return ls

        

### Version 1: All in

In [None]:
train_model('SG','')

training model SVR
training location SG
dataset all in
42.69876455233248
52.834251553219346
58.25940223406434
training model Linear_Regression
training location SG
dataset all in
1.1536587972911667
140.73588736873003
124.56726525746191
training model XGBoost
training location SG
dataset all in
7.930099790663078
54.03484930159609
55.77631643540513
training model RandomForest
training location SG
dataset all in
21.984807612421996
53.05741847998628
58.38612143086532


In [None]:
with open('pkls/SG_feat_dict.pkl', 'rb') as f:
    feat_dict = pickle.load(f)


print("SVR top 10 features")
model = joblib.load('model_results/SG__SVR.pkl')
print(get_coef_impt(model,feat_dict)[:10])
print("XGBoost top 10 features")
model = joblib.load('model_results/SG__XGBoost.pkl')
print(get_feature_impt(model,feat_dict)[:10])
print("RandomForest top 10 features")
model = joblib.load('model_results/SG__RandomForest.pkl')
print(get_feature_impt(model,feat_dict)[:10])

SVR top 10 features
[['Categorical property_type entire villa', 1.946479745140088], ['Word bathtub', 1.3257295491073855], ['Word threebedroom', 1.2709613953755985], ['Categorical neighbourhood_cleansed southern island', 1.2700201489225331], ['Word universal', 1.213768451044867], ['Word holiday', 1.0981101564649631], ['Word rice', 1.0745539762402585], ['Word wisma', 1.0263933600324604], ['Categorical neighbourhood_cleansed marina south', 1.0], ['Word sqm', 0.9901574507109934]]
XGBoost top 10 features
[['Numerical bedrooms', 0.10920517], ['Categorical amenities_str gym', 0.04094915], ['Categorical amenities_str shower gel', 0.037675034], ['Word shared', 0.028727882], ['Word temperature', 0.023702735], ['Word mattress', 0.022554306], ['Numerical minimum_nights', 0.020395033], ['Categorical amenities_str conditioner', 0.016125493], ['Categorical amenities_str high chair', 0.01588239], ['Categorical amenities_str pool', 0.0150096845]]
RandomForest top 10 features
[['Numerical bedrooms', 0.2

In [None]:
train_model('NY','')

training model SVR
training location NY
dataset all in
61.85768001519122
65.93770235493153
66.4723869981451
training model Linear_Regression
training location NY
dataset all in
52.50263484948599
70.361566644142
71.6748231969928
training model XGBoost
training location NY
dataset all in
34.69684430400049
59.9712056339329
61.64000617018348


training model RandomForest
training location NY
dataset all in
24.006821043434616
60.900017307575105
63.20602957719934


In [None]:
with open('pkls/NY_feat_dict.pkl', 'rb') as f:
    feat_dict = pickle.load(f)

print("SVR top 10 features")
model = joblib.load('model_results/NY__SVR.pkl')
print(get_coef_impt(model,feat_dict)[:10])
print("XGBoost top 10 features")
model = joblib.load('model_results/NY__XGBoost.pkl')
print(get_feature_impt(model,feat_dict)[:10])
print("RandomForest top 10 features")
model = joblib.load('model_results/NY__RandomForest.pkl')
print(get_feature_impt(model,feat_dict)[:10])

SVR top 10 features
[['Categorical amenities_str 48 hdtv with hbo max', 2.0], ['Word blueground', 1.8986937997247548], ['Categorical amenities_str sauna', 1.5529511422270246], ['Categorical bathrooms_text 3.5', 1.5380230848709413], ['Categorical amenities_str paid valet parking on premise u2013 1 space', 1.4593436568273437], ['Categorical property_type entire villa', 1.4466212067988897], ['Categorical property_type private room in resort', 1.4453824158477557], ['Categorical amenities_str miele stainless steel stove', 1.2980077428360492], ['Word sale', 1.2253652068935263], ['Categorical amenities_str luxury body soap', 1.2127110667309107]]
XGBoost top 10 features
[['Categorical room_type private room', 0.07873815], ['Categorical bathrooms_text 1.5', 0.04275509], ['Categorical bathrooms_text 2.0', 0.028340925], ['Word complimentary', 0.01638157], ['Word madison', 0.01490174], ['Categorical bathrooms_text 2.5', 0.013573806], ['Categorical neighbourhood_group_cleansed manhattan', 0.0125383

## Hyper parameter tuning

In [None]:
location = 'NY'
data_version = ''
with open('pkls/{}_{}x_train.pkl'.format(location,data_version), 'rb') as f:
    X_train = pickle.load(f)

with open('pkls/{}_{}y_train.pkl'.format(location,data_version), 'rb') as f:
    y_train = pickle.load(f)

with open('pkls/{}_{}x_val.pkl'.format(location,data_version), 'rb') as f:
    X_val = pickle.load(f)

with open('pkls/{}_{}y_val.pkl'.format(location,data_version), 'rb') as f:
    y_val = pickle.load(f)

with open('pkls/{}_{}x_test.pkl'.format(location,data_version), 'rb') as f:
    X_test = pickle.load(f)

with open('pkls/{}_{}y_test.pkl'.format(location,data_version), 'rb') as f:
    y_test = pickle.load(f)

In [None]:
svr = LinearSVR()
params = {'C': [1, 10, 100], 'epsilon': [1,0.1,0.01,0]}
clf = GridSearchCV(svr,
                  params,
                  scoring='neg_mean_squared_error',
                  verbose=1)
clf.fit(X_train, y_train)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'C': 1, 'epsilon': 1}
Lowest RMSE:  0.7444353860313172


In [None]:
get_results(clf.best_estimator_)

63.65518440079796
69.62225276046067
70.41916833808116


In [None]:
svregressor = LinearSVR()
svregressor.fit(X_train, y_train)
get_results(svregressor)

In [None]:

'''
params = { 'max_depth': [3, 5, 6, 10, 15, 20],
           'learning_rate': [0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'n_estimators': [100,250,500]}
xgbr = XGBRegressor()
clf = RandomizedSearchCV(estimator=xgbr,
                         param_distributions=params,
                         scoring='neg_mean_squared_error',
                         n_iter=10,
                         verbose=1)
clf.fit(X_train, y_train)
'''

xgboost_tuned = XGBRegressor(subsample= 0.5, n_estimators= 1000, max_depth= 20, learning_rate= 0.01, colsample_bytree= 0.4, colsample_bylevel= 0.8999999999999999)
eval_set = [(X_val, y_val)]
xgboost_tuned.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="rmse", eval_set=eval_set, verbose=True)

get_results(xgboost_tuned)

[0]	validation_0-rmse:1.10642
[1]	validation_0-rmse:1.10020
[2]	validation_0-rmse:1.09401
[3]	validation_0-rmse:1.08801
[4]	validation_0-rmse:1.08193
[5]	validation_0-rmse:1.07516
[6]	validation_0-rmse:1.06918
[7]	validation_0-rmse:1.06291
[8]	validation_0-rmse:1.05681
[9]	validation_0-rmse:1.05191
[10]	validation_0-rmse:1.04591
[11]	validation_0-rmse:1.04080
[12]	validation_0-rmse:1.03524
[13]	validation_0-rmse:1.02968
[14]	validation_0-rmse:1.02400
[15]	validation_0-rmse:1.01825
[16]	validation_0-rmse:1.01259
[17]	validation_0-rmse:1.00704
[18]	validation_0-rmse:1.00144
[19]	validation_0-rmse:0.99643
[20]	validation_0-rmse:0.99102
[21]	validation_0-rmse:0.98631
[22]	validation_0-rmse:0.98181
[23]	validation_0-rmse:0.97699
[24]	validation_0-rmse:0.97194
[25]	validation_0-rmse:0.96685
[26]	validation_0-rmse:0.96182
[27]	validation_0-rmse:0.95722
[28]	validation_0-rmse:0.95229
[29]	validation_0-rmse:0.94724
[30]	validation_0-rmse:0.94301
[31]	validation_0-rmse:0.93815
[32]	validation_0-

In [None]:
#save your model or results
joblib.dump(clf, 'model_results/xgboost_tuned.pkl')

#load your model for further usage
joblib.load("model_results/xgboost_tuned.pkl")

In [None]:
location = 'NY'
data_version = ''
with open('pkls/{}_{}x_train.pkl'.format(location,data_version), 'rb') as f:
    X_train = pickle.load(f)

with open('pkls/{}_{}y_train.pkl'.format(location,data_version), 'rb') as f:
    y_train = pickle.load(f)

with open('pkls/{}_{}x_val.pkl'.format(location,data_version), 'rb') as f:
    X_val = pickle.load(f)

with open('pkls/{}_{}y_val.pkl'.format(location,data_version), 'rb') as f:
    y_val = pickle.load(f)

with open('pkls/{}_{}x_test.pkl'.format(location,data_version), 'rb') as f:
    X_test = pickle.load(f)

with open('pkls/{}_{}y_test.pkl'.format(location,data_version), 'rb') as f:
    y_test = pickle.load(f)
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
rf = RandomForestRegressor()
clf = RandomizedSearchCV(estimator=rf,
                         param_distributions=params,
                         scoring='neg_mean_squared_error',
                         n_iter=5,
                         verbose=1)
clf.fit(X_train, y_train)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


KernelInterrupted: Execution interrupted by the Jupyter kernel.

In [None]:
get_results(clf.best_estimator_)

In [None]:
rf_tuned = RandomForestRegressor()
eval_set = [(X_val, y_val)]
rf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="rmse", eval_set=eval_set, verbose=True)

get_results(rf)

In [None]:
train_model('NY_SG','')

training model SVR
training location NY_SG
dataset all in
60.81667843809415
68.08456169934476
66.90917831893171
training model Linear_Regression
training location NY_SG
dataset all in
52.19104621722007
71.4445904817389
71.60209955998475
training model XGBoost
training location NY_SG
dataset all in
35.50519137170655
63.129515819727054
63.287806683101344


training model RandomForest
training location NY_SG
dataset all in
23.11030338158052
64.60016790265293
63.781575881205846


In [None]:
with open('pkls/NY_SG_feat_dict.pkl', 'rb') as f:
    feat_dict = pickle.load(f)

print("SVR top 10 features")
model = joblib.load('model_results/NY_SG__SVR.pkl')
print(get_coef_impt(model,feat_dict)[:10])
print("XGBoost top 10 features")
model = joblib.load('model_results/NY_SG__XGBoost.pkl')
print(get_feature_impt(model,feat_dict)[:10])
print("RandomForest top 10 features")
model = joblib.load('model_results/NY_SG__RandomForest.pkl')
print(get_feature_impt(model,feat_dict)[:10])

SVR top 10 features
[['Word blueground', 2.1987163763482687], ['Categorical amenities_str 48 hdtv with hbo max', 1.9999999999999987], ['Categorical neighbourhood_cleansed marina south', 1.8481027831916612], ['Categorical amenities_str miele stainless steel stove', 1.385922474958555], ['Word induction', 1.351699965418099], ['Word spread', 1.3034772461691728], ['Word procedure', 1.2126009649946934], ['Categorical amenities_str 60 hdtv', 1.2123466682899036], ['Word perspective', 1.2077096630566229], ['Word entertaining', 1.165944724683502]]
XGBoost top 10 features
[['Categorical room_type private room', 0.07031396], ['Categorical bathrooms_text 1.5', 0.03797805], ['Categorical bathrooms_text 2.0', 0.028847337], ['Numerical bedrooms', 0.017944638], ['Categorical neighbourhood_group_cleansed manhattan', 0.014313521], ['Categorical amenities_str gym', 0.010753754], ['Categorical bathrooms_text 2.5', 0.008986365], ['Categorical amenities_str dishwasher', 0.0083546275], ['Categorical amenities

### Version 2:No Lat Lon

In [None]:
train_model('SG','no_lat_lon_')

training model SVR
training location SG
dataset version no_lat_lon_
43.0847982567231
53.00265179971617
58.44134663943078
training model Linear_Regression
training location SG
dataset version no_lat_lon_
1.1639541253527321
138.92163831337535
124.20811317080029
training model XGBoost
training location SG
dataset version no_lat_lon_
7.9082292725189545
59.14051255685292
57.855613693758244
training model RandomForest
training location SG
dataset version no_lat_lon_
22.047291022043687
53.9057690772574
57.42011007385977


In [None]:
train_model('NY','no_lat_lon_')

training model SVR
training location NY
dataset version no_lat_lon_
62.17113363104643
66.11980082501023
66.74867230038566
training model Linear_Regression
training location NY
dataset version no_lat_lon_
52.5490662760493
70.40614606634824
71.70870855163417
training model XGBoost
training location NY
dataset version no_lat_lon_
35.001012439452595
62.03804559686136
61.645015265569135
training model RandomForest
training location NY
dataset version no_lat_lon_
24.359332858575467
62.835992950969654
64.02984111084167


In [None]:
train_model('NY_SG','no_lat_lon_')

training model SVR
training location NY_SG
dataset version no_lat_lon_
61.257913495873915
68.43850029447975
67.3328875274818
training model Linear_Regression
training location NY_SG
dataset version no_lat_lon_
52.19188056224119
71.46521052814506
71.59896217256112
training model XGBoost
training location NY_SG
dataset version no_lat_lon_
35.865580910054604
63.41456111540573
63.357634651859584
training model RandomForest
training location NY_SG
dataset version no_lat_lon_
23.552517416081674
64.97591309186123
64.27510190730207


### Version 3:No Superhost

In [None]:
train_model('SG','no_superhost_')

training model SVR
training location SG
dataset version no_superhost_
42.93095739008736
52.758903995340624
58.36604048913898
training model Linear_Regression
training location SG
dataset version no_superhost_
1.1645736334927346
138.88856677602314
124.05474291799523
training model XGBoost
training location SG
dataset version no_superhost_
7.9082292725189545
59.14051255685292
57.855613693758244
training model RandomForest
training location SG
dataset version no_superhost_
21.8446150936419
54.78734072628492
58.104896513355165


In [None]:
train_model('NY','no_superhost_')

training model SVR
training location NY
dataset version no_superhost_
61.71925639458053
65.82348371479051
66.37503025809669
training model Linear_Regression
training location NY
dataset version no_superhost_
52.54908603287253
70.40858055398755
71.71648577159111
training model XGBoost
training location NY
dataset version no_superhost_
35.001012439452595
62.03804559686136
61.645015265569135
training model RandomForest
training location NY
dataset version no_superhost_
24.389507662539586
62.90617203077535
63.99815012849519


In [None]:
train_model('NY_SG','no_superhost_')

training model SVR
training location NY_SG
dataset version no_superhost_
60.72064139026018
67.8842639871028
66.75043201447434
training model Linear_Regression
training location NY_SG
dataset version no_superhost_
52.19213433629538
71.45703207028652
71.59272939965427
training model XGBoost
training location NY_SG
dataset version no_superhost_
35.865580910054604
63.41456111540573
63.357634651859584


KernelInterrupted: Execution interrupted by the Jupyter kernel.

In [None]:
#with randomforest
train_model('NY_SG','no_superhost_')

training model RandomForest
training location NY_SG
dataset version no_superhost_
23.54541053019607
64.60992175336425
64.2919548615404


### Version 4:No words

In [None]:
train_model('SG','no_words_')

training model SVR
training location SG
dataset version no_words_
62.134427730404575
63.6603558509257
70.4977275171625
training model Linear_Regression
training location SG
dataset version no_words_
55.67077375066558
68.87666599781882
70.3841491003544
training model XGBoost
training location SG
dataset version no_words_
9.483573593350668
53.468412184486766
56.081155588992004
training model RandomForest
training location SG
dataset version no_words_
21.674135051934694
51.40658283095234
60.24829803573428


In [None]:
train_model('NY','no_words_')

training model SVR
training location NY
dataset version no_words_
67.62215202126886
68.82143515868844
69.29955617767305
training model Linear_Regression
training location NY
dataset version no_words_
63.573453571136504
67.56886981327392
67.85954589814381
training model XGBoost
training location NY
dataset version no_words_
43.22045273966367
60.59507721062355
60.58867581004117
training model RandomForest
training location NY
dataset version no_words_
23.86686719734384
62.14639920597554
63.41828203685695


In [None]:
train_model('NY_SG','no_words_')

training model SVR
training location NY_SG
dataset version no_words_
67.8625029358394
71.01858683793377
70.00560175600336
training model Linear_Regression
training location NY_SG
dataset version no_words_
63.53330488406742
69.32029997399228
68.3478430910069
training model XGBoost
training location NY_SG
dataset version no_words_
43.53274683176059
62.853529564812696
60.6726770225955
training model RandomForest
training location NY_SG
dataset version no_words_
23.32982094625235
63.34540875886814
62.17937366085229


### Version 5:No review_scores_ratings

In [None]:
train_model('SG','no_review_ratings_')

training model SVR
training location SG
dataset version no_review_ratings_
42.8157826341687
53.124278897989655
58.3003215190529
training model Linear_Regression
training location SG
dataset version no_review_ratings_
1.1563439970644698
140.01566820062536
124.44509659347096
training model XGBoost
training location SG
dataset version no_review_ratings_
7.745404239492099
58.93251615842421
57.22923053637348


In [None]:
train_model('NY','no_review_ratings_')

training model SVR
training location NY
dataset version no_review_ratings_
62.027075210278134
66.08175882357939
66.56001958048716
training model Linear_Regression
training location NY
dataset version no_review_ratings_
52.555717197587
70.41672155953637
71.7140428691645
training model XGBoost
training location NY
dataset version no_review_ratings_
35.169024682144965
62.00666835134327
61.98630464846041


In [None]:
train_model('NY_SG','no_review_ratings_')

training model SVR
training location NY_SG
dataset version no_review_ratings_
60.78902643938995
67.96130867261448
66.83291689590567
training model Linear_Regression
training location NY_SG
dataset version no_review_ratings_
52.19366013108762
71.4742592545006
71.60707968246871
training model XGBoost
training location NY_SG
dataset version no_review_ratings_
35.59691781214302
63.42998021585662
63.43590675163636


In [None]:
train_model('SG','no_review_ratings_')


training model RandomForest
training location SG
dataset version no_review_ratings_
22.53093987586526
54.323544008338786
58.17196123239702


In [None]:
train_model('NY','no_review_ratings_')


training model RandomForest
training location NY
dataset version no_review_ratings_
24.352629100961877
63.30279291497174
63.99974164372381


In [None]:
train_model('NY_SG','no_review_ratings')

training model RandomForest
training location NY_SG
dataset version no_words_
23.13547074185266
63.166623545461356
61.99505800135846


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2a92d0af-cecf-4cde-96f5-c3db3a7f88f1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>