In [77]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [78]:
%run "featureSelection.ipynb"

----------------
               Columns        Score
31           sentiment  6773.682638
0   host_response_rate  6431.843258
29     amenities_count   263.548784
1    host_is_superhost   149.157935
26   reviews_per_month   120.950215
----------------
----------------
               Columns        Score
31           sentiment  5385.757684
0   host_response_rate  3450.414756
29     amenities_count   185.463246
1    host_is_superhost   119.675834
26   reviews_per_month    94.629085
----------------
----------------
               Columns        Score
0   host_response_rate  5711.801576
31           sentiment  4928.552034
29     amenities_count   258.664707
1    host_is_superhost   160.091961
26   reviews_per_month   157.658318
----------------
----------------
               Columns        Score
31           sentiment  4541.731696
0   host_response_rate  2455.297309
29     amenities_count   191.621710
----------------
----------------
                Columns        Score
31            sent

In [79]:
#Splitting the model into train, and test
from sklearn.model_selection import train_test_split
review_scores_rating_Xtrain, review_scores_rating_Xtest, review_scores_rating_ytrain, review_scores_rating_ytest = train_test_split(review_scores_rating_M2X,review_scores_rating_y)
review_scores_accuracy_Xtrain, review_scores_accuracy_Xtest, review_scores_accuracy_ytrain, review_scores_accuracy_ytest = train_test_split(review_scores_accuracy_M2X,review_scores_accuracy_y)
review_scores_cleanliness_Xtrain, review_scores_cleanliness_Xtest, review_scores_cleanliness_ytrain, review_scores_cleanliness_ytest = train_test_split(review_scores_cleanliness_M2X,review_scores_cleanliness_y)
review_scores_checkin_Xtrain, review_scores_checkin_Xtest, review_scores_checkin_ytrain, review_scores_checkin_ytest = train_test_split(review_scores_checkin_M2X,review_scores_checkin_y)
review_scores_communication_Xtrain, review_scores_communication_Xtest, review_scores_communication_ytrain, review_scores_communication_ytest = train_test_split(review_scores_communication_M2X,review_scores_communication_y)
review_scores_location_Xtrain, review_scores_location_Xtest, review_scores_location_ytrain, review_scores_location_ytest = train_test_split(review_scores_location_M2X,review_scores_location_y)
review_scores_value_Xtrain, review_scores_value_Xtest, review_scores_value_ytrain, review_scores_value_ytest = train_test_split(review_scores_value_M2X,review_scores_value_y)

In [65]:
def randomForest_GridSearchCV (Xtrain,Xtest,ytrain,ytest,cv = 3):
    from sklearn.model_selection import GridSearchCV
    param_grid = {
    'bootstrap': [True, False],
    'max_depth': [80, 90, 100, 110],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
    }
    # Create a based model
    rf = RandomForestRegressor()
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = cv, n_jobs = -1, verbose = 2)
    # Fit the grid search to the data
    grid_search.fit(Xtrain, ytrain)
    print(grid_search.best_params_)
    base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
    base_model.fit(Xtrain, ytrain)
    base_predictions = base_model.predict(Xtest)   
    best_grid = grid_search.best_estimator_
    best_pred = best_grid.predict(Xtest)
    print('Base Model_R2 score:',r2_score(ytest,base_predictions))
    print('Base Model_MAE:',mean_absolute_error(ytest,base_predictions))
    print('Best Model_R2 score:',r2_score(ytest,best_pred))
    print('Best Model_MAE:',mean_absolute_error(ytest,best_pred))

    return best_pred   

In [66]:
def randomForest_RandomSearchCV (Xtrain,Xtest,ytrain,ytest,cv = 3):
    from sklearn.model_selection import RandomizedSearchCV
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}
    # Create a based model
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    # Fit the grid search to the data
    rf_random.fit(Xtrain, ytrain)
    print(rf_random.best_params_)
    base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
    base_model.fit(Xtrain, ytrain)
    base_predictions = base_model.predict(Xtest)   
    best_random = rf_random.best_estimator_
    best_pred = best_random.predict(review_scores_rating_Xtest)
    print('Base Model_R2 score:',r2_score(ytest,base_predictions))
    print('Base Model_MAE:',mean_absolute_error(ytest,base_predictions))
    print('Best Model_R2 score:',r2_score(ytest,best_pred))
    print('Best Model_MAE:',mean_absolute_error(ytest,best_pred))

    return best_pred   

In [43]:
review_scores_rating_pred = randomForest_GridSearchCV(review_scores_rating_Xtrain, review_scores_rating_Xtest, review_scores_rating_ytrain, review_scores_rating_ytest,3)
print('R2 score:',r2_score(review_scores_rating_ytest,review_scores_rating_pred))
print('MAE:',mean_absolute_error(review_scores_rating_ytest,review_scores_rating_pred))
print('MSE:',mean_squared_error(review_scores_rating_ytest,review_scores_rating_pred))
print('RMSE:',mean_squared_error(review_scores_rating_ytest,review_scores_rating_pred,squared=False))

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


{'bootstrap': True, 'max_depth': 90, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 100}


  base_model.fit(Xtrain, ytrain)


Base Model_R2 score: 0.8774968387756906
Base Model_MAE: 0.1395605554075231
Best Model_R2 score: 0.8936611359501642
Best Model_MAE: 0.13086181833350516
R2 score: 0.8936611359501642
MAE: 0.13086181833350516
MSE: 0.06011886531471782
RMSE: 0.2451914870355776


In [67]:
review_scores_accuracy_pred = randomForest_GridSearchCV(review_scores_accuracy_Xtrain, review_scores_accuracy_Xtest, review_scores_accuracy_ytrain, review_scores_accuracy_ytest,3)
print('R2 score:',r2_score(review_scores_accuracy_ytest,review_scores_accuracy_pred))
print('MAE:',mean_absolute_error(review_scores_accuracy_ytest,review_scores_accuracy_pred))
print('MSE:',mean_squared_error(review_scores_accuracy_ytest,review_scores_accuracy_pred))
print('RMSE:',mean_squared_error(review_scores_accuracy_ytest,review_scores_accuracy_pred,squared=False))

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'bootstrap': False, 'max_depth': 80, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 100}


  base_model.fit(Xtrain, ytrain)


Base Model_R2 score: 0.614797475724234
Base Model_MAE: 0.1569231228353774
Best Model_R2 score: 0.6756528471036317
Best Model_MAE: 0.14510409003475078
R2 score: 0.6756528471036317
MAE: 0.14510409003475078
MSE: 0.09528863853675995
RMSE: 0.3086885785654532


In [68]:
review_scores_cleanliness_pred = randomForest_GridSearchCV(review_scores_cleanliness_Xtrain, review_scores_cleanliness_Xtest, review_scores_cleanliness_ytrain, review_scores_cleanliness_ytest,3)
print('R2 score:',r2_score(review_scores_cleanliness_ytest,review_scores_cleanliness_pred))
print('MAE:',mean_absolute_error(review_scores_cleanliness_ytest,review_scores_cleanliness_pred))
print('MSE:',mean_squared_error(review_scores_cleanliness_ytest,review_scores_cleanliness_pred))
print('RMSE:',mean_squared_error(review_scores_cleanliness_ytest,review_scores_cleanliness_pred,squared=False))

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'bootstrap': True, 'max_depth': 80, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 100}


  base_model.fit(Xtrain, ytrain)


Base Model_R2 score: 0.7785258371108962
Base Model_MAE: 0.20422318370373518
Best Model_R2 score: 0.7718876405396954
Best Model_MAE: 0.199416030623413
R2 score: 0.7718876405396954
MAE: 0.199416030623413
MSE: 0.14901591518026866
RMSE: 0.3860257960036721


In [69]:
review_scores_checkin_pred = randomForest_GridSearchCV(review_scores_checkin_Xtrain, review_scores_checkin_Xtest, review_scores_checkin_ytrain, review_scores_checkin_ytest,3)
print('R2 score:',r2_score(review_scores_checkin_ytest,review_scores_checkin_pred))
print('MAE:',mean_absolute_error(review_scores_checkin_ytest,review_scores_checkin_pred))
print('MSE:',mean_squared_error(review_scores_checkin_ytest,review_scores_checkin_pred))
print('RMSE:',mean_squared_error(review_scores_checkin_ytest,review_scores_checkin_pred,squared=False))

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'bootstrap': False, 'max_depth': 80, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 200}


  base_model.fit(Xtrain, ytrain)


Base Model_R2 score: 0.529209074859232
Base Model_MAE: 0.14242139447382254
Best Model_R2 score: 0.6019629642856027
Best Model_MAE: 0.1333927298932113
R2 score: 0.6019629642856027
MAE: 0.1333927298932113
MSE: 0.07781165723749077
RMSE: 0.27894740944753504


In [70]:
review_scores_communication_pred = randomForest_GridSearchCV(review_scores_communication_Xtrain, review_scores_communication_Xtest, review_scores_communication_ytrain, review_scores_communication_ytest,3)
print('R2 score:',r2_score(review_scores_communication_ytest,review_scores_communication_pred))
print('MAE:',mean_absolute_error(review_scores_communication_ytest,review_scores_communication_pred))
print('MSE:',mean_squared_error(review_scores_communication_ytest,review_scores_communication_pred))
print('RMSE:',mean_squared_error(review_scores_communication_ytest,review_scores_communication_pred,squared=False))

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'bootstrap': True, 'max_depth': 80, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}


  base_model.fit(Xtrain, ytrain)


Base Model_R2 score: 0.5957299223721756
Base Model_MAE: 0.12701096795518138
Best Model_R2 score: 0.6495892562272989
Best Model_MAE: 0.12364784340743266
R2 score: 0.6495892562272989
MAE: 0.12364784340743266
MSE: 0.06637753615564905
RMSE: 0.25763838253577254


In [80]:
review_scores_location_pred = randomForest_GridSearchCV(review_scores_location_Xtrain, review_scores_location_Xtest, review_scores_location_ytrain, review_scores_location_ytest,3)
print('R2 score:',r2_score(review_scores_location_ytest,review_scores_location_pred))
print('MAE:',mean_absolute_error(review_scores_location_ytest,review_scores_location_pred))
print('MSE:',mean_squared_error(review_scores_location_ytest,review_scores_location_pred))
print('RMSE:',mean_squared_error(review_scores_location_ytest,review_scores_location_pred,squared=False))

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'bootstrap': True, 'max_depth': 100, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 100}


  base_model.fit(Xtrain, ytrain)


Base Model_R2 score: 0.3379642644268468
Base Model_MAE: 0.18872704261654352
Best Model_R2 score: 0.4629119892886431
Best Model_MAE: 0.1761791896787902
R2 score: 0.4629119892886431
MAE: 0.1761791896787902
MSE: 0.08625335107400856
RMSE: 0.2936892083036225


In [76]:
review_scores_value_pred = randomForest_GridSearchCV(review_scores_value_Xtrain, review_scores_value_Xtest, review_scores_value_ytrain, review_scores_value_ytest,3)
print('R2 score:',r2_score(review_scores_value_ytest,review_scores_value_pred))
print('MAE:',mean_absolute_error(review_scores_value_ytest,review_scores_value_pred))
print('MSE:',mean_squared_error(review_scores_value_ytest,review_scores_value_pred))
print('RMSE:',mean_squared_error(review_scores_value_ytest,review_scores_value_pred,squared=False))

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'bootstrap': False, 'max_depth': 80, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 100}


  base_model.fit(Xtrain, ytrain)


Base Model_R2 score: 0.7221885098651211
Base Model_MAE: 0.19232584239984976
Best Model_R2 score: 0.7522179901251045
Best Model_MAE: 0.18982850243218322
R2 score: 0.7522179901251045
MAE: 0.18982850243218322
MSE: 0.10619764933958258
RMSE: 0.32587980811885625


ref
1) <<https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74>>