In [22]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
from utils.data_loader import AirbnbLoader
from sklearn.model_selection import ShuffleSplit, GridSearchCV

loader = AirbnbLoader()
X, y = loader.load_airbnb('beds', normalized=True)

cv_split = ShuffleSplit(n_splits = 5, test_size = .3, train_size = .7, random_state = 42)

model = GradientBoostingRegressor()
base_results = cross_validate(model, X, y, cv = cv_split, scoring=['r2', 'neg_mean_squared_error'], return_train_score=True)
model.fit(X, y) #Just for defaults params

print('DEFAULT parameters: ', model.get_params())
print(f'DEFAULT train r2: {base_results["train_r2"].mean()}')
print(f'DEFAULT test r2: {base_results["test_r2"].mean()}')
print(f'DEFAULT train mse: {base_results["train_neg_mean_squared_error"].mean()}')
print(f'DEFAULT test mse: {base_results["test_neg_mean_squared_error"].mean()}')

DEFAULT parameters:  {'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
DEFAULT train r2: 0.7639052470119235
DEFAULT test r2: 0.399293247653757
DEFAULT train mse: -0.003002110301217997
DEFAULT test mse: -0.008684585267604225


Tuning for MSE

In [23]:
param_grid = {
  'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
  'min_samples_split': [2,3,4,5],
  'min_samples_leaf': [3, 5, 10, 20],
  'max_depth': [1, 2, 3]
  } 

tuned_model = GridSearchCV(GradientBoostingRegressor(), param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv_split, return_train_score=True)
tuned_model.fit(X, y)


In [24]:

print('TUNED params: ', tuned_model.best_params_)
print(f'TUNED train mse: {tuned_model.cv_results_["mean_train_score"][tuned_model.best_index_]}')
print(f'TUNED test mse: {tuned_model.cv_results_["mean_test_score"][tuned_model.best_index_]}')

TUNED params:  {'loss': 'squared_error', 'max_depth': 1, 'min_samples_leaf': 10, 'min_samples_split': 2}
TUNED train mse: -0.006109360363316554
TUNED test mse: -0.0073379669771767835


In [25]:
print(f'DEFAULT train mse: {base_results["train_neg_mean_squared_error"].mean()}')
print(f'DEFAULT test mse: {base_results["test_neg_mean_squared_error"].mean()}')

DEFAULT train mse: -0.003002110301217997
DEFAULT test mse: -0.008684585267604225


Tuning for r2

In [26]:
tuned_model = GridSearchCV(GradientBoostingRegressor(), param_grid=param_grid, scoring='r2', cv=cv_split, return_train_score=True)
tuned_model.fit(X, y)

print('TUNED params: ', tuned_model.best_params_)
print(f'TUNED train r2: {tuned_model.cv_results_["mean_train_score"][tuned_model.best_index_]}')
print(f'TUNED test r2: {tuned_model.cv_results_["mean_test_score"][tuned_model.best_index_]}')
print("")
print('DEFAULT parameters: ', model.get_params())
print(f'DEFAULT train r2: {base_results["train_r2"].mean()}')
print(f'DEFAULT test r2: {base_results["test_r2"].mean()}')

TUNED params:  {'loss': 'squared_error', 'max_depth': 1, 'min_samples_leaf': 10, 'min_samples_split': 3}
TUNED train mse: 0.5214980993600703
TUNED test mse: 0.4916581681465758

DEFAULT parameters:  {'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
DEFAULT train r2: 0.7639052470119235
DEFAULT test r2: 0.399293247653757
