In [8]:
import sys
sys.path.append('/Users/tompease/Documents/Coding/airbnb')
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ShuffleSplit, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import cross_validate
from utils.data_loader import AirbnbLoader
import pandas as pd
from scipy.stats import randint


loader = AirbnbLoader()
X, y = loader.load_airbnb('Price_Night', normalized=True)

cv_split = ShuffleSplit(n_splits = 5, test_size = .3, train_size = .7, random_state = 42)

In [9]:
grad_boost_param_dist = {
  'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
  'min_samples_split': randint(1,10),
  'min_samples_leaf': randint(1,20),
  'max_depth': randint(1,4)
  } 

random_forest_param_dist = {
  'bootstrap': [True, False],
  'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
  'min_samples_leaf': randint(1,20),
  'min_samples_split': randint(1,10),
  'n_estimators': randint(1,1000)
 }

decision_tree_param_dist = {
  'criterion' : ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
  'splitter' : ['best', 'random'],
  'max_depth' : [1, 2, 4, 8, 16, None],
  'min_samples_split' : randint(2,10),
  'min_weight_fraction_leaf': [0.0, 0.0001, 0.001, 0.1],
  'max_features': ['sqrt', 'log2', None]
}

sgd_regressor_param_dist = {
  'loss' : ['squared_error', 'epsilon_insensitive', 'huber', 'squared_epsilon_insensitive'],
  'penalty' : ['l1', 'l2', 'elasticnet'],
  'alpha' : [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
  'shuffle' : [True, False],
  'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
  'max_iter' : randint(1000, 100000)
}


In [10]:

MLA = [
  [GradientBoostingRegressor(), grad_boost_param_dist],
  [SGDRegressor(max_iter=10000), sgd_regressor_param_dist],
  [RandomForestRegressor(), random_forest_param_dist],
  [DecisionTreeRegressor(), decision_tree_param_dist]
]


MLA_columns = ['MLA Name', 'MSE Parameters','Tuned for MSE Train', 'Tuned for MSE Test', 'r2 Parameters', 'Tuned for r2 Train', 'Tuned for r2 Test']
MLA_compare = pd.DataFrame(columns = MLA_columns)

row_index = 0

for el in MLA:
  alg = el[0]
  param_grid = el[1]
  MLA_name = alg.__class__.__name__
  MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
  
  mse_tuned_model = RandomizedSearchCV(alg, param_distributions=param_grid, n_iter=100, scoring='neg_mean_squared_error', cv=cv_split, return_train_score=True)
  mse_tuned_model.fit(X, y)
  
  MLA_compare.loc[row_index, 'MSE Parameters'] = str(mse_tuned_model.best_params_)
  MLA_compare.loc[row_index, 'Tuned for MSE Train'] = mse_tuned_model.cv_results_["mean_train_score"][mse_tuned_model.best_index_]
  MLA_compare.loc[row_index, 'Tuned for MSE Test'] = mse_tuned_model.cv_results_["mean_test_score"][mse_tuned_model.best_index_]

  
  r2_tuned_model = RandomizedSearchCV(alg, param_distributions=param_grid, n_iter=100, scoring='r2', cv=cv_split, return_train_score=True)
  r2_tuned_model.fit(X, y)
  MLA_compare.loc[row_index, 'r2 Parameters'] = str(r2_tuned_model.best_params_)
  MLA_compare.loc[row_index, 'Tuned for r2 Train'] = r2_tuned_model.cv_results_["mean_train_score"][r2_tuned_model.best_index_]
  MLA_compare.loc[row_index, 'Tuned for r2 Test'] = r2_tuned_model.cv_results_["mean_test_score"][r2_tuned_model.best_index_]

  row_index += 1



  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [11]:
MLA_compare.sort_values(by = ['Tuned for MSE Test'], ascending = False, inplace = True)

MLA_compare

Unnamed: 0,MLA Name,MSE Parameters,Tuned for MSE Train,Tuned for MSE Test,r2 Parameters,Tuned for r2 Train,Tuned for r2 Test
2,RandomForestRegressor,"{'bootstrap': True, 'max_depth': 60, 'max_feat...",-0.007197,-0.007714,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.505993,0.298174
0,GradientBoostingRegressor,"{'loss': 'squared_error', 'max_depth': 1, 'min...",-0.008449,-0.007892,"{'loss': 'squared_error', 'max_depth': 1, 'min...",0.415929,0.282628
1,SGDRegressor,"{'alpha': 0.0001, 'learning_rate': 'adaptive',...",-0.009923,-0.00839,"{'alpha': 0.0001, 'learning_rate': 'adaptive',...",0.318267,0.238569
3,DecisionTreeRegressor,"{'criterion': 'friedman_mse', 'max_depth': 2, ...",-0.009921,-0.008601,"{'criterion': 'squared_error', 'max_depth': 1,...",0.287742,0.197824
