In [2]:
import sys
sys.path.append('/Users/tompease/Documents/Coding/airbnb')
from sklearn.linear_model import SGDRegressor, LinearRegression, ElasticNet, BayesianRidge
from sklearn.model_selection import ShuffleSplit
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_validate
from utils.data_loader import AirbnbLoader
import pandas as pd

loader = AirbnbLoader()
X, y = loader.load_airbnb('Price_Night', normalized=True)

cv_split = ShuffleSplit(n_splits = 5, test_size = .3, train_size = .7, random_state = 42)

The code below loops over a number of popular machine learning algorithms, using cross validation, and returning a table ordered from best to worst in terms of train accuracy

In [3]:

MLA = [
  GradientBoostingRegressor(),
  LinearRegression(),
  SGDRegressor(),
  RandomForestRegressor(),
  ElasticNet(),
  SVR(),
  BayesianRidge(),
  KernelRidge(),
  XGBRegressor(),
  CatBoostRegressor(silent=True),
  DecisionTreeRegressor()
]


MLA_columns = ['MLA Name', 'MLA Parameters','Train MSE', 'Test MSE', 'Train r2', 'Test r2']
MLA_compare = pd.DataFrame(columns = MLA_columns)

row_index = 0

for alg in MLA:
  MLA_name = alg.__class__.__name__
  MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
  MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())

  cv_results = cross_validate(alg, X=X, y=y, cv = cv_split , scoring=['r2', 'neg_mean_squared_error'], return_train_score=True) 

  MLA_compare.loc[row_index, 'Train MSE'] = cv_results['train_neg_mean_squared_error'].mean()
  MLA_compare.loc[row_index, 'Test MSE'] = cv_results['test_neg_mean_squared_error'].mean()
  MLA_compare.loc[row_index, 'Train r2'] = cv_results['train_r2'].mean()
  MLA_compare.loc[row_index, 'Test r2'] = cv_results['test_r2'].mean()

  row_index += 1

MLA_compare.sort_values(by = ['Test MSE'], ascending = False, inplace = True)

MLA_compare

Unnamed: 0,MLA Name,MLA Parameters,Train MSE,Test MSE,Train r2,Test r2
7,KernelRidge,"{'alpha': 1, 'coef0': 1, 'degree': 3, 'gamma':...",-0.008978,-0.008004,0.382619,0.269414
5,SVR,"{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'd...",-0.008952,-0.008257,0.384078,0.244224
6,BayesianRidge,"{'alpha_1': 1e-06, 'alpha_2': 1e-06, 'alpha_in...",-0.00869,-0.00848,0.402483,0.224777
9,CatBoostRegressor,"{'loss_function': 'RMSE', 'silent': True}",-0.001164,-0.008772,0.919802,0.195019
0,GradientBoostingRegressor,"{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': ...",-0.003322,-0.009599,0.771564,0.12261
3,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",-0.001508,-0.009698,0.89623,0.109372
2,SGDRegressor,"{'alpha': 0.0001, 'average': False, 'early_sto...",-0.013515,-0.010353,0.070359,0.05359
4,ElasticNet,"{'alpha': 1.0, 'copy_X': True, 'fit_intercept'...",-0.014538,-0.011007,0.0,-0.007133
8,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",-8.7e-05,-0.011485,0.99397,-0.0627
10,DecisionTreeRegressor,"{'ccp_alpha': 0.0, 'criterion': 'squared_error...",-2.4e-05,-0.017213,0.998329,-0.601051
