In [4]:
from sklearn.linear_model import SGDRegressor, LinearRegression, ElasticNet, BayesianRidge
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from utils.data_loader import AirbnbLoader
from utils.data_splitter import split_data
import pandas as pd


loader = AirbnbLoader()
X, y = loader.load_airbnb('Price_Night', normalized=True)
data = split_data(X, y)


The code below loops over a number of popular machine learning algorithms, using more traditional test/train splits , and returning a table ordered from best to worst in terms of train accuracy

In [5]:

MLA = [
  GradientBoostingRegressor(),
  LinearRegression(),
  SGDRegressor(),
  RandomForestRegressor(),
  ElasticNet(),
  SVR(),
  BayesianRidge(),
  KernelRidge(),
  XGBRegressor(),
  CatBoostRegressor(silent=True)
]


MLA_columns = ['MLA Name', 'MLA Parameters','Train MSE', 'Test MSE', 'Train r2', 'Test r2']
MLA_compare = pd.DataFrame(columns = MLA_columns)

row_index = 0

for alg in MLA:
  MLA_name = alg.__class__.__name__
  MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
  MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())

  alg.fit(data['X_train'], data['y_train'])

  test_predictions = alg.predict(data['X_test'])
  train_predictions = alg.predict(data['X_train'])

  MLA_compare.loc[row_index, 'Train MSE'] = mean_squared_error(data['y_train'], train_predictions)
  MLA_compare.loc[row_index, 'Test MSE'] = mean_squared_error(data['y_test'], test_predictions)
  MLA_compare.loc[row_index, 'Train r2'] = r2_score(data['y_train'], train_predictions)
  MLA_compare.loc[row_index, 'Test r2'] = r2_score(data['y_test'], test_predictions)

  row_index += 1

MLA_compare.sort_values(by = ['Test MSE'], ascending = True, inplace = True)

MLA_compare

Unnamed: 0,MLA Name,MLA Parameters,Train MSE,Test MSE,Train r2,Test r2
7,KernelRidge,"{'alpha': 1, 'coef0': 1, 'degree': 3, 'gamma':...",0.007901,0.009545,0.37337,0.326081
6,BayesianRidge,"{'alpha_1': 1e-06, 'alpha_2': 1e-06, 'alpha_in...",0.007711,0.009667,0.388377,0.317471
1,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",0.007704,0.009725,0.38893,0.313375
3,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.001373,0.010489,0.891093,0.259438
9,CatBoostRegressor,"{'loss_function': 'RMSE', 'silent': True}",0.001362,0.010491,0.891988,0.259258
0,GradientBoostingRegressor,"{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': ...",0.003459,0.011309,0.725647,0.201571
8,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.000126,0.012332,0.99004,0.129289
5,SVR,"{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'd...",0.012182,0.013217,0.033796,0.066805
4,ElasticNet,"{'alpha': 1.0, 'copy_X': True, 'fit_intercept'...",0.012608,0.01417,0.0,-0.00044
2,SGDRegressor,"{'alpha': 0.0001, 'average': False, 'early_sto...",1.4523720558613755e+18,1.8115214735581888e+18,-1.151931528194461e+20,-1.2790098372610272e+20
