# Model Selection

In [75]:
# import packages
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.model_selection import cross_val_score, KFold, GridSearchCV

In [76]:
# set up dataset
df = pd.read_pickle('../data/data.pkl')
X = df.drop('Price', axis=1)
y = df['Price']

In [77]:
# evaluation function (root mean squared error)
def rmse(model):
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    return np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf))

In [78]:
# the list of models to evaluate (GridSearchCV was used to find the best parameters for each model type)
models = [
    Lasso(alpha=2.173),
    Ridge(alpha=24.995),
    ElasticNet(alpha=.733, l1_ratio=.992),
    SVR(kernel='linear'),
    RandomForestRegressor(n_estimators=1800, max_features=.333, max_depth=None),
    AdaBoostRegressor(n_estimators=100, learning_rate=.05),
    GradientBoostingRegressor(n_estimators=600, max_depth=4, learning_rate=.05, max_features='sqrt', loss='squared_error'),
    XGBRegressor(colsample_bytree=.1, learning_rate=.05, max_depth=4, n_estimators=1000, reg_alpha=.5, reg_lambda=.25)
]

In [79]:
# example of using GridSearchCV to tune hyperparameters

# params = {
#     'max_depth': (3, 4, None),
#     'n_estimators': (500, 1000, 2000),
#     'colsample_bytree': (.1, .333, .5),
#     'reg_alpha': (.25, .5, 75),
#     'reg_lambda': (.25, .5, .75)
# }
# reg = GridSearchCV(XGBRegressor(learning_rate=.05), params, n_jobs=-1)
# reg.fit(X, y)
# reg.best_estimator_

In [80]:
# create and evaluate models
for model in models:
    score = rmse(model)
    print(f"{str(model).split('(')[0]} RMS Error: {score.mean()} \u00B1 {score.std()}")

Lasso RMS Error: 677.734774857363 ± 33.427346359761515
Ridge RMS Error: 676.4928888133803 ± 36.93320032272733
ElasticNet RMS Error: 675.3574861650643 ± 36.11773111759448
SVR RMS Error: 835.2549960323838 ± 38.13046119892962
RandomForestRegressor RMS Error: 621.3633780947794 ± 35.48021162876483
AdaBoostRegressor RMS Error: 771.1188945457418 ± 29.903099422023182
GradientBoostingRegressor RMS Error: 590.6028157085999 ± 31.311346276600183
XGBRegressor RMS Error: 576.4541533015773 ± 35.790300554675454


In [81]:
# to compare the regression results with the price range of the datapoints
y.describe()

count     3617.000000
mean      2245.439591
std        998.177401
min        600.000000
25%       1695.000000
50%       1995.000000
75%       2495.000000
max      10000.000000
Name: Price, dtype: float64

In [99]:
# analyze coefficients of regression to see which factors are weighted most heavily

lasso = Lasso(alpha=2.173).fit(X, y)
coef = pd.Series(lasso.coef_, index=X.columns).sort_values(ascending=False)
print(f'Top Factors and Coefficients: \n{coef[:5]}')
print(f'\nBottom Factors and Coefficients: \n{coef[-5:]}')
print(f'\n{len(coef[coef == 0])} Zero-Coefficient Factors (out of 177)')

Top Factors and Coefficients: 
Full_Baths       1190.423131
SQFT_Int         1145.226357
Median Income     722.216302
Age               539.660418
Grill             463.367039
dtype: float64

Bottom Factors and Coefficients: 
Style: Brick Front            -99.867705
Lot_Size_Num                 -102.058313
Amenities: Tennis Court(s)   -107.112452
Households                   -156.347660
Rooms: Game                  -271.265033
dtype: float64

74 Zero-Coefficient Factors (out of 177)


# Results
The XGBoost Regressor by far performed the best on this dataset. However, even its root mean square error was about 576 USD. While this model can give a reasonable estimate regarding the price range of a certain rental, it by no means should be used as an exact price calculator. The standard deviation in the error across all folds is only about 32 USD, though, indicating that the model generalizes well.

As expected, the Lasso model highly weights factors that were identified in the exploratory phase to have the greatest positive correlation with price, such as the number of full baths, the square footage of the home, and the median income of the area. Interestingly, the model assigns a large negative coefficient to the size of the lot, perhaps indicating that home rentals on large estates go for lower prices. The model also zeroes out 74/177 variables, indicating that only about 100 or fewer of the columns were absolutely necessary to accurately predict pricing.