# Model Selection

In [17]:
# import packages
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV

In [18]:
# set up dataset
df = pd.read_pickle('../data/data.pkl')
X = df.drop('Price', axis=1)
y = df['Price']

In [19]:
# evaluation function (root mean squared error)
def rmse(model):
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    return np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf))

In [20]:
# the list of models to evaluate (GridSearchCV was used to find the best parameters for each model type)
models = [
    Lasso(alpha=2.173),
    Ridge(alpha=24.995),
    ElasticNet(alpha=.733, l1_ratio=.992),
    Pipeline([('scaler', StandardScaler()), ('svr', SVR(kernel='linear'))]),
    RandomForestRegressor(n_estimators=400, max_features=.333, max_depth=None),
    AdaBoostRegressor(n_estimators=100, learning_rate=.05),
    GradientBoostingRegressor(n_estimators=600, max_depth=4, learning_rate=.05, max_features='sqrt', loss='squared_error'),
    XGBRegressor(colsample_bytree=.1, learning_rate=.05, max_depth=4, n_estimators=1000, reg_alpha=.5, reg_lambda=.25)
]

In [21]:
# example of using GridSearchCV to tune hyperparameters

# params = {
#     'max_depth': (3, 4, None),
#     'n_estimators': (500, 1000, 2000),
#     'colsample_bytree': (.1, .333, .5),
#     'reg_alpha': (.25, .5, 75),
#     'reg_lambda': (.25, .5, .75)
# }
# reg = GridSearchCV(XGBRegressor(learning_rate=.05), params, n_jobs=-1)
# reg.fit(X, y)
# reg.best_estimator_

In [22]:
# create and evaluate models
for model in models:
    score = rmse(model)
    model_name = str(model).split('(')[0]
    print(f"{'SVR' if model_name[0] == 'P' else model_name} RMS Error: {score.mean()} \u00B1 {score.std()}")

Lasso RMS Error: 643.8068058994882 ± 25.97122056759165
Ridge RMS Error: 646.4154030121493 ± 24.026717424669933
ElasticNet RMS Error: 644.747005235093 ± 24.67139963088082
SVR RMS Error: 680.6346919525198 ± 24.2784569418987
RandomForestRegressor RMS Error: 593.7104661927427 ± 21.765800320238014
AdaBoostRegressor RMS Error: 733.4535651653056 ± 28.7929547409758
GradientBoostingRegressor RMS Error: 571.1583545109514 ± 33.86346885086319
XGBRegressor RMS Error: 561.8925051275608 ± 29.550546284032723


In [23]:
# to compare the regression results with the price range of the datapoints
y.describe()

count     3543.000000
mean      2208.412645
std        918.063358
min        600.000000
25%       1695.000000
50%       1995.000000
75%       2450.000000
max      10000.000000
Name: Price, dtype: float64

In [24]:
# analyze coefficients of regression to see which factors are weighted most heavily

lasso = Lasso(alpha=2.173).fit(X, y)
coef = pd.Series(lasso.coef_, index=X.columns).sort_values(ascending=False)
print(f'Top Factors and Coefficients: \n{coef[:5]}')
print(f'\nBottom Factors and Coefficients: \n{coef[-5:]}')
print(f'\n{len(coef[coef == 0])} Zero-Coefficient Factors (out of 177)')

Top Factors and Coefficients: 
Full_Baths       1256.193711
SQFT_Int          917.906860
Median Income     699.180921
Age               596.454562
Half_Baths        559.761821
dtype: float64

Bottom Factors and Coefficients: 
Rooms: Other          -95.543028
Style: Brick Front    -99.744197
Assoc                -117.129119
Households           -134.380591
Rooms: Game          -269.949401
dtype: float64

86 Zero-Coefficient Factors (out of 177)


# Results
The XGBoost Regressor by far performed the best on this dataset. However, even its root mean square error was about 562 USD. While this model can give a reasonable estimate regarding the price range of a certain rental, it by no means should be used as an exact price calculator. The standard deviation in the error across all folds is only about 30 USD, though, indicating that the model generalizes well.

As expected, the Lasso model highly weights factors that were identified in the exploratory phase to have the greatest positive correlation with price, such as the number of full baths, the square footage of the home, and the median income of the area. Interestingly, the model assigns a large negative coefficient to the size of the lot, perhaps indicating that home rentals on large estates go for lower prices. The model also zeroes out 86/177 variables, indicating that only about 90 or fewer of the columns were absolutely necessary to accurately predict pricing.