In [None]:
# import dependencies and global settings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.model_selection import KFold, StratifiedKFold, cross_validate, cross_val_predict, cross_val_score
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error

import pickle

In [None]:
# load linear features (X) and target variable (y) dataframes
X = pd.read_csv('./data_frames/housing_X_features.csv', index_col=0)
y = pd.read_csv('./data_frames/housing_y_target.csv', index_col=0)

In [None]:
# dummy encode all categorical features
X = pd.get_dummies(X, drop_first=True)

## Load models and train stacking regressor

In [None]:
# load models
lr_lasso = pd.read_pickle(r'./models/lr_lasso.pkl')
rfr_bagging = pd.read_pickle(r'./models/rfr_bagging.pkl')
rfr_boosting = pd.read_pickle(r'./models/rfr_boosting.pkl')

In [None]:
level_0_estimators = dict()
level_0_estimators["lr_lasso"] = lr_lasso.best_estimator_
level_0_estimators["rfr_bagging"] = rfr_bagging.best_estimator_
level_0_estimators["rfr_boosting"] = rfr_boosting.best_estimator_
 
level_0_columns = [f"{name}_prediction" for name in level_0_estimators.keys()]

rand_state=12
level_1_estimator = RandomForestRegressor(random_state=rand_state)

In [None]:
k=5
cv = KFold(n_splits=k, shuffle=True, random_state=12)
stacking_clf = StackingRegressor(estimators=list(level_0_estimators.items()),
                                 final_estimator=level_1_estimator,
                                 passthrough=True,
                                 cv=cv)

In [None]:
# cross validation
k=5
cv = KFold(n_splits=k, shuffle=True, random_state=12)
stacking_cv_results = cross_validate(stacking_clf, X, np.ravel(y), cv=cv, return_train_score=True, return_estimator=True)

for test_score in stacking_cv_results['test_score']:
    print(test_score)
    # test_error = 1 - test_score
    # print(test_error)

print('Mean Score (r^2)=' + str(stacking_cv_results['test_score'].mean()))

In [None]:
# save cv_results to pickle file
filename = './models/stacking_cv_results.pkl'
outfile = open(filename,'wb')
pickle.dump(stacking_cv_results, outfile)
outfile.close()

In [None]:
# find and save best model from cv
stacking_best_model = stacking_cv_results['estimator'][np.where(stacking_cv_results['test_score'] == 
                                                                stacking_cv_results['test_score'])[0][0]]
#calculate MAE
y_pred = stacking_best_model.predict(X)
print('Mean Absolute Error:' + str(mean_absolute_error(y, y_pred)))

## Save scores and graph results

In [None]:
#lasso scores
lr_lasso_scores = [lr_lasso.cv_results_['split0_test_score'][lr_lasso.best_index_],
                   lr_lasso.cv_results_['split1_test_score'][lr_lasso.best_index_],
                   lr_lasso.cv_results_['split2_test_score'][lr_lasso.best_index_],
                   lr_lasso.cv_results_['split3_test_score'][lr_lasso.best_index_],
                   lr_lasso.cv_results_['split4_test_score'][lr_lasso.best_index_]]
lr_lasso_scores

In [None]:
#bagging scores
rfr_bagging_scores = [rfr_bagging.cv_results_['split0_test_score'][rfr_bagging.best_index_],
                      rfr_bagging.cv_results_['split1_test_score'][rfr_bagging.best_index_],
                      rfr_bagging.cv_results_['split2_test_score'][rfr_bagging.best_index_],
                      rfr_bagging.cv_results_['split3_test_score'][rfr_bagging.best_index_],
                      rfr_bagging.cv_results_['split4_test_score'][rfr_bagging.best_index_]]
rfr_bagging_scores

In [None]:
#boosting scores
rfr_boosting_scores = [rfr_boosting.cv_results_['split0_test_score'][rfr_boosting.best_index_],
                       rfr_boosting.cv_results_['split1_test_score'][rfr_boosting.best_index_],
                       rfr_boosting.cv_results_['split2_test_score'][rfr_boosting.best_index_],
                       rfr_boosting.cv_results_['split3_test_score'][rfr_boosting.best_index_],
                       rfr_boosting.cv_results_['split4_test_score'][rfr_boosting.best_index_]]
rfr_boosting_scores

In [None]:
#load and save results from stacking model cv
stacking_cv_results = pd.read_pickle(r'./models/stacking_cv_results.pkl')
stacking_scores = list(stacking_cv_results['test_score'])
stacking_scores

In [None]:
#create dataframe
df_scores = pd.DataFrame({'Penalized': lr_lasso_scores,
                          'Random Forest': rfr_bagging_scores,
                          'Boosting': rfr_boosting_scores,
                          'Stacking': stacking_scores,})
df_scores

In [None]:
#stack df_scores to model_type and score columns
df_model_scores = df_scores.stack().reset_index().rename(columns={'level_1':'model_type', 0:'score'}).drop('level_0', axis=1)
df_model_scores

In [None]:
# rank model category by mean cv score
ranks = df_model_scores.groupby('model_type')['score'].median().sort_values(ascending=False).index

# plot box plots in order of median score
plt.figure(figsize=(10,6))
sns.boxplot(x='model_type', y='score', data=df_model_scores, order=ranks)
plt.title('Mean Cross Validation Scores')
plt.xlabel("model type")
plt.xticks(rotation=90)
plt.ylabel("CV scores (r^2)")
plt.show()