# Ensemble 

In [35]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import pickle
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_val_score,KFold,train_test_split
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from mlxtend.regressor import StackingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from xgboost import XGBRegressor

import warnings


warnings.filterwarnings("ignore")

# environment settings
data_path = 'Data/'

# Deserialize previously saved data from "preprocessing"
with open(data_path+'train_pp.obj', 'rb') as train_pp, \
open(data_path+'test_pp.obj','rb') as test_pp:
    train_df = pickle.load(train_pp)
    test_df = pickle.load(test_pp)
train_df["SalePrice"] = np.log1p(train_df["SalePrice"])

In [36]:
X = train_df.loc[:,'Id':'SaleCondition_Partial']
y = train_df['SalePrice']
print("Shape of training set {}.\nShape of test set {}".format(X.shape,y.shape))

Shape of training set (1460, 303).
Shape of test set (1460,)


In [37]:
lasso = Lasso(alpha=0.00065)
ENet = ElasticNet(alpha=0.0008,l1_ratio=.55)
gBoost = GradientBoostingRegressor(n_estimators=4000, learning_rate=0.03,
                                   max_depth=2, max_features='sqrt', 
                                   loss='huber', random_state =5)

xgb= XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)


svm = SVR()
stregr = StackingRegressor(regressors=[lasso,ENet,xgb,gBoost], 
                           meta_regressor=svm)

kf = KFold(10, shuffle=True, random_state=42).get_n_splits(X)
rmse_cv_stack= np.sqrt(-cross_val_score(stregr, X, y, scoring="neg_mean_squared_error", cv = kf))
print("The 10-fold crossvalidation RMSE of Stack Regressor is {:.5f} +/- {:.3f}".format(rmse_cv_stack.mean(),
                                                                                          rmse_cv_stack.std()))

The 10-fold crossvalidation RMSE of Stack Regressor is 0.11573 +/- 0.019


Best is 0.11573 +/- 0.019 lasso,ENet,xgb,gBoost


In [17]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)


In [29]:
averaged = AveragingModels(models = (ENet, gBoost, lasso, xgb))
rmse_cv_avg= np.sqrt(-cross_val_score(averaged, X, y, scoring="neg_mean_squared_error", cv = kf))
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(rmse_cv_avg.mean(), rmse_cv_avg.std()))

 Averaged base models score: 0.1138 (0.0190)



Best is  Averaged base models score: 0.1138 (0.0190) <br>
ENet,gBoost,lasso,xgb

In [38]:
averaged_final = AveragingModels(models = (averaged, stregr))
rmse_cv_avg2= np.sqrt(-cross_val_score(averaged_final, X, y, scoring="neg_mean_squared_error", cv = kf))
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(rmse_cv_avg2.mean(), rmse_cv_avg2.std()))

 Averaged base models score: 0.1120 (0.0185)



In [39]:
#Prediction on real test set using Stack Regressor
averaged_final.fit(X,y)
pred_results_stack =averaged_final.predict(test_df)
pred_results_stack =np.expm1(pred_results_stack)
result_df = pd.DataFrame(data={'Id': test_df["Id"].values,
                               'SalePrice': pred_results_stack})
#Create output csv file
result_df.to_csv(data_path+"outputs/averaged_supreme_model", index=False)