# Ensemble 

In [51]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import pickle
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_val_score,KFold,train_test_split
from sklearn.linear_model import ElasticNet, Lasso,Ridge
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from mlxtend.regressor import StackingRegressor,StackingCVRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from xgboost import XGBRegressor
from sklearn.preprocessing import RobustScaler


import warnings


warnings.filterwarnings("ignore")

# environment settings
data_path = 'Data/'

# Deserialize previously saved data from "preprocessing"
with open(data_path+'train_pp.obj', 'rb') as train_pp, \
open(data_path+'test_pp.obj','rb') as test_pp:
    train_df = pickle.load(train_pp)
    test_df = pickle.load(test_pp)
train_df["SalePrice"] = np.log1p(train_df["SalePrice"])

In [52]:
X = train_df.loc[:,'Id':'SaleCondition_Partial']
y = train_df['SalePrice']
print("Shape of training set {}.\nShape of test set {}".format(X.shape,y.shape))

Shape of training set (1456, 304).
Shape of test set (1456,)


In [53]:
scaler = RobustScaler()
X_scaled=scaler.fit(X).transform(X)
test_scaled=scaler.transform(test_df)

In [54]:
lasso = Lasso(alpha=0.00065)
ENet = ElasticNet(alpha=0.0008,l1_ratio=.55)
gBoost = GradientBoostingRegressor(n_estimators=4000, learning_rate=0.03,
                                   max_depth=2, max_features='sqrt', 
                                   loss='huber', random_state =5)

xgb= XGBRegressor(colsample_bytree=0.1, gamma=0.03, 
                             learning_rate=0.02, max_depth=3, 
                             n_estimators=3000,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

Best is 0.11484 +/- 0.017 lasso,ENet,xgb,gBoost


In [55]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [56]:
kf = KFold(10, shuffle=True, random_state=42).get_n_splits(X)

averaged = AveragingModels(models = (gBoost,xgb,ENet,lasso))
rmse_cv_avg= np.sqrt(-cross_val_score(averaged, X_scaled, y, scoring="neg_mean_squared_error", cv = kf))
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(rmse_cv_avg.mean(), rmse_cv_avg.std()))

 Averaged base models score: 0.1051 (0.0157)



Best is  Averaged base models score: 0.1053 (0.0161) <br>
ENet,gBoost,lasso,xgb

In [58]:
#Prediction on real test set using Stack Regressor
averaged.fit(X_scaled,y)
pred_results_stack =averaged.predict(test_scaled)
pred_results_stack =np.expm1(pred_results_stack)
result_df = pd.DataFrame(data={'Id': test_df["Id"].values,
                               'SalePrice': pred_results_stack})

#Create output csv file
result_df.to_csv(data_path+"outputs/averaged_model", index=False)