In my previous notebook I did a high-level EDA on all the features - checked missing values and explored a list of numeric and categorical attributes that could be useful in training the regression model (you can find my previous notebook here if you are interested: [Predicting House Prices - Data Processing and EDA](https://www.kaggle.com/biyuyang/predicting-house-prices-data-processing-and-eda)). In this notebook, I will explore what's the best model as well as the features that contribute most to the model performances. Below are the topics in this notebook:
* Write up a data pipeline to execute basic data transformation
* Batch train some baseline models and pick one for further training
* Update the data processing pipeline with model tuning and feature selection processes

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings("ignore")

In [85]:
houseTrainRaw = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
houseTestRaw = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [86]:
houseTrain = houseTrainRaw.copy()
houseTest = houseTestRaw.copy()

## Data processing
* Turn built years into ages until current date/time
* Add in an indicator to show if a record has missing values for certain attributes (make it optional so that can be added/dropped in grid search)
* Categorize numeric and categorical attributes and process them separately

In [4]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime as dt

In [90]:
# add an attribute year age 
class YearsToAges(BaseEstimator, TransformerMixin):
    def __init__(self, yearCols):
        self.cols = yearCols
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        for col in self.cols:
            X[col + 'Age'] = dt.now().year - X[col]
            X = X.drop(columns = col).rename(columns = {col + 'Age': col})
        return X 

In [91]:
# track missing columns before imputing if needed
class AddMissingIndicator(BaseEstimator, TransformerMixin):
    def __init__(self, include_missing_cols = False):
        self.include_missing_cols = include_missing_cols
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        if self.include_missing_cols == True:
            cols = X.columns
            for col in cols:
                X[col + '_MissingInd'] = pd.isna(X[col])
            return X
        else:
            return X

In [92]:
# select numeric VS categorical attributes
class NumCatSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, include_missing_cols = False):
        self.attribute_names = attribute_names
        self.include_missing_cols = include_missing_cols
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        if self.include_missing_cols == True:
            missingCols = [col + '_MissingInd' for col in self.attribute_names]
            return pd.concat([X[self.attribute_names], X[missingCols]], axis = 1)
        else:
            return X[self.attribute_names]

In [93]:
# process numeric attributes
class ProcessNumAttr(BaseEstimator, TransformerMixin):
    def __init__(self, include_missing_cols = False):
        self.include_missing_cols = include_missing_cols
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        imputer = SimpleImputer(strategy = 'median')
        scaler = StandardScaler()
        if self.include_missing_cols == True:
            missingCols = [col for col in X.columns if col.endswith('_MissingInd')]
            cols = X.drop(columns = missingCols).columns
            XImp = imputer.fit_transform(X[cols])
            XScale = scaler.fit_transform(XImp)
            return np.c_[XScale, X[missingCols]]
        
        else:
            XImp = imputer.fit_transform(X)
            XScale = scaler.fit_transform(XImp)
            return XScale

In [104]:
# process categorical features
class ProcessCatAttr(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        for col in X.columns:
            X[col] = X[col].astype('object')
            X.loc[X[col].isnull(), col] = 'No Feature'

        encoder = OneHotEncoder(handle_unknown = 'ignore')
        return encoder.fit_transform(X)

In [105]:
# put them all together
# categorize columns
IdCol = ['Id']
label = ['SalePrice']
num = [
    'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold', 'LotArea', 'LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
    'LowQualFinSF', 'GrLivArea', 'BsmtFullBath','BsmtHalfBath', 'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 
    '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'OverallQual', 'OverallCond'
]
yrCols = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
cat = houseTrain.drop(columns = IdCol + label + num, axis = 1).columns

# Numeric attributes pipeline
num_pipeline = Pipeline([
    ('years_to_ages', YearsToAges(yrCols)),
    ('add_missing_ind', AddMissingIndicator(False)),
    ('select_num_attr', NumCatSelector(num, False)),
    ('process_num_attr', ProcessNumAttr(False))
])

# categorical attributes pipeline
cat_pipeline = Pipeline([
    ('add_missing_ind', AddMissingIndicator(False)),
    ('select_cat_attr', NumCatSelector(cat, False)),
    ('process_cat_attr', ProcessCatAttr())
])

full_pipeline = FeatureUnion(
    transformer_list = [
        ('num_pipeline', num_pipeline),
        ('cat_pipeline', cat_pipeline)
    ]
)

In [11]:
houseTrainClean = full_pipeline.fit_transform(houseTrain)

## Model Training
* Split data set into train and test
* Batch train several models and pick the best performer OR try stacking the regressors
* Grid search with CV on hyperparameter tuning - full data and model pipeline will be used

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score 
import time as t

In [13]:
def batch_fit_models(xT, yT, xV, yV, models):

    # initiate a dictionary to record model results
    resultCols = [
        'Model', 'Train Time', 
        'Train RMSE', 'Validation RMSE',
        'Train MAE', 'Validation MAE',
        'Train MSLE', 'Validation MSLE',
        'Train R2', 'Validation R2'
    ]

    result = dict([(key, []) for key in resultCols])
    
    # batch train models
    for model_name, model in models.items():
        
        result['Model'].append(model_name)
        
        # train model and record time laps
        trainStart = t.process_time()
        fit = model.fit(xT, yT)
        trainEnd = t.process_time()
        
        # back fit the model on train data
        predTrain = fit.predict(xT)
        
        # fit the model on validation data
        predValid = fit.predict(xV)
        
        # create data for result dict
        result['Train Time'].append(trainEnd - trainStart)
        result['Train RMSE'].append(np.sqrt(mean_squared_error(yT, predTrain)))
        result['Validation RMSE'].append(np.sqrt(mean_squared_error(yV, predValid)))
        result['Train MAE'].append(mean_absolute_error(yT, predTrain))
        result['Validation MAE'].append(mean_absolute_error(yV, predValid))
        result['Train MSLE'].append(mean_squared_log_error(yT, predTrain))
        result['Validation MSLE'].append(mean_squared_log_error(yV, predValid))
        result['Train R2'].append(r2_score(yT, predTrain))
        result['Validation R2'].append(r2_score(yV, predValid))
        
    # turn result dict into a df
    dfResult = pd.DataFrame.from_dict(result)
    
    return dfResult

In [14]:
y = houseTrain[label]
xTrain, xValid, yTrain, yValid = train_test_split(houseTrainClean, y, test_size = 0.2, random_state = 1206)

In [15]:
modelsToFit = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha = 0.1, random_state = 777),
    'Lasso': Lasso(alpha = 0.1, random_state = 777),
    'Elastic Net': ElasticNet(alpha = 0.1, random_state = 777),
    'Logistic Regression': LogisticRegression(random_state = 777),
    'SVR (linear kernel)': SVR(kernel = 'linear'),
    'Linear SVR': LinearSVR(random_state = 777),
    'Random Forest': RandomForestRegressor(random_state = 777),
    'AdaBoost': AdaBoostRegressor(random_state = 777),
    'GBR': GradientBoostingRegressor(random_state = 777),
    'Stacked Regressors': StackingRegressor(estimators = [('linear_reg', LinearRegression()), ('ridge', Ridge(alpha = 0.1, random_state = 777)), ('lasso', Lasso(alpha = 0.1, random_state = 777)), ('linear_svr', LinearSVR(random_state = 777)), ('linear_kernel_svm', SVR(kernel = 'linear')), ('rf', RandomForestRegressor(random_state = 777)), ('adaboost', AdaBoostRegressor(random_state = 777)), ('gbr', GradientBoostingRegressor(random_state = 777))], final_estimator = ElasticNet(alpha = 0.1, random_state = 777))
}

In [16]:
baselineModel = batch_fit_models(xTrain, yTrain, xValid, yValid, modelsToFit)
baselineModel.sort_values(by = 'Validation RMSE')

Unnamed: 0,Model,Train Time,Train RMSE,Validation RMSE,Train MAE,Validation MAE,Train MSLE,Validation MSLE,Train R2,Validation R2
10,Stacked Regressors,78.717372,11422.722336,22684.25693,8059.293835,16034.704945,0.004415,0.018836,0.979138,0.920899
3,Elastic Net,0.423113,28245.630203,24444.901647,15829.38028,16372.713153,0.016046,0.01947,0.872441,0.908144
9,GBR,1.357595,13560.355612,24658.801811,10094.062083,17510.70587,0.006999,0.021499,0.9706,0.906529
0,Linear Regression,0.53787,19925.311008,25607.803601,12537.830185,18275.8156,0.010525,0.038442,0.936523,0.899196
2,Lasso,2.019495,19925.503195,25648.321302,12537.491673,18262.996777,0.010525,0.038395,0.936521,0.898877
1,Ridge,0.075468,20268.261732,25668.639738,12867.882644,18168.493706,0.010796,0.033844,0.934319,0.898717
7,Random Forest,11.372823,11839.034376,27304.956053,6687.065154,18756.376233,0.003467,0.026863,0.97759,0.885392
8,AdaBoost,0.819192,29066.972591,35524.322748,22828.97618,26215.700668,0.03607,0.052589,0.864915,0.806008
4,Logistic Regression,26.075781,1453.841746,50466.849538,98.886986,31016.476027,5.7e-05,0.060665,0.999662,0.608489
5,SVR (linear kernel),0.573986,75191.696486,77236.384356,49416.298172,51714.245622,0.127443,0.142946,0.096042,0.082987


The results show that stacked regressors can perform better than individual regressors (although more regularization is probably needed - noticed overfitting from differences in train and validation RMSE). What about adding the missing indicators on each column? 

In [17]:
# Numeric attributes pipeline
num_pipeline = Pipeline([
    ('years_to_ages', YearsToAges(yrCols)),
    ('add_missing_ind', AddMissingIndicator(True)),
    ('select_num_attr', NumCatSelector(num, True)),
    ('process_num_attr', ProcessNumAttr(True))
])

# categorical attributes pipeline
cat_pipeline = Pipeline([
    ('add_missing_ind', AddMissingIndicator(True)),
    ('select_cat_attr', NumCatSelector(cat, True)),
    ('process_cat_attr', ProcessCatAttr())
])

full_pipeline = FeatureUnion(
    transformer_list = [
        ('num_pipeline', num_pipeline),
        ('cat_pipeline', cat_pipeline)
    ]
)

In [18]:
houseTrainMisInd = full_pipeline.fit_transform(houseTrain)
xTrain2, xValid2, yTrain2, yValid2 = train_test_split(houseTrainMisInd, y, test_size = 0.2, random_state = 1206)

In [19]:
baselineModelMisInd = batch_fit_models(xTrain2, yTrain2, xValid2, yValid2, modelsToFit)
baselineModelMisInd.sort_values(by = 'Validation RMSE')

Unnamed: 0,Model,Train Time,Train RMSE,Validation RMSE,Train MAE,Validation MAE,Train MSLE,Validation MSLE,Train R2,Validation R2
10,Stacked Regressors,89.843666,11989.019092,23051.310646,8525.750203,16369.628728,0.004995,0.019561,0.977019,0.918319
3,Elastic Net,0.60061,28226.539967,24448.015863,15825.225027,16385.800362,0.016052,0.019464,0.872613,0.90812
9,GBR,1.62089,13560.355612,24959.528907,10094.062083,17600.028495,0.006999,0.02166,0.9706,0.904236
0,Linear Regression,0.808592,19924.570196,25598.154796,12533.28074,18255.407216,0.010534,0.037967,0.936527,0.899272
2,Lasso,2.333504,19924.738758,25643.427479,12533.132205,18246.190332,0.010533,0.037953,0.936526,0.898916
1,Ridge,0.109584,20216.579867,25672.721649,12842.165625,18175.570583,0.010796,0.033027,0.934653,0.898685
7,Random Forest,12.698627,11983.363764,27266.411583,6715.997012,18817.167808,0.003509,0.026715,0.97704,0.885715
8,AdaBoost,0.962867,29578.087578,36285.20038,23197.570566,27326.032011,0.038603,0.054089,0.860122,0.797609
4,Logistic Regression,31.752303,1453.841746,50750.322894,98.886986,31276.921233,5.7e-05,0.062222,0.999662,0.604078
5,SVR (linear kernel),0.642787,75009.573017,77046.461788,49225.290266,51529.998767,0.126522,0.141973,0.100415,0.087491


Hmmm... Looks like we should forget about adding missing indicators?

## Feature selection
* Univariate feature selection
* Recursive feature elimination
* Based on model

In [20]:
from sklearn.feature_selection import GenericUnivariateSelect, RFECV, SelectFromModel, f_regression, mutual_info_regression

In [21]:
def feature_selection_strategy(xT, yT, xV, yV, strats):
    
    # initiate a dictionary to record model results
    resultCols = [
        'Strategy', 'Train Time', 
        'Train RMSE', 'Validation RMSE',
        'Train MAE', 'Validation MAE',
        'Train MSLE', 'Validation MSLE',
        'Train R2', 'Validation R2'
    ]

    result = dict([(key, []) for key in resultCols])
    
    # fit a stacked regression to data
    estimators = [
        ('linear_reg', LinearRegression()), 
        ('ridge', Ridge(alpha = 0.1, random_state = 777)), 
        ('lasso', Lasso(alpha = 0.1, random_state = 777)), 
        ('linear_svr', LinearSVR(random_state = 777)), 
        ('linear_kernel_svm', SVR(kernel = 'linear')), 
        ('rf', RandomForestRegressor(random_state = 777)), 
        ('adaboost', AdaBoostRegressor(random_state = 777)), 
        ('gbr', GradientBoostingRegressor(random_state = 777)) 
    ]

    stackedRegressor = StackingRegressor(estimators = estimators, final_estimator = ElasticNet(alpha = 0.1, random_state = 777))
    
    # batch train models
    for strat_name, strat in strats.items():
        
        result['Strategy'].append(strat_name)
 
        # transform data, train model and record time laps
    
        trainStart = t.process_time()
        selector = strat.fit(xT, yT)
        xTU = selector.transform(xT)
        xVU = selector.transform(xV)
        fit = stackedRegressor.fit(xTU, yT)
        trainEnd = t.process_time()
        
        # back fit the model on train data
        predTrain = fit.predict(xTU)
        
        # fit the model on validation data
        predValid = fit.predict(xVU)
        
        # create data for result dict
        result['Train Time'].append(trainEnd - trainStart)
        result['Train RMSE'].append(np.sqrt(mean_squared_error(yT, predTrain)))
        result['Validation RMSE'].append(np.sqrt(mean_squared_error(yV, predValid)))
        result['Train MAE'].append(mean_absolute_error(yT, predTrain))
        result['Validation MAE'].append(mean_absolute_error(yV, predValid))
        result['Train MSLE'].append(mean_squared_log_error(yT, predTrain))
        result['Validation MSLE'].append(mean_squared_log_error(yV, predValid))
        result['Train R2'].append(r2_score(yT, predTrain))
        result['Validation R2'].append(r2_score(yV, predValid))
        
    # turn result dict into a df
    dfResult = pd.DataFrame.from_dict(result)
    
    return dfResult

In [22]:
featureSelectionStrats = {
    'K Best': GenericUnivariateSelect(mutual_info_regression, 'k_best', 20),
    'Percentile': GenericUnivariateSelect(mutual_info_regression, 'percentile', 10),
    'RFECV': RFECV(ElasticNet(alpha = 0.1, random_state = 777), scoring = 'neg_root_mean_squared_error'),
    'From Model': SelectFromModel(ElasticNet(alpha = 0.1, random_state = 777))
}

In [23]:
featureSelectionResults = feature_selection_strategy(xTrain, yTrain, xValid, yValid, featureSelectionStrats)

In [24]:
featureSelectionResults.sort_values(by = 'Validation RMSE')

Unnamed: 0,Strategy,Train Time,Train RMSE,Validation RMSE,Train MAE,Validation MAE,Train MSLE,Validation MSLE,Train R2,Validation R2
3,From Model,38.159638,13488.51601,22306.664073,9765.206278,15732.721621,0.006238,0.017694,0.97091,0.923511
2,RFECV,329.252956,15718.798764,23810.134495,11522.95646,16534.237215,0.009027,0.019851,0.960495,0.912852
1,Percentile,32.180553,13955.537533,24930.832585,10400.242533,17992.266918,0.007489,0.022507,0.968861,0.904456
0,K Best,23.637024,18765.340214,29607.551617,14103.259149,20297.998547,0.012625,0.028432,0.943698,0.865248


Looks like the best strategy is to select based on a model of choice. The question is then which model should be the best. Will search for the best combination of everything.

## Hyperparameter tuning
* What is the best hyperparameter combination based on no missing indicators and model?
* Do we want to include missing indicators?
* Which model should be used for feature selection?

In [48]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [114]:
# Complete data/model pipeline
# put them all together
# categorize columns
IdCol = ['Id']
label = ['SalePrice']
num = [
    'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold', 'LotArea', 'LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
    'LowQualFinSF', 'GrLivArea', 'BsmtFullBath','BsmtHalfBath', 'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 
    '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'OverallQual', 'OverallCond'
]
yrCols = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
cat = houseTrain.drop(columns = IdCol + label + num, axis = 1).columns

# Numeric attributes pipeline
num_pipeline = Pipeline([
    ('years_to_ages', YearsToAges(yrCols)),
    ('add_missing_ind', AddMissingIndicator(False)),
    ('select_num_attr', NumCatSelector(num, False)),
    ('process_num_attr', ProcessNumAttr(False))
])

# categorical attributes pipeline
cat_pipeline = Pipeline([
    ('add_missing_ind', AddMissingIndicator(False)),
    ('select_cat_attr', NumCatSelector(cat, False)),
    ('process_cat_attr', ProcessCatAttr())
])

data_transformation = FeatureUnion(
    transformer_list = [
        ('num_pipeline', num_pipeline),
        ('cat_pipeline', cat_pipeline)
    ]
)

full_pipeline = Pipeline([
    ('data_transformation', data_transformation),
    ('feature_selection', SelectFromModel(ElasticNet(alpha = 0.1, random_state = 777)))
])

In [36]:
# stacked regression model
estimators = [
        ('linear_reg', LinearRegression()), 
        ('ridge', Ridge(random_state = 777)), 
        ('lasso', Lasso(random_state = 777)), 
        ('linear_svr', LinearSVR(random_state = 777)), 
        ('linear_kernel_svm', SVR(kernel = 'linear')), 
        ('rf', RandomForestRegressor(random_state = 777)), 
        ('adaboost', AdaBoostRegressor(random_state = 777)), 
        ('gbr', GradientBoostingRegressor(random_state = 777)) 
    ]

stackedRegressor = StackingRegressor(estimators = estimators, final_estimator = ElasticNet(random_state = 777))

modelParaGrid = {
    'ridge__alpha': [0.0001, 0.001, 0.01, 0.1, 1],
    'lasso__alpha': [0.0001, 0.001, 0.01, 0.1, 1],
    'linear_svr__C': [1, 10, 100, 1000],
    'linear_kernel_svm__C': [1, 10, 100, 1000],
    'rf__n_estimators': [100, 500, 1000],
    'rf__max_depth': [3, 5, 10],
    'adaboost__n_estimators': [50, 100, 500],
    'adaboost__learning_rate': [0.005, 0.01, 0.1, 1],
    'gbr__n_estimators': [100, 500, 1000],
    'gbr__learning_rate': [0.005, 0.01, 0.1, 1],
    'gbr__min_samples_leaf': [5, 10, 100],
    'final_estimator__alpha': [0.001, 0.01, 0.1, 1, 5]
}

randomSearchStackedReg = RandomizedSearchCV(stackedRegressor, modelParaGrid, cv = 5, scoring = 'neg_mean_squared_error', n_iter = 5, verbose = 3, n_jobs = -1)

In [37]:
houseTrainFinal = full_pipeline.fit_transform(houseTrain, y)
randomSearchStackedReg.fit(houseTrainFinal, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 16.1min finished


RandomizedSearchCV(cv=5,
                   estimator=StackingRegressor(estimators=[('linear_reg',
                                                            LinearRegression()),
                                                           ('ridge',
                                                            Ridge(random_state=777)),
                                                           ('lasso',
                                                            Lasso(random_state=777)),
                                                           ('linear_svr',
                                                            LinearSVR(random_state=777)),
                                                           ('linear_kernel_svm',
                                                            SVR(kernel='linear')),
                                                           ('rf',
                                                            RandomForestRegressor(random_state=777)),
              

In [40]:
modelCVResults = randomSearchStackedReg.cv_results_
for mean_score, params in zip(modelCVResults['mean_test_score'], modelCVResults['params']):
    print(np.sqrt(-mean_score), params)

28758.22893471727 {'ridge__alpha': 1, 'rf__n_estimators': 1000, 'rf__max_depth': 10, 'linear_svr__C': 10, 'linear_kernel_svm__C': 1000, 'lasso__alpha': 0.001, 'gbr__n_estimators': 1000, 'gbr__min_samples_leaf': 100, 'gbr__learning_rate': 0.1, 'final_estimator__alpha': 1, 'adaboost__n_estimators': 100, 'adaboost__learning_rate': 0.1}
27790.23359159506 {'ridge__alpha': 0.1, 'rf__n_estimators': 100, 'rf__max_depth': 5, 'linear_svr__C': 10, 'linear_kernel_svm__C': 10, 'lasso__alpha': 0.001, 'gbr__n_estimators': 1000, 'gbr__min_samples_leaf': 5, 'gbr__learning_rate': 0.1, 'final_estimator__alpha': 1, 'adaboost__n_estimators': 50, 'adaboost__learning_rate': 0.01}
31121.710497464373 {'ridge__alpha': 0.001, 'rf__n_estimators': 1000, 'rf__max_depth': 3, 'linear_svr__C': 10, 'linear_kernel_svm__C': 100, 'lasso__alpha': 0.1, 'gbr__n_estimators': 100, 'gbr__min_samples_leaf': 100, 'gbr__learning_rate': 0.01, 'final_estimator__alpha': 5, 'adaboost__n_estimators': 50, 'adaboost__learning_rate': 0.1}

In [41]:
randomSearchStackedReg.best_estimator_

StackingRegressor(estimators=[('linear_reg', LinearRegression()),
                              ('ridge', Ridge(alpha=0.1, random_state=777)),
                              ('lasso', Lasso(alpha=0.001, random_state=777)),
                              ('linear_svr', LinearSVR(C=10, random_state=777)),
                              ('linear_kernel_svm', SVR(C=10, kernel='linear')),
                              ('rf',
                               RandomForestRegressor(max_depth=5,
                                                     random_state=777)),
                              ('adaboost',
                               AdaBoostRegressor(learning_rate=0.01,
                                                 random_state=777)),
                              ('gbr',
                               GradientBoostingRegressor(min_samples_leaf=5,
                                                         n_estimators=1000,
                                                         random_stat

In [118]:
# finalize full pipeline with stack regressor
bestStackedRegressor = randomSearchStackedReg.best_estimator_

# Numeric attributes pipeline
num_pipeline = Pipeline([
    ('years_to_ages', YearsToAges(yrCols)),
    ('selector', NumCatSelector(num)),
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

# categorical attributes pipeline
cat_pipeline = Pipeline([
    ('selector', NumCatSelector(cat)),
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'No Feature')),
    ('encoder', OneHotEncoder(handle_unknown = 'ignore'))
])

data_transformation = FeatureUnion(
    transformer_list = [
        ('num_pipeline', num_pipeline),
        ('cat_pipeline', cat_pipeline)
    ]
)

full_pipeline_updated = Pipeline([
    ('data_transformation', data_transformation),
    ('feature_selection', SelectFromModel(ElasticNet(alpha = 0.1, random_state = 777))),
    ('stack_regression', bestStackedRegressor)
])

## Submission!
* Fit the full pipeline on train data
* Apply it on test data
* Submit!

In [119]:
model = full_pipeline_updated.fit(houseTrain, y)

In [122]:
testID = houseTest['Id']
testPred = model.predict(houseTest)

In [123]:
submission = pd.concat([testID, pd.DataFrame(testPred)], axis = 1)
submission = submission.rename(columns = {0: 'SalePrice'})
submission.to_csv('house_prices_submission_20200705.csv', index = False)

In [124]:
submission

Unnamed: 0,Id,SalePrice
0,1461,122721.384134
1,1462,168987.608089
2,1463,185482.947471
3,1464,190947.822630
4,1465,181920.213896
...,...,...
1454,2915,78528.684406
1455,2916,81364.977248
1456,2917,166241.849165
1457,2918,107276.353183
