# Stacking Regression Models for Better Prediction

In [1]:
# import the necessary libraries
import os
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LinearRegression
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import mean_squared_error as MSE
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import RobustScaler

In [152]:
# navigate to project folder
os.chdir('./ML_project/Stacked-Regression-Prediction')

In [153]:
# read in functionality from cleaning and feature engineering scripts
from cleaner import *
from feature_engineering import *

In [154]:
# clean train and test data
train,test=cleanmydata()

In [155]:
# engineer features
train,test=ftrengineer(train,test)

In [156]:
# separate target and features
X_train=train.drop(columns=('SalePrice'))
y_train=np.log(train['SalePrice'])

In [157]:
# save numeric and categorical features
nums = [col for col in X_train.columns if\
        X_train[col].dtype=='int64' or X_train[col].dtype=='float64']
cats = [
    col for col in X_train.columns if X_train[col].dtype=='object']

## Create column transformers

In [158]:
# one hot encoder (linear models)
ohe_col_trans=make_column_transformer(
    (OneHotEncoder(drop='first', handle_unknown='ignore'),cats),
    remainder='passthrough', sparse_threshold=0)

In [159]:
# ordinal encoder (tree models)
ord_col_trans=make_column_transformer(
    (OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=999),cats),
    remainder='passthrough')

# Models

#### RF

In [185]:
# instantiate random forest regressor
rfreg=RandomForestRegressor(n_estimators=800,
                              max_depth =7,
                              criterion='squared_error')

In [186]:
# create pipeline with rfr and ordinal column transformer
rf_pipe=make_pipeline(ord_col_trans, rfreg)

In [187]:
# check functionality 
rf_pipe.fit(X_train,y_train).score(X_train,y_train)

0.9476188495063259

#### Lasso

In [188]:
# instantiate lasso model
lasso=Lasso(alpha=0.001)

In [189]:
# make pipeline with one hot encoder column transformer 
lasso_pipe=make_pipeline(ohe_col_trans,StandardScaler(), lasso)

In [190]:
# test pipeline
lasso_pipe.fit(X_train,y_train).score(X_train,y_train)

0.9338799608805253

#### XGB

In [194]:
# instantiate xgb regressor
xgbr=xgb.XGBRegressor(learning_rate=0.03,
                              n_estimators=800,
                              max_depth =3,
                              eval_metric='rmsle')

In [192]:
# create pipeline with ordinal encoder
xgb_pipe=make_pipeline(ord_col_trans,xgbr)

In [193]:
# check pipeline works
xgb_pipe.fit(X_train,y_train).score(X_train,y_train)

0.9704818389948425

#### Stacking Regressor

In [195]:
# add base model pipelines to base models
base_models = [('lasso',lasso_pipe),
               ('xgb',xgb_pipe),
              ('rfr',rf_pipe)]

# save ridge model as meta model
meta_model =Ridge()

# create stacking regressor model
stacking_model = StackingRegressor(estimators=base_models, 
                                    final_estimator=meta_model,
                                  cv=5)

In [196]:
stacking_model.fit(X_train,y_train)



StackingRegressor(cv=5,
                  estimators=[('lasso',
                               Pipeline(steps=[('columntransformer',
                                                ColumnTransformer(remainder='passthrough',
                                                                  sparse_threshold=0,
                                                                  transformers=[('onehotencoder',
                                                                                 OneHotEncoder(drop='first',
                                                                                               handle_unknown='ignore'),
                                                                                 ['MSSubClass',
                                                                                  'MSZoning',
                                                                                  'Street',
                                                                                

In [198]:
stacking_model.score(X_train,y_train)

0.9579442924155568

In [206]:
# make predictions on test data, save predictions
predictions=pd.DataFrame(
    np.exp(stacking_model.predict(test)).reshape(-1,1))



## Submission

In [200]:
# recreate Id column for submission
idnum_test=pd.DataFrame(
    (np.arange(1461,2920)).reshape(-1,1), columns={'Id'})

In [209]:
# concat Id and predictions
submission=pd.concat([idnum_test,predictions], axis=1)

In [203]:
# label column
submission=submission.rename(columns={0: "SalePrice"})

In [204]:
# export to CSV
submission.to_csv('./my25_submission.csv', index=False)

In [205]:
# submit predictions directly to Kaggle 
! kaggle competitions submit -c\
house-prices-advanced-regression-techniques\
-f my25_submission.csv -m "new pipelines, cv=5, score: 957"

100%|██████████████████████████████████████| 33.7k/33.7k [00:00<00:00, 41.2kB/s]
Successfully submitted to House Prices - Advanced Regression Techniques