# Imports

In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns',100)

In [3]:
from matplotlib import pyplot as plt
import seaborn as sns

In [4]:
from sklearn.model_selection import cross_validate, GridSearchCV

In [5]:
import statsmodels.api as sm

In [6]:
from sklearn.pipeline import make_pipeline

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
from sklearn.preprocessing import RobustScaler

In [31]:
from sklearn.model_selection import cross_validate

In [9]:
df_raw = pd.read_csv('train.csv')

Notes:
- Year Sold and Built not useful by itself, need to be changed into Age at Sale
- Year Sold minus Year Remodel gives time since last remodel
- MSSubClass will need some encoding (one-hot)
- MSZoning will need encoding
- Street can be one hot encoded
- Alley will be encoded
- LotShape encoded
- LandContour encoded
- Utilities encode

## Data Transforms

In [110]:
df = df_raw.copy()

In [111]:
df['AgeHouseAtSale'] = df['YrSold'] - df['YearBuilt']

df['YearsLastRemodelAtSale']  = df['YrSold'] - df['YearRemodAdd']

df['YearsGarageBuiltAtSale'] = df['YrSold'] - df['GarageYrBlt'] 

df.drop(columns=['Id'],inplace=True)

df['MSSubClass'] = df['MSSubClass'].astype(str)

In [112]:
df['YearsLastRemodelAtSale'] = np.maximum(df['YearsLastRemodelAtSale'],0)

In [114]:
for var in df.dtypes[df.dtypes == 'int64'].index:
    if df[var].isnull().sum()>0:
        df[var+'_isnull'] = df[var].isnull().astype(int)
        df[var]=df[var].fillna(df[var].mean())

for var in df.dtypes[df.dtypes == 'float64'].index:
    if df[var].isnull().sum()>0:
        df[var+'_isnull'] = df[var].isnull().astype(int)
        df[var]=df[var].fillna(df[var].mean())

df_dummies = pd.get_dummies(df,dummy_na=True,drop_first=False)

In [144]:
df_dummies.drop(columns=df_dummies.loc[:,df_dummies.nunique() == 1].columns, inplace=True)

In [146]:
X = df_dummies.drop(columns='SalePrice')

y = np.log(df_dummies['SalePrice'])

X['intercept'] = 1

## Looking at effects of Robust Scaler
Not a huge fan, less explainable and causes convergence issues for Lasso

In [40]:
standardscaler = RobustScaler()

In [41]:
standardscaler.fit(X)

RobustScaler()

In [42]:
test = standardscaler.transform(X)

In [43]:
test = pd.DataFrame(test,columns=X.columns)

In [47]:
test.describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,AgeHouseAtSale,YearsLastRemodelAtSale,YearsGarageBuiltAtSale,LotFrontage_isnull,MasVnrArea_isnull,GarageYrBlt_isnull,YearsGarageBuiltAtSale_isnull,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,...,GarageFinish_Unf,GarageFinish_nan,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageQual_nan,GarageCond_Ex,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA,GarageCond_nan,PavedDrive_N,PavedDrive_P,PavedDrive_Y,PavedDrive_nan,PoolQC_Ex,PoolQC_Fa,PoolQC_Gd,PoolQC_nan,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_nan,MiscFeature_Gar2,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,MiscFeature_nan,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan,intercept
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,-3.023076e-16,0.256504,0.049658,0.575342,-0.037656,-0.246872,0.631265,0.084436,46.549315,0.153402,0.131203,0.148506,0.476638,5.844521,0.079511,0.425342,0.057534,-0.434932,0.382877,-0.133562,0.046575,0.258904,-0.386986,2.405382e-15,-0.232877,-0.029068,0.560979,0.318533,21.95411,3.409589,15.060959,2.758904,43.489041,0.107306,-0.092123,0.033651,0.241892,9.923569e-18,0.177397,0.005479,0.055479,0.055479,0.059589,0.043151,0.006849,0.020548,0.367123,0.04726,0.00274,0.008219,...,0.414384,0.055479,0.002055,0.032877,0.009589,0.002055,-0.102055,0.055479,0.00137,0.023973,0.006164,0.004795,-0.091781,0.055479,0.061644,0.020548,-0.082192,0.0,0.00137,0.00137,0.002055,-0.004795,0.040411,0.036986,0.107534,0.007534,-0.192466,0.00137,0.00137,0.033562,0.000685,-0.036986,0.029452,0.00274,0.00137,0.006164,0.003425,0.003425,0.083562,0.002055,-0.132192,0.0,0.069178,0.00274,0.008219,0.013699,-0.179452,0.085616,0.0,0.0
std,1.159159,2.465728,0.691498,1.112799,0.656585,0.557984,1.099355,0.640362,161.319273,0.755328,0.873045,0.759132,0.599627,48.623081,0.811866,0.518911,0.238753,0.550916,0.502885,0.815778,0.220338,0.812697,0.644666,0.6152457,0.747315,0.88532,0.746064,0.974353,61.119149,29.317331,55.757415,40.177307,496.123024,0.901209,0.664048,0.657612,0.557855,0.6161716,0.382135,0.073846,0.228992,0.228992,0.236805,0.203266,0.082505,0.141914,0.482186,0.212268,0.052289,0.090317,...,0.492784,0.228992,0.045299,0.178375,0.097486,0.045299,0.302824,0.228992,0.036999,0.153016,0.078298,0.0691,0.288815,0.228992,0.24059,0.141914,0.274751,0.0,0.036999,0.036999,0.045299,0.0691,0.196989,0.188793,0.309897,0.086502,0.394372,0.036999,0.036999,0.18016,0.026171,0.188793,0.169128,0.052289,0.036999,0.078298,0.05844,0.05844,0.276824,0.045299,0.338815,0.0,0.253844,0.052289,0.090317,0.116277,0.383862,0.279893,0.0,0.0
min,-2.581577,-2.02038,-2.5,-4.0,-2.195652,-1.189189,0.0,-0.538435,0.0,-0.816239,-1.973134,-1.478645,0.0,0.0,-1.745848,0.0,0.0,-2.0,0.0,-3.0,-1.0,-2.0,-1.0,-2.012979,-2.0,-1.987578,0.0,-0.367647,0.0,0.0,0.0,0.0,0.0,-1.666667,-1.0,-0.76087,-0.405405,-0.7514364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
25%,-0.5289452,-0.475543,-0.5,0.0,-0.413043,-0.72973,0.0,-0.538435,0.0,-0.435043,-0.389552,-0.402553,0.0,0.0,-0.516802,0.0,0.0,-1.0,0.0,-1.0,0.0,-0.5,-1.0,-0.423235,-1.0,-0.602484,0.0,-0.367647,0.0,0.0,0.0,0.0,0.0,-0.333333,-0.5,-0.586957,-0.27027,-0.5719492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.4710548,0.524457,0.5,1.0,0.586957,0.27027,1.0,0.461565,0.0,0.564957,0.610448,0.597447,1.0,0.0,0.483198,1.0,0.0,0.0,1.0,0.0,0.0,0.5,0.0,0.576765,0.0,0.397516,1.0,0.632353,0.0,0.0,0.0,0.0,0.0,0.666667,0.5,0.413043,0.72973,0.4280508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,12.78684,50.831645,2.0,4.0,0.804348,0.432432,9.741248,7.385749,1474.0,3.176923,10.18607,7.079038,2.836538,572.0,6.455002,3.0,2.0,1.0,2.0,5.0,2.0,4.0,2.0,0.8075343,2.0,3.884058,5.10119,7.676471,552.0,508.0,480.0,738.0,15500.0,2.0,1.0,2.195652,1.243243,1.992153,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0


## Creating own scaler

Based on taking the log first (adding a 1 so that 0s become 1s), and then centering and dividing out the standard deviation

The log standard is incredibly promising for Elastic Net. In fact, it's almost the best performing model yet (only beaten by the stacking regression)

Bayesian Ridge also showed improvement

In [156]:
from sklearn.base import BaseEstimator, TransformerMixin

class LogStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        self.init = True
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        temp = np.log(X+1)
        X_scaled = (temp - temp.mean())/temp.std()
        return X_scaled.fillna(0)


In [157]:
test_scaler = LogStandardScaler()

In [158]:
test_scaler.fit(X)

LogStandardScaler()

In [159]:
test_scaler.transform(X).describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,AgeHouseAtSale,YearsLastRemodelAtSale,YearsGarageBuiltAtSale,LotFrontage_isnull,MasVnrArea_isnull,GarageYrBlt_isnull,YearsGarageBuiltAtSale_isnull,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,...,GarageType_nan,GarageFinish_Fin,GarageFinish_RFn,GarageFinish_Unf,GarageFinish_nan,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageQual_nan,GarageCond_Ex,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA,GarageCond_nan,PavedDrive_N,PavedDrive_P,PavedDrive_Y,PoolQC_Ex,PoolQC_Fa,PoolQC_Gd,PoolQC_nan,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_nan,MiscFeature_Gar2,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,MiscFeature_nan,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,intercept
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,2.3809e-14,4.933401e-15,-6.560004e-14,-4.113521e-14,-9.143495e-14,2.242042e-14,-8.323631e-16,1.150678e-15,1.289684e-16,2.523058e-15,-4.532105e-15,-3.216035e-14,1.963422e-16,-1.847457e-16,-1.521705e-14,-1.687539e-15,1.558495e-16,-8.796312e-15,-1.612409e-15,9.896041e-14,2.716214e-13,-6.905834e-14,4.611836e-15,-7.393054e-13,-2.324077e-14,5.914067e-15,1.171513e-15,-1.29706e-15,-1.307554e-16,-2.865288e-16,4.4865180000000004e-18,9.509707e-16,3.201397e-16,-3.141897e-14,1.023335e-11,-3.168318e-15,-1.922207e-15,7.846843e-16,-9.236903e-16,-6.872357000000001e-17,-2.002584e-16,-2.002584e-16,-2.5e-05,-2.5e-05,-5e-06,3.1e-05,-0.00027,-5.1e-05,-3e-06,-5e-06,...,9.5e-05,0.000175,0.000292,-0.000108,9.5e-05,-2e-06,-1e-05,5.1e-05,-2e-06,0.000191,9.5e-05,-2e-06,4.5e-05,-3.9e-05,3.6e-05,-0.000802,9.5e-05,-8.3e-05,3.1e-05,0.000738,-2e-06,-2e-06,-2e-06,0.001954,3.1e-05,7.7e-05,-0.000117,2.7e-05,-0.000538,-2e-06,-2e-06,5e-06,-1e-06,-0.001942,3.9e-05,-3e-06,-2e-06,-3.9e-05,-3e-06,-3e-06,8.3e-05,-2e-06,0.000598,0.000126,-3e-06,-5e-06,-6e-06,0.000327,-0.000125,0.9996575
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.000039,0.99961,1.000273,0.999467,0.999567,0.999735,0.999997,1.000004,...,1.000519,0.99936,0.999892,1.000362,1.000519,1.000183,1.000168,1.000403,1.000183,1.000024,1.000519,0.999544,1.000152,1.000524,0.999969,1.000287,1.000519,1.000403,0.999467,1.000375,0.999544,0.999544,1.000183,0.999969,0.999906,0.999388,1.000123,1.000166,1.000015,0.999544,0.999544,1.0003,1.000184,1.00032,1.000152,0.999997,0.999544,1.000524,0.999822,0.999822,1.00023,1.000183,1.000244,0.999884,0.999997,1.000004,0.999926,1.000154,1.000497,5.997259e-15
min,-3.577797,-3.749892,-6.174595,-6.984065,-3.348087,-1.694032,-0.8156799,-1.413655,-0.3552207,-3.046557,-5.892019,-3.76246,-0.8698193,-0.1335566,-4.363541,-0.8329399,-0.24284,-4.110643,-0.7693754,-5.77599,-7.972346,-4.181158,-1.013214,-3.318435,-3.017973,-3.991535,-0.9463772,-1.072549,-0.4041057,-0.1284774,-0.2926691,-0.06936881,-0.1904162,-2.861496,-1.367517,-2.316576,-1.995653,-2.298676,-0.4642261,-0.07420154,-0.2422767,-0.2422767,-0.251673,-0.212229,-0.083044,-0.144684,-0.761314,-0.222637,-0.052399,-0.091009,...,-0.242308,-0.562911,-0.637034,-0.841315,-0.242308,-0.045372,-0.184353,-0.098351,-0.045372,-2.965116,-0.242308,-0.03701,-0.156646,-0.07881,-0.069347,-3.146341,-0.242308,-0.256406,-0.144684,-3.341026,-0.03701,-0.03701,-0.045372,-14.4,-0.205094,-0.195713,-0.347159,-0.087087,-2.048214,-0.03701,-0.03701,-0.186339,-0.026177,-5.104478,-0.174128,-0.052399,-0.03701,-0.07881,-0.058594,-0.058594,-0.301845,-0.045372,-2.561331,-0.272365,-0.052399,-0.091009,-0.117808,-2.137615,-0.306167,0.9996575
25%,-0.3374865,-0.3499788,-0.7354547,-0.4602502,-0.563763,-0.8617529,-0.8156799,-1.413655,-0.3552207,-0.1276856,-0.061114,-0.7092162,-0.8698193,-0.1335566,-0.7143553,-0.8329399,-0.24284,-1.006267,-0.7693754,-0.9974921,-0.2075362,-0.9458646,-1.013214,-0.6818393,-0.8671381,0.0051102,-0.9463772,-1.072549,-0.4041057,-0.1284774,-0.2926691,-0.06936881,-0.1904162,-0.2798051,-0.6140419,-0.6477366,-0.7474594,-0.6390515,-0.4642261,-0.07420154,-0.2422767,-0.2422767,-0.251673,-0.212229,-0.083044,-0.144684,-0.761314,-0.222637,-0.052399,-0.091009,...,-0.242308,-0.562911,-0.637034,-0.841315,-0.242308,-0.045372,-0.184353,-0.098351,-0.045372,0.337209,-0.242308,-0.03701,-0.156646,-0.07881,-0.069347,0.317073,-0.242308,-0.256406,-0.144684,0.3,-0.03701,-0.03701,-0.045372,0.071338,-0.205094,-0.195713,-0.347159,-0.087087,0.4875,-0.03701,-0.03701,-0.186339,-0.026177,0.19403,-0.174128,-0.052399,-0.03701,-0.07881,-0.058594,-0.058594,-0.301845,-0.045372,0.390852,-0.272365,-0.052399,-0.091009,-0.117808,0.46789,-0.306167,0.9996575
50%,0.1470817,0.08875776,0.02773275,-0.4602502,0.06470948,0.4454383,-0.8156799,0.5755955,-0.3552207,0.2816979,0.1306323,-0.05152768,-0.8698193,-0.1335566,0.06330204,-0.8329399,-0.24284,0.8096768,-0.7693754,0.2538029,-0.2075362,-0.2263599,0.743101,0.006064994,0.3910196,0.2527021,-0.9463772,0.441164,-0.4041057,-0.1284774,-0.2926691,-0.06936881,-0.1904162,0.08244225,0.1390583,0.4051836,0.1045654,0.4239562,-0.4642261,-0.07420154,-0.2422767,-0.2422767,-0.251673,-0.212229,-0.083044,-0.144684,-0.761314,-0.222637,-0.052399,-0.091009,...,-0.242308,-0.562911,-0.637034,-0.841315,-0.242308,-0.045372,-0.184353,-0.098351,-0.045372,0.337209,-0.242308,-0.03701,-0.156646,-0.07881,-0.069347,0.317073,-0.242308,-0.256406,-0.144684,0.3,-0.03701,-0.03701,-0.045372,0.071338,-0.205094,-0.195713,-0.347159,-0.087087,0.4875,-0.03701,-0.03701,-0.186339,-0.026177,0.19403,-0.174128,-0.052399,-0.03701,-0.07881,-0.058594,-0.058594,-0.301845,-0.045372,0.390852,-0.272365,-0.052399,-0.091009,-0.117808,0.46789,-0.306167,0.9996575
75%,0.5240474,0.4793668,0.6888358,0.4551323,0.9474735,0.9250985,1.125857,0.7821056,-0.3552207,0.5649459,0.3656961,0.7252719,1.131714,-0.1335566,0.6438339,1.148394,-0.24284,0.8096768,1.258149,0.2538029,-0.2075362,0.3969034,0.743101,0.934331,0.3910196,0.3777606,1.02937,0.8946186,-0.4041057,-0.1284774,-0.2926691,-0.06936881,-0.1904162,0.6730201,0.8917837,0.7270795,0.903083,0.7741682,-0.4642261,-0.07420154,-0.2422767,-0.2422767,-0.251673,-0.212229,-0.083044,-0.144684,1.311679,-0.222637,-0.052399,-0.091009,...,-0.242308,-0.562911,1.567935,1.188706,-0.242308,-0.045372,-0.184353,-0.098351,-0.045372,0.337209,-0.242308,-0.03701,-0.156646,-0.07881,-0.069347,0.317073,-0.242308,-0.256406,-0.144684,0.3,-0.03701,-0.03701,-0.045372,0.071338,-0.205094,-0.195713,-0.347159,-0.087087,0.4875,-0.03701,-0.03701,-0.186339,-0.026177,0.19403,-0.174128,-0.052399,-0.03701,-0.07881,-0.058594,-0.058594,-0.301845,-0.045372,0.390852,-0.272365,-0.052399,-0.091009,-0.117808,0.46789,-0.306167,0.9996575
max,4.868581,6.124392,2.265474,2.573151,1.271404,1.211748,1.989122,1.473502,3.599379,1.137121,1.717085,4.553349,1.448021,8.364276,4.109352,3.129729,6.513721,2.098109,2.444175,3.781005,7.557274,3.33096,2.499416,1.302821,1.976111,0.9961745,1.655115,1.857349,3.252061,9.217257,4.108624,14.98768,7.679426,1.537157,1.644135,1.420255,1.19252,1.438183,2.152647,13.46758,4.124686,4.124686,3.971375,4.705519,12.040755,6.898094,1.311679,4.487148,19.07218,10.981116,...,4.126923,1.772615,1.567935,1.188706,4.126923,22.034317,5.422754,10.163618,22.034317,0.337209,4.126923,26.978448,6.379603,12.699592,14.40199,0.317073,4.126923,3.90172,6.898094,0.3,26.978448,26.978448,22.034317,0.071338,4.870867,5.097856,2.880114,11.475254,0.4875,26.978448,26.978448,5.365958,38.190812,0.19403,5.739459,19.07218,26.978448,12.699592,17.04984,17.04984,3.311387,22.034317,0.390852,3.666609,19.07218,10.981116,8.481737,0.46789,3.268408,0.9996575


# Training

## SKLEARN

In [151]:
from sklearn.linear_model import LassoCV

In [176]:
lasso_cv = LassoCV(cv=5, random_state=0,max_iter=2000)

In [177]:
lasso_pipeline = make_pipeline(LogStandardScaler(),lasso_cv)

In [178]:
lasso_pipeline.fit(X=X,y=y)

Pipeline(steps=[('logstandardscaler', LogStandardScaler()),
                ('lassocv', LassoCV(cv=5, max_iter=2000, random_state=0))])

In [155]:
lasso_cv.alpha_

0.003947179385645563

### Elastic Net
Looks like the optimal is just an L1 Lasso regression

In [160]:
from sklearn.linear_model import ElasticNetCV

In [180]:
# elastic_net_cv = ElasticNetCV(l1_ratio=[0,.5,1]
#                              ,alphas=[0.1,1,10,100]
#                              )
elastic_net_cv = ElasticNetCV(cv=5, random_state=0, l1_ratio=[0.5,0.7,0.8,0.9,1],max_iter=2000)

In [181]:
elastic_net_pipeline = make_pipeline(LogStandardScaler(),elastic_net_cv)

In [182]:
elastic_net_pipeline.fit(X=X,y=y)

Pipeline(steps=[('logstandardscaler', LogStandardScaler()),
                ('elasticnetcv',
                 ElasticNetCV(cv=5, l1_ratio=[0.5, 0.7, 0.8, 0.9, 1],
                              max_iter=2000, random_state=0))])

In [183]:
elastic_net_cv.l1_ratio_

0.5

In [184]:
elastic_net_cv.alpha_

0.007894358771291127

In [185]:
test = pd.DataFrame(elastic_net_cv.coef_,index=X.columns)

### Decision Tree

In [27]:
params_dt={'splitter':['random', 'best']
        ,'min_samples_split':[2, 3, 4, 5, 6, 8, 10]
        ,'min_samples_leaf':[0.01, 0.02, 0.03, 0.04]
        ,'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2]
        ,'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None]
        ,'max_features':[ 0.95, 0.90, 0.85, 0.80, 0.75, 0.70]
        ,'max_depth':[None, 2,4,6,8]
        ,'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]}
        

In [49]:
from sklearn.tree import DecisionTreeRegressor

In [50]:
from sklearn.model_selection import RandomizedSearchCV

In [51]:
from skopt import BayesSearchCV

In [31]:
decision_tree = DecisionTreeRegressor()

In [32]:
bayes_search_dt = BayesSearchCV(decision_tree,params_dt)

In [218]:
decision_tree = DecisionTreeRegressor(max_depth=6, max_features=0.85, max_leaf_nodes=40,
                      min_samples_leaf=0.02, min_samples_split=6,
                      min_weight_fraction_leaf=0.0075)

In [219]:
decision_tree_pipeline = make_pipeline(LogStandardScaler(),decision_tree)

In [220]:
decision_tree_pipeline.fit(X,y)

Pipeline(steps=[('logstandardscaler', LogStandardScaler()),
                ('decisiontreeregressor',
                 DecisionTreeRegressor(max_depth=6, max_features=0.85,
                                       max_leaf_nodes=40, min_samples_leaf=0.02,
                                       min_samples_split=6,
                                       min_weight_fraction_leaf=0.0075))])

## XGBoost

In [221]:
import xgboost

In [222]:
from importlib import reload

In [223]:
reload(xgboost)

<module 'xgboost' from 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\xgboost\\__init__.py'>

In [224]:
xgboost.__version__

'1.6.2'

In [225]:
xg_boost = xgboost.XGBRegressor()

In [226]:
xg_boost_pipeline = make_pipeline(LogStandardScaler(),xg_boost)

In [238]:
xg_boost_pipeline.fit(X,y)

Pipeline(steps=[('logstandardscaler', LogStandardScaler()),
                ('xgbregressor',
                 XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              gamma=0, gpu_id=-1, grow_policy='depthwise',
                              importance_type=None, interaction_constraints='',
                              learning_rate=0.300000012, max_bin=256,
                              max_cat_to_onehot=4, max_delta_step=0,
                              max_depth=6, max_leaves=0, min_child_weight=1,
                              missing=nan, monotone_constraints='()',
                              n_estimators=100, n_jobs=0, num_parallel_tree=1,
                              predictor='auto', random_state=0, reg_alpha=0,
       

## Random Forest

In [228]:
from sklearn.ensemble import RandomForestRegressor

In [229]:
random_forest_initial = RandomForestRegressor(n_estimators=200,oob_score=True)

In [230]:
random_forest_pipeline = make_pipeline(LogStandardScaler(),random_forest_initial)

In [231]:
random_forest_pipeline.fit(X,y)

Pipeline(steps=[('logstandardscaler', LogStandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(n_estimators=200, oob_score=True))])

In [44]:
random_forest = RandomForestRegressor(max_depth=20, max_features=0.55, min_samples_leaf=6,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.005,
                      n_estimators=200, oob_score=True)

NameError: name 'RandomForestRegressor' is not defined

## Adaboost

In [240]:
from sklearn.ensemble import AdaBoostRegressor

In [241]:
adaboost = AdaBoostRegressor()

In [242]:
adaboost_pipeline = make_pipeline(LogStandardScaler(),adaboost)

In [243]:
adaboost_pipeline.fit(X,y)

Pipeline(steps=[('logstandardscaler', LogStandardScaler()),
                ('adaboostregressor', AdaBoostRegressor())])

## Light GBM

In [247]:
from lightgbm import LGBMRegressor

In [248]:
light_gbm = LGBMRegressor()

light_gbm_pipeline = make_pipeline(LogStandardScaler(),light_gbm)

light_gbm_pipeline.fit(X,y)

Pipeline(steps=[('logstandardscaler', LogStandardScaler()),
                ('lgbmregressor', LGBMRegressor())])

## GradientBoostingRegressor

In [250]:
from sklearn.ensemble import GradientBoostingRegressor

In [251]:
gbr = GradientBoostingRegressor()

gbr_pipeline = make_pipeline(LogStandardScaler(),gbr)

gbr.fit(X,y)

GradientBoostingRegressor()

## Stochastic Gradient Descent Regression

In [255]:
from sklearn.linear_model import SGDRegressor

In [256]:
SGDR = SGDRegressor()

SGDR_pipeline = make_pipeline(LogStandardScaler(),SGDR)

SGDR_pipeline.fit(X,y)

Pipeline(steps=[('logstandardscaler', LogStandardScaler()),
                ('sgdregressor', SGDRegressor())])

## from sklearn.svm import SVR
Support Vector Machine

In [257]:
from sklearn.svm import SVR

In [258]:
svr = SVR()

svr_pipeline = make_pipeline(LogStandardScaler(),svr)

svr_pipeline.fit(X,y)

Pipeline(steps=[('logstandardscaler', LogStandardScaler()), ('svr', SVR())])

## from sklearn.linear_model import BayesianRidge

In [259]:
from sklearn.linear_model import BayesianRidge

In [260]:
bayesianr = BayesianRidge()

bayesianr_pipeline = make_pipeline(LogStandardScaler(),bayesianr)

bayesianr_pipeline.fit(X,y)

Pipeline(steps=[('logstandardscaler', LogStandardScaler()),
                ('bayesianridge', BayesianRidge())])

## from sklearn.kernel_ridge import KernelRidge

In [261]:
from sklearn.kernel_ridge import KernelRidge

In [262]:
kernelr = KernelRidge()

kernelr_pipeline = make_pipeline(LogStandardScaler(),kernelr)

kernelr_pipeline.fit(X,y)

Pipeline(steps=[('logstandardscaler', LogStandardScaler()),
                ('kernelridge', KernelRidge())])

## Ensemble

In [264]:
from sklearn.ensemble import VotingRegressor, StackingRegressor

In [265]:
voting_reg = VotingRegressor([('elastic_net',elastic_net_pipeline)
                              ,('rf',random_forest_pipeline)
                              ,('xg_boost',xg_boost_pipeline)
                             # ,('adaboost',adaboost_pipeline)
                                  ,('lightgbm',light_gbm_pipeline)
                                  ,('gbr',gbr_pipeline)
                                  ,('bayesianRidge',bayesianr_pipeline)
                                  # ,('kernelRidge',kernelr)
                             ])

In [266]:
voting_reg.fit(X,y)

VotingRegressor(estimators=[('elastic_net',
                             Pipeline(steps=[('logstandardscaler',
                                              LogStandardScaler()),
                                             ('elasticnetcv',
                                              ElasticNetCV(cv=5,
                                                           l1_ratio=[0.5, 0.7,
                                                                     0.8, 0.9,
                                                                     1],
                                                           max_iter=2000,
                                                           random_state=0))])),
                            ('rf',
                             Pipeline(steps=[('logstandardscaler',
                                              LogStandardScaler()),
                                             ('randomforestregressor',
                                              RandomForestRegressor(

In [274]:
stacking_reg = StackingRegressor([('elastic_net',elastic_net_pipeline)
                              ,('rf',random_forest_pipeline)
                              ,('xg_boost',xg_boost_pipeline)
                             # ,('adaboost',adaboost_pipeline)
                                  ,('lightgbm',light_gbm_pipeline)
                                  ,('gbr',gbr_pipeline)
                                  ,('bayesianRidge',bayesianr_pipeline)
                                  # ,('kernelRidge',kernelr)
                             ])

In [275]:
stacking_reg.fit(X,y)

StackingRegressor(estimators=[('elastic_net',
                               Pipeline(steps=[('logstandardscaler',
                                                LogStandardScaler()),
                                               ('elasticnetcv',
                                                ElasticNetCV(cv=5,
                                                             l1_ratio=[0.5, 0.7,
                                                                       0.8, 0.9,
                                                                       1],
                                                             max_iter=2000,
                                                             random_state=0))])),
                              ('rf',
                               Pipeline(steps=[('logstandardscaler',
                                                LogStandardScaler()),
                                               ('randomforestregressor',
                                        

## Cross Val Score

In [186]:
val_results = cross_validate(elastic_net_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.015875,0.003899,-0.020499,-0.018528,-0.015914,-0.013897,-0.010538


In [179]:
val_results = cross_validate(lasso_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.01589,0.003869,-0.020443,-0.018527,-0.015996,-0.013912,-0.010574


In [234]:
val_results = cross_validate(decision_tree_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.037435,0.00513,-0.042783,-0.040168,-0.039812,-0.034313,-0.0301


In [235]:
val_results = cross_validate(xg_boost_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.020749,0.002079,-0.024063,-0.021163,-0.020198,-0.019794,-0.01853


In [244]:
val_results = cross_validate(random_forest_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.02076,0.002691,-0.024336,-0.022615,-0.020308,-0.018525,-0.018014


In [202]:
val_results = cross_validate(random_forest_initial,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.020326,0.002785,-0.023262,-0.022594,-0.020544,-0.01874,-0.016492


In [245]:
val_results = cross_validate(adaboost_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.031226,0.003267,-0.034848,-0.033666,-0.030606,-0.030557,-0.026451


In [252]:
val_results = cross_validate(light_gbm_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.017907,0.002759,-0.022168,-0.01865,-0.017702,-0.015801,-0.015214


In [253]:
val_results = cross_validate(gbr_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.016872,0.002406,-0.020477,-0.016831,-0.016738,-0.016614,-0.013699


In [267]:
val_results = cross_validate(SGDR,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-1.904125e+33,1.03194e+33,-3.69856e+33,-1.823122e+33,-1.471204e+33,-1.377371e+33,-1.150367e+33


In [268]:
val_results = cross_validate(svr,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.041612,0.004728,-0.045493,-0.045119,-0.042029,-0.041672,-0.033746


In [269]:
val_results = cross_validate(bayesianr_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.017763,0.00435,-0.022834,-0.02048,-0.018978,-0.01382,-0.012706


In [271]:
val_results = cross_validate(kernelr_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-577.065057,0.720064,-577.531635,-577.427471,-577.374909,-577.195793,-575.795477


In [272]:
val_results = cross_validate(voting_reg,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.014984,0.002678,-0.018672,-0.016118,-0.015262,-0.013029,-0.011838


In [91]:
val_results = cross_validate(stacking_reg,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.015611,0.003662,-0.019202,-0.019197,-0.015881,-0.012115,-0.011662


Scaling seems to have really benefitted the linear regression Elastic Net, but did not affect the tree based formulas much. 

Somehow kernalr is much worse now. 

# Prediction

In [208]:
df_predict_raw = pd.read_csv('test.csv')

In [209]:
df_predict = df_predict_raw.copy()

In [210]:
df_predict['AgeHouseAtSale'] = df_predict['YrSold'] - df_predict['YearBuilt']

df_predict['YearsLastRemodelAtSale']  = df_predict['YrSold'] - df_predict['YearRemodAdd']

df_predict['YearsGarageBuiltAtSale'] = df_predict['YrSold'] - df_predict['GarageYrBlt'] 

df_predict.drop(columns=['Id'],inplace=True)

df_predict['MSSubClass'] = df_predict['MSSubClass'].astype(str)

In [211]:
df_predict['YearsLastRemodelAtSale'] = np.maximum(df_predict['YearsLastRemodelAtSale'],0)
df_predict['AgeHouseAtSale'] = np.maximum(df_predict['AgeHouseAtSale'],0)
df_predict['YearsGarageBuiltAtSale'] = np.maximum(df_predict['YearsGarageBuiltAtSale'],0)

In [212]:
for var in df_predict.dtypes[df_predict.dtypes == 'int64'].index:
    if df_predict[var].isnull().sum()>0:
        df_predict[var+'_isnull'] = df_predict[var].isnull().astype(int)
        df_predict[var]=df_predict[var].fillna(df_predict[var].mean())

for var in df_predict.dtypes[df_predict.dtypes == 'float64'].index:
    if df_predict[var].isnull().sum()>0:
        df_predict[var+'_isnull'] = df_predict[var].isnull().astype(int)
        df_predict[var]=df_predict[var].fillna(df_predict[var].mean())

df_predict_dummies = pd.get_dummies(df_predict,dummy_na=True,drop_first=False)

In [213]:
_, df_predict_dummies = X.align(df_predict_dummies, axis=1, fill_value=0)

In [214]:
df_predict_dummies.drop(columns=df_predict_dummies.columns[~df_predict_dummies.columns.isin(X.columns)],inplace=True)

In [215]:
df_predict_dummies['intercept']=1

In [216]:
df_predict_dummies.describe()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,AgeHouseAtSale,Alley_Grvl,Alley_Pave,Alley_nan,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond_Fa,BsmtCond_Gd,BsmtCond_Po,BsmtCond_TA,BsmtCond_nan,BsmtExposure_Av,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_nan,BsmtFinSF1,BsmtFinSF2,BsmtFinType1_ALQ,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType1_nan,BsmtFinType2_ALQ,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,BsmtFinType2_nan,BsmtFullBath,BsmtHalfBath,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_TA,BsmtQual_nan,BsmtUnfSF,CentralAir_N,CentralAir_Y,Condition1_Artery,...,PavedDrive_Y,PoolArea,PoolQC_Ex,PoolQC_Fa,PoolQC_Gd,PoolQC_nan,RoofMatl_ClyTile,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities_AllPub,Utilities_NoSeWa,WoodDeckSF,YearBuilt,YearRemodAdd,YearsGarageBuiltAtSale,YearsGarageBuiltAtSale_isnull,YearsLastRemodelAtSale,YrSold,intercept
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,...,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,1156.534613,325.967786,1.79438,36.412611,0.047978,0.02536,0.926662,2.85401,0.825908,0.021247,0.039068,0.036326,0.07745,0.040439,0.039068,0.002056,0.887594,0.030843,0.135024,0.097327,0.085675,0.651816,0.030158,439.203704,52.619342,0.143249,0.082934,0.295408,0.054832,0.106237,0.288554,0.028787,0.022618,0.023989,0.013708,0.028101,0.034955,0.847841,0.028787,0.434454,0.065202,0.0939,0.036326,0.405072,0.434544,0.030158,554.294925,0.069225,0.930775,0.030158,...,0.891707,1.744345,0.001371,0.0,0.000685,0.997944,0.0,0.988348,0.0,0.0,0.0,0.008225,0.002742,0.000685,0.004798,0.801234,0.007539,0.181631,0.002742,0.002056,0.061001,0.005483,0.008225,0.01782,0.825223,0.082248,0.030158,0.005483,0.002056,0.011652,0.002742,0.002056,0.080192,0.002742,0.862234,17.064428,0.004112,0.995888,6.385195,1046.11797,0.998629,0.0,93.174777,1971.357779,1983.662783,30.191166,0.053461,24.108979,2007.769705,1.0
std,398.16582,420.610226,20.207842,30.431416,0.213793,0.157269,0.26078,0.829788,0.379318,0.144258,0.193823,0.187165,0.267396,0.197053,0.193823,0.045314,0.315973,0.172952,0.341866,0.296504,0.27998,0.476558,0.17108,455.111888,176.693301,0.350447,0.275876,0.456382,0.22773,0.308247,0.453245,0.167264,0.148734,0.153067,0.116316,0.165319,0.18373,0.359298,0.167264,0.530283,0.252295,0.291789,0.187165,0.491074,0.495867,0.17108,437.110508,0.253924,0.253924,0.17108,...,0.310857,30.491646,0.037012,0.0,0.02618,0.045314,0.0,0.10735,0.0,0.0,0.0,0.090348,0.052306,0.02618,0.069124,0.399209,0.086531,0.385673,0.052306,0.045314,0.239414,0.073871,0.090348,0.132344,0.379907,0.274837,0.17108,0.073871,0.045314,0.10735,0.052306,0.045314,0.271683,0.052306,0.344772,56.609763,0.064018,0.064018,1.508895,442.746712,0.037012,0.0,127.744882,30.390071,21.130467,25.052613,0.225029,21.129454,1.30174,0.0
min,407.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1879.0,1950.0,0.0,0.0,0.0,2006.0,1.0
25%,873.5,0.0,0.0,7.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,219.5,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,5.0,784.0,1.0,0.0,0.0,1953.0,1963.0,7.0,0.0,4.0,2007.0,1.0
50%,1079.0,0.0,0.0,34.0,0.0,0.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,351.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,460.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,6.0,988.0,1.0,0.0,0.0,1973.0,1992.0,30.191166,0.0,16.0,2008.0,1.0
75%,1382.5,676.0,0.0,55.0,0.0,0.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,752.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,797.5,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,7.0,1304.0,1.0,0.0,168.0,2001.0,2004.0,48.0,0.0,44.0,2009.0,1.0
max,5095.0,1862.0,360.0,129.0,1.0,1.0,1.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4010.0,1526.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,2140.0,1.0,1.0,1.0,...,1.0,800.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,576.0,1.0,1.0,15.0,5095.0,1.0,0.0,1424.0,2010.0,2010.0,114.0,1.0,60.0,2010.0,1.0


## Predictions

### Statsmodel low regularization

In [34]:
df_predict_raw['SalePrice'] = np.exp(results.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_initial.csv',index=False)

### Elastic Net

In [217]:
df_predict_raw['SalePrice'] =  np.exp(elastic_net_pipeline.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_elasticnet_log_standardized.csv',index=False)

### Decision Tree

In [236]:
df_predict_raw['SalePrice'] =  np.exp(decision_tree_pipeline.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_decisiontree_logstandard.csv',index=False)

### XG Boost

In [239]:
df_predict_raw['SalePrice'] =  np.exp(xg_boost_pipeline.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_xgboost_logstandard.csv',index=False)

### Random Forest

In [198]:
df_predict_raw['SalePrice'] =  np.exp(random_forest.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_rf_cv.csv',index=False)

In [201]:
df_predict_raw['SalePrice'] =  np.exp(random_forest_initial.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_rf_logstandard.csv',index=False)

### Light GBM

In [278]:
df_predict_raw['SalePrice'] =  np.exp(light_gbm.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_light_gbm.csv',index=False)

### Gradient Boosting Regression

In [282]:
df_predict_raw['SalePrice'] =  np.exp(gbr.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_gbr.csv',index=False)

### Voting Regression

In [273]:
df_predict_raw['SalePrice'] =  np.exp(voting_reg.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_voting_reg_standardized.csv',index=False)

### Stacking Regression

In [279]:
df_predict_raw['SalePrice'] =  np.exp(stacking_reg.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_stacking_reg.csv',index=False)