# Imports

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns',100)

In [2]:
from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import cross_validate, GridSearchCV

In [4]:
import statsmodels.api as sm

In [5]:
from sklearn.pipeline import make_pipeline

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
df_raw = pd.read_csv('train.csv')

Notes:
- Year Sold and Built not useful by itself, need to be changed into Age at Sale
- Year Sold minus Year Remodel gives time since last remodel
- MSSubClass will need some encoding (one-hot)
- MSZoning will need encoding
- Street can be one hot encoded
- Alley will be encoded
- LotShape encoded
- LandContour encoded
- Utilities encode

## Data Transforms

In [44]:
df = df_raw.copy()

In [45]:
df['AgeHouseAtSale'] = df['YrSold'] - df['YearBuilt']

df['YearsLastRemodelAtSale']  = df['YrSold'] - df['YearRemodAdd']

df['YearsGarageBuiltAtSale'] = df['YrSold'] - df['GarageYrBlt'] 

df.drop(columns=['Id'],inplace=True)

df['MSSubClass'] = df['MSSubClass'].astype(str)

In [46]:
for var in df.dtypes[df.dtypes == 'int64'].index:
    if df[var].isnull().sum()>0:
        df[var+'_isnull'] = df[var].isnull().astype(int)
        df[var]=df[var].fillna(df[var].mean())

for var in df.dtypes[df.dtypes == 'float64'].index:
    if df[var].isnull().sum()>0:
        df[var+'_isnull'] = df[var].isnull().astype(int)
        df[var]=df[var].fillna(df[var].mean())

df_dummies = pd.get_dummies(df,dummy_na=True,drop_first=False)

In [47]:
X = df_dummies.drop(columns='SalePrice')

y = np.log(df_dummies['SalePrice'])

X['intercept'] = 1

In [15]:
standardscaler = StandardScaler()

In [16]:
standardscaler.fit(X)

StandardScaler()

In [49]:
test = standardscaler.transform(X)

In [50]:
test = pd.DataFrame(test,columns=X.columns)

In [51]:
test.describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,AgeHouseAtSale,YearsLastRemodelAtSale,YearsGarageBuiltAtSale,LotFrontage_isnull,MasVnrArea_isnull,GarageYrBlt_isnull,YearsGarageBuiltAtSale_isnull,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,...,GarageFinish_Unf,GarageFinish_nan,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageQual_nan,GarageCond_Ex,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA,GarageCond_nan,PavedDrive_N,PavedDrive_P,PavedDrive_Y,PavedDrive_nan,PoolQC_Ex,PoolQC_Fa,PoolQC_Gd,PoolQC_nan,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_nan,MiscFeature_Gar2,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,MiscFeature_nan,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan,intercept
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,3.410514e-16,-4.2027830000000004e-17,-7.680310000000001e-17,3.692632e-16,1.032983e-15,4.518912e-15,1.630355e-16,1.5056450000000003e-17,1.653168e-16,-7.376139000000001e-17,2.063038e-16,7.686013000000001e-17,-3.269835e-17,1.920268e-16,-1.446332e-16,4.106304e-17,9.999611e-18,2.770995e-16,-3.0112900000000006e-17,-1.108892e-16,2.237936e-16,4.387662e-17,2.701036e-16,3.937033e-15,2.5854510000000002e-17,-2.022735e-17,2.190029e-16,3.357284e-17,1.379034e-16,-4.005738e-16,1.199193e-16,-7.945889e-16,2.409602e-16,-6.395189e-17,3.566101e-14,6.52256e-17,-1.4143940000000003e-17,-1.352039e-16,-6.668942000000001e-17,-5.0036080000000005e-17,2.045548e-17,2.045548e-17,2.399907e-16,-2.522335e-16,-1.2166830000000001e-17,-6.163259e-17,4.2583900000000006e-17,-1.880155e-16,-4.7697770000000004e-17,1.258905e-15,...,1.003763e-17,2.045548e-17,5.714702e-16,-3.699856e-16,9.223976e-17,-7.814335000000001e-17,1.152046e-16,2.045548e-17,-2.562638e-17,-1.065358e-16,1.809816e-16,2.898557e-16,-1.964943e-16,2.045548e-17,-1.073723e-16,-1.667236e-16,-1.974068e-16,0.0,9.163142000000001e-17,-2.543627e-16,-7.384219e-16,-1.345955e-15,1.808675e-16,5.4864790000000006e-17,1.329986e-16,1.505645e-16,-1.863045e-16,1.427321e-16,-1.375612e-16,5.308159e-16,6.705538e-17,3.954219e-16,-1.968365e-16,-3.634269e-16,1.133036e-17,1.858483e-16,-1.156162e-15,-4.346694e-16,-3.406712e-16,-2.10353e-17,6.52218e-16,0.0,-4.2279730000000005e-17,-5.270708e-16,4.4047720000000006e-17,-1.320291e-16,-5.96935e-17,2.0987780000000002e-17,0.0,0.0
std,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,...,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,0.0,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,0.0,1.000343,1.000343,1.000343,1.000343,1.000343,1.000343,0.0,0.0
min,-2.227875,-0.9237292,-3.688413,-4.11297,-3.287824,-1.689368,-0.5744105,-0.9730182,-0.2886528,-1.284176,-2.411167,-2.144172,-0.7951632,-0.1202417,-2.24912,-0.8199644,-0.241061,-2.841822,-0.7616207,-3.514952,-4.751486,-2.780469,-0.9512265,-3.27295,-2.36544,-2.212963,-0.7521758,-0.7044833,-0.3593249,-0.1163393,-0.2702084,-0.06869175,-0.08768781,-1.969111,-1.367655,-1.208604,-1.160729,-1.219942,-0.4643852,-0.07422696,-0.2423597,-0.2423597,-0.2517238,-0.2123597,-0.08304548,-0.1448414,-0.7616342,-0.2227209,-0.05241424,-0.09103457,...,-0.841191,-0.2423597,-0.04537649,-0.1843755,-0.09839655,-0.04537649,-2.966253,-0.2423597,-0.03703704,-0.1567208,-0.07875671,-0.06940907,-3.145715,-0.2423597,-0.2563073,-0.1448414,-3.341656,0.0,-0.03703704,-0.03703704,-0.04537649,-14.40734,-0.205214,-0.1959766,-0.3471184,-0.08712888,-2.048348,-0.03703704,-0.03703704,-0.1863522,-0.02618016,-5.10265,-0.1742005,-0.05241424,-0.03703704,-0.07875671,-0.05862104,-0.05862104,-0.3019617,-0.04537649,-2.56218,0.0,-0.2726158,-0.05241424,-0.09103457,-0.1178511,-2.138345,-0.305995,0.0,0.0
25%,-0.4564744,-0.2969908,-0.7951515,-0.5171998,-0.5719226,-0.8656586,-0.5744105,-0.9730182,-0.2886528,-0.7793259,-0.5966855,-0.7261556,-0.7951632,-0.1202417,-0.7347485,-0.8199644,-0.241061,-1.026041,-0.7616207,-1.062465,-0.2114536,-0.9341298,-0.9512265,-0.6881478,-1.026858,-0.647916,-0.7521758,-0.7044833,-0.3593249,-0.1163393,-0.2702084,-0.06869175,-0.08768781,-0.4891101,-0.6144386,-0.9440523,-0.9184057,-0.9285484,-0.4643852,-0.07422696,-0.2423597,-0.2423597,-0.2517238,-0.2123597,-0.08304548,-0.1448414,-0.7616342,-0.2227209,-0.05241424,-0.09103457,...,-0.841191,-0.2423597,-0.04537649,-0.1843755,-0.09839655,-0.04537649,0.3371256,-0.2423597,-0.03703704,-0.1567208,-0.07875671,-0.06940907,0.3178928,-0.2423597,-0.2563073,-0.1448414,0.2992528,0.0,-0.03703704,-0.03703704,-0.04537649,0.06940907,-0.205214,-0.1959766,-0.3471184,-0.08712888,0.4881983,-0.03703704,-0.03703704,-0.1863522,-0.02618016,0.1959766,-0.1742005,-0.05241424,-0.03703704,-0.07875671,-0.05862104,-0.05862104,-0.3019617,-0.04537649,0.3902926,0.0,-0.2726158,-0.05241424,-0.09103457,-0.1178511,0.4676514,-0.305995,0.0,0.0
50%,6.454645e-16,-0.1040633,-0.07183611,-0.5171998,0.05737148,0.4425864,-0.5744105,-0.1319022,-0.2886528,-0.2031633,-0.1503334,-0.1956933,-0.7951632,-0.1202417,-0.09797004,-0.8199644,-0.241061,0.7897405,-0.7616207,0.1637791,-0.2114536,-0.3186833,0.6004949,0.0,0.3117246,0.03284429,-0.7521758,-0.3270298,-0.3593249,-0.1163393,-0.2702084,-0.06869175,-0.08768781,-0.1191097,0.1387775,-0.05118902,-0.4337589,-1.478913e-16,-0.4643852,-0.07422696,-0.2423597,-0.2423597,-0.2517238,-0.2123597,-0.08304548,-0.1448414,-0.7616342,-0.2227209,-0.05241424,-0.09103457,...,-0.841191,-0.2423597,-0.04537649,-0.1843755,-0.09839655,-0.04537649,0.3371256,-0.2423597,-0.03703704,-0.1567208,-0.07875671,-0.06940907,0.3178928,-0.2423597,-0.2563073,-0.1448414,0.2992528,0.0,-0.03703704,-0.03703704,-0.04537649,0.06940907,-0.205214,-0.1959766,-0.3471184,-0.08712888,0.4881983,-0.03703704,-0.03703704,-0.1863522,-0.02618016,0.1959766,-0.1742005,-0.05241424,-0.03703704,-0.07875671,-0.05862104,-0.05862104,-0.3019617,-0.04537649,0.3902926,0.0,-0.2726158,-0.05241424,-0.09103457,-0.1178511,0.4676514,-0.305995,0.0,0.0
75%,0.4065156,0.108708,0.6514792,0.3817427,0.9516316,0.9271216,0.3355252,0.5891327,-0.2886528,0.5450557,0.5491227,0.5915905,0.8731117,-0.1202417,0.4974036,1.10781,-0.241061,0.7897405,1.227585,0.1637791,-0.2114536,0.2967633,0.6004949,0.937776,0.3117246,0.4820057,0.5886506,0.3221901,-0.3593249,-0.1163393,-0.2702084,-0.06869175,-0.08768781,0.620891,0.8919936,0.5771222,0.8747875,0.6949322,-0.4643852,-0.07422696,-0.2423597,-0.2423597,-0.2517238,-0.2123597,-0.08304548,-0.1448414,1.312966,-0.2227209,-0.05241424,-0.09103457,...,1.188791,-0.2423597,-0.04537649,-0.1843755,-0.09839655,-0.04537649,0.3371256,-0.2423597,-0.03703704,-0.1567208,-0.07875671,-0.06940907,0.3178928,-0.2423597,-0.2563073,-0.1448414,0.2992528,0.0,-0.03703704,-0.03703704,-0.04537649,0.06940907,-0.205214,-0.1959766,-0.3471184,-0.08712888,0.4881983,-0.03703704,-0.03703704,-0.1863522,-0.02618016,0.1959766,-0.1742005,-0.05241424,-0.03703704,-0.07875671,-0.05862104,-0.05862104,-0.3019617,-0.04537649,0.3902926,0.0,-0.2726158,-0.05241424,-0.09103457,-0.1178511,0.4676514,-0.305995,0.0,0.0
max,11.03492,20.51827,2.821425,3.07857,1.282839,1.217843,8.289499,11.40575,8.851638,4.004295,11.52095,9.132681,3.936963,11.64775,7.855574,4.963359,8.13868,2.605522,3.216791,6.294997,8.868612,4.604889,3.703938,1.312989,2.988889,4.421526,6.087635,7.554198,8.675309,17.21723,8.341462,18.30618,31.16527,2.100892,1.64521,3.288781,1.795616,3.234222,2.153385,13.47219,4.126099,4.126099,3.972607,4.708992,12.04159,6.904105,1.312966,4.489924,19.07878,10.98484,...,1.188791,4.126099,22.03785,5.423713,10.16296,22.03785,0.3371256,4.126099,27.0,6.380775,12.69733,14.40734,0.3178928,4.126099,3.901567,6.904105,0.2992528,0.0,27.0,27.0,22.03785,0.06940907,4.872962,5.10265,2.880862,11.47725,0.4881983,27.0,27.0,5.366183,38.19686,0.1959766,5.740513,19.07878,27.0,12.69733,17.05872,17.05872,3.311678,22.03785,0.3902926,0.0,3.668167,19.07878,10.98484,8.485281,0.4676514,3.268027,0.0,0.0


# Outliers

Capping did not seem to make much of a difference. Does not seem to be egregious outliers

In [22]:
df.describe([0.01,.99])

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,AgeHouseAtSale,YearsLastRemodelAtSale,YearsGarageBuiltAtSale,LotFrontage_isnull,MasVnrArea_isnull,GarageYrBlt_isnull,YearsGarageBuiltAtSale_isnull
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,567.240411,1057.429452,1162.626712,346.992466,5.844521,1515.463699,0.425342,0.057534,1.565068,0.382877,2.866438,1.046575,6.517808,0.613014,1978.506164,1.767123,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589,36.547945,22.95,29.306019,0.177397,0.005479,0.055479,0.055479
std,22.024023,9981.264932,1.382997,1.112799,30.202904,20.645407,180.569112,456.098091,161.319273,441.866955,438.705324,386.587738,436.528436,48.623081,525.480383,0.518911,0.238753,0.550916,0.502885,0.815778,0.220338,1.625393,0.644666,23.994583,0.747315,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883,30.250152,20.640653,24.030693,0.382135,0.073846,0.228992,0.228992
min,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
1%,21.0,1680.0,3.0,3.0,1899.18,1950.0,0.0,0.0,0.0,0.0,0.0,520.0,0.0,0.0,692.18,0.0,0.0,1.0,0.0,1.0,1.0,3.0,0.0,1917.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,61815.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,70.049958,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,477.5,991.5,1087.0,0.0,0.0,1464.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1978.506164,2.0,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0,35.0,14.0,29.306019,0.0,0.0,0.0,0.0
99%,137.41,37567.64,10.0,9.0,2009.0,2009.0,791.28,1572.41,830.38,1797.05,2155.05,2219.46,1418.92,360.0,3123.48,2.0,1.0,3.0,1.0,5.0,2.0,11.0,2.0,2009.0,3.0,1002.79,505.46,285.82,261.05,168.0,268.05,0.0,700.0,12.0,2010.0,442567.01,110.41,60.0,90.0,1.0,0.0,1.0,1.0
max,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,2336.0,6110.0,4692.0,2065.0,572.0,5642.0,3.0,2.0,3.0,2.0,8.0,3.0,14.0,3.0,2010.0,4.0,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0,136.0,60.0,107.0,1.0,1.0,1.0,1.0


# Training

## SKLEARN

In [28]:
from sklearn.linear_model import LassoCV

In [29]:
lasso_cv = LassoCV(cv=5, random_state=0)

In [30]:
lasso_pipeline = make_pipeline(StandardScaler(),lasso_cv)

In [31]:
lasso_pipeline.fit(X=X,y=y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('lassocv', LassoCV(cv=5, random_state=0))])

In [32]:
lasso_cv.alpha_

0.004313679483801087

### Elastic Net
Looks like the optimal is just an L1 Lasso regression

In [36]:
from sklearn.linear_model import ElasticNetCV

In [37]:
# elastic_net_cv = ElasticNetCV(l1_ratio=[0,.5,1]
#                              ,alphas=[0.1,1,10,100]
#                              )
elastic_net_cv = ElasticNetCV(cv=5, random_state=0, l1_ratio=[0.5,0.7,0.8,0.9,1])

In [38]:
elastic_net_pipeline = make_pipeline(StandardScaler(),elastic_net_cv)

In [39]:
elastic_net_pipeline.fit(X=X,y=y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('elasticnetcv',
                 ElasticNetCV(cv=5, l1_ratio=[0.5, 0.7, 0.8, 0.9, 1],
                              random_state=0))])

In [40]:
elastic_net_cv.l1_ratio_

0.5

In [41]:
elastic_net_cv.alpha_

0.009250834233476633

In [42]:
test = pd.DataFrame(elastic_net_cv.coef_,index=X.columns)

### Decision Tree

In [27]:
params_dt={'splitter':['random', 'best']
        ,'min_samples_split':[2, 3, 4, 5, 6, 8, 10]
        ,'min_samples_leaf':[0.01, 0.02, 0.03, 0.04]
        ,'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2]
        ,'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None]
        ,'max_features':[ 0.95, 0.90, 0.85, 0.80, 0.75, 0.70]
        ,'max_depth':[None, 2,4,6,8]
        ,'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]}
        

In [28]:
from sklearn.tree import DecisionTreeRegressor

In [29]:
from sklearn.model_selection import RandomizedSearchCV

In [30]:
from skopt import BayesSearchCV

In [31]:
decision_tree = DecisionTreeRegressor()

In [32]:
bayes_search_dt = BayesSearchCV(decision_tree,params_dt)

In [33]:
decision_tree = DecisionTreeRegressor(max_depth=6, max_features=0.85, max_leaf_nodes=40,
                      min_samples_leaf=0.02, min_samples_split=6,
                      min_weight_fraction_leaf=0.0075)

In [34]:
decision_tree_pipeline = make_pipeline(StandardScaler(),decision_tree)

In [35]:
decision_tree_pipeline.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('decisiontreeregressor',
                 DecisionTreeRegressor(max_depth=6, max_features=0.85,
                                       max_leaf_nodes=40, min_samples_leaf=0.02,
                                       min_samples_split=6,
                                       min_weight_fraction_leaf=0.0075))])

## XGBoost

In [36]:
import xgboost

In [37]:
from importlib import reload

In [38]:
reload(xgboost)

<module 'xgboost' from 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\xgboost\\__init__.py'>

In [39]:
xgboost.__version__

'1.6.2'

In [40]:
xg_boost = xgboost.XGBRegressor()

In [41]:
xg_boost_pipeline = make_pipeline(StandardScaler(),xg_boost)

In [42]:
xg_boost.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

## Random Forest

In [46]:
from sklearn.ensemble import RandomForestRegressor

In [47]:
random_forest_initial = RandomForestRegressor(n_estimators=200,oob_score=True)

In [49]:
random_forest_pipeline = make_pipeline(StandardScaler(),random_forest_initial)

In [50]:
random_forest_pipeline.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(n_estimators=200, oob_score=True))])

In [44]:
random_forest = RandomForestRegressor(max_depth=20, max_features=0.55, min_samples_leaf=6,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.005,
                      n_estimators=200, oob_score=True)

NameError: name 'RandomForestRegressor' is not defined

## Adaboost

In [57]:
from sklearn.ensemble import AdaBoostRegressor

In [58]:
adaboost = AdaBoostRegressor()

In [59]:
adaboost_pipeline = make_pipeline(StandardScaler(),adaboost)

In [60]:
adaboost_pipeline.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('adaboostregressor', AdaBoostRegressor())])

## Light GBM

In [62]:
from lightgbm import LGBMRegressor

In [63]:
light_gbm = LGBMRegressor()

light_gbm_pipeline = make_pipeline(StandardScaler(),light_gbm)

light_gbm_pipeline.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('lgbmregressor', LGBMRegressor())])

## GradientBoostingRegressor

In [65]:
from sklearn.ensemble import GradientBoostingRegressor

In [66]:
gbr = GradientBoostingRegressor()

gbr_pipeline = make_pipeline(StandardScaler(),gbr)

gbr.fit(X,y)

GradientBoostingRegressor()

## Stochastic Gradient Descent Regression

In [69]:
from sklearn.linear_model import SGDRegressor

In [70]:
SGDR = SGDRegressor()

SGDR_pipeline = make_pipeline(StandardScaler(),SGDR)

SGDR_pipeline.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor', SGDRegressor())])

## from sklearn.svm import SVR
Support Vector Machine

In [72]:
from sklearn.svm import SVR

In [73]:
svr = SVR()

svr_pipeline = make_pipeline(StandardScaler(),svr)

svr_pipeline.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()), ('svr', SVR())])

## from sklearn.linear_model import BayesianRidge

In [76]:
from sklearn.linear_model import BayesianRidge

In [77]:
bayesianr = BayesianRidge()

bayesianr_pipeline = make_pipeline(StandardScaler(),bayesianr)

bayesianr_pipeline.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('bayesianridge', BayesianRidge())])

## from sklearn.kernel_ridge import KernelRidge

In [80]:
from sklearn.kernel_ridge import KernelRidge

In [81]:
kernelr = KernelRidge()

kernelr_pipeline = make_pipeline(StandardScaler(),kernelr)

kernelr_pipeline.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kernelridge', KernelRidge())])

## Ensemble

In [85]:
from sklearn.ensemble import VotingRegressor, StackingRegressor

In [87]:
voting_reg = VotingRegressor([('elastic_net',elastic_net_pipeline)
                              ,('rf',random_forest_pipeline)
                              ,('xg_boost',xg_boost_pipeline)
                             # ,('adaboost',adaboost_pipeline)
                                  ,('lightgbm',light_gbm_pipeline)
                                  ,('gbr',gbr_pipeline)
                                  ,('bayesianRidge',bayesianr_pipeline)
                                  # ,('kernelRidge',kernelr)
                             ])

In [92]:
voting_reg.fit(X,y)

VotingRegressor(estimators=[('elastic_net',
                             Pipeline(steps=[('standardscaler',
                                              StandardScaler()),
                                             ('elasticnetcv',
                                              ElasticNetCV(cv=5,
                                                           l1_ratio=[0.5, 0.7,
                                                                     0.8, 0.9,
                                                                     1],
                                                           random_state=0))])),
                            ('rf',
                             Pipeline(steps=[('standardscaler',
                                              StandardScaler()),
                                             ('randomforestregressor',
                                              RandomForestRegressor(n_estimators=200,
                                                                    

In [89]:
stacking_reg = StackingRegressor([('elastic_net',elastic_net_pipeline)
                              ,('rf',random_forest_pipeline)
                              ,('xg_boost',xg_boost_pipeline)
                             # ,('adaboost',adaboost_pipeline)
                                  ,('lightgbm',light_gbm_pipeline)
                                  ,('gbr',gbr_pipeline)
                                  ,('bayesianRidge',bayesianr_pipeline)
                                  # ,('kernelRidge',kernelr)
                             ])

In [90]:
stacking_reg.fit(X,y)

StackingRegressor(estimators=[('elastic_net',
                               Pipeline(steps=[('standardscaler',
                                                StandardScaler()),
                                               ('elasticnetcv',
                                                ElasticNetCV(cv=5,
                                                             l1_ratio=[0.5, 0.7,
                                                                       0.8, 0.9,
                                                                       1],
                                                             random_state=0))])),
                              ('rf',
                               Pipeline(steps=[('standardscaler',
                                                StandardScaler()),
                                               ('randomforestregressor',
                                                RandomForestRegressor(n_estimators=200,
                                        

## Cross Val Score

In [34]:
from sklearn.model_selection import cross_validate

In [43]:
val_results = cross_validate(elastic_net_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.020617,0.010032,-0.035587,-0.022522,-0.022276,-0.011766,-0.010933


In [25]:
val_results = cross_validate(lasso_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.020734,0.010145,-0.035885,-0.022604,-0.022441,-0.011772,-0.010967


In [47]:
val_results = cross_validate(decision_tree_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.037794,0.004469,-0.043628,-0.040288,-0.037976,-0.034838,-0.03224


In [51]:
val_results = cross_validate(xg_boost_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.020725,0.002873,-0.024823,-0.021896,-0.020797,-0.018601,-0.01751


In [52]:
val_results = cross_validate(random_forest_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.020422,0.002488,-0.023458,-0.021937,-0.020676,-0.01893,-0.017112


In [202]:
val_results = cross_validate(random_forest_initial,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.020326,0.002785,-0.023262,-0.022594,-0.020544,-0.01874,-0.016492


In [61]:
val_results = cross_validate(adaboost_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.029985,0.002666,-0.034171,-0.030452,-0.029798,-0.028341,-0.027164


In [64]:
val_results = cross_validate(light_gbm_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.017546,0.002645,-0.021153,-0.01853,-0.017916,-0.015946,-0.014183


In [67]:
val_results = cross_validate(gbr_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.016221,0.002595,-0.01974,-0.017143,-0.016892,-0.013975,-0.013354


In [71]:
val_results = cross_validate(SGDR,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-9.99299e+33,9.050041e+33,-2.328386e+34,-1.334854e+34,-9.722485e+33,-3.470509e+33,-1.395562e+32


In [74]:
val_results = cross_validate(svr,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.041615,0.004721,-0.045491,-0.045126,-0.042035,-0.041659,-0.033765


In [78]:
val_results = cross_validate(bayesianr_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.021717,0.00871,-0.033839,-0.024515,-0.023699,-0.014247,-0.012285


In [83]:
val_results = cross_validate(kernelr_pipeline,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-144.572268,0.294254,-144.978352,-144.724658,-144.570174,-144.335483,-144.252671


In [88]:
val_results = cross_validate(voting_reg,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.015504,0.00349,-0.01929,-0.018351,-0.016073,-0.012014,-0.011793


In [91]:
val_results = cross_validate(stacking_reg,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.015611,0.003662,-0.019202,-0.019197,-0.015881,-0.012115,-0.011662


Scaling seems to have really benefitted the linear regression Elastic Net, but did not affect the tree based formulas much. 

Somehow kernalr is much worse now. 

# Prediction

In [94]:
df_predict_raw = pd.read_csv('test.csv')

In [95]:
df_predict = df_predict_raw.copy()

In [96]:
df_predict['AgeHouseAtSale'] = df_predict['YrSold'] - df_predict['YearBuilt']

df_predict['YearsLastRemodelAtSale']  = df_predict['YrSold'] - df_predict['YearRemodAdd']

df_predict['YearsGarageBuiltAtSale'] = df_predict['YrSold'] - df_predict['GarageYrBlt'] 

df_predict.drop(columns=['Id'],inplace=True)

df_predict['MSSubClass'] = df_predict['MSSubClass'].astype(str)

In [97]:
for var in df_predict.dtypes[df_predict.dtypes == 'int64'].index:
    if df_predict[var].isnull().sum()>0:
        df_predict[var+'_isnull'] = df_predict[var].isnull().astype(int)
        df_predict[var]=df_predict[var].fillna(df_predict[var].mean())

for var in df_predict.dtypes[df_predict.dtypes == 'float64'].index:
    if df_predict[var].isnull().sum()>0:
        df_predict[var+'_isnull'] = df_predict[var].isnull().astype(int)
        df_predict[var]=df_predict[var].fillna(df_predict[var].mean())

df_predict_dummies = pd.get_dummies(df_predict,dummy_na=True,drop_first=False)

In [98]:
_, df_predict_dummies = X.align(df_predict_dummies, axis=1, fill_value=0)

In [99]:
df_predict_dummies.drop(columns=df_predict_dummies.columns[~df_predict_dummies.columns.isin(X.columns)],inplace=True)

In [100]:
df_predict_dummies['intercept']=1

## Predictions

### Statsmodel low regularization

In [34]:
df_predict_raw['SalePrice'] = np.exp(results.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_initial.csv',index=False)

### Elastic Net

In [58]:
df_predict_raw['SalePrice'] =  np.exp(elastic_net_pipeline.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_elasticnet_standardized.csv',index=False)

In [57]:
np.exp(elastic_net_pipeline.predict(df_predict_dummies[X.columns]))

array([123044.34002264, 152817.94245416, 175576.08772772, ...,
       167835.60169374, 118403.03839643, 227853.80061158])

### Decision Tree

In [160]:
df_predict_raw['SalePrice'] =  np.exp(decision_tree.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_decisiontree.csv',index=False)

### XG Boost

In [68]:
df_predict_raw['SalePrice'] =  np.exp(xg_boost.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_xgboost.csv',index=False)

### Random Forest

In [198]:
df_predict_raw['SalePrice'] =  np.exp(random_forest.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_rf_cv.csv',index=False)

In [201]:
df_predict_raw['SalePrice'] =  np.exp(random_forest_initial.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_rf.csv',index=False)

### Light GBM

In [278]:
df_predict_raw['SalePrice'] =  np.exp(light_gbm.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_light_gbm.csv',index=False)

### Gradient Boosting Regression

In [282]:
df_predict_raw['SalePrice'] =  np.exp(gbr.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_gbr.csv',index=False)

### Voting Regression

In [103]:
df_predict_raw['SalePrice'] =  np.exp(voting_reg.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_voting_reg_standardized.csv',index=False)

### Stacking Regression

In [279]:
df_predict_raw['SalePrice'] =  np.exp(stacking_reg.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_stacking_reg.csv',index=False)