# Imports

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns',100)

In [2]:
from matplotlib import pyplot as plt
import seaborn as sns

In [58]:
from sklearn.model_selection import cross_validate, GridSearchCV

In [3]:
import statsmodels.api as sm

In [4]:
df_raw = pd.read_csv('train.csv')

Notes:
- Year Sold and Built not useful by itself, need to be changed into Age at Sale
- Year Sold minus Year Remodel gives time since last remodel
- MSSubClass will need some encoding (one-hot)
- MSZoning will need encoding
- Street can be one hot encoded
- Alley will be encoded
- LotShape encoded
- LandContour encoded
- Utilities encode

## Data Transforms

In [46]:
df = df_raw.copy()

In [47]:
df['AgeHouseAtSale'] = df['YrSold'] - df['YearBuilt']

df['YearsLastRemodelAtSale']  = df['YrSold'] - df['YearRemodAdd']

df['YearsGarageBuiltAtSale'] = df['YrSold'] - df['GarageYrBlt'] 

df.drop(columns=['Id'],inplace=True)

df['MSSubClass'] = df['MSSubClass'].astype(str)

In [48]:
# for var in df.dtypes[df.dtypes=='object'].index:
#     # df_raw.groupby(var,dropna=False).agg({'SalePrice':'mean'}).plot.bar()
#     df[var]=df[var].fillna('NULL')

In [49]:
df[df.isnull().mean().sort_values(ascending=False)[:4].index].describe()

Unnamed: 0,PoolQC,MiscFeature,Alley,Fence
count,7,54,91,281
unique,3,4,2,4
top,Gd,Shed,Grvl,MnPrv
freq,3,49,50,157


In [50]:
for var in df.dtypes[df.dtypes == 'int64'].index:
    if df[var].isnull().sum()>0:
        df[var+'_isnull'] = df[var].isnull().astype(int)
        df[var]=df[var].fillna(df[var].mean())

for var in df.dtypes[df.dtypes == 'float64'].index:
    if df[var].isnull().sum()>0:
        df[var+'_isnull'] = df[var].isnull().astype(int)
        df[var]=df[var].fillna(df[var].mean())

df_dummies = pd.get_dummies(df,dummy_na=True,drop_first=False)

In [51]:
X = df_dummies.drop(columns='SalePrice')

In [52]:
y = np.log(df_dummies['SalePrice'])

In [53]:
X['intercept'] = 1

In [54]:
model = sm.OLS(y,X)

In [55]:
results = model.fit_regularized(alpha=10,L1_wt=1)

# Training

## SKLEARN

In [72]:
from sklearn.linear_model import LassoCV

In [73]:
lasso_cv = LassoCV(cv=5, random_state=0)

In [74]:
lasso_cv.fit(X=X,y=y)

LassoCV(cv=5, random_state=0)

In [75]:
lasso_cv.alpha_

1.025240696711317

In [76]:
test = pd.DataFrame(elastic_net_cv.coef_,index=X.columns)
test.loc[test[0]>0]

Unnamed: 0,0
LotArea,2e-06
YearBuilt,0.00034
MasVnrArea,3.3e-05
BsmtFinSF1,4.7e-05
TotalBsmtSF,0.000162
2ndFlrSF,2.2e-05
GrLivArea,0.000308
GarageArea,0.000383
WoodDeckSF,0.00016
ScreenPorch,0.000123


### Elastic Net
Looks like the optimal is just an L1 Lasso regression

In [40]:
from sklearn.linear_model import ElasticNetCV

In [105]:
# elastic_net_cv = ElasticNetCV(l1_ratio=[0,.5,1]
#                              ,alphas=[0.1,1,10,100]
#                              )
elastic_net_cv = ElasticNetCV(cv=5, random_state=0, l1_ratio=[0.5,0.7,0.8,0.9,1])

In [106]:
elastic_net_cv.fit(X=X,y=y)

ElasticNetCV(cv=5, l1_ratio=[0.5, 0.7, 0.8, 0.9, 1], random_state=0)

In [107]:
elastic_net_cv.l1_ratio_

0.9

In [108]:
elastic_net_cv.alpha_

1.1391563296792413

In [109]:
test = pd.DataFrame(elastic_net_cv.coef_,index=X.columns)

In [110]:
test.loc[test[0]>0]

Unnamed: 0,0
LotArea,2e-06
MasVnrArea,3.3e-05
BsmtFinSF1,4.7e-05
TotalBsmtSF,0.000162
2ndFlrSF,2.1e-05
GrLivArea,0.000308
GarageArea,0.000383
WoodDeckSF,0.00016
ScreenPorch,0.000123


In [235]:
from sklearn.linear_model import ElasticNet

In [236]:
elastic_net = ElasticNet(l1_ratio=0.9,alpha=1.1391563296792413)

### Decision Tree

In [None]:
params_dt={'splitter':['random', 'best']
        ,'min_samples_split':[2, 3, 4, 5, 6, 8, 10]
        ,'min_samples_leaf':[0.01, 0.02, 0.03, 0.04]
        ,'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2]
        ,'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None]
        ,'max_features':[ 0.95, 0.90, 0.85, 0.80, 0.75, 0.70]
        ,'max_depth':[None, 2,4,6,8]
        ,'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]}
        

In [120]:
from sklearn.tree import DecisionTreeRegressor

In [129]:
from sklearn.model_selection import RandomizedSearchCV

In [138]:
from skopt import BayesSearchCV

In [121]:
decision_tree = DecisionTreeRegressor()

In [152]:
bayes_search_dt = BayesSearchCV(decision_tree,params_dt)

In [153]:
bayes_search_dt.fit(X,y)

BayesSearchCV(estimator=DecisionTreeRegressor(),
              search_spaces={'max_depth': [None, 2, 4, 6, 8],
                             'max_features': [0.95, 0.9, 0.85, 0.8, 0.75, 0.7],
                             'max_leaf_nodes': [10, 15, 20, 25, 30, 35, 40, 45,
                                                50, None],
                             'min_impurity_decrease': [0.0, 0.0005, 0.005, 0.05,
                                                       0.1, 0.15, 0.2],
                             'min_samples_leaf': [0.01, 0.02, 0.03, 0.04],
                             'min_samples_split': [2, 3, 4, 5, 6, 8, 10],
                             'min_weight_fraction_leaf': [0.0, 0.0025, 0.005,
                                                          0.0075, 0.01, 0.05],
                             'splitter': ['random', 'best']})

In [154]:
bayes_search_dt.best_estimator_

DecisionTreeRegressor(max_depth=6, max_features=0.85, max_leaf_nodes=40,
                      min_samples_leaf=0.02, min_samples_split=6,
                      min_weight_fraction_leaf=0.0075)

In [155]:
decision_tree = DecisionTreeRegressor(max_depth=6, max_features=0.85, max_leaf_nodes=40,
                      min_samples_leaf=0.02, min_samples_split=6,
                      min_weight_fraction_leaf=0.0075)

In [157]:
decision_tree.fit(X,y)

DecisionTreeRegressor(max_depth=6, max_features=0.85, max_leaf_nodes=40,
                      min_samples_leaf=0.02, min_samples_split=6,
                      min_weight_fraction_leaf=0.0075)

## XGBoost

In [57]:
import xgboost

In [59]:
xg_boost = xgboost.XGBRegressor()

In [60]:
xg_boost.fit(X,y)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=20,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

## Random Forest

In [114]:
from sklearn.ensemble import RandomForestRegressor

In [199]:
random_forest_initial = RandomForestRegressor(n_estimators=200,oob_score=True)

In [200]:
random_forest_initial.fit(X,y)

RandomForestRegressor(n_estimators=200, oob_score=True)

In [170]:
random_forest.oob_score_

0.8672808132118064

In [177]:
oob_scores = {}
rf = RandomForestRegressor(n_estimators=50, warm_start=True, oob_score=True, n_jobs=-1, random_state=2)
rf.fit(X, y)
oob_scores[50] = rf.oob_score_
est = 50
estimators=[est]
for i in range(9):
    est += 50
    estimators.append(est)
    rf.set_params(n_estimators=est)
    rf.fit(X, y)
    oob_scores[est] = rf.oob_score_

In [178]:
oob_scores

{50: 0.863228572941851,
 100: 0.8692002499863029,
 150: 0.8710950810754902,
 200: 0.8720762808249324,
 250: 0.8726630493147655,
 300: 0.8737357478017185,
 350: 0.8743760948860517,
 400: 0.8741297599307519,
 450: 0.8748115174693367,
 500: 0.8747478596427851}

In [210]:
params_rf={'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]
           ,'min_samples_split': [0.01, 0.02, 0.03, 0.04, 0.06, 0.08, 0.1]
           ,'min_samples_leaf':[1,2,4,6,8,10,20,30]
           ,'min_impurity_decrease':[0.0, 0.01, 0.05, 0.10, 0.15, 0.2]
           ,'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None]
           ,'max_features':[0.8, 0.7, 0.6, 0.5, 0.4]
           ,'max_depth':[None,2,4,6,8,10,20]}
        

In [211]:
bayes_search_rf = BayesSearchCV(random_forest,params_rf,n_jobs=-1)

In [212]:
bayes_search_rf.fit(X,y)

BayesSearchCV(estimator=RandomForestRegressor(max_depth=8, max_features=0.7,
                                              min_samples_leaf=0.01,
                                              min_samples_split=4,
                                              min_weight_fraction_leaf=0.0075,
                                              n_estimators=200,
                                              oob_score=True),
              n_jobs=-1,
              search_spaces={'max_depth': [None, 2, 4, 6, 8, 10, 20],
                             'max_features': [0.8, 0.7, 0.6, 0.5, 0.4],
                             'max_leaf_nodes': [10, 15, 20, 25, 30, 35, 40, 45,
                                                50, None],
                             'min_impurity_decrease': [0.0, 0.01, 0.05, 0.1,
                                                       0.15, 0.2],
                             'min_samples_leaf': [1, 2, 4, 6, 8, 10, 20, 30],
                             'min_samples_split': [0.

In [213]:
bayes_search_rf.best_estimator_

RandomForestRegressor(max_depth=20, max_features=0.6, min_samples_leaf=8,
                      min_samples_split=0.01, min_weight_fraction_leaf=0.0075,
                      n_estimators=200, oob_score=True)

In [218]:
params_rf={'min_weight_fraction_leaf': [0.005, 0.0075, 0.01]
           ,'min_samples_split': [1, 0.001 ,0.01, 0.02]
           ,'min_samples_leaf':[6,7,8,9,10]
           # ,'min_impurity_decrease':[0.0, 0.01, 0.05, 0.10, 0.15, 0.2]
           # ,'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None]
           ,'max_features':[0.55, 0.6, 0.65]
           ,'max_depth':[10,20,50,100]}
bayes_search_rf = BayesSearchCV(random_forest,params_rf,n_jobs=-1)

In [219]:
bayes_search_rf.fit(X,y)
bayes_search_rf.best_estimator_



RandomForestRegressor(max_depth=20, max_features=0.55, min_samples_leaf=6,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.005,
                      n_estimators=200, oob_score=True)

In [222]:
random_forest = RandomForestRegressor(max_depth=20, max_features=0.55, min_samples_leaf=6,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.005,
                      n_estimators=200, oob_score=True)

In [223]:
random_forest.fit(X,y)

RandomForestRegressor(max_depth=20, max_features=0.55, min_samples_leaf=6,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.005,
                      n_estimators=200, oob_score=True)

## Adaboost

In [226]:
from sklearn.ensemble import AdaBoostRegressor

In [227]:
adaboost = AdaBoostRegressor()

In [229]:
adaboost.fit(X,y)

AdaBoostRegressor()

## Light GBM

In [242]:
from lightgbm import LGBMRegressor

In [None]:
light_gbm = LGBMRegressor()

light_gbm.fit(X,y)

## GradientBoostingRegressor

In [248]:
from sklearn.ensemble import GradientBoostingRegressor

In [249]:
gbr = GradientBoostingRegressor()

gbr.fit(X,y)

GradientBoostingRegressor()

## Stochastic Gradient Descent Regression

In [251]:
from sklearn.linear_model import SGDRegressor

In [252]:
SGDR = SGDRegressor()

SGDR.fit(X,y)

SGDRegressor()

## from sklearn.svm import SVR

In [256]:
from sklearn.svm import SVR

In [257]:
svr = SVR()

svr.fit(X,y)

SVR()

## from sklearn.linear_model import BayesianRidge

In [260]:
from sklearn.linear_model import BayesianRidge

In [261]:
bayesianr = BayesianRidge()

bayesianr.fit(X,y)

BayesianRidge()

## from sklearn.kernel_ridge import KernelRidge

In [263]:
from sklearn.kernel_ridge import KernelRidge

In [264]:
kernelr = KernelRidge()

kernelr.fit(X,y)

KernelRidge()

## Ensemble

In [266]:
from sklearn.ensemble import VotingRegressor, StackingRegressor

In [280]:
voting_reg = VotingRegressor([('elastic_net',elastic_net_cv)
                              ,('rf',random_forest_initial)
                              ,('xg_boost',xg_boost)
                             ,('adaboost',adaboost)
                                  ,('lightgbm',light_gbm)
                                  ,('gbr',gbr)
                                  ,('bayesianRidge',bayesianr)
                                  ,('kernelRidge',kernelr)
                             ])

In [273]:
stacking_reg = StackingRegressor([('elastic_net',elastic_net_cv)
                              ,('rf',random_forest_initial)
                              ,('xg_boost',xg_boost)
                             ,('adaboost',adaboost)
                                  ,('lightgbm',light_gbm)
                                  ,('gbr',gbr)
                                  ,('bayesianRidge',bayesianr)
                                  ,('kernelRidge',kernelr)
                             ])

In [276]:
stacking_reg.fit(X,y)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


StackingRegressor(estimators=[('elastic_net',
                               ElasticNetCV(cv=5,
                                            l1_ratio=[0.5, 0.7, 0.8, 0.9, 1],
                                            random_state=0)),
                              ('rf',
                               RandomForestRegressor(n_estimators=200,
                                                     oob_score=True)),
                              ('xg_boost',
                               XGBRegressor(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=1,
                                            enable_categorical=False, gamma=0,
                                            gpu_id=-1, importance_type=None,...
                                            n_estimators=100, n_jobs=20,
                                       

## Cross Val Score

In [35]:
from sklearn.model_selection import cross_validate

In [111]:
val_results = cross_validate(elastic_net_cv,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.040608,0.017898,-0.071136,-0.04084,-0.03399,-0.031193,-0.025879


In [112]:
val_results = cross_validate(lasso_cv,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.040611,0.017896,-0.071136,-0.04084,-0.03399,-0.031212,-0.025879


In [156]:
val_results = cross_validate(decision_tree,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.038799,0.002561,-0.042011,-0.040984,-0.037969,-0.03665,-0.036382


In [113]:
val_results = cross_validate(xg_boost,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.020724,0.002884,-0.024847,-0.021879,-0.020797,-0.018598,-0.017497


In [224]:
val_results = cross_validate(random_forest,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.020906,0.002835,-0.023648,-0.022773,-0.022421,-0.018082,-0.017609


In [202]:
val_results = cross_validate(random_forest_initial,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.020326,0.002785,-0.023262,-0.022594,-0.020544,-0.01874,-0.016492


In [230]:
val_results = cross_validate(adaboost,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.0314,0.003302,-0.036019,-0.032751,-0.031797,-0.028304,-0.028126


In [281]:
val_results = cross_validate(voting_reg,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.016966,0.003705,-0.020941,-0.020408,-0.016929,-0.01335,-0.013201


In [245]:
val_results = cross_validate(light_gbm,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.017956,0.00312,-0.022391,-0.019007,-0.018423,-0.015312,-0.014644


In [250]:
val_results = cross_validate(gbr,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.016141,0.002402,-0.019283,-0.017522,-0.016296,-0.014208,-0.013396


In [255]:
val_results = cross_validate(SGDR,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-1.644795e+34,2.1124899999999998e+34,-5.186109e+34,-1.53271e+34,-1.4597969999999998e+34,-3.3579180000000004e+32,-1.177889e+32


In [258]:
val_results = cross_validate(svr,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.041615,0.004721,-0.045491,-0.045126,-0.042035,-0.041659,-0.033765


In [262]:
val_results = cross_validate(bayesianr,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.020079,0.008465,-0.034062,-0.021738,-0.016976,-0.013992,-0.013627


In [265]:
val_results = cross_validate(kernelr,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.021636,0.008508,-0.035214,-0.023329,-0.020549,-0.015052,-0.014035


In [274]:
val_results = cross_validate(stacking_reg,X,y, scoring=['neg_mean_squared_error','r2'])
pd.DataFrame(val_results['test_neg_mean_squared_error']).describe().transpose()

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,5.0,-0.016183,0.003881,-0.020452,-0.019416,-0.016602,-0.012251,-0.012193


# Prediction

In [61]:
df_predict_raw = pd.read_csv('test.csv')

In [62]:
df_predict = df_predict_raw.copy()

In [63]:
df_predict['AgeHouseAtSale'] = df_predict['YrSold'] - df_predict['YearBuilt']

df_predict['YearsLastRemodelAtSale']  = df_predict['YrSold'] - df_predict['YearRemodAdd']

df_predict['YearsGarageBuiltAtSale'] = df_predict['YrSold'] - df_predict['GarageYrBlt'] 

df_predict.drop(columns=['Id'],inplace=True)

df_predict['MSSubClass'] = df_predict['MSSubClass'].astype(str)

In [64]:
for var in df_predict.dtypes[df_predict.dtypes == 'int64'].index:
    if df_predict[var].isnull().sum()>0:
        df_predict[var+'_isnull'] = df_predict[var].isnull().astype(int)
        df_predict[var]=df_predict[var].fillna(df_predict[var].mean())

for var in df_predict.dtypes[df_predict.dtypes == 'float64'].index:
    if df_predict[var].isnull().sum()>0:
        df_predict[var+'_isnull'] = df_predict[var].isnull().astype(int)
        df_predict[var]=df_predict[var].fillna(df_predict[var].mean())

df_predict_dummies = pd.get_dummies(df_predict,dummy_na=True,drop_first=False)

In [65]:
_, df_predict_dummies = X.align(df_predict_dummies, axis=1, fill_value=0)

In [66]:
df_predict_dummies.drop(columns=df_predict_dummies.columns[~df_predict_dummies.columns.isin(X.columns)],inplace=True)

In [67]:
df_predict_dummies['intercept']=1

## Predictions

### Statsmodel low regularization

In [34]:
df_predict_raw['SalePrice'] = np.exp(results.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_initial.csv',index=False)

### Elastic Net

In [159]:
df_predict_raw['SalePrice'] =  np.exp(elastic_net_cv.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_elasticnet.csv',index=False)

### Decision Tree

In [160]:
df_predict_raw['SalePrice'] =  np.exp(decision_tree.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_decisiontree.csv',index=False)

### XG Boost

In [68]:
df_predict_raw['SalePrice'] =  np.exp(xg_boost.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_xgboost.csv',index=False)

### Random Forest

In [198]:
df_predict_raw['SalePrice'] =  np.exp(random_forest.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_rf_cv.csv',index=False)

In [201]:
df_predict_raw['SalePrice'] =  np.exp(random_forest_initial.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_rf.csv',index=False)

### Light GBM

In [278]:
df_predict_raw['SalePrice'] =  np.exp(light_gbm.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_light_gbm.csv',index=False)

## Gradient Boosting Regression

In [282]:
df_predict_raw['SalePrice'] =  np.exp(gbr.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_gbr.csv',index=False)

### Stacking Regression

In [279]:
df_predict_raw['SalePrice'] =  np.exp(stacking_reg.predict(df_predict_dummies[X.columns]))

df_predict_raw[['Id','SalePrice']].to_csv('predictions_stacking_reg.csv',index=False)