In [2]:
# libraries
import numpy as np
import pandas as pd
import pandas_exploration_util.viz.explore as pe
from datetime import date
import math
from scipy.stats import skew
import time
from multiprocessing import Pool
import itertools

pd.set_option('display.max_columns', 500) # So wide dataset don't get cut off
pd.set_option('display.max_rows', 300) # So wide dataset don't get cut off

# regressor modules
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_log_error, mean_squared_error
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.pipeline import Pipeline



  return f(*args, **kwds)


ModuleNotFoundError: No module named 'scipy'

In [61]:
# add a few visualization functions to make things a bit easier
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

def xy(colx, agg, colz, hover_display, source, mode = 'markers'):
    # colx : x axis, str
    # colz : y axis, str
    # coly : aggregation function to apply to colz, str {'unaggregated' 'count', 'sum', 'mean', 'std', 'max', 'min'}
    # hover_display: column to display for hover, only for unaggregated, str
    # source: dataframe
    # for a point and click version of this check out: https://github.com/yifeihuang/pandas_exploration_util
    print(
        '{}: {:0.1%} null ({:d} out of {:d})'\
            .format(
                colx
                , source[colx].isnull().sum() / source.shape[0]
                , source[colx].isnull().sum()
                , source.shape[0]
            )
        )
    print(
        '{}: {:0.1%} null ({:d} out of {:d})'\
            .format(
                colz
                , source[colz].isnull().sum() / source.shape[0]
                , source[colz].isnull().sum()
                , source.shape[0]
            )
        )
    
    data = []
    for i in range(len(agg)):
        temp = source
        if(agg[i] == 'unaggregated'):
            grouped = temp.loc[:, [colx, colz]].set_index(colx)
            grouped.columns = pd.MultiIndex.from_product([[colz],[agg[i]]])
        if(agg[i] in ['count', 'sum', 'mean', 'std', 'max', 'min', 'median']):
            grouped = temp.groupby(colx).agg(
                {
                    colz : [agg[i]]
                }
            )
        elif(agg[i] == 'uniques'):
            grouped = temp.groupby(colx).apply(
                lambda g: pd.Series(g[colz].unique().size, index = pd.MultiIndex.from_product([[colz],[agg[i]]]))
            )
        # print(grouped.head())

        if(agg[i] == 'unaggregated'):
            trace = go.Scattergl(
                x = grouped.index,
                y = grouped[colz][agg[i]],
                name = agg[i] + ' of ' + colz + ' vs ' + colx,
                mode = mode[i],
                text = source[hover_display],
                hoverinfo = 'text'
            )

        else:
            trace = go.Scattergl(
                x = grouped.index,
                y = grouped[colz][agg[i]],
                name = agg[i] + ' of ' + colz + ' vs ' + colx,
                mode = mode[i]
            )
        data.append(trace)
        
    layout = go.Layout(
        title=(', ').join(agg) + ' of ' + colz + ' vs ' + colx,
        yaxis=dict(
            title=colz
        ),
        xaxis=dict(
            title=colx
        )
    )
    
    fig = go.Figure(data=data, layout=layout)
    plot_url = py.iplot(fig)
    
def distribution(colx, source):
    print(
        '{}: {:0.1%} null ({:d} out of {:d})'\
            .format(
                colx
                , source[colx].isnull().sum() / source.shape[0]
                , source[colx].isnull().sum()
                , source.shape[0]
            )
        )
    temp = source

    trace = go.Histogram(x=temp[colx],
                    name=colx,
                    marker=dict(
                        color='rgb(49,130,189)')
                )
    layout = go.Layout(
        title='distribution',
        yaxis=dict(
            title='count'
        ),
        xaxis=dict(
            title=colx
        )
    )

    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    plot_url = py.iplot(fig)

# Data Imports

In [652]:
#data imports
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

all_features = pd.concat((train,test), sort=True)
all_features.reset_index()
ntrain = train.shape[0]
#verifying the union worked appropriately
train.shape, test.shape, all_features.shape, all_features[ntrain:].SalePrice.isnull().sum()


((1460, 81), (1459, 80), (2919, 81), 1459)

# Data Exploration

Start with a correlation against the response variable

In [6]:
correl = train.corr()
correl.loc[:, 'SalePrice'].sort_values(ascending = False)


SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePr

## Visualize the relationship between the response variable `SalesPrice` vs the top most correlated numerical variables

### `OverallQual`
1. Clearly ordinal, going to leave as a numerical value
2. relationship appears to exponential - should explore taking log of both to see if it imporves correlation

In [670]:
xy('OverallQual', ['unaggregated', 'median', 'mean'] ,'SalePrice', hover_display = 'Id', source = all_features, mode = ['markers', 'lines', 'lines'])


OverallQual: 0.0% null (0 out of 2919)
SalePrice: 50.0% null (1459 out of 2919)


Correlation indeed improves with logarithm, from 0.79 to 0.81 by taking the log of SalePrice  
TO DO: take a the log of the `SalePrice` before modeling

In [164]:
# exploring log
all_features['logSalePrice'] = np.log1p(all_features.SalePrice)
all_features['logOverallQual'] = np.log1p(all_features.OverallQual)
disp = all_features[['logSalePrice', 'SalePrice', 'logOverallQual', 'OverallQual']].corr()

all_features.drop(['logOverallQual'], axis = 1, inplace = True)

disp

Unnamed: 0,logSalePrice,SalePrice,logOverallQual,OverallQual
logSalePrice,1.0,0.948374,0.802067,0.817185
SalePrice,0.948374,1.0,0.747192,0.790982
logOverallQual,0.802067,0.747192,1.0,0.985143
OverallQual,0.817185,0.790982,0.985143,1.0


### `GrLivArea`
1. two outliers that really seem to buck the expected trend Id = [524, 1299]
2. TO DO: remove these two training points before modeling

In [139]:
xy('GrLivArea', ['unaggregated'] ,'SalePrice', hover_display = 'Id', source = all_features, mode = ['markers', 'lines', 'lines'])


GrLivArea: 0.0% null (0 out of 2919)
SalePrice: 50.0% null (1459 out of 2919)


Again we see log improving the correlation - this time with log of both

In [87]:
# exploring log
all_features['logGrLivArea'] = np.log1p(all_features.GrLivArea)
disp = all_features[['logSalePrice', 'SalePrice', 'logGrLivArea', 'GrLivArea']].corr()

all_features.drop(['logGrLivArea'], axis = 1, inplace = True)

disp

Unnamed: 0,logSalePrice,SalePrice,logGrLivArea,GrLivArea
logSalePrice,1.0,0.948374,0.730254,0.700927
SalePrice,0.948374,1.0,0.695147,0.708624
logGrLivArea,0.730254,0.695147,1.0,0.969619
GrLivArea,0.700927,0.708624,0.969619,1.0


### `GarageCars`
1. relationship seems to change from positive to negative at the 3 cars boundry. it's possible that other factors that impact price are draggin it down for these example, but because there are so few observations it might be better to lump these into 3+
2. will explore regrouping these into an ordinal variable, such that it's 3+ instead of discrete counts
3. will also explore logs

In [160]:
xy('GarageCars', ['unaggregated', 'mean', 'median'] ,'SalePrice', hover_display = 'Id', source = all_features, mode = ['markers', 'lines', 'lines'])


GarageCars: 0.0% null (1 out of 2919)
SalePrice: 50.0% null (1459 out of 2919)


max correlation comes from ordinal grouped garage cars with log of price

In [88]:
all_features['logGarageCars'] = np.log1p(all_features.GarageCars)
all_features['ordGarageCars'] = all_features.GarageCars.apply(lambda x : x if x<=3 else 3)
all_features['logOrdGarageCars'] = np.log1p(all_features.ordGarageCars)
disp = all_features[['logSalePrice', 'SalePrice', 'logGarageCars', 'ordGarageCars', 'logOrdGarageCars']].corr()

all_features.drop(['logGarageCars', 'ordGarageCars', 'logOrdGarageCars'], axis = 1, inplace = True)

disp

Unnamed: 0,logSalePrice,SalePrice,logGarageCars,ordGarageCars,logOrdGarageCars
logSalePrice,1.0,0.948374,0.641561,0.686674,0.643384
SalePrice,0.948374,1.0,0.576617,0.646652,0.57851
logGarageCars,0.641561,0.576617,1.0,0.974521,0.99845
ordGarageCars,0.686674,0.646652,0.974521,1.0,0.974351
logOrdGarageCars,0.643384,0.57851,0.99845,0.974351,1.0


### `GarageArea`
1. We again see some outliers, but if we take a closer look at the data points
    1. 1062, 1191 have relatively small living area
    2. 1299 is an outliers that we previously identified and are going to remove
    3. 582 was a partial assessment

In [95]:
xy('GarageArea', ['unaggregated'] ,'SalePrice', hover_display = 'Id', source = all_features, mode = ['markers', 'lines', 'lines'])


GarageArea: 0.0% null (1 out of 2919)
SalePrice: 50.0% null (1459 out of 2919)


In [94]:
all_features[all_features.GarageArea >= 1248]

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,BsmtQual,BsmtUnfSF,CentralAir,Condition1,Condition2,Electrical,EnclosedPorch,ExterCond,ExterQual,Exterior1st,Exterior2nd,Fence,FireplaceQu,Fireplaces,Foundation,FullBath,Functional,GarageArea,GarageCars,GarageCond,GarageFinish,GarageQual,GarageType,GarageYrBlt,GrLivArea,HalfBath,Heating,HeatingQC,HouseStyle,Id,KitchenAbvGr,KitchenQual,LandContour,LandSlope,LotArea,LotConfig,LotFrontage,LotShape,LowQualFinSF,MSSubClass,MSZoning,MasVnrArea,MasVnrType,MiscFeature,MiscVal,MoSold,Neighborhood,OpenPorchSF,OverallCond,OverallQual,PavedDrive,PoolArea,PoolQC,RoofMatl,RoofStyle,SaleCondition,SalePrice,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,logSalePrice
581,2042,0,0,,3,1Fam,TA,No,0.0,0.0,Unf,Unf,0.0,0.0,Ex,2042.0,Y,Norm,Norm,SBrkr,0,TA,Ex,VinylSd,VinylSd,,Gd,1,PConc,2,Typ,1390.0,3.0,TA,RFn,TA,Attchd,2009.0,2042,1,GasA,Ex,1Story,582,1,Ex,Lvl,Gtl,12704,Inside,98.0,Reg,0,20,RL,306.0,BrkFace,,0,8,NridgHt,90,5,8,Y,0,,CompShg,Hip,Partial,253293.0,New,0,Pave,8,2042.0,AllPub,0,2008,2009,2009,12.442306
1061,894,0,0,,2,1Fam,TA,No,0.0,0.0,Unf,Unf,0.0,0.0,TA,894.0,Y,Norm,Norm,SBrkr,0,TA,Fa,MetalSd,MetalSd,,,0,CBlock,1,Typ,1248.0,3.0,TA,RFn,TA,Detchd,1994.0,894,0,GasA,TA,1Story,1062,1,TA,Low,Gtl,18000,Inside,120.0,Reg,0,30,C (all),0.0,,Shed,560,8,IDOTRR,20,4,3,Y,0,,CompShg,Gable,Normal,81000.0,ConLD,0,Grvl,6,894.0,AllPub,0,1935,1950,2008,11.302217
1190,1622,0,0,,3,2fmCon,TA,Av,1159.0,0.0,BLQ,Unf,1.0,0.0,TA,90.0,Y,Norm,Norm,SBrkr,0,Gd,TA,MetalSd,MetalSd,,TA,1,CBlock,1,Typ,1356.0,4.0,TA,Fin,TA,2Types,1975.0,1622,0,GasA,Ex,1Story,1191,1,TA,Low,Mod,32463,Inside,,Reg,0,190,RL,149.0,Stone,,0,3,Mitchel,0,4,4,Y,0,,CompShg,Gable,Normal,168000.0,WD,0,Pave,7,1249.0,AllPub,439,1961,1975,2007,12.031725
1298,4692,950,0,,3,1Fam,TA,Gd,5644.0,0.0,GLQ,Unf,2.0,0.0,Ex,466.0,Y,Feedr,Norm,SBrkr,0,TA,Ex,Stucco,Stucco,,Gd,3,PConc,2,Typ,1418.0,2.0,TA,Fin,TA,Attchd,2008.0,5642,1,GasA,Ex,2Story,1299,1,Ex,Bnk,Gtl,63887,Corner,313.0,IR3,0,60,RL,796.0,Stone,,0,1,Edwards,292,5,10,Y,480,Gd,ClyTile,Hip,Partial,160000.0,New,0,Pave,12,6110.0,AllPub,214,2008,2008,2008,11.982935
495,1420,1420,0,,4,1Fam,TA,No,988.0,0.0,ALQ,Unf,0.0,1.0,Gd,432.0,Y,Norm,Norm,SBrkr,0,TA,Gd,Plywood,Plywood,MnPrv,Gd,2,PConc,2,Min2,1314.0,4.0,Gd,Fin,TA,Attchd,1978.0,2840,1,GasA,Ex,2Story,1956,1,Gd,Lvl,Gtl,12511,Corner,,IR1,0,60,RL,168.0,BrkFace,,0,12,NWAmes,16,7,7,Y,0,,WdShake,Mansard,Normal,,WD,208,Pave,8,1420.0,AllPub,0,1978,1978,2008,
611,1254,182,0,Grvl,3,1Fam,TA,No,0.0,0.0,Unf,Unf,0.0,1.0,TA,892.0,Y,Norm,Norm,SBrkr,100,TA,TA,Wd Sdng,Wd Sdng,GdWo,Gd,1,BrkTil,1,Typ,1488.0,4.0,TA,Unf,Fa,Detchd,1968.0,1436,0,GasA,Ex,1.5Fin,2072,1,TA,Lvl,Gtl,10836,Inside,60.0,Reg,0,50,RL,0.0,,,0,6,OldTown,0,5,5,N,0,,CompShg,Gable,Normal,,WD,0,Pave,7,892.0,AllPub,0,1922,1950,2008,
701,2276,0,0,,3,1Fam,TA,Gd,2085.0,0.0,GLQ,Unf,1.0,0.0,Gd,186.0,Y,Norm,Norm,SBrkr,70,TA,Gd,VinylSd,VinylSd,,Gd,2,PConc,2,Typ,1348.0,3.0,TA,RFn,Gd,Attchd,2008.0,2276,0,GasA,Ex,1Story,2162,1,Ex,Lvl,Gtl,11778,Inside,91.0,Reg,0,20,RL,554.0,Stone,,0,6,CollgCr,0,5,9,Y,0,,CompShg,Hip,Abnorml,,WD,255,Pave,7,2271.0,AllPub,0,2008,2008,2008,


Correlation is best between log of price and untransformed GarageArea

In [102]:
all_features['logGarageArea'] = np.log1p(all_features.GarageArea)
disp = all_features[['logSalePrice', 'SalePrice', 'logGarageArea', 'GarageArea']].corr()

all_features.drop(['logGarageArea'], axis = 1, inplace = True)

disp

Unnamed: 0,logSalePrice,SalePrice,logGarageArea,GarageArea
logSalePrice,1.0,0.948374,0.454898,0.650888
SalePrice,0.948374,1.0,0.371597,0.623431
logGarageArea,0.454898,0.371597,1.0,0.719518
GarageArea,0.650888,0.623431,0.719518,1.0


### `TotalBsmtSF`
1. again we see 1299 being an outlier

In [159]:
xy('TotalBsmtSF', ['unaggregated'] ,'SalePrice', hover_display = 'Id', source = all_features, mode = ['markers', 'lines', 'lines'])


TotalBsmtSF: 0.0% null (1 out of 2919)
SalePrice: 50.0% null (1459 out of 2919)


transforming TotalBsmtSF doesn't really yield much gain

In [106]:
all_features['logTotalBsmtSF'] = np.log1p(all_features.TotalBsmtSF)
disp = all_features[['logSalePrice', 'SalePrice', 'logTotalBsmtSF', 'TotalBsmtSF']].corr()

all_features.drop(['logTotalBsmtSF'], axis = 1, inplace = True)

disp

Unnamed: 0,logSalePrice,SalePrice,logTotalBsmtSF,TotalBsmtSF
logSalePrice,1.0,0.948374,0.372838,0.612134
SalePrice,0.948374,1.0,0.325883,0.613581
logTotalBsmtSF,0.372838,0.325883,1.0,0.649423
TotalBsmtSF,0.612134,0.613581,0.649423,1.0


### Additional exploration that I will spare the details of
1. 1stFlrSF - very similar to GrLivArea, consider combining into a totalSF and then calculating a % for the partial #s
2. FullBath - ditto as above
3. TotRmsAbvGrd - trend starts to reverse at 11, for any greater than 11 bucketing 11, effectively changing it into an ordinal variable
4. YearBuilt
5. YearRemodAdd
6. GarageYrBlt - there is one data point in the test set that has a clear error, 2593
7. MasVnrArea - there are nulls that we need to impute
8. Fireplaces - log improves correlation. Test set has samples with 4 fireplaces, but we have no training observations with that value, probably better to bucket ordinally into 3+
9. BsmtFinSF1 - has null value that we need to impute
10. LotFrontage - log improves correlation, need to impute nulls
11. WoodDeckSF
12. 2ndFlrSF
13. OpenPorchSF - log improves correlation
14. BedroomAbvGr - very little correlation, turn it into % bedrooms

In [162]:
xy('Fireplaces', ['unaggregated', 'mean', 'median'] ,'SalePrice', hover_display = 'Id', source = all_features, mode = ['markers', 'lines', 'lines'])


Fireplaces: 0.0% null (0 out of 2919)
SalePrice: 50.0% null (1459 out of 2919)


In [166]:
name = 'WoodDeckSF'
logname = 'log' + name

all_features[logname] = np.log1p(all_features[name])
disp = all_features[['logSalePrice', 'SalePrice', logname, name]].corr()

all_features.drop([logname], axis = 1, inplace = True)

disp

Unnamed: 0,logSalePrice,SalePrice,logWoodDeckSF,WoodDeckSF
logSalePrice,1.0,0.948374,0.343039,0.334135
SalePrice,0.948374,1.0,0.32145,0.324413
logWoodDeckSF,0.343039,0.32145,1.0,0.853057
WoodDeckSF,0.334135,0.324413,0.853057,1.0


# General clean up of nulls

In [None]:
# find all the null columns
def check_df_null(df):
    all_nulls = df.isnull().sum() / df.shape[0] * 100.0
    all_nulls = all_nulls.loc[all_nulls != 0].sort_values(ascending = False)
    for i,v in all_nulls.iteritems():
        print('\'' + str(i) + '\' : \'' +  str(v) + '\'')
#         print('\'{:s} : {:0.2%}'.format(str(i), v))
        
check_df_null(all_features)

In [655]:
all_features[(all_features.Id==524) | (all_features.Id==1299)]

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,BsmtQual,BsmtUnfSF,CentralAir,Condition1,Condition2,Electrical,EnclosedPorch,ExterCond,ExterQual,Exterior1st,Exterior2nd,Fence,FireplaceQu,Fireplaces,Foundation,FullBath,Functional,GarageArea,GarageCars,GarageCond,GarageFinish,GarageQual,GarageType,GarageYrBlt,GrLivArea,HalfBath,Heating,HeatingQC,HouseStyle,Id,KitchenAbvGr,KitchenQual,LandContour,LandSlope,LotArea,LotConfig,LotFrontage,LotShape,LowQualFinSF,MSSubClass,MSZoning,MasVnrArea,MasVnrType,MiscFeature,MiscVal,MoSold,Neighborhood,OpenPorchSF,OverallCond,OverallQual,PavedDrive,PoolArea,PoolQC,RoofMatl,RoofStyle,SaleCondition,SalePrice,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
523,3138,1538,0,,3,1Fam,TA,Gd,2260.0,0.0,GLQ,Unf,1.0,0.0,Ex,878.0,Y,PosN,PosN,SBrkr,0,TA,Ex,CemntBd,CmentBd,,Gd,1,PConc,3,Typ,884.0,3.0,TA,Fin,TA,BuiltIn,2007.0,4676,1,GasA,Ex,2Story,524,1,Ex,Bnk,Gtl,40094,Inside,130.0,IR1,0,60,RL,762.0,Stone,,0,10,Edwards,406,5,10,Y,0,,CompShg,Hip,Partial,184750.0,New,0,Pave,11,3138.0,AllPub,208,2007,2008,2007
1298,4692,950,0,,3,1Fam,TA,Gd,5644.0,0.0,GLQ,Unf,2.0,0.0,Ex,466.0,Y,Feedr,Norm,SBrkr,0,TA,Ex,Stucco,Stucco,,Gd,3,PConc,2,Typ,1418.0,2.0,TA,Fin,TA,Attchd,2008.0,5642,1,GasA,Ex,2Story,1299,1,Ex,Bnk,Gtl,63887,Corner,313.0,IR3,0,60,RL,796.0,Stone,,0,1,Edwards,292,5,10,Y,480,Gd,ClyTile,Hip,Partial,160000.0,New,0,Pave,12,6110.0,AllPub,214,2008,2008,2008


In [656]:
# # remove training outlier [524, 1299]
train = train.drop(train[(train.Id==524) | (train.Id==1299)].index)

feature_fill_ = pd.concat((train,test), sort=True)

# # fix weird data quality issues

#2593 has GarageYrBlt = 2207, where as the year build is 2006, update value
feature_fill_.loc[feature_fill_.Id == 2593, 'GarageYrBlt'] = feature_fill_.loc[feature_fill_.Id == 2593, 'YearBuilt']

#2577 has GarageType = Detched, but every other Garage attribute is null, so it's likely not a real garage
feature_fill_.loc[feature_fill_.Id == 2577, 'GarageType'] = np.nan

# fill the nulls

null_map = {
    'PoolQC' : 'NA' # No pool
    , 'MiscFeature' : 'NA'
    , 'Alley' : 'NA'
    , 'Fence' : 'NA'
    , 'FireplaceQu' : 'NA'
    , 'GarageQual' : 'NA'
    , 'GarageCond' : 'NA'
    , 'GarageFinish' : 'NA'
#     , 'GarageYrBlt' : ? probably want to fill this with the year built
    , 'GarageType' : 'NA'
    , 'BsmtExposure' : 'NA'
    , 'BsmtCond' : 'NA'
    , 'BsmtQual' : 'NA'
    , 'BsmtFinType2' : 'NA'
    , 'BsmtFinType1' : 'NA'
    , 'MasVnrType' : 'None'
    , 'MasVnrArea' : 0
    , 'BsmtFullBath' : 0
    , 'BsmtHalfBath' : 0
#     , 'Utilities' : all_features.Utilities.mode()[0] # can probably drop this
#     , 'Functional' : all_features.Functional.mode()[0] # most common val
#     , 'Electrical' : all_features.Electrical.mode()[0]
    , 'BsmtUnfSF' : 0 # no basement, only 1 sample in the test dataset
    , 'TotalBsmtSF' : 0
    , 'GarageCars' : 0
    , 'BsmtFinSF2' : 0
    , 'BsmtFinSF1' : 0
#     , 'KitchenQual' : all_features.KitchenQual.mode()[0]
    , 'GarageArea' : 0
}

feature_fill_ = feature_fill_.fillna(value = null_map)

# Special treatment

# 'LotFrontage' : '16.649537512846866'
# fill based on the neighborhood median
feature_fill_['LotFrontage'] = feature_fill_.groupby("Neighborhood")["LotFrontage"] \
    .transform(lambda x: x.fillna(x.median()))

#'GarageYrBlt' fill with year built
feature_fill_['GarageYrBlt'] = feature_fill_['GarageYrBlt'].fillna(feature_fill_['YearBuilt'])

# MSZoning, 'Exterior1st', 'Exterior2nd', 'SaleType'
# these all come from 1 or 2 sample in the test set
# fill based on neighborhood mode
for c in ['MSZoning', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Utilities', 'Functional', 'Electrical', 'KitchenQual']:
    feature_fill_[c] = feature_fill_.groupby("Neighborhood")[c] \
        .transform(lambda x: x.fillna(x.mode()[0]))
    
    
# check again for null
check_df_null(feature_fill_)
feature_fill_.shape, all_features.shape

'SalePrice' : '50.017140898183065'


((2917, 81), (2919, 81))

# Feature engineering

In [657]:
# update ordinal variables with numerical values

# update lot shape into basically regular vs not
map1 = {
    'Reg' : 3
    ,'IR1' : 2
    ,'IR2' : 1
    ,'IR3' : 0
}

for c in ['LotShape']:
    feature_fill_[c] = feature_fill_[c].apply(lambda x : map1[x])
    
map2 = {
    'Ex': 5
    ,'Gd': 4
    ,'TA': 3
    ,'Fa': 2
    ,'Po': 1
    ,'NA': 0
}

for c in ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond'
     , 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond'
     , 'PoolQC', 
    ]:
    feature_fill_[c] = feature_fill_[c].apply(lambda x : map2[x])


map3 = {
    'Gd': 4
    ,'Av': 3
    ,'Mn': 2
    ,'No': 1
    ,'NA': 0
}

for c in ['BsmtExposure']:
    feature_fill_[c] = feature_fill_[c].apply(lambda x : map3[x])

map3 = {
    'GLQ': 6
    ,'ALQ': 5
    ,'BLQ': 4
    ,'Rec': 3
    ,'LwQ': 2
    ,'Unf': 1
    ,'NA': 0
}

for c in ['BsmtFinType1', 'BsmtFinType2']:
# consider adding a feature that is does it have multiple finish types
    feature_fill_[c] = feature_fill_[c].apply(lambda x : map3[x])

map4 = {
    'Typ': 7
    ,'Min1': 6
    ,'Min2': 5
    ,'Mod': 4
    ,'Maj1': 3
    ,'Maj2': 2
    ,'Sev': 1
    ,'Sal': 0
}

for c in ['Functional']:
    feature_fill_[c] = feature_fill_[c].apply(lambda x : map4[x])

map5 = {
    'Fin' : 3
    ,'RFn' : 2
    ,'Unf' : 1
    ,'NA' : 0
}

for c in ['GarageFinish']:
    feature_fill_[c] = feature_fill_[c].apply(lambda x : map5[x])


map6 = {
    'Y' : 2
    ,'P' : 1
    ,'N' : 0
}

for c in ['PavedDrive']:
    feature_fill_[c] = feature_fill_[c].apply(lambda x : map6[x])

map7 = {
    'Pave' : 2
    , 'Grvl' : 1
    , 'NA' : 0
}

for c in ['Alley']:
    feature_fill_[c] = feature_fill_[c].apply(lambda x : map7[x])

    
map71 = {
    'Pave' : 1
    , 'Grvl' : 0
}

for c in ['Street']:
    feature_fill_[c] = feature_fill_[c].apply(lambda x : map71[x])
    
map8 = {
    'Gtl' : 2
    ,'Mod' : 1
    ,'Sev' : 0
}

for c in ['LandSlope']:
    feature_fill_[c] = feature_fill_[c].apply(lambda x : map8[x])
    
map9 = {
    'Y' : 1
    ,'N' : 0
}

for c in ['CentralAir']:
    feature_fill_[c] = feature_fill_[c].apply(lambda x : map9[x])
    
#Fence
map10 = {
    'GdPrv': 4
    ,'MnPrv': 3
    ,'GdWo': 2
    ,'MnWw': 1
    ,'NA': 0
}

for c in ['Fence']:
    feature_fill_[c] = feature_fill_[c].apply(lambda x : map10[x])

#derive new variables
feature_fill_['TotalSF'] = feature_fill_['TotalBsmtSF'] + feature_fill_['GrLivArea']
feature_fill_['DolPerSF'] = feature_fill_['SalePrice'] / feature_fill_['TotalSF']
feature_fill_['TotalBsmtFinSF'] = feature_fill_['BsmtFinSF1'] + feature_fill_['BsmtFinSF2']
feature_fill_['AbvGrBath'] = feature_fill_['FullBath'] + 0.5 * feature_fill_['HalfBath']
feature_fill_['TotalBath'] = (feature_fill_['FullBath'] + feature_fill_['BsmtFullBath']) \
                            + 0.5 * (feature_fill_['HalfBath'] + feature_fill_['BsmtHalfBath'])

#SF
feature_fill_['pctSFGRLiv'] = feature_fill_['GrLivArea'] / feature_fill_['TotalSF'] * 100.0
feature_fill_['pctSFBsmt'] = feature_fill_['TotalBsmtSF'] / feature_fill_['TotalSF'] * 100.0
feature_fill_['pctSFLowQualFin'] = feature_fill_['LowQualFinSF'] / feature_fill_['TotalSF'] * 100.0
feature_fill_['pctSF1stFlr'] = feature_fill_['1stFlrSF'] / feature_fill_['TotalSF'] * 100.0
feature_fill_['pctSF2ndFlr'] = feature_fill_['2ndFlrSF'] / feature_fill_['TotalSF'] * 100.0

#bathrooms and bedrooms
feature_fill_['pctAbvGrBath'] = feature_fill_['AbvGrBath'] / feature_fill_['TotalBath'] * 100.0
feature_fill_['pctFullBath'] = (feature_fill_['FullBath'] + feature_fill_['BsmtFullBath']) \
                                / feature_fill_['TotalBath']
feature_fill_['pctAbvGrBed'] = feature_fill_['BedroomAbvGr'] / feature_fill_['TotRmsAbvGrd'] * 100.0


#basement
feature_fill_['pctSFBsmtFin'] = feature_fill_.apply(
    lambda x : \
    0 if x['TotalBsmtSF'] == 0 else x['TotalBsmtFinSF'] / x['TotalBsmtSF'] * 100.0 \
    , axis = 1
)
feature_fill_['WgtBsmtFinType'] = feature_fill_.apply(
    lambda x : \
    0 if x['TotalBsmtFinSF'] == 0 \
    else ((x['BsmtFinSF1'] * x['BsmtFinType1'] + x['BsmtFinSF2'] * x['BsmtFinType2']) \
                                / x['TotalBsmtFinSF']) \
    , axis = 1
)


#porch
feature_fill_['totalPorchSF'] = feature_fill_['OpenPorchSF'] + feature_fill_['EnclosedPorch'] \
                                + feature_fill_['3SsnPorch'] + feature_fill_['ScreenPorch']

feature_fill_['pctSFOpenPorch'] = feature_fill_.apply(
    lambda x : \
    0 if x['totalPorchSF'] == 0 else x['OpenPorchSF'] / x['totalPorchSF'] * 100.0 \
    , axis = 1
)
feature_fill_['pctSFEnclosedPorch'] = feature_fill_.apply(
    lambda x : \
    0 if x['totalPorchSF'] == 0 else x['EnclosedPorch'] / x['totalPorchSF'] * 100.0 \
    , axis = 1
)
feature_fill_['pctSF3SsnPorch'] = feature_fill_.apply(
    lambda x : \
    0 if x['totalPorchSF'] == 0 else x['3SsnPorch'] / x['totalPorchSF'] * 100.0 \
    , axis = 1
)
feature_fill_['pctSFScreenPorch'] = feature_fill_.apply(
    lambda x : \
    0 if x['totalPorchSF'] == 0 else x['ScreenPorch'] / x['totalPorchSF'] * 100.0 \
    , axis = 1
)


# group reversing trends into ordinal variables

feature_fill_['ordGarageCars'] = feature_fill_['GarageCars'].apply(lambda x : x if x<=3 else 3)
feature_fill_['ordTotRmsAbvGrd'] = feature_fill_['TotRmsAbvGrd'].apply(lambda x : x if x<=11 else 11)
feature_fill_['ordFireplaces'] = feature_fill_['Fireplaces'].apply(lambda x : x if x<=3 else 3)


# convert neighborhood to numerical using median SalePrice per square ft
neighborhood_map = feature_fill_.groupby('Neighborhood')["DolPerSF"].apply(lambda x: x.median()).to_dict()
feature_fill_['Neighborhood'] = feature_fill_['Neighborhood'].apply(lambda x : neighborhood_map[x])

# pool variables (QC and area) are so sparsely populated, just deriving pool indicator variable might be better
feature_fill_['Pool'] = feature_fill_.apply(lambda x : 0 if (x['PoolArea'] == 0) else 1, axis = 1)

# # take log of things that improved correlation
# for c in ['SalePrice'
#             , 'TotalSF'
#             , 'GrLivArea'
#             , 'ordFireplaces'
#             , 'WoodDeckSF'
#             , 'totalPorchSF'
#             , 'LotFrontage'
#          ]:
#     feature_fill_[c] = np.log1p(feature_fill_[c])

def check_df_skew(df):
    cols = feature_fill_.dtypes
    num_cols = cols[(cols == 'int64') | (cols == 'float64')].index.values.tolist()
    num_cols
    skews = feature_fill_.apply(lambda x : x.skew() if x.name in num_cols else None )\
        .sort_values(ascending = False)
#     for i,v in skews.iteritems():
#         print('\'' + str(i) + '\' : \'' +  str(v) + '\'')
    return skews

# col_skew = check_df_skew(feature_fill_)

# for c in col_skew[abs(col_skew)>=0].index.values:
#     if c in ['Id', 'MoSold', 'YrSold', 'MSSubClass']:
#         pass
#     else:
#         feature_fill_[c] = np.log1p(feature_fill_[c])

for c in [
    'DolPerSF'
    , 'GarageArea'
    , 'BsmtFinSF1'
    , 'MasVnrArea'
    , 'TotalBsmtFinSF'
    , 'WoodDeckSF'
    , 'GrLivArea'
    , 'OpenPorchSF'
    , 'TotalSF'
    , 'LotFrontage'
    , 'LotArea'
    , '1stFlrSF'
    , '2ndFlrSF'
    , 'TotalBsmtSF'
    , 'ScreenPorch'
    , 'PoolArea'
    , '3SsnPorch'
    , 'totalPorchSF'
    , 'MiscVal'
    , 'BsmtFinSF2'
    , 'LowQualFinSF'
    , 'BsmtUnfSF'
]:
    feature_fill_[c] = np.log1p(feature_fill_[c])

# check again for null in case mapping did not work as planned
check_df_null(feature_fill_)

'DolPerSF' : '50.017140898183065'
'SalePrice' : '50.017140898183065'


In [658]:
# correl = feature_fill_.corr()
# correl.loc[:, 'DolPerSF'].sort_values(ascending = False)

feature_fill_.shape

(2917, 105)

In [447]:
name = 'KitchenAbvGr'

xy(name, ['unaggregated', 'mean', 'median'] ,'DolPerSF', hover_display = 'Id', source = feature_fill_, mode = ['markers', 'lines', 'lines'])

feature_fill_['logDolPerSF'] = np.log1p(feature_fill_['DolPerSF'])


logname = 'log' + name

feature_fill_[logname] = np.log1p(feature_fill_[name])
disp = feature_fill_[['logDolPerSF', 'DolPerSF', logname, name]].corr()

feature_fill_.drop([logname, 'logDolPerSF'], axis = 1, inplace = True)

disp

KitchenAbvGr: 0.0% null (0 out of 2915)
DolPerSF: 50.0% null (1457 out of 2915)


Unnamed: 0,logDolPerSF,DolPerSF,logKitchenAbvGr,KitchenAbvGr
logDolPerSF,1.0,0.999176,-0.269986,-0.276254
DolPerSF,0.999176,1.0,-0.264607,-0.270807
logKitchenAbvGr,-0.269986,-0.264607,1.0,0.994572
KitchenAbvGr,-0.276254,-0.270807,0.994572,1.0


In [659]:
# code categorical the features
# feature_fill_ = feature_fill_[feature_col]
# # month and year, MSSubClass are not ordinal variables in this instance
feature_fill_ = pd.get_dummies(feature_fill_, prefix = ['MoSold', 'YrSold', 'MSSubClass'], columns = ['MoSold', 'YrSold', 'MSSubClass'])
# all other object types are converted to dummies
feature_fill_ = pd.get_dummies(feature_fill_)


In [567]:
feature_fill_

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,BsmtQual,BsmtUnfSF,CentralAir,EnclosedPorch,ExterCond,ExterQual,Fence,FireplaceQu,Fireplaces,FullBath,Functional,GarageArea,GarageCars,GarageCond,GarageFinish,GarageQual,GarageYrBlt,GrLivArea,HalfBath,HeatingQC,Id,KitchenAbvGr,KitchenQual,LandSlope,LotArea,LotFrontage,LotShape,LowQualFinSF,MasVnrArea,MiscVal,Neighborhood,OpenPorchSF,OverallCond,OverallQual,PavedDrive,PoolArea,PoolQC,SalePrice,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,TotalSF,DolPerSF,TotalBsmtFinSF,AbvGrBath,TotalBath,pctSFGRLiv,pctSFBsmt,pctSFLowQualFin,pctSF1stFlr,pctSF2ndFlr,pctAbvGrBath,pctFullBath,pctAbvGrBed,pctSFBsmtFin,WgtBsmtFinType,totalPorchSF,pctSFOpenPorch,pctSFEnclosedPorch,pctSF3SsnPorch,pctSFScreenPorch,ordGarageCars,ordTotRmsAbvGrd,ordFireplaces,Pool,MoSold_1,MoSold_2,MoSold_3,MoSold_4,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,MoSold_10,MoSold_11,MoSold_12,YrSold_2006,YrSold_2007,YrSold_2008,YrSold_2009,YrSold_2010,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_NA,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,MiscFeature_Gar2,MiscFeature_NA,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Utilities_AllPub,Utilities_NoSeWa
0,6.753438,6.751101,0.000000,0,3,3,1,6.561031,0.000000,6,1,1.0,0.0,4,5.017280,1,0,3,4,0,0,0,2,7,6.308098,2.0,3,2,3,2003.0,7.444833,1,5,1,1,4,2,9.042040,4.189655,3,0.000000,5.283204,0.000000,74.795762,4.127134,5,7,2,0.0,0,208500.0,0.000000,1,8,6.753438,0.000000,2003,2003,7.850493,81.254871,6.561031,2.5,3.5,66.640686,33.359314,0.000000,33.359314,33.281372,71.428571,0.857143,37.500000,82.476636,6.000000,4.127134,100.000000,0.000000,0.000000,0.000000,2.0,8,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
1,7.141245,0.000000,0.000000,0,3,3,4,6.886532,0.000000,5,1,0.0,1.0,4,5.652489,1,0,3,3,0,3,1,2,7,6.133398,2.0,3,2,3,1976.0,7.141245,0,5,2,1,3,2,9.169623,4.394449,3,0.000000,0.000000,0.000000,78.472514,0.000000,8,6,2,0.0,0,181500.0,0.000000,1,6,7.141245,5.700444,1976,1976,7.833996,71.909667,6.886532,2.0,2.5,50.000000,50.000000,0.000000,50.000000,0.000000,80.000000,0.800000,50.000000,77.496038,5.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.0,6,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
2,6.825460,6.765039,0.000000,0,3,3,2,6.188264,0.000000,6,1,1.0,0.0,4,6.075346,1,0,3,4,0,3,1,2,7,6.411818,2.0,3,2,3,2001.0,7.488294,1,5,3,1,4,2,9.328212,4.234107,2,0.000000,5.093750,0.000000,74.795762,3.761200,5,7,2,0.0,0,223500.0,0.000000,1,6,6.825460,0.000000,2001,2002,7.903596,82.594235,6.188264,2.5,3.5,66.001478,33.998522,0.000000,33.998522,32.002956,71.428571,0.857143,50.000000,52.826087,6.000000,3.761200,100.000000,0.000000,0.000000,0.000000,2.0,6,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
3,6.869014,6.629363,0.000000,0,3,4,1,5.379897,0.000000,5,1,1.0,0.0,3,6.293419,1,272,3,3,0,4,1,1,7,6.466145,3.0,3,1,3,1998.0,7.448916,0,4,4,1,4,2,9.164401,4.110874,2,0.000000,0.000000,0.000000,76.317297,3.583519,5,7,2,0.0,0,140000.0,0.000000,1,7,6.629363,0.000000,1915,1970,7.813592,56.611403,5.379897,1.0,2.0,69.429842,30.570158,0.000000,38.859685,30.570158,50.000000,1.000000,42.857143,28.571429,5.000000,5.730100,11.400651,88.599349,0.000000,0.000000,3.0,7,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
4,7.044033,6.960348,0.000000,0,4,3,3,6.486161,0.000000,6,1,1.0,0.0,4,6.196444,1,0,3,4,0,3,1,2,7,6.729824,3.0,3,2,3,2000.0,7.695758,1,5,5,1,4,2,9.565284,4.442651,2,0.000000,5.860786,0.000000,80.352514,4.442651,5,8,2,0.0,0,250000.0,0.000000,1,9,7.044033,5.262690,2000,2000,8.114923,74.783129,6.486161,2.5,3.5,65.749327,34.250673,0.000000,34.250673,31.498654,71.428571,0.857143,44.444444,57.205240,6.000000,4.442651,100.000000,0.000000,0.000000,0.000000,3.0,9,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
5,6.680855,6.340359,5.771441,0,1,3,1,6.597146,0.000000,6,1,1.0,0.0,4,4.174387,1,0,3,3,3,0,0,1,7,6.175867,2.0,3,1,3,1993.0,7.217443,1,5,6,1,3,2,9.555064,4.454347,2,0.000000,0.000000,6.552508,67.275748,3.433987,5,5,2,0.0,0,143000.0,0.000000,1,5,6.680855,3.713572,1993,1995,7.677400,66.265060,6.597146,1.5,2.5,63.113994,36.886006,0.000000,36.886006,26.227989,60.000000,0.800000,20.000000,91.959799,6.000000,5.860786,8.571429,0.000000,91.428571,0.000000,2.0,5,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
6,7.435438,0.000000,0.000000,0,3,3,3,7.222566,0.000000,6,1,1.0,0.0,5,5.762051,1,0,3,4,0,4,1,2,7,6.456770,2.0,3,2,3,2004.0,7.435438,0,5,7,1,4,2,9.218804,4.330733,3,0.000000,5.231109,0.000000,82.566554,4.060443,5,8,2,0.0,0,307000.0,0.000000,1,7,7.430707,5.545177,2004,2005,8.125927,90.828402,7.222566,2.0,3.0,50.118343,49.881657,0.000000,50.118343,0.000000,66.666667,1.000000,42.857143,81.198102,6.000000,4.060443,100.000000,0.000000,0.000000,0.000000,2.0,7,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
7,7.010312,6.891626,0.000000,0,3,3,2,6.756932,3.496508,5,4,1.0,0.0,4,5.379897,1,228,3,3,0,3,2,2,7,6.184149,2.0,3,2,3,1973.0,7.645398,1,5,8,1,3,2,9.247925,4.394449,2,0.000000,5.484797,5.860786,66.618653,5.323010,6,7,2,0.0,0,200000.0,0.000000,1,7,7.010312,5.463832,1973,1973,8.070281,62.558649,6.793466,2.5,3.5,65.373788,34.626212,0.000000,34.626212,30.747576,71.428571,0.857143,42.857143,80.487805,4.964085,6.070738,47.222222,52.777778,0.000000,0.000000,2.0,7,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
8,6.930495,6.624065,0.000000,0,2,3,1,0.000000,0.000000,1,1,0.0,0.0,3,6.859615,1,205,3,3,0,3,2,2,6,6.150603,2.0,3,1,2,1931.0,7.481556,0,4,9,2,3,2,8.719481,3.951244,3,0.000000,0.000000,0.000000,57.485030,0.000000,5,7,2,0.0,0,129900.0,0.000000,1,8,6.859615,4.510860,1931,1950,7.910957,47.652238,0.000000,2.0,2.0,65.077036,34.922964,0.000000,37.490829,27.586207,100.000000,1.000000,25.000000,0.000000,0.000000,5.327876,0.000000,100.000000,0.000000,0.000000,2.0,8,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
9,6.982863,0.000000,0.000000,0,2,3,1,6.747587,0.000000,6,1,1.0,0.0,3,4.948760,1,0,3,3,0,3,2,1,7,5.327876,1.0,3,2,4,1939.0,6.982863,0,5,10,2,3,2,8.912069,3.931826,3,0.000000,0.000000,0.000000,63.839338,1.609438,6,5,2,0.0,0,118000.0,0.000000,1,5,6.899723,0.000000,1939,1950,7.634821,57.059961,6.747587,1.0,2.0,52.079304,47.920696,0.000000,52.079304,0.000000,50.000000,1.000000,40.000000,85.872856,6.000000,1.609438,100.000000,0.000000,0.000000,0.000000,1.0,5,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0


In [660]:
seed = 13

feature_fill_t = feature_fill_[feature_fill_.SalePrice >= 0]

target_col = 'DolPerSF'

feature_col = feature_fill_.columns.values.tolist()

feature_col.remove(target_col)
feature_col.remove('Id')
feature_col.remove('SalePrice')

feature_fill_t.shape, feature_fill_.shape, feature_col

((1458, 253),
 (2917, 253),
 ['1stFlrSF',
  '2ndFlrSF',
  '3SsnPorch',
  'Alley',
  'BedroomAbvGr',
  'BsmtCond',
  'BsmtExposure',
  'BsmtFinSF1',
  'BsmtFinSF2',
  'BsmtFinType1',
  'BsmtFinType2',
  'BsmtFullBath',
  'BsmtHalfBath',
  'BsmtQual',
  'BsmtUnfSF',
  'CentralAir',
  'EnclosedPorch',
  'ExterCond',
  'ExterQual',
  'Fence',
  'FireplaceQu',
  'Fireplaces',
  'FullBath',
  'Functional',
  'GarageArea',
  'GarageCars',
  'GarageCond',
  'GarageFinish',
  'GarageQual',
  'GarageYrBlt',
  'GrLivArea',
  'HalfBath',
  'HeatingQC',
  'KitchenAbvGr',
  'KitchenQual',
  'LandSlope',
  'LotArea',
  'LotFrontage',
  'LotShape',
  'LowQualFinSF',
  'MasVnrArea',
  'MiscVal',
  'Neighborhood',
  'OpenPorchSF',
  'OverallCond',
  'OverallQual',
  'PavedDrive',
  'PoolArea',
  'PoolQC',
  'ScreenPorch',
  'Street',
  'TotRmsAbvGrd',
  'TotalBsmtSF',
  'WoodDeckSF',
  'YearBuilt',
  'YearRemodAdd',
  'TotalSF',
  'TotalBsmtFinSF',
  'AbvGrBath',
  'TotalBath',
  'pctSFGRLiv',
  'pctSFB

In [661]:
check_df_null(feature_fill_t)

In [584]:
# add a few utility functions to help with baseline model evaluation
# we will be using the random grid search to optimize the hyper paramters

cv_strat = KFold(n_splits=5, shuffle=True, random_state=seed)

#grid search with default parameters set
def rand_search(est, grid, n_iter = 50, seed = seed):
    return RandomizedSearchCV(
            estimator = est
            , param_distributions = grid
            , n_iter = n_iter
            , cv = cv_strat
            , verbose=1
            , random_state=seed
            , return_train_score=True
#             , scoring = 'neg_mean_squared_log_error' 
            , scoring = 'neg_mean_squared_error'
            #using neg_mean_squared_error, because already log transformed sale price
            , n_jobs = -1)

# the best results from the search 
def print_search_result(search, feature_col):
    print(search.estimator)
    print(
        'best params:'
        , search.cv_results_['params'][search.best_index_] \
        , '\ntraining score:'
        , math.sqrt(-search.cv_results_['mean_train_score'][search.best_index_]) \
        , '\ncv score:'
        , math.sqrt(-search.cv_results_['mean_test_score'][search.best_index_]) \
#         , '\nout of sample score:'
    )

#     pred = search.best_estimator_.predict(oos_df[feature_col])
#     print('rmsle score:', math.sqrt(mean_squared_error(oos_df[target_col], pred)))
     
#     a = zip(feature_col, search.best_estimator_.feature_importances_)
#     b = sorted(list(a), key=lambda x: x[1], reverse = True)
#     print('feature importance') 
#     for f in b:
#         print(f)


In [605]:
# Lasso

lasso = Lasso(random_state=seed, max_iter = 10000)

lasso_rand_grid = {
#         'alpha' : [0.00035,0.0004, 0.00045]
        'alpha' : np.linspace(0.0001, 0.01, 200)
#         'alpha' : np.geomspace(.0001, 0.001, 200)
        , 'selection' : ['random', 'cyclic'] # 
    }

# lasso = Pipeline([
#         ('scale', RobustScaler())
#         , ('regress', Lasso(random_state=seed))
#     ])

# lasso_rand_grid = {
#         'regress__alpha' : [0.103, 0.105, 0.107]
#         , 'regress__selection' : ['random', 'cyclic'] # 
#     }



# print(dt_rand_grid)
#default scoring function here is R^2 - coefficient of determination
lasso_rand = rand_search(est = lasso, grid = lasso_rand_grid, n_iter = 20, seed = seed)
lasso_rand.fit(feature_fill_t[feature_col], feature_fill_t[target_col])
print_search_result(search = lasso_rand, feature_col = feature_col)
# training score: 0.09918830530562099 
# cv score: 0.11048932814760003

# with scaler
# training score: 0.11235407710835162 
# cv score: 0.12058709784354842

# best params: {'selection': 'random', 'alpha': 0.00035} 
# training score: 0.09992252602383392 
# cv score: 0.11306021959945402

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.3s finished


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=False, positive=False, precompute=False, random_state=13,
   selection='cyclic', tol=0.0001, warm_start=False)
best params: {'selection': 'random', 'alpha': 0.0003984924623115578} 
training score: 0.0975143600957221 
cv score: 0.11020997514792925


In [608]:
# Ridge

ridge = Ridge(random_state=seed, max_iter = 10000)
ridge_rand_grid = {
#     'alpha' : [5.4,5.5,5.6]
    'alpha' : np.linspace(1, 20, 200)
#     'alpha' : np.geomspace(.1, 100, 200)
}

# ridge = Pipeline([
#         ('scale', RobustScaler())
#         , ('regress', Ridge(random_state=seed))
#     ])

# ridge_rand_grid = {
#     'regress__alpha' : [68,69,70]
# }

# print(dt_rand_grid)
#default scoring function here is R^2 - coefficient of determination
ridge_rand = rand_search(est = ridge, grid = ridge_rand_grid, n_iter =20, seed = seed)
ridge_rand.fit(feature_fill_t[feature_col], feature_fill_t[target_col])
print_search_result(search = ridge_rand, feature_col = feature_col)
# training score: 0.09759436599440784 
# cv score: 0.11339557514543169

# with scaler
# training score: 0.10978567325647763 
# cv score: 0.12285425900662197

# best params: {'alpha': 5.5} 
# training score: 0.0984853917365448 
# cv score: 0.11610196406483882

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=False, random_state=13, solver='auto', tol=0.001)
best params: {'alpha': 11.50251256281407} 
training score: 0.09683342964634055 
cv score: 0.11181996505533895


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished


In [610]:
# kernel ridge

kr = KernelRidge()
kr_rand_grid = {
#     'alpha' : [6.85,6.9,6.95]
    'alpha' : np.linspace(5, 15, 100)
#     , 'kernel' : ['linear', 'polynomial']
    , 'kernel' : ['polynomial']
#     , 'gamma' : [0.005, 0.01, 0.0, 0.05]
#     , 'degree' : [1.4,1.5,1.6]
    , 'degree' : np.linspace(1, 2, 10)
#     , 'coef0' : [10,20,30]
    , 'coef0' : np.linspace(30, 50, 10)
}

# kr = Pipeline([
#         ('scale', RobustScaler())
#         , ('regress', KernelRidge())
#     ])

# kr_rand_grid = {
#     'regress__alpha' : [2.5,3,3.5]
#     , 'regress__kernel' : ['linear', 'polynomial']
# #     , 'regress__gamma' : [6,7,8]
# #     , 'regress__degree' : [2,3,4]
# #     , 'regress__coef0' : [1,2,3]
# }

# print(dt_rand_grid)
#default scoring function here is R^2 - coefficient of determination
kr_rand = rand_search(est = kr, grid = kr_rand_grid, n_iter = 50, seed = seed)
kr_rand.fit(feature_fill_t[feature_col], feature_fill_t[target_col])
print_search_result(search = kr_rand, feature_col = feature_col)
# training score: 0.09544319945150963 
# cv score: 0.11475016941482404

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   12.6s finished


KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='linear',
      kernel_params=None)
best params: {'kernel': 'polynomial', 'degree': 1.4444444444444444, 'coef0': 41.111111111111114, 'alpha': 10.656565656565657} 
training score: 0.09531853416059298 
cv score: 0.11219545875779753


In [611]:
# decision tree

dt = DecisionTreeRegressor(random_state=seed)
dt_rand_grid = {
    'max_depth' : np.arange(4,9)
    , 'max_features' : ['auto', 'sqrt', 'log2']
    , 'min_samples_split' : np.arange(2,20)
    , 'min_samples_leaf' : np.arange(1,10)
}

dt_rand = rand_search(est = dt, grid = dt_rand_grid, n_iter = 400, seed = seed)
dt_rand.fit(feature_fill_t[feature_col], feature_fill_t[target_col])
print_search_result(search = dt_rand, feature_col = feature_col)

# training score: 0.11563502843627028 
# cv score: 0.17263120201111165

# best params: {'min_samples_split': 11, 'min_samples_leaf': 8, 'max_features': 'auto', 'max_depth': 6} 
# training score: 0.12715027358959455 
# cv score: 0.15479876159714254

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    0.7s


DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=13, splitter='best')
best params: {'min_samples_split': 8, 'min_samples_leaf': 8, 'max_features': 'auto', 'max_depth': 6} 
training score: 0.12715027358959455 
cv score: 0.15479876159714254


[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:    4.9s finished


In [631]:
# ada boosted feature fill

ada = AdaBoostRegressor(base_estimator = dt_rand.best_estimator_, random_state=seed)
ada_rand_grid = {
    'learning_rate' : np.linspace(0.1, 0.2, 20)
    , 'loss' : ['square']
    , 'n_estimators' : [1000,2000,3000]
}

#default scoring function here is R^2 - coefficient of determination
ada_rand = rand_search(est = ada, grid = ada_rand_grid, n_iter = 30, seed = seed)

ada_rand.fit(feature_fill_t[feature_col], feature_fill_t[target_col])
print_search_result(search = ada_rand, feature_col = feature_col)
# training score: 0.06766563347586665 
# cv score: 0.12865505233233854

# best params: {'n_estimators': 650, 'loss': 'square', 'learning_rate': 1.3} 
# training score: 0.08352147996040646 
# cv score: 0.1296019276925901

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 19.5min finished


AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=6, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=8,
           min_samples_split=8, min_weight_fraction_leaf=0.0,
           presort=False, random_state=13, splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=50,
         random_state=13)
best params: {'n_estimators': 2000, 'loss': 'square', 'learning_rate': 0.17368421052631577} 
training score: 0.07997425834641354 
cv score: 0.1310440312566406


In [632]:
# gradient boosted tree
# gradient boosted tree is a technique that builds an additive model in a forward stage-wise fashion
# In each stage a regression tree is fit on the negative gradient of the given loss function.

gbt = GradientBoostingRegressor(random_state=seed)
gbt_rand_grid = {
    'max_depth' : np.arange(1,6)
    , 'max_features' : ['sqrt']
#     , 'max_features' : ['sqrt', 'auto', 'log2']
    , 'min_samples_split' : np.arange(9,14)
    , 'min_samples_leaf' : np.arange(1,6)
    , 'n_estimators' : [800, 1000]
    , 'learning_rate' : np.linspace(0.06, 0.1, 20)
    , 'loss' : ['huber']
    , 'alpha' : [0.95]
}

gbt_rand = rand_search(est = gbt, grid = gbt_rand_grid, n_iter = 200, seed = seed)

gbt_rand.fit(feature_fill_t[feature_col], feature_fill_t[target_col])
print_search_result(search = gbt_rand, feature_col = feature_col)

# training score: 0.06693443372154344 
# cv score: 0.11094374231036692

# best params: {'n_estimators': 600, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 5, 'loss': 'huber', 'learning_rate': 0.02, 'alpha': 0.95} 
# training score: 0.06074964493547424 
# cv score: 0.11525645038771415

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  7.1min finished


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=13,
             subsample=1.0, verbose=0, warm_start=False)
best params: {'n_estimators': 800, 'min_samples_split': 13, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 3, 'loss': 'huber', 'learning_rate': 0.06, 'alpha': 0.95} 
training score: 0.058614300350258254 
cv score: 0.110450633856671


In [633]:
# xgboost

xgb = XGBRegressor(silent=1, booster='gbtree', n_jobs=-1, random_state=seed)

xgb_rand_grid = {
    'max_depth' : np.arange(4,9)
    , 'learning_rate' : np.linspace(0.015, 0.019, 10)
    , 'colsample_bytree' : np.linspace(0.75, 0.95, 10)
    , 'gamma' : np.linspace(0.009, 0.013, 10)
    , 'reg_alpha' : np.linspace(0.45, 0.85, 10)
    , 'reg_lambda' : np.linspace(0, 0.4, 10)
    , 'n_estimators' : [800,1000]
}

xgb_rand = rand_search(est = xgb, grid = xgb_rand_grid, n_iter = 200, seed = seed)

xgb_rand.fit(feature_fill_t[feature_col], feature_fill_t[target_col])
print_search_result(search = xgb_rand, feature_col = feature_col)
# training score: 0.047694021229689305 
# cv score: 0.11453948576542947

# best params: {'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.0075, 'gamma': 0.0175, 'colsample_bytree': 0.5} 
# training score: 0.041084841144068324 
# cv score: 0.11928538351073462

# best params: {'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.01, 'gamma': 0.025, 'colsample_bytree': 0.5} 
# training score: 0.049743030590073505 
# cv score: 0.11585700401428557

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 45.4min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 58.1min finished


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=13,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=1,
       subsample=1)
best params: {'reg_lambda': 0.26666666666666666, 'reg_alpha': 0.49444444444444446, 'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.016777777777777777, 'gamma': 0.011222222222222222, 'colsample_bytree': 0.8166666666666667} 
training score: 0.05873139498124729 
cv score: 0.1136810597777444


In [634]:
# random forest

rf = RandomForestRegressor(random_state=seed)
rf_rand_grid = {
#     'max_depth' : [10,24,26,28]
#     , 'min_samples_split' : [2, 4, 6]
#     , 'min_samples_leaf' : [1, 2, 3]
#     , 'n_estimators' : [200, 300, 400]
#     , 'max_features' : ['auto', 'sqrt', 'log2']
    
    'max_depth' : np.arange(20,29)
    , 'max_features' : ['sqrt']
#     , 'max_features' : ['sqrt', 'auto', 'log2']
    , 'min_samples_split' : np.arange(5,10)
    , 'min_samples_leaf' : np.arange(1,4)
    , 'n_estimators' : [800,1000]
}

# print(dt_rand_grid)
#default scoring function here is R^2 - coefficient of determination
rf_rand = rand_search(est = rf, grid = rf_rand_grid, n_iter = 200, seed = seed)

rf_rand.fit(feature_fill_t[feature_col], feature_fill_t[target_col])
print_search_result(search = rf_rand, feature_col = feature_col)

# training score: 0.059829848792990054 
# cv score: 0.12768469464789087

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 10.6min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=13, verbose=0, warm_start=False)
best params: {'n_estimators': 1000, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 26} 
training score: 0.06396811178993972 
cv score: 0.13353026065389176


In [635]:
# testing out a stacking model

class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=seed)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X.iloc[train_index], y.iloc[train_index])
                y_pred = instance.predict(X.iloc[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

class StackingAveragedModels_error(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.error_models_ = list()
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=seed)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        out_of_fold_error = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X.iloc[train_index], y.iloc[train_index])
                y_pred = instance.predict(X.iloc[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
            instance2 = clone(model)
            self.error_models_.append(instance2)
            instance2.fit(X, (out_of_fold_predictions[:, i] - y.values))
            error_pred = instance2.predict(X)
            out_of_fold_error[:, i] = error_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(np.column_stack([out_of_fold_predictions, out_of_fold_error]), y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_preds = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        meta_error = np.column_stack([model.predict(X) for model in self.error_models_ ])
        return self.meta_model_.predict(np.column_stack([meta_preds, meta_error]))    
    
class StackingAveragedModels_parallel(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    def generate_out_of_fold(self, params):
        t_index = params[0][0]
        oof_index = params[0][1]
        model_num = params[1][0]
        model_ins = params[1][1]
        instance = clone(model_ins)
        instance.fit(self.X.iloc[t_index], self.y.iloc[t_index])
        y_pred = instance.predict(self.X.iloc[oof_index])
        
        d = {
#             'train_index' : t_index
            'oof_index' : oof_index
            , 'model_num' : model_num
            , 'model_ins' : instance
            , 'y_pred' : y_pred
        }
        
        return d
            
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.X = X
        self.y = y
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=seed)
        pool = Pool(8)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        
        combos = itertools.product(kfold.split(self.X, self.y), enumerate(base_models))
        list_of_data = pool.map(self.generate_out_of_fold, combos)
        for d in list_of_data:
            self.base_models_[d['model_num']].append(d['model_ins'])
            out_of_fold_predictions[d['oof_index'], d['model_num']] = d['y_pred']
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [668]:

best_gbt = gbt_rand.best_estimator_

best_ada = ada_rand.best_estimator_

best_rf = rf_rand.best_estimator_

best_lasso = lasso_rand.best_estimator_
best_ridge = ridge_rand.best_estimator_
best_kr = kr_rand.best_estimator_
best_xgb = xgb_rand.best_estimator_


t0 = time.time()

stack = StackingAveragedModels(
                    base_models = (
                        best_gbt
                        , best_ada
                        , best_rf
                        , best_lasso
                        , best_ridge
                        , best_kr
                        , best_xgb
                    ),
                    meta_model = best_lasso
)

score = np.sqrt(-cross_val_score(stack
                               , feature_fill_t[feature_col]
                               , feature_fill_t[target_col]
                               , scoring="neg_mean_squared_error"
                               , cv = cv_strat
                                , n_jobs = -1
                              )
             )

print("Stacked Model CV score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

t1 = time.time()

print("Took {:.1f}s".format(t1-t0))

# Stacked Model CV score: 0.1079 (0.0078)

# Took 41.1s

# implemented a parallel class, but it was more efficient to just use the parallelization of the cross_val_score method

Stacked Model CV score: 0.1067 (0.0099)

Took 326.4s


In [669]:
# final training and generating submission
# feature_fill_[:ntrain][target_col].isnull().sum()
stack.fit(feature_fill_t[feature_col], feature_fill_t[target_col])
test_features = feature_fill_[feature_fill_.SalePrice.isnull()]
test_features['log_pred'] = stack.predict(test_features[feature_col])
test_features['SalePrice'] = test_features.log_pred.apply(lambda x : np.expm1(x)) \
    * test_features.TotalSF.apply(lambda x : np.expm1(x))
test_features[['Id', 'SalePrice']].to_csv('submission5.csv',index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [667]:
# final training and generating submission
# feature_fill_[:ntrain][target_col].isnull().sum()
best_gbt.fit(feature_fill_t[feature_col], feature_fill_t[target_col])
test_features = feature_fill_[feature_fill_.SalePrice.isnull()]
test_features['log_pred'] = best_gbt.predict(test_features[feature_col])
test_features['SalePrice'] = test_features.log_pred.apply(lambda x : np.expm1(x)) \
    * test_features.TotalSF.apply(lambda x : np.expm1(x))
test_features[['Id', 'SalePrice']].to_csv('submission4.csv',index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [639]:
feature_fill_.shape

(2915, 253)