# House Prices: Advanced Regression Techniques

### Training Goal
* With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this challenges predict the final price of each home.
* 아이오와 주 에임스 시에 있는 주거용 주택의 (거의 모든) 측면을 설명하는 79 가지 설명 변수로 각 주택의 최종 가격을 예측한다.

### 성능 측정 지표
* #### RMSE (Root Mean Squared Error)
$$ \text{RMSE} = \sqrt{\frac{1}{|\hat{R}|} \sum_{\hat{r}_{ui} \in \hat{R}}(r_{ui} - \hat{r}_{ui})^2} $$

In [35]:
import csv
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as sk
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
from statsmodels.graphics import utils
from sklearn.model_selection import KFold
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import Imputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib
from joblib import dump, load
from collections import defaultdict
from sklearn.model_selection import train_test_split

sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()
%matplotlib inline

pd.options.display.max_columns = 400
pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 600
pd.options.display.precision = 10

# Preprocessing

In [55]:
def cleansing_data(train, test):
        
    #outlier 제거
    train = train.drop([524, 1299])
    
    for df in [train, test]:
        for i in df.index:
            if df['MSZoning'][i] == 'C (all)':
                df['MSZoning'][i] = 'C'        

    # 종속변수와 종속변수의 왜도로 인해 log를 취한 종속변수 정의
    sale_price = train.pop('SalePrice')
    sale_price_log = np.log(sale_price)
    
    # 전체 면적을 나타내는 새로운 변수 생성
    for df in [train, test]:
        df['TotalArea'] = df['TotalBsmtSF'].fillna(0) + df['GrLivArea'].fillna(0)

    # 변수의 속성에 따라 구분
    not_num = ['MSSubClass', 'OverallQual', 'OverallCond', 'MoSold']

    num_cols = [num for num in train.keys() if (train[num].dtype == 'int64' or train[num].dtype == 'float64') 
                and (num not in not_num)]
    cat_cols = [cat for cat in train.keys() if cat not in num_cols]

    numcat_data = {
        'LandSlope' : ['Sev', 'Mod', 'Gtl'],
        'ExterQual' : ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'ExterCond' : ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'BsmtQual' : ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'BsmtCond' : ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'BsmtExposure' : ['NA', 'No', 'Mn', 'Av', 'Gd'],
        'BsmtFinType1' : ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
        'BsmtFinType2' : ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
        'HeatingQC' : ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'KitchenQual' : ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'Functional' : ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
        'FireplaceQu' : ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'GarageFinish' : ['NA', 'Unf', 'RFn', 'Fin'],
        'GarageQual' : ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'GarageCond' : ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'PoolQC' : ['NA', 'Fa', 'TA', 'Gd', 'Ex'],
        'OverallQual' : list(range(1, 11)), 
        'OverallCond' : list(range(1, 11)),
        }

    numcat_cols = list(numcat_data.keys())
    dum_cols = [cat for cat in cat_cols if cat not in numcat_cols]
    
    ## NUMERIC FEATURE
    # numeric feature 의 missing value replace 및 missing value 여부를 나타내는 column 생성 (False / True)
    zero = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'Fireplaces', 
        'GarageCars', 'GarageArea', 'PoolArea', 'MiscVal', 'GarageYrBlt', 'MasVnrArea']
    med = [num for num in num_cols if num not in zero]
    
    for df in [train, test]:
        for num in med:
            df['{0}_na'.format(num)] = df[num].isnull()
            df[num] = df[num].fillna(df[num].median())

    for df in [train, test]:
        for num in zero:
            df['{0}_na'.format(num)] = df[num].isnull()
            df[num] = df[num].fillna(0)
            
    # train data 기준으로 상관계수가 0.1 미만인 columns 삭제
    corr_num = pd.concat([train[num_cols], sale_price_log], axis=1).corr()
    under_cols = [i for i in corr_num if abs(corr_num.SalePrice[i]) < 0.1]
    drop_cols = under_cols + [f'{i}_na' for i in under_cols]
    for df in [train, test]:
        df.drop(drop_cols, axis=1, inplace=True)
        
        
    ## ORDINAL FEATURE
    # feature의 처리 과정을 편리하기 위해 type 변경 (category type)
    for df in [train, test]:
        for cat_list in [cat_cols, numcat_cols]:
            for cat in cat_list:
                df[cat] = df[cat].astype('category').cat.as_ordered()
    
    # value 의 범주와 순서 설정
    for df in [train, test]:
        for cat in numcat_data.keys():
            df[cat].cat.set_categories(numcat_data[cat], ordered=True, inplace=True)
    
    # 일부 feature 의 missing value 대체
    for df in [train, test]:
        for i in numcat_cols:
            if 'NA' in numcat_data[i]:
                df[i].fillna('NA')
            elif 'None' in numcat_data[i]:
                df[i].filna('None')

    # 설정한 범주와 순서에 맞게 numeric value 대체
    for df in [train, test]:
        for col in numcat_cols:
            df[col] = df[col].cat.codes + 1 # NaN 값을 0 으로 만들기 위해 +!
            
    # train data 기준으로 상관계수가 0.1 미만인 columns 삭제
    corr_numcat = pd.concat([train[numcat_cols], sale_price_log], axis=1).corr()
    under_cols_numcat = [i for i in corr_numcat if abs(corr_numcat.SalePrice[i]) < 0.1]
    drop_cols_numcat = under_cols_numcat + ['GarageCond']
    for df in [train, test]:
        df.drop(drop_cols_numcat, axis=1, inplace=True)        
            
    ## CATEGORICAL FEATURE (one-hot)       
    # Utilities feature 의 대부분의 class 하나이기 때문에 제외
    for df in [train, test]:
        df.drop('Utilities', axis=1, inplace=True)
    dum_cols.remove('Utilities')
    
    # categorical feature의 범주 정
    cat_classes = {
        'MSSubClass': [20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180, 190],
        'LotShape' : ['Reg', 'IR1', 'IR2', 'IR3'],
        'LandContour' : ['Lvl', 'Bnk', 'HLS', 'Low'],
        # 'Utilities' : ['AllPub', 'NoSewr', 'NoSeWa', 'ELO'],
        'CentralAir': ['N', 'Y'],
        'Electrical' : ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix'],
        'MoSold' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
        'MSZoning' : ['A', 'C', 'FV', 'I', 'RH', 'RL', 'RP', 'RM'],
        'Street' : ['Grvl', 'Pave'],
        'Alley' : ['Grvl', 'Pave', 'NA'],
        'LotConfig' : ['Inside', 'Corner', 'CulDSac', 'FR2', 'FR3'],
        'Neighborhood' : ['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NoRidge', 'NPkVill', 'NridgHt', 'NWAmes', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker'],
        'Condition1' : ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe'],
        'Condition2' : ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe'],
        'BldgType' : ['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'],
        'HouseStyle' : ['1Story', '1.5Fin', '1.5Unf', '2Story', '2.5Fin', '2.5Unf', 'SFoyer', 'SLvl'],
        'RoofStyle' : ['Flat', 'Gable', 'Gambrel', 'Hip', 'Mansard', 'Shed'],
        'RoofMatl' : ['ClyTile', 'CompShg', 'Membran', 'Metal', 'Roll', 'Tar&Grv', 'WdShake', 'WdShngl'],
        'Exterior1st' : ['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing'],
        'Exterior2nd' : ['AsbShng', 'AsphShn', 'Brk Cmn', 'BrkFace', 'CBlock', 'CmentBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'Wd Shng'],
        'MasVnrType' : ['BrkCmn', 'BrkFace', 'CBlock', 'None', 'Stone'],
        'Foundation' : ['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood'],
        'Heating' : ['Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall'],
        'GarageType' : ['2Types', 'Attchd', 'Basment', 'BuiltIn', 'CarPort', 'Detchd', 'NA'],
        'PavedDrive' : ['Y', 'P', 'N'],
        'Fence' : ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'NA'],
        'MiscFeature' : ['Elev', 'Gar2', 'Othr', 'Shed', 'TenC', 'NA'],
        'SaleType' : ['WD', 'CWD', 'VWD', 'New', 'COD', 'Con', 'ConLw', 'ConLI', 'ConLD', 'Oth'],
        'SaleCondition' : ['Normal', 'Abnorml', 'AdjLand', 'Alloca', 'Family', 'Partial']}
        
    #'NA' 혹은 'None' class 가 있는 경우 해당 값으로 대체, 없는 경우 최빈값으로 대체
    for df in [train, test]:
        for i in dum_cols:
            df[i].cat.set_categories(cat_classes[i], ordered=True, inplace=True)

        for i in dum_cols:
            if 'NA' in cat_classes[i]:
                df[i] = df[i].fillna('NA')
            elif 'None'in cat_classes[i]:
                df[i] = df[i].fillna('None')
            else:
                df[i] = df[i].fillna(df[i].mode()[0])
                
    # 분산분석을 사용하여 categorical feature 영향력 파악 및 삭제
    for cat in dum_cols:
        train[cat] = train[cat].astype('object')
        
    anova_cat = pd.concat([train[dum_cols], sale_price], axis=1)
    cats_rs = pd.DataFrame(data=[sm.OLS.from_formula("SalePrice ~ C(" + cat + ")" , anova_cat).fit().rsquared for cat in dum_cols],
                 index=list(dum_cols), columns=['r_squared'])

    train = train.drop(cats_rs[cats_rs.r_squared < 0.1].index, axis=1)
    test = test.drop(cats_rs[cats_rs.r_squared < 0.1].index, axis=1)
    
    for i in cats_rs[cats_rs.r_squared < 0.1].index:
        dum_cols.remove(i)
                
    # pandas의 get_dummies 를 사용하여 one-hot-encoding
    # 위의 단계에서 set_categories를 통해 value의 범주를 지정하였므로 누락 값에 대한 column 또한 생성
    for i in dum_cols:
        train[i] = train[i].astype('category').cat.as_ordered()
        train[i].cat.set_categories(cat_classes[i], ordered=True, inplace=True)
    
    train_dummies = pd.get_dummies(train[dum_cols])
    test_dummies = pd.get_dummies(test[dum_cols])
    
    train.drop(dum_cols, axis=1, inplace=True)
    test.drop(dum_cols, axis=1, inplace=True)
    
    train = pd.concat([train, train_dummies], axis=1)
    train = pd.concat([train, sale_price], axis=1)
    test = pd.concat([test, test_dummies], axis=1)
    
    return train, test

In [56]:
df_train = pd.read_csv('./data/train.csv', index_col='Id')
df_test = pd.read_csv('./data/test.csv', index_col='Id')

In [57]:
train, test = cleansing_data(df_train, df_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [58]:
train.head()

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,TotalArea,LotFrontage_na,LotArea_na,YearBuilt_na,YearRemodAdd_na,1stFlrSF_na,2ndFlrSF_na,GrLivArea_na,FullBath_na,HalfBath_na,BedroomAbvGr_na,KitchenAbvGr_na,TotRmsAbvGrd_na,WoodDeckSF_na,OpenPorchSF_na,EnclosedPorch_na,ScreenPorch_na,TotalArea_na,BsmtFinSF1_na,BsmtUnfSF_na,TotalBsmtSF_na,BsmtFullBath_na,Fireplaces_na,GarageCars_na,GarageArea_na,GarageYrBlt_na,MasVnrArea_na,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSZoning_A,MSZoning_C,MSZoning_FV,MSZoning_I,MSZoning_RH,MSZoning_RL,MSZoning_RP,MSZoning_RM,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NoRidge,Neighborhood_NPkVill,Neighborhood_NridgHt,Neighborhood_NWAmes,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Other,Exterior1st_Plywood,Exterior1st_PreCast,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_PreCast,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_CBlock,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_NA,SaleType_WD,SaleType_CWD,SaleType_VWD,SaleType_New,SaleType_COD,SaleType_Con,SaleType_ConLw,SaleType_ConLI,SaleType_ConLD,SaleType_Oth,SaleCondition_Normal,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Partial,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1
1,65.0,8450,7,2003,2003,196.0,4,5,4,2,7,706,150,856,5,856,854,1710,1,2,1,3,1,4,8,8,0,0,2003.0,3,2,548,4,0,61,0,0,2566,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,208500
2,80.0,9600,6,1976,1976,0.0,3,5,4,5,6,978,284,1262,5,1262,0,1262,0,2,0,3,1,3,6,8,1,4,1976.0,3,2,460,4,298,0,0,0,2524,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,181500
3,68.0,11250,7,2001,2002,162.0,4,5,4,3,7,486,434,920,5,920,866,1786,1,2,1,3,1,4,6,8,1,4,2001.0,3,2,608,4,0,42,0,0,2706,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,223500
4,60.0,9550,7,1915,1970,0.0,3,4,5,2,6,216,540,756,4,961,756,1717,1,1,0,3,1,4,7,8,1,5,1998.0,2,3,642,4,0,35,272,0,2473,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,140000
5,84.0,14260,8,2000,2000,350.0,4,5,4,4,7,655,490,1145,5,1145,1053,2198,1,2,1,4,1,4,9,8,1,4,2000.0,3,3,836,4,192,84,0,0,3343,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,250000


In [59]:
dfy = train.pop('SalePrice')
dfX = train.copy()

In [60]:
test.head()

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,TotalArea,LotFrontage_na,LotArea_na,YearBuilt_na,YearRemodAdd_na,1stFlrSF_na,2ndFlrSF_na,GrLivArea_na,FullBath_na,HalfBath_na,BedroomAbvGr_na,KitchenAbvGr_na,TotRmsAbvGrd_na,WoodDeckSF_na,OpenPorchSF_na,EnclosedPorch_na,ScreenPorch_na,TotalArea_na,BsmtFinSF1_na,BsmtUnfSF_na,TotalBsmtSF_na,BsmtFullBath_na,Fireplaces_na,GarageCars_na,GarageArea_na,GarageYrBlt_na,MasVnrArea_na,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSZoning_A,MSZoning_C,MSZoning_FV,MSZoning_I,MSZoning_RH,MSZoning_RL,MSZoning_RP,MSZoning_RM,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NoRidge,Neighborhood_NPkVill,Neighborhood_NridgHt,Neighborhood_NWAmes,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Other,Exterior1st_Plywood,Exterior1st_PreCast,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_PreCast,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_CBlock,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_NA,SaleType_WD,SaleType_CWD,SaleType_VWD,SaleType_New,SaleType_COD,SaleType_Con,SaleType_ConLw,SaleType_ConLI,SaleType_ConLD,SaleType_Oth,SaleCondition_Normal,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1
1461,80.0,11622,5,1961,1961,0.0,3,4,4,2,4,468.0,270.0,882.0,3,896,0,896,0.0,1,0,2,1,3,5,8,0,0,1961.0,2,1.0,730.0,4,140,0,0,120,1778.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1462,81.0,14267,6,1958,1958,108.0,3,4,4,2,6,923.0,406.0,1329.0,3,1329,0,1329,0.0,1,1,3,1,4,6,8,0,0,1958.0,2,1.0,312.0,4,393,36,0,0,2658.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1463,74.0,13830,5,1997,1998,0.0,3,5,4,2,7,791.0,137.0,928.0,4,928,701,1629,0.0,2,1,3,1,3,6,8,1,4,1997.0,4,2.0,482.0,4,212,34,0,0,2557.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1464,78.0,9978,6,1998,1998,20.0,3,4,4,2,7,602.0,324.0,926.0,5,926,678,1604,0.0,2,1,3,1,4,7,8,1,5,1998.0,4,2.0,470.0,4,360,36,0,0,2530.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1465,43.0,5005,8,1992,1992,0.0,4,5,4,2,6,263.0,1017.0,1280.0,5,1280,0,1280,0.0,2,0,2,1,4,5,8,0,0,1992.0,3,2.0,506.0,4,0,82,0,144,2560.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [61]:
cat_classes = joblib.load('cat_classes.pkl')

# Modeling

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
cv = KFold(10)

In [22]:
train.shape

(1460, 172)

## 1. Linear Regression

In [116]:
lin_reg = LinearRegression()

In [117]:
lin_reg.fit(dfX, dfy)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [118]:
test_y_hat = lin_reg.predict(test)

In [119]:
id_cols = pd.DataFrame(data=list(df_test.index), columns=['id'])
test_lin_reg = pd.concat([id_cols, pd.DataFrame(data=test_y_hat, columns=['SalePrice'])], axis=1)
test_lin_reg.to_csv('./test_lin_reg.csv', index=False)

* submission score : 0.18857

## 2. XGBRegressor

In [62]:
from xgboost import XGBRegressor

* GridSearchCV 를 통해 best parameter 탐색

In [63]:
xgb_lig = XGBRegressor()

In [64]:
param_grid = {'max_depth' : [3, 6, 9, 12, 15], 'learning_rate' : [0.0001, 0.001, 0.01, 0.1], 
               'n_estimators': [50, 100, 150, 200, 250]}

In [65]:
xgb_gs = GridSearchCV(estimator=xgb_lig, param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

In [66]:
%%time
xgb_gs.fit(dfX, dfy)

Wall time: 43min 30s


GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 6, 9, 12, 15], 'learning_rate': [0.0001, 0.001, 0.01, 0.1], 'n_estimators': [50, 100, 150, 200, 250]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [67]:
np.sqrt(-(xgb_gs.best_score_))

22879.454709993606

In [96]:
xgb_gs.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 250}

* 확인된 best parameter 를 통해 modeling

In [69]:
%%time
gs_xgb_lig = XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=250)
gs_xgb_lig.fit(dfX, dfy)

Wall time: 1.65 s


In [70]:
xgb_pred = gs_xgb_lig.predict(test)

In [71]:
id_cols = pd.DataFrame(data=list(df_test.index), columns=['id'])
test_xgb_reg = pd.concat([id_cols, pd.DataFrame(data=xgb_pred, columns=['SalePrice'])], axis=1)
test_xgb_reg.to_csv('./test_xgb_reg.csv', index=False)

* submission score : 0.13863

## 3. DecisionTreeRegressor

In [72]:
from sklearn.tree import DecisionTreeRegressor

In [73]:
dtree = DecisionTreeRegressor()

In [74]:
param_grid = {'max_depth' : [None, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30], 'max_leaf_nodes' : [None, 5, 50, 500, 5000]}
dtree_gs = GridSearchCV(estimator=dtree, param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

In [75]:
%%time
dtree_gs.fit(dfX, dfy)

Wall time: 43.6 s


GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30], 'max_leaf_nodes': [None, 5, 50, 500, 5000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [76]:
np.sqrt(-(dtree_gs.best_score_))

33368.53819279646

In [97]:
dtree_gs.best_params_

{'max_depth': 6, 'max_leaf_nodes': None}

In [78]:
%%time
gs_dtree = DecisionTreeRegressor(max_depth=6, max_leaf_nodes=None)
gs_dtree.fit(dfX, dfy)

Wall time: 52 ms


In [79]:
dtree_pred = gs_dtree.predict(test)

In [80]:
test_dtree = pd.concat([id_cols, pd.DataFrame(data=dtree_pred, columns=['SalePrice'])], axis=1)
test_dtree.to_csv('./test_dtree.csv', index=False)

* submission score : 0.19444

## 4. Ridfge & Lasso

In [81]:
from sklearn.linear_model import Ridge, Lasso

In [82]:
las_reg = Lasso()
rid_reg = Ridge()

In [83]:
param_grid = {'alpha' : [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]}
las_gs = GridSearchCV(estimator=las_reg, param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=1)
rid_gs = GridSearchCV(estimator=rid_reg, param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

In [84]:
%%time
las_gs.fit(dfX, dfy)
rid_gs.fit(dfX, dfy)









Wall time: 31.7 s


In [85]:
print(np.sqrt(-las_gs.best_score_))
print(np.sqrt(-rid_gs.best_score_))
print('='*20)
print(las_gs.best_params_)
print(rid_gs.best_params_)

27116.04273253262
26722.82337588816
{'alpha': 1.0}
{'alpha': 1.0}


In [86]:
%%time
gs_lasso = Lasso()
gs_ridge = Ridge()
gs_lasso.fit(dfX, dfy)
gs_ridge.fit(dfX, dfy)

Wall time: 316 ms




In [87]:
lasso_pred = gs_lasso.predict(test)
ridge_pred = gs_ridge.predict(test)

In [88]:
test_lasso = pd.concat([id_cols, pd.DataFrame(data=lasso_pred, columns=['SalePrice'])], axis=1)
test_ridge = pd.concat([id_cols, pd.DataFrame(data=ridge_pred, columns=['SalePrice'])], axis=1)
test_lasso.to_csv('./test_lasso.csv', index=False)
test_ridge.to_csv('./test_ridge.csv', index=False)

* submission scores
  - Lasso : 0.18283
  - Ridge : 0.17829

## 5. RandomFroestRegressor

In [89]:
from sklearn.ensemble import RandomForestRegressor

In [90]:
RandomForestRegressor()

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [91]:
rf_reg = RandomForestRegressor()

In [92]:
param_grid = {'max_depth' : [None, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30], 'max_leaf_nodes' : [None, 5, 50, 500, 5000], 
              'n_estimators': [10, 50, 100, 150, 200, 250]}
rf_reg_gs = GridSearchCV(estimator=rf_reg, param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

In [93]:
%%time
rf_reg_gs.fit(dfX, dfy)

Wall time: 2h 57min 37s


GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30], 'max_leaf_nodes': [None, 5, 50, 500, 5000], 'n_estimators': [10, 50, 100, 150, 200, 250]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [94]:
np.sqrt(-(rf_reg_gs.best_score_))

25312.590355561842

In [98]:
rf_reg_gs.best_params_

{'max_depth': 30, 'max_leaf_nodes': 500, 'n_estimators': 250}

In [105]:
%%time
gs_rf_reg = RandomForestRegressor(max_depth=30, max_leaf_nodes=500, n_estimators=250)
gs_rf_reg.fit(dfX, dfy)

Wall time: 11.3 s


In [106]:
rf_reg_pred = gs_rf_reg.predict(test)

In [107]:
id_cols = pd.DataFrame(data=list(df_test.index), columns=['id'])
test_rf_reg = pd.concat([id_cols, pd.DataFrame(data=rf_reg_pred, columns=['SalePrice'])], axis=1)
test_rf_reg.to_csv('./test_rf_reg.csv', index=False)

* submission score : 0.14813