In [1]:
import os
import pandas as pd
import numpy as np
from IPython.display import display
pd.options.display.max_columns = None

In [2]:
DATA_PATH = os.path.join('data')

In [3]:
def load_training_data(training_path=DATA_PATH):
    cvs_path = os.path.join(training_path, 'train.csv')
    return pd.read_csv(cvs_path)

In [4]:
def load_predicting_data(predicting_path=DATA_PATH):
    cvs_path = os.path.join(predicting_path, 'test.csv')
    return pd.read_csv(cvs_path)

In [8]:
#https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

In [68]:
training = load_training_data()

In [97]:
training.Condition1.value_counts()

Norm      1260
Feedr       81
Artery      48
RRAn        26
PosN        19
RRAe        11
PosA         8
RRNn         5
RRNe         2
Name: Condition1, dtype: int64

# Feature Imputation: handling missing values

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from scipy.stats import skew

In [118]:
class FeatureImputer(BaseEstimator, TransformerMixin):
    """
    This class fills in missing values.
    """
    def __init__(self, fill_missvals=True):
        self.fill_missvals = fill_missvals
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.fill_missvals:
            X['PoolQC'] = X['PoolQC'].fillna('None')
            X['Alley'] = X['Alley'].fillna('None')
            X['Fence'] = X['Fence'].fillna('None')
            X['FireplaceQu'] = X['FireplaceQu'].fillna('None')
            X['LotFrontage'] = X.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
            X['MSZoning'] = X['MSZoning'].fillna(X['MSZoning'].mode()[0])            
            X["Functional"] = X["Functional"].fillna("Typ")
            X['Electrical'] = X['Electrical'].fillna(X['Electrical'].mode()[0])
            X['KitchenQual'] = X['KitchenQual'].fillna(X['KitchenQual'].mode()[0])
            X['Exterior1st'] = X['Exterior1st'].fillna(X['Exterior1st'].mode()[0])
            X['Exterior2nd'] = X['Exterior2nd'].fillna(X['Exterior2nd'].mode()[0])
            X['SaleType'] = X['SaleType'].fillna(X['SaleType'].mode()[0])
            X['MSSubClass'] = X['MSSubClass'].fillna("None")
            X['Utilities'] = X['Utilities'].fillna(0)
            for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 
                        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2','MasVnrType'):
                X[col] = X[col].fillna('None')
            for col in ('GarageArea', 'GarageCars', 'MasVnrArea', 'BsmtFinSF1', 
                        'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
                X[col] = X[col].fillna(0)
            X['MSSubClass'] = X['MSSubClass'].apply(str) 
            X['OverallCond'] = X['OverallCond'].astype(str)       
        return X

In [119]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, additional=1):
        self.additional = additional
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.additional == 1:
            X['HouseAge'] = X['YrSold'] - X['YearBuilt']
            X['BathperRoom'] = (X['FullBath'] + X['HalfBath']*2) / X['TotRmsAbvGrd']
            X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
            X['TotalArea'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF'] + X['GarageArea']
            X['TotalPorch'] = X['OpenPorchSF'] + X['EnclosedPorch'] + X['3SsnPorch'] + X['ScreenPorch']
        else:
            X['HouseAge'] = X['YrSold'] - X['YearBuilt']
            X['BathperRoom'] = (X['FullBath'] + X['HalfBath']*2) / X['TotRmsAbvGrd']
            X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
            X['TotalArea'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF'] + X['GarageArea']
            X['TotalPorch'] = X['OpenPorchSF'] + X['EnclosedPorch'] + X['3SsnPorch'] + X['ScreenPorch']
        
        return X     

# Categorical transformer

In [120]:
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        labels = LabelEncoder()
        
        categorical_attribs = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
                               'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
                               'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
                               'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond')
        
        for cat in categorical_attribs:
            X[cat] = labels.fit_transform(X[cat])
            
        return X

# Numerical transformer

In [143]:
class NumericTransformer(BaseEstimator, TransformerMixin):
    """
    This takes care of the skewness.
    """
    def __init__(self, skew=0.75):
        self.skew = skew
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        numerical_attribs = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
                             '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'TotalPorch', 'HouseAge',
                             'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', 'TotalSF', 'TotalArea']
        X_numeric = X[numerical_attribs]
        skewness = X_numeric.apply(lambda x: skew(x))
        skewness_features = skewness[abs(skewness) >= self.skew].index
        X[skewness_features] = np.log1p(X[skewness_features])
        X = pd.get_dummies(X)
        
        return X

In [137]:
class drop_cols(BaseEstimator, TransformerMixin):
    def __init__(self, remove_cols=True):
        self.remove_cols = remove_cols
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.remove_cols:
            del X['MoSold']
            del X['MiscFeature']
            del X['MiscVal']
            del X['GarageYrBlt']
            del X['Utilities']     
            del X['YearBuilt']
            del X['YearRemodAdd']
            del X['YrSold']
            del X['RoofMatl']
            del X['LandContour']
            del X['Street']
            del X['Condition1']
            del X['Condition2']
            del X['LotConfig']
            del X['PoolArea']
            del X['MSSubClass']
            del X['3SsnPorch']
            del X['PoolQC']
            del X['FireplaceQu']
            del X['LowQualFinSF']
            del X['OverallCond']
            del X['BsmtHalfBath']
            del X['BsmtFinType2']
            del X['LandSlope']
        return X

In [144]:
proprocessing = Pipeline([
    ('missing_val', FeatureImputer()),
    ('feat_eng', FeatureEngineer()),
    ('cat_trans', CategoricalTransformer()),
    ('drop_cols', drop_cols()),
    ('num_trans', NumericTransformer(skew=0.5)),
])

In [145]:
training = load_training_data()

In [146]:
trainWprice = pd.DataFrame(training)
trainNoPriceID = trainWprice.drop(['SalePrice', 'Id'], axis=1)

In [147]:
train_y = trainWprice.SalePrice
train_y_final = np.log(train_y)

## Correlation matrix

In [148]:
df = proprocessing.fit_transform(trainNoPriceID)
df.head()

Unnamed: 0,LotFrontage,LotArea,Alley,LotShape,LandSlope,OverallQual,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,Fence,HouseAge,BathperRoom,TotalSF,TotalArea,TotalPorch,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,4.189655,9.04204,1,3,0,7,5.283204,2,4,2,4,3,2,6.561031,0.0,5.01728,6.753438,0,1,6.753438,6.751101,7.444833,1,2,1,3,1,2,8,6,0,2,2,548,5,5,2,0.0,4.127134,0.0,0.0,4,1.791759,0.5,7.850493,8.043984,4.127134,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,4.394449,9.169623,1,3,0,6,0.0,3,4,2,4,1,0,6.886532,0.0,5.652489,7.141245,0,1,7.141245,0.0,7.141245,0,2,0,3,1,3,6,6,1,2,2,460,5,5,2,5.700444,0.0,0.0,0.0,4,3.465736,0.333333,7.833996,8.001355,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,4.234107,9.328212,1,0,0,7,5.09375,2,4,2,4,2,2,6.188264,0.0,6.075346,6.82546,0,1,6.82546,6.765039,7.488294,1,2,1,3,1,2,6,6,1,2,2,608,5,5,2,0.0,3.7612,0.0,0.0,4,2.079442,0.666667,7.903596,8.106213,3.7612,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,4.110874,9.164401,1,0,0,7,0.0,3,4,4,1,3,0,5.379897,0.0,6.293419,6.629363,2,1,6.869014,6.629363,7.448916,1,1,0,3,1,2,7,6,1,3,3,642,5,5,2,0.0,3.583519,5.609472,0.0,4,4.521789,0.142857,7.813592,8.044305,5.7301,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
4,4.442651,9.565284,1,0,0,8,5.860786,2,4,2,4,0,2,6.486161,0.0,6.196444,7.044033,0,1,7.044033,6.960348,7.695758,1,2,1,4,1,2,9,6,1,2,3,836,5,5,2,5.26269,4.442651,0.0,0.0,4,2.197225,0.444444,8.114923,8.338067,4.442651,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [161]:
df.Alley.value_counts()

1    1369
0      50
2      41
Name: Alley, dtype: int64

In [150]:
forcorr = pd.concat([df, train_y_final], axis=1)
corr_matrix = forcorr.corr()
pd.options.display.max_rows = 2000
with open('corr.txt', 'w') as f:
    print(corr_matrix['SalePrice'].sort_values(ascending=False), 'corr.txt', file=f)

In [151]:
scaler = RobustScaler()

In [165]:
dfFinal = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)

In [166]:
dfFinal.head()

Unnamed: 0,LotFrontage,LotArea,Alley,LotShape,LandSlope,OverallQual,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,Fence,HouseAge,BathperRoom,TotalSF,TotalArea,TotalPorch,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.257516,-0.26766,0.0,0.0,0.0,0.5,1.03441,-1.0,0.0,0.0,0.0,0.0,0.0,0.09271,0.0,-0.898157,-0.300181,0.0,0.0,-0.524119,1.024186,0.342891,1.0,0.0,1.0,0.0,0.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.281573,0.0,0.0,0.0,0.0,0.205247,0.0,0.0,0.0,-0.989863,0.555556,0.090814,0.147997,0.047828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.464671,0.029682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-0.4,0.142255,0.0,-0.403505,0.492878,0.0,0.0,0.327547,0.0,-0.327743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.082816,0.0,0.0,0.0,1.11122,-0.769489,0.0,0.0,0.0,-0.06507,0.0,0.049766,0.042002,-0.791023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.100761,0.399288,0.0,-1.0,0.0,0.5,0.997317,-1.0,0.0,0.0,0.0,-1.0,0.0,0.035971,0.0,-0.074216,-0.152897,0.0,0.0,-0.36595,1.026301,0.438896,1.0,0.0,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.530021,0.0,0.0,0.0,0.0,0.118822,0.0,0.0,0.0,-0.830932,1.111111,0.222946,0.302725,-0.026549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.535329,0.017513,0.0,-1.0,0.0,0.5,0.0,0.0,0.0,1.0,-3.0,0.0,-0.4,-0.087072,0.0,0.095602,-0.553912,0.5,0.0,-0.2703,1.005718,0.35191,1.0,-1.0,0.0,0.0,0.0,-1.0,0.5,0.0,0.0,0.5,1.0,0.670807,0.0,0.0,0.0,0.0,0.076857,5.609472,0.0,0.0,0.51835,-0.634921,-0.001005,0.148795,0.373635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0
4,0.634651,0.951802,0.0,-1.0,0.0,1.0,1.147496,-1.0,0.0,0.0,0.0,-3.0,0.0,0.081314,0.0,0.020085,0.294081,0.0,0.0,0.114059,1.05593,0.89719,1.0,0.0,1.0,1.0,0.0,-1.0,1.5,0.0,0.0,0.0,1.0,1.47412,0.0,0.0,0.0,1.025886,0.279765,0.0,0.0,0.0,-0.765863,0.37037,0.748773,0.879216,0.111958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Models

In [197]:
# define cross validation 
from sklearn.model_selection import cross_val_score
def rmse_cv(model,X,y):
    scores = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=10)
    #the MSE scores returned is always negative
    rmse = np.sqrt(-scores) 
    return rmse

In [183]:
# Lin reg ALL 14 models HYPERPARAMS NOT optimized
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR

models = [LinearRegression(),
          Ridge(),
          Lasso(),
          RandomForestRegressor(),
          GradientBoostingRegressor(),
          SVR(),
          LinearSVR(),
          ElasticNet(),
          SGDRegressor(),
          BayesianRidge(),
          KernelRidge(),
          ExtraTreesRegressor()]
names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra"]

In [196]:
# Run the models and compare
from operator import itemgetter
ModScores = {}

for name, model in zip(names, models):
    score = rmse_cv(model, dfFinal, train_y_final)
    ModScores[name] = score.mean()
    print("{}: {:.6f}".format(name,score.mean()))

print("trainFinal ", dfFinal.shape)
print("_"*80)
for key, value in sorted(ModScores.items(), key = itemgetter(1), reverse = False):
    print(key, value)

LR: 2656660677.158792
Ridge: 0.135435
Lasso: 0.399228
RF: 0.141533
GBR: 0.131055
SVR: 0.146807




LinSVR: 0.148575
Ela: 0.399228
SGD: 0.364908
Bay: 0.133272
Ker: 0.292826
Extra: 0.135437
trainFinal  (1460, 170)
________________________________________________________________________________
GBR 0.13105468668281942
Bay 0.13327198427710604
Ridge 0.13543543732192193
Extra 0.13543681483505474
RF 0.14153341195073474
SVR 0.14680691551130456
LinSVR 0.14857523804276218
Ker 0.2928258559168698
SGD 0.36490833248101795
Lasso 0.3992282792085989
Ela 0.3992282792085989
LR 2656660677.158792


In [14]:
class grid():
    def __init__(self, model):
        self.model = model
        
    def grid_get(self, X, y, param_grid):
        grid_search = GridSearchCV(self.model, param_grid, cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X, y)
        print(grid_search.best_params_, np.sqrt(-grid_search.best_score_))
        
        grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
        print(pd.DataFrame(grid_search.cv_results_)[['params', 'mean_test_score', 'std_test_score']])

In [15]:
def PlotLearningCurve(estimator, title, X, y, ylim=None, cv=None,
                     n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlable('Training examples')
    plt.ylabel('Error')
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = 1-np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = 1-np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    
    plt.fill_between(train_sizes, train_scores_mean-train_scores_std, train_scores_mean+train_scores_std, alpha=0.1, color='r')
    plt.fill_between(train_sizes, test_scores_mean-test_scores_std, test_scores_mean+test_scores_std, alpha=0.1, color='g')
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
    plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Cross Validation score')
    
    plt.legend(loc='best')
    return plt

In [18]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
    
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        for model in self.models_:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

# Pipeline

In [None]:
pipe = Pipeline([
    ('feature_engineer', feature_engineer()),
    ('add_feature', add_feature(additional=2)),
    ('lab_enc', labelenc()),
    ('drop_cols', drop_cols()),
    ('skew_dummies', skew_dummies(skew=1))
])

## Feature importance

In [189]:
training.loc[:,'HouseAge'] = training.YrSold - training.YearBuilt
training.loc[:,'RemodelAge'] = training.YrSold - training.YearRemodAdd
training.loc[:,'TotalPorchSF'] = training.OpenPorchSF + training.EnclosedPorch + training.ScreenPorch + training['3SsnPorch']
training.loc[:,'TotalBathAbGr'] = training.FullBath + training.HalfBath / 2
#training.loc[:,'BsmtFinPct'] = (training['BsmtFinSF1'] + training['BsmtFinSF2']) *100 / training.TotalBsmtSF

training = training.drop(['SaleCondition', 'SaleType', 'MoSold', 'MiscFeature','MiscVal', 'GarageYrBlt',
                         'YearBuilt', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', '3SsnPorch', 
                          'YearRemodAdd', 'LowQualFinSF', 'YrSold', 'PoolArea', 'PoolQC', 
                          'FullBath', 'HalfBath', 'BsmtUnfSF'], axis=1)

In [181]:
#pred = load_predicting_data()
from sklearn.model_selection import train_test_split

In [183]:
training.shape

(1460, 65)

In [190]:
num_attribs = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea',
               'TotalBsmtSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 
               'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 
               'HouseAge', 'RemodelAge', 'TotalPorchSF', 'TotalBathAbGr']
cat_attribs = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
              'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
              'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
              'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 
              'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
              'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
              'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
              'GarageCond', 'PavedDrive', 'Fence']

In [185]:
len(num_attribs)

21

In [186]:
len(cat_attribs)

40

In [35]:



from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer

from sklearn.base import TransformerMixin

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin

In [None]:
class MyRegressor(BaseEstimator, RegressorMixin):
    def fit(self, X, y):
        return self
    def predict(self, X):
        return np.mean(X, axis=1)

In [None]:
class MyImputer(BaseEstimator, TransformerMixin):
    def __init__(self, imputer, strategy):
        self.imputer = imputer
        self.strategy = strategy
        
    def fit(self, X, y=None):
        self.imputer = self.imputer(strategy=self.strategy)
        self.imputer.fit(X, y)
        return self
    
    def transform(self, X, *_):
        return self.imputer.transform(X)

In [None]:
imputation = ColumnTransformer(
        [('categorical_imputer', Imputer(), cat_attribs),
         ('numeric_imputer', Imputer(SimpleImputer, strategy='constant', fill_value=None), num_attribs)
        ])

In [None]:
df = pd.DataFrame(imputation.fit_transform(df), columns=df.columns, index=df.index)

In [None]:
pipe = make_pipeline(Imputer(),
                    LogisticRegression())
pipe.fit(X,y)
pipe.predict(X)

In [234]:
num_transformer = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value=None),
                    StandardScaler()
)

cat_transformer = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value='missing'),
                    OneHotEncoder(handle_unknown='ignore')
)

col_transformer = make_column_transformer(
    (cat_transformer, cat_attribs),
    (num_transformer, num_attribs)
)

In [192]:
train_set, val_set = train_test_split(training, test_size=0.3, random_state=42)

In [193]:
train_set.shape

(1022, 66)

# Baseline models with most features

## 1. Linear Regression

In [194]:
y_train = train_set.SalePrice
y_val = val_set.SalePrice

X_train = train_set.drop(['Id', 'SalePrice'], axis=1)
X_val = val_set.drop(['Id', 'SalePrice'], axis=1)

In [195]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,Fence,HouseAge,RemodelAge,TotalPorchSF,TotalBathAbGr
135,20,RL,80.0,10400,Pave,,Reg,Lvl,AllPub,Inside,...,530,TA,TA,Y,98,MnPrv,38,38,0,2.0
1452,180,RM,35.0,3675,Pave,,Reg,Lvl,AllPub,Inside,...,525,TA,TA,Y,0,,1,1,28,1.0
762,60,FV,72.0,8640,Pave,,Reg,Lvl,AllPub,Inside,...,614,TA,TA,Y,169,,1,1,45,2.5
932,20,RL,84.0,11670,Pave,,IR1,Lvl,AllPub,Corner,...,788,TA,TA,Y,0,,1,1,191,2.0
435,60,RL,43.0,10667,Pave,,IR2,Lvl,AllPub,CulDSac,...,550,TA,TA,Y,158,,13,13,61,2.5


In [236]:
df = pd.DataFrame(col_transformer.fit_transform(X_val))

In [237]:
df.head()

Unnamed: 0,0
0,"(0, 0)\t1.0\n (0, 18)\t1.0\n (0, 21)\t1.0\..."
1,"(0, 5)\t1.0\n (0, 18)\t1.0\n (0, 21)\t1.0\..."
2,"(0, 1)\t1.0\n (0, 19)\t1.0\n (0, 21)\t1.0\..."
3,"(0, 4)\t1.0\n (0, 19)\t1.0\n (0, 21)\t1.0\..."
4,"(0, 0)\t1.0\n (0, 18)\t1.0\n (0, 21)\t1.0\..."


In [103]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [196]:
linReg = LinearRegression()
pipe_linReg = make_pipeline(col_transformer, linReg)
pipe_linReg.fit(X_train, y_train);

In [205]:
predictions = pipe_linReg.predict(X_val)
lin_mse = mean_squared_error(y_val, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

28225.0935633959

In [206]:
df = pd.DataFrame({'Actual': y_val, 'Predicted': predictions})
df.head()

Unnamed: 0,Actual,Predicted
892,154500,156078.808777
1105,325000,364953.319459
413,115000,78383.259069
522,159000,189315.897922
1036,315500,331817.287999


## 2. Decision tree

In [199]:
from sklearn.tree import DecisionTreeRegressor

In [200]:
reg = DecisionTreeRegressor()
pipe_dt = make_pipeline(col_transformer,reg)

In [201]:
pipe_dt.fit(X_train, y_train);

In [202]:
predictions = pipe_dt.predict(X_val)

In [203]:
df = pd.DataFrame({'Actual': y_val, 'Predicted': predictions})
df.head()

Unnamed: 0,Actual,Predicted
892,154500,139400.0
1105,325000,430000.0
413,115000,109900.0
522,159000,210000.0
1036,315500,277500.0


In [204]:
dt_mse = mean_squared_error(y_val, predictions)
dt_rmse = np.sqrt(dt_mse)
dt_rmse

34959.27204090693

## 3. Random Forests

In [207]:
from sklearn.ensemble import RandomForestClassifier

In [208]:
rf = RandomForestClassifier()
pipe_rf = make_pipeline(col_transformer, rf)

In [209]:
pipe_rf.fit(X_train, y_train);
predictions = pipe_rf.predict(X_val)

In [210]:
df = pd.DataFrame({'Actual': y_val, 'Predicted': predictions})
df.head()

Unnamed: 0,Actual,Predicted
892,154500,132500
1105,325000,271000
413,115000,125500
522,159000,131000
1036,315500,265900


In [211]:
dt_mse = mean_squared_error(y_val, predictions)
dt_rmse = np.sqrt(dt_mse)
dt_rmse

39676.31616577201

# Model selection

In [212]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [214]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    #NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]
for classifier in classifiers:
    pipe = make_pipeline(col_transformer, classifier)
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_val, y_val))

KNeighborsClassifier(n_neighbors=3)
model score: 0.007
SVC(C=0.025, probability=True)
model score: 0.014
DecisionTreeClassifier()
model score: 0.000
RandomForestClassifier()
model score: 0.009
AdaBoostClassifier()
model score: 0.014
GradientBoostingClassifier()
model score: 0.014


1. MSSubClass
2. [leave as is] MSZoning
3. [leave as is] Street
4. [leave as is] Alley
6. [leave as is] LandContour
8. [leave as is] LotConfig
9. [leave as is] LandSlope
10. [leave as is] Neighborhood
11. [drop] Condition1, Condition2
12. [leave as is] BldgType
13. HouseStyle
14. YearBuilt
15. YearRemodAdd: convert to binary, yes or no.
16. [leave as is] RoofStyle
17. [leave as is] RoofMatl
18. Exterior1st, Exterior2nd
19. [leave as is] MasVnrType
20. ExterQual
21. ExterCond
22. [leave as is] Foundation
23. BsmtQual
24. BsmtCond
25. BsmtExposure
26. BsmtFinType1, BsmtFinType2
27. Heating
28. HeatingQC
29. [leave as is] CentralAir
30. Electrical
31. KitchenQual
32. Functional
33. FireplaceQu
34. GarageType
35. GarageYrBlt
36. GarageFinish
37. GarageQual
38. GarageCond
39. PavedDrive
40. PoolQC
41. Fence 
44. YrSold
45. SaleType
46. SaleCondition

In [19]:
#class FeatureSelector(BaseEstimator, TransformerMixin):
#    def __init__(self, feature_names):
#        self._feature_names = feature_names
    
#    def fit(self, X, y=None):
#        return self

    #returns a pandas df with only the selected columns
#    def transform(self, X, y=None):
#        return X[self._feature_names]

In [162]:
#from sklearn.decomposition import PCA
#pca = PCA(n_components = 0.999)
#dfFinal = pca.fit_transform(dfFinal)

In [None]:
#import matplotlib.pyplot as plt
#%matplotlib inline
#import seaborn as sns