In [1]:
import os
import pandas as pd
import numpy as np
from IPython.display import display
pd.options.display.max_columns = None

In [2]:
DATA_PATH = os.path.join('data')

In [3]:
def load_training_data(training_path=DATA_PATH):
    cvs_path = os.path.join(training_path, 'train.csv')
    return pd.read_csv(cvs_path)

In [4]:
def load_predicting_data(predicting_path=DATA_PATH):
    cvs_path = os.path.join(predicting_path, 'test.csv')
    return pd.read_csv(cvs_path)

In [8]:
#https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

In [87]:
missing_vals = df.isnull().sum().sort_values(ascending = False)
percent = ( df.isnull().sum()/df.isnull().count() ).sort_values(ascending = False)
missing_df = pd.concat([missing_vals, percent], axis = 1, keys = ["Total", "Percent"])
total_missing = missing_df[missing_df['Total'] > 0]
total_missing

Unnamed: 0,Total,Percent


# Feature Imputation: handling missing values

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from scipy.stats import skew

In [7]:
class FeatureImputer(BaseEstimator, TransformerMixin):
    """
    This class fills in missing values.
    """
    def __init__(self, fill_missvals=True):
        self.fill_missvals = fill_missvals
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.fill_missvals:
            X['PoolQC'] = X['PoolQC'].fillna('None')
            X['Alley'] = X['Alley'].fillna('None')
            X['Fence'] = X['Fence'].fillna('None')
            X['FireplaceQu'] = X['FireplaceQu'].fillna('None')
            X['LotFrontage'] = X.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
            X['MSZoning'] = X['MSZoning'].fillna(X['MSZoning'].mode()[0])            
            X["Functional"] = X["Functional"].fillna("Typ")
            X['Electrical'] = X['Electrical'].fillna(X['Electrical'].mode()[0])
            X['KitchenQual'] = X['KitchenQual'].fillna(X['KitchenQual'].mode()[0])
            X['Exterior1st'] = X['Exterior1st'].fillna(X['Exterior1st'].mode()[0])
            X['Exterior2nd'] = X['Exterior2nd'].fillna(X['Exterior2nd'].mode()[0])
            X['SaleType'] = X['SaleType'].fillna(X['SaleType'].mode()[0])
            X['MSSubClass'] = X['MSSubClass'].fillna("None")
            X['Utilities'] = X['Utilities'].fillna(0)
            for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 
                        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2','MasVnrType'):
                X[col] = X[col].fillna('None')
            for col in ('GarageArea', 'GarageCars', 'MasVnrArea', 'BsmtFinSF1', 
                        'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
                X[col] = X[col].fillna(0)
            X['MSSubClass'] = X['MSSubClass'].apply(str) 
            X['OverallCond'] = X['OverallCond'].astype(str)       
        return X

In [8]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, additional=1):
        self.additional = additional
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.additional == 1:
            X['HouseAge'] = X['YrSold'] - X['YearBuilt']
            X['BathperRoom'] = (X['FullBath'] + X['HalfBath']*2) / X['TotRmsAbvGrd']
            X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
            X['TotalArea'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF'] + X['GarageArea']
            X['TotalPorch'] = X['OpenPorchSF'] + X['EnclosedPorch'] + X['3SsnPorch'] + X['ScreenPorch']
        else:
            X['HouseAge'] = X['YrSold'] - X['YearBuilt']
            X['BathperRoom'] = (X['FullBath'] + X['HalfBath']*2) / X['TotRmsAbvGrd']
            X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
            X['TotalArea'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF'] + X['GarageArea']
            X['TotalPorch'] = X['OpenPorchSF'] + X['EnclosedPorch'] + X['3SsnPorch'] + X['ScreenPorch']
        
        return X     

# Categorical transformer

In [46]:
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        labels = LabelEncoder()
        
        categorical_attribs = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
                               'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
                               'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
                               'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond')
        
        for cat in categorical_attribs:
            X[cat] = labels.fit_transform(X[cat])
            
        return X

# Numerical transformer

In [32]:
class NumericTransformer(BaseEstimator, TransformerMixin):
    """
    This takes care of the skewness.
    """
    def __init__(self, skew=0.75):
        self.skew = skew
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        numerical_attribs = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
                             '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'TotalPorch', 'HouseAge',
                             'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'TotalSF', 'TotalArea']
        X_numeric = X[numerical_attribs]
        skewness = X_numeric.apply(lambda x: skew(x))
        skewness_features = skewness[abs(skewness) >= self.skew].index
        X[skewness_features] = np.log1p(X[skewness_features])
        X = pd.get_dummies(X)
        
        return X

In [33]:
class drop_cols(BaseEstimator, TransformerMixin):
    def __init__(self, remove_cols=True):
        self.remove_cols = remove_cols
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.remove_cols:
            del X['MoSold']
            del X['MiscFeature']
            del X['MiscVal']
            del X['GarageYrBlt']
            del X['Utilities']     
            del X['YearBuilt']
            del X['YearRemodAdd']
            del X['YrSold']
        return X

In [49]:
proprocessing = Pipeline([
    ('missing_val', FeatureImputer()),
    ('feat_eng', FeatureEngineer()),
    ('cat_trans', CategoricalTransformer()),
    ('drop_cols', drop_cols()),
    ('num_trans', NumericTransformer(skew=0.5)),
])

In [50]:
training = load_training_data()

In [51]:
trainWprice = pd.DataFrame(training)
trainNoPriceID = trainWprice.drop(['SalePrice', 'Id'], axis=1)

In [52]:
df = proprocessing.fit_transform(trainNoPriceID)
df

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,OverallCond,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,HouseAge,BathperRoom,TotalSF,TotalArea,TotalPorch,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_ClyTile,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,9,4.189655,9.042040,1,1,3,0,7,4,5.283204,2,4,2,4,3,2,6.561031,6,0.000000,5.017280,6.753438,0,1,6.753438,6.751101,0.0,7.444833,1,0,2,1,3,1,2,8,6,0,3,2,2,548,5,5,2,0.000000,4.127134,0.000000,0.0,0.0,0,3,4,1.791759,0.500000,7.850493,8.043984,4.127134,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,4,4.394449,9.169623,1,1,3,0,6,7,0.000000,3,4,2,4,1,0,6.886532,6,0.000000,5.652489,7.141245,0,1,7.141245,0.000000,0.0,7.141245,0,1,2,0,3,1,3,6,6,1,5,2,2,460,5,5,2,5.700444,0.000000,0.000000,0.0,0.0,0,3,4,3.465736,0.333333,7.833996,8.001355,0.000000,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,9,4.234107,9.328212,1,1,0,0,7,4,5.093750,2,4,2,4,2,2,6.188264,6,0.000000,6.075346,6.825460,0,1,6.825460,6.765039,0.0,7.488294,1,0,2,1,3,1,2,6,6,1,5,2,2,608,5,5,2,0.000000,3.761200,0.000000,0.0,0.0,0,3,4,2.079442,0.666667,7.903596,8.106213,3.761200,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,10,4.110874,9.164401,1,1,0,0,7,4,0.000000,3,4,4,1,3,0,5.379897,6,0.000000,6.293419,6.629363,2,1,6.869014,6.629363,0.0,7.448916,1,0,1,0,3,1,2,7,6,1,2,3,3,642,5,5,2,0.000000,3.583519,5.609472,0.0,0.0,0,3,4,4.521789,0.142857,7.813592,8.044305,5.730100,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
4,9,4.442651,9.565284,1,1,0,0,8,4,5.860786,2,4,2,4,0,2,6.486161,6,0.000000,6.196444,7.044033,0,1,7.044033,6.960348,0.0,7.695758,1,0,2,1,4,1,2,9,6,1,5,2,3,836,5,5,2,5.262690,4.442651,0.000000,0.0,0.0,0,3,4,2.197225,0.444444,8.114923,8.338067,4.442651,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,9,4.143135,8.976894,1,1,3,0,6,4,0.000000,3,4,2,4,3,6,0.000000,6,0.000000,6.860664,6.860664,0,1,6.860664,6.543912,0.0,7.407318,0,0,2,1,3,1,3,7,6,1,5,2,2,460,5,5,2,0.000000,3.713572,0.000000,0.0,0.0,0,3,4,2.197225,0.571429,7.863651,8.026497,3.713572,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1456,4,4.454347,9.486152,1,1,3,0,6,5,4.787492,3,4,2,4,3,0,6.673298,5,5.099866,6.380123,7.341484,4,1,7.637234,0.000000,0.0,7.637234,1,0,2,0,3,1,3,7,2,2,5,3,2,500,5,5,2,5.857933,0.000000,0.000000,0.0,0.0,0,3,2,3.496508,0.285714,8.193124,8.322637,0.000000,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1457,10,4.204693,9.109746,1,1,3,0,7,8,0.000000,0,2,4,1,3,2,5.620401,6,0.000000,6.777647,7.050123,0,1,7.080868,7.050123,0.0,7.758333,0,0,2,0,4,1,2,9,6,2,2,2,1,252,5,5,2,0.000000,4.110874,0.000000,0.0,0.0,0,3,0,4.248495,0.222222,8.158516,8.228177,4.110874,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1458,4,4.234107,9.181735,1,1,3,0,5,5,0.000000,3,4,4,4,2,2,3.912023,5,6.937314,0.000000,6.983790,2,1,6.983790,0.000000,0.0,6.983790,1,0,1,0,2,1,2,5,6,0,3,3,1,240,5,5,2,5.905362,0.000000,4.727388,0.0,0.0,0,3,4,4.110874,0.200000,7.676474,7.781973,4.727388,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [43]:
scaler = RobustScaler()

In [44]:
dfFinal = scaler.fit_transform(df)
dfFinal

array([[ 0.8       ,  0.        , -0.25751586, ...,  0.        ,
         0.        ,  0.        ],
       [-0.2       ,  0.        ,  0.46467123, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.8       ,  0.        , -0.10076115, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.        ,  0.        , -0.20448628, ...,  0.        ,
         0.        ,  0.        ],
       [-0.2       ,  0.        , -0.10076115, ...,  0.        ,
         0.        ,  0.        ],
       [-0.2       ,  0.        ,  0.23998376, ...,  0.        ,
         0.        ,  0.        ]])

In [45]:
dfFinal.shape

(1460, 122)

In [127]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.999)

In [128]:
dfFinal = pca.fit_transform(dfFinal)

In [129]:
dfFinal

array([[-2.76060722e+00, -9.26461768e-01, -1.09933686e+00, ...,
        -3.83791536e-03, -3.54537204e-01,  3.53821923e-02],
       [-2.76208340e+00, -8.47457471e-01, -3.91026804e-02, ...,
        -2.57117715e-01,  4.64085436e-01, -4.35142375e-01],
       [-2.75716942e+00, -1.64418452e+00, -9.30299631e-01, ...,
         4.56333588e-01, -2.10026495e-01, -1.50967039e-01],
       ...,
       [-2.73238829e+00, -1.30819147e+00, -3.86058221e-01, ...,
        -4.18348951e-01, -2.45836639e-01, -7.36484015e-01],
       [-2.74650760e+00,  3.16047667e+00,  6.62092243e+00, ...,
         2.08764129e-02, -7.11958369e-01,  3.28266549e-01],
       [-2.74388979e+00,  6.53364031e-01,  5.45341756e+00, ...,
        -6.88941048e-01, -5.89447691e-01, -2.29675144e-02]])

In [100]:
training.SaleCondition.value_counts()

Normal     1198
Partial     125
Abnorml     101
Family       20
Alloca       12
AdjLand       4
Name: SaleCondition, dtype: int64

In [14]:
class grid():
    def __init__(self, model):
        self.model = model
        
    def grid_get(self, X, y, param_grid):
        grid_search = GridSearchCV(self.model, param_grid, cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X, y)
        print(grid_search.best_params_, np.sqrt(-grid_search.best_score_))
        
        grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
        print(pd.DataFrame(grid_search.cv_results_)[['params', 'mean_test_score', 'std_test_score']])

In [15]:
def PlotLearningCurve(estimator, title, X, y, ylim=None, cv=None,
                     n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlable('Training examples')
    plt.ylabel('Error')
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = 1-np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = 1-np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    
    plt.fill_between(train_sizes, train_scores_mean-train_scores_std, train_scores_mean+train_scores_std, alpha=0.1, color='r')
    plt.fill_between(train_sizes, test_scores_mean-test_scores_std, test_scores_mean+test_scores_std, alpha=0.1, color='g')
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
    plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Cross Validation score')
    
    plt.legend(loc='best')
    return plt

In [18]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
    
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        for model in self.models_:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

# Pipeline

In [None]:
pipe = Pipeline([
    ('feature_engineer', feature_engineer()),
    ('add_feature', add_feature(additional=2)),
    ('lab_enc', labelenc()),
    ('drop_cols', drop_cols()),
    ('skew_dummies', skew_dummies(skew=1))
])

## Feature importance

In [189]:
training.loc[:,'HouseAge'] = training.YrSold - training.YearBuilt
training.loc[:,'RemodelAge'] = training.YrSold - training.YearRemodAdd
training.loc[:,'TotalPorchSF'] = training.OpenPorchSF + training.EnclosedPorch + training.ScreenPorch + training['3SsnPorch']
training.loc[:,'TotalBathAbGr'] = training.FullBath + training.HalfBath / 2
#training.loc[:,'BsmtFinPct'] = (training['BsmtFinSF1'] + training['BsmtFinSF2']) *100 / training.TotalBsmtSF

training = training.drop(['SaleCondition', 'SaleType', 'MoSold', 'MiscFeature','MiscVal', 'GarageYrBlt',
                         'YearBuilt', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', '3SsnPorch', 
                          'YearRemodAdd', 'LowQualFinSF', 'YrSold', 'PoolArea', 'PoolQC', 
                          'FullBath', 'HalfBath', 'BsmtUnfSF'], axis=1)

In [181]:
#pred = load_predicting_data()
from sklearn.model_selection import train_test_split

In [183]:
training.shape

(1460, 65)

In [190]:
num_attribs = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea',
               'TotalBsmtSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 
               'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 
               'HouseAge', 'RemodelAge', 'TotalPorchSF', 'TotalBathAbGr']
cat_attribs = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
              'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
              'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
              'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 
              'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
              'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
              'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
              'GarageCond', 'PavedDrive', 'Fence']

In [185]:
len(num_attribs)

21

In [186]:
len(cat_attribs)

40

In [35]:



from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer

from sklearn.base import TransformerMixin

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin

In [None]:
class MyRegressor(BaseEstimator, RegressorMixin):
    def fit(self, X, y):
        return self
    def predict(self, X):
        return np.mean(X, axis=1)

In [None]:
class MyImputer(BaseEstimator, TransformerMixin):
    def __init__(self, imputer, strategy):
        self.imputer = imputer
        self.strategy = strategy
        
    def fit(self, X, y=None):
        self.imputer = self.imputer(strategy=self.strategy)
        self.imputer.fit(X, y)
        return self
    
    def transform(self, X, *_):
        return self.imputer.transform(X)

In [None]:
imputation = ColumnTransformer(
        [('categorical_imputer', Imputer(), cat_attribs),
         ('numeric_imputer', Imputer(SimpleImputer, strategy='constant', fill_value=None), num_attribs)
        ])

In [None]:
df = pd.DataFrame(imputation.fit_transform(df), columns=df.columns, index=df.index)

In [None]:
pipe = make_pipeline(Imputer(),
                    LogisticRegression())
pipe.fit(X,y)
pipe.predict(X)

In [234]:
num_transformer = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value=None),
                    StandardScaler()
)

cat_transformer = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value='missing'),
                    OneHotEncoder(handle_unknown='ignore')
)

col_transformer = make_column_transformer(
    (cat_transformer, cat_attribs),
    (num_transformer, num_attribs)
)

In [192]:
train_set, val_set = train_test_split(training, test_size=0.3, random_state=42)

In [193]:
train_set.shape

(1022, 66)

# Baseline models with most features

## 1. Linear Regression

In [194]:
y_train = train_set.SalePrice
y_val = val_set.SalePrice

X_train = train_set.drop(['Id', 'SalePrice'], axis=1)
X_val = val_set.drop(['Id', 'SalePrice'], axis=1)

In [195]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,Fence,HouseAge,RemodelAge,TotalPorchSF,TotalBathAbGr
135,20,RL,80.0,10400,Pave,,Reg,Lvl,AllPub,Inside,...,530,TA,TA,Y,98,MnPrv,38,38,0,2.0
1452,180,RM,35.0,3675,Pave,,Reg,Lvl,AllPub,Inside,...,525,TA,TA,Y,0,,1,1,28,1.0
762,60,FV,72.0,8640,Pave,,Reg,Lvl,AllPub,Inside,...,614,TA,TA,Y,169,,1,1,45,2.5
932,20,RL,84.0,11670,Pave,,IR1,Lvl,AllPub,Corner,...,788,TA,TA,Y,0,,1,1,191,2.0
435,60,RL,43.0,10667,Pave,,IR2,Lvl,AllPub,CulDSac,...,550,TA,TA,Y,158,,13,13,61,2.5


In [236]:
df = pd.DataFrame(col_transformer.fit_transform(X_val))

In [237]:
df.head()

Unnamed: 0,0
0,"(0, 0)\t1.0\n (0, 18)\t1.0\n (0, 21)\t1.0\..."
1,"(0, 5)\t1.0\n (0, 18)\t1.0\n (0, 21)\t1.0\..."
2,"(0, 1)\t1.0\n (0, 19)\t1.0\n (0, 21)\t1.0\..."
3,"(0, 4)\t1.0\n (0, 19)\t1.0\n (0, 21)\t1.0\..."
4,"(0, 0)\t1.0\n (0, 18)\t1.0\n (0, 21)\t1.0\..."


In [103]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [196]:
linReg = LinearRegression()
pipe_linReg = make_pipeline(col_transformer, linReg)
pipe_linReg.fit(X_train, y_train);

In [205]:
predictions = pipe_linReg.predict(X_val)
lin_mse = mean_squared_error(y_val, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

28225.0935633959

In [206]:
df = pd.DataFrame({'Actual': y_val, 'Predicted': predictions})
df.head()

Unnamed: 0,Actual,Predicted
892,154500,156078.808777
1105,325000,364953.319459
413,115000,78383.259069
522,159000,189315.897922
1036,315500,331817.287999


## 2. Decision tree

In [199]:
from sklearn.tree import DecisionTreeRegressor

In [200]:
reg = DecisionTreeRegressor()
pipe_dt = make_pipeline(col_transformer,reg)

In [201]:
pipe_dt.fit(X_train, y_train);

In [202]:
predictions = pipe_dt.predict(X_val)

In [203]:
df = pd.DataFrame({'Actual': y_val, 'Predicted': predictions})
df.head()

Unnamed: 0,Actual,Predicted
892,154500,139400.0
1105,325000,430000.0
413,115000,109900.0
522,159000,210000.0
1036,315500,277500.0


In [204]:
dt_mse = mean_squared_error(y_val, predictions)
dt_rmse = np.sqrt(dt_mse)
dt_rmse

34959.27204090693

## 3. Random Forests

In [207]:
from sklearn.ensemble import RandomForestClassifier

In [208]:
rf = RandomForestClassifier()
pipe_rf = make_pipeline(col_transformer, rf)

In [209]:
pipe_rf.fit(X_train, y_train);
predictions = pipe_rf.predict(X_val)

In [210]:
df = pd.DataFrame({'Actual': y_val, 'Predicted': predictions})
df.head()

Unnamed: 0,Actual,Predicted
892,154500,132500
1105,325000,271000
413,115000,125500
522,159000,131000
1036,315500,265900


In [211]:
dt_mse = mean_squared_error(y_val, predictions)
dt_rmse = np.sqrt(dt_mse)
dt_rmse

39676.31616577201

# Model selection

In [212]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [214]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    #NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]
for classifier in classifiers:
    pipe = make_pipeline(col_transformer, classifier)
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_val, y_val))

KNeighborsClassifier(n_neighbors=3)
model score: 0.007
SVC(C=0.025, probability=True)
model score: 0.014
DecisionTreeClassifier()
model score: 0.000
RandomForestClassifier()
model score: 0.009
AdaBoostClassifier()
model score: 0.014
GradientBoostingClassifier()
model score: 0.014


1. MSSubClass
2. [leave as is] MSZoning
3. [leave as is] Street
4. [leave as is] Alley
6. [leave as is] LandContour
8. [leave as is] LotConfig
9. [leave as is] LandSlope
10. [leave as is] Neighborhood
11. [drop] Condition1, Condition2
12. [leave as is] BldgType
13. HouseStyle
14. YearBuilt
15. YearRemodAdd: convert to binary, yes or no.
16. [leave as is] RoofStyle
17. [leave as is] RoofMatl
18. Exterior1st, Exterior2nd
19. [leave as is] MasVnrType
20. ExterQual
21. ExterCond
22. [leave as is] Foundation
23. BsmtQual
24. BsmtCond
25. BsmtExposure
26. BsmtFinType1, BsmtFinType2
27. Heating
28. HeatingQC
29. [leave as is] CentralAir
30. Electrical
31. KitchenQual
32. Functional
33. FireplaceQu
34. GarageType
35. GarageYrBlt
36. GarageFinish
37. GarageQual
38. GarageCond
39. PavedDrive
40. PoolQC
41. Fence 
44. YrSold
45. SaleType
46. SaleCondition

In [19]:
#class FeatureSelector(BaseEstimator, TransformerMixin):
#    def __init__(self, feature_names):
#        self._feature_names = feature_names
    
#    def fit(self, X, y=None):
#        return self

    #returns a pandas df with only the selected columns
#    def transform(self, X, y=None):
#        return X[self._feature_names]

In [None]:
#import matplotlib.pyplot as plt
#%matplotlib inline
#import seaborn as sns