In [29]:
# Set up code checking
# Set up filepaths
import os
os.chdir(os.path.join(os.path.expanduser('~'), 'kaggle'))

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


## Load data into dataframes

In [31]:
train_path = os.path.join('data', 'house-price-data', 'train.csv')
train_data = pd.read_csv(train_path)

test_path = os.path.join('data', 'house-price-data', 'test.csv')
test_data = pd.read_csv(test_path)

## Select numerical and categorical variables, and separate target from features

In [32]:
y = train_data['SalePrice']
train_data.drop(columns=['SalePrice'], inplace=True)

numerical_columns = train_data.select_dtypes(include=np.number).columns.tolist()

numerical_columns.remove('MSSubClass')

categorical_columns = [c for c in train_data.columns if c not in numerical_columns]

print(numerical_columns)
print(categorical_columns)

['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'Gar

## Add missing data col for every column with missing data

In [45]:
from sklearn.base import BaseEstimator, TransformerMixin

# adds additional column which indicates whether data was missing for given feature
class AddMissingIndicator(BaseEstimator, TransformerMixin):
    def __init__(self, numeric_columns):
        self.numeric_columns = numeric_columns
        self.num_cols_with_na = [] 

    def fit(self, X, y=None):
        self.num_cols_with_na = [c for c in self.numeric_columns if X[c].isna().any()]
        return self
    
    def transform(self, X, y=None):
        for c in self.num_cols_with_na:
            missing_col_name = f"{c}_is_missing"
            X[missing_col_name] = np.where(X[c].isna(), 1, 0)
        
        return X 


cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')), 
    ('OHE', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

num_pipeline = Pipeline(steps=[
    ('missing_indicator', AddMissingIndicator(numerical_columns)),
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])


preprocessing = ColumnTransformer(transformers=[
    ('numerical', num_pipeline, numerical_columns),
    ('categorical', cat_pipeline, categorical_columns)
])


train_data_transformed = preprocessing.fit_transform(train_data)
print(train_data_transformed.shape)

test_data_transformed = preprocessing.transform(test_data)
print(test_data_transformed.shape)



['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
(1460, 322)
(1459, 322)
