In [5]:
# Set up code checking
# Set up filepaths
import os
os.chdir(os.path.join(os.path.expanduser('~'), 'kaggle'))

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


## Load data into dataframes

In [7]:
train_path = os.path.join('data', 'house-price-data', 'train.csv')
train_data = pd.read_csv(train_path)

test_path = os.path.join('data', 'house-price-data', 'test.csv')
test_data = pd.read_csv(test_path)

## Select numerical and categorical variables, and separate target from features

In [8]:
y = train_data['SalePrice']
train_data.drop(columns=['SalePrice'], inplace=True)

numerical_columns = train_data.select_dtypes(include=np.number).columns.tolist()

numerical_columns.remove('MSSubClass')

categorical_columns = [c for c in train_data.columns if c not in numerical_columns]

print(numerical_columns)
print(categorical_columns)

['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'Gar

## Add missing data col for every column with missing data

In [10]:

cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='Missing')), 
                                ('OHE'), OneHotEncoder(handle_unknown='ignore')])

num_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))])

# add num_col_name_missing column for every numeric column with missing data
numeric_cols_with_na = [c for c in numerical_columns if train_data[c].isna().any()]
for c in numeric_cols_with_na:
    missing_col_name = f"{c}_missing"
    train_data[missing_col_name] = np.where(train_data[c].isna(), 1, 0)
    test_data[missing_col_name] = np.where(test_data[c].isna(), 1, 0) 


col_transformer = ColumnTransformer([('numerical_columns', num_pipeline, numerical_columns), 
                                    ('categorical_columns', cat_pipeline, categorical_columns)])

train_data_transformed = col_transformer.fit_transform(train_data)

test_data_transformed = col_transformer.transform(test_data)
print(train_data_transformed.shape, test_data_transformed.shape)

TypeError: 'OneHotEncoder' object is not iterable