In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# these are the objects we need to impute missing data
# with sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# to split the datasets
from sklearn.model_selection import train_test_split

In [2]:
cols_to_use = ['BsmtQual', 'FireplaceQu', 'SalePrice']

data = pd.read_csv('houseprice.csv', usecols=cols_to_use)
data.head()

Unnamed: 0,BsmtQual,FireplaceQu,SalePrice
0,Gd,,208500
1,Gd,TA,181500
2,Gd,TA,223500
3,TA,Gd,140000
4,Gd,TA,250000


In [3]:
data.isnull().mean()

BsmtQual       0.025342
FireplaceQu    0.472603
SalePrice      0.000000
dtype: float64

In [4]:
# let's separate into training and testing set

# first let's remove the target from the features
cols_to_use.remove('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(
    data[cols_to_use],  # just the features
    data['SalePrice'],  # the target
    test_size=0.3,  # the percentage of obs in the test set
    random_state=0)  # for reproducibility

X_train.shape, X_test.shape

((1022, 2), (438, 2))

In [5]:
X_train.isnull().mean()

BsmtQual       0.023483
FireplaceQu    0.467710
dtype: float64

In [6]:
# let's inspect the values of the categorical variable
X_train['BsmtQual'].unique()

array(['Gd', 'TA', 'Fa', nan, 'Ex'], dtype=object)

In [7]:
# let's inspect the values of the categorical variable
X_train['FireplaceQu'].unique()

array([nan, 'Gd', 'TA', 'Fa', 'Po', 'Ex'], dtype=object)

In [8]:
# Now we impute the missing values with SimpleImputer

# create an instance of the simple imputer
# we indicate that we want to impute by replacing NA
# with 'Missing'

imputer = SimpleImputer(strategy='constant', 
                       fill_value = 'Missing')

# we fit the imputer to the train set
# the imputer will learn the median of all variables
imputer.fit(X_train)

SimpleImputer(fill_value='Missing', strategy='constant')

In [9]:
# we can look at the learnt modes like this:
imputer.statistics_

array(['Missing', 'Missing'], dtype=object)

In [10]:
# and now we impute the train and test set

# NOTE: the data is returned as a numpy array!!!
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

X_train

array([['Gd', 'Missing'],
       ['Gd', 'Gd'],
       ['TA', 'Missing'],
       ...,
       ['Missing', 'Missing'],
       ['Gd', 'TA'],
       ['Gd', 'Missing']], dtype=object)

In [11]:
# encode the train set back to a dataframe:

X_train = pd.DataFrame(X_train, columns=cols_to_use)
X_train.head()

Unnamed: 0,BsmtQual,FireplaceQu
0,Gd,Missing
1,Gd,Gd
2,TA,Missing
3,TA,Missing
4,TA,Missing


In [12]:
X_train['BsmtQual'].unique()

array(['Gd', 'TA', 'Fa', 'Missing', 'Ex'], dtype=object)

In [13]:
X_train.isnull().mean()

BsmtQual       0.0
FireplaceQu    0.0
dtype: float64

### A MASSIVE NOTE OF CAUTION:

Note that when using SimpleImputer and setting the parameters to:

* strategy='constant'
* fill_value = 'Missing'

If your dataframe contains variables that are numerical and categorical, NA in both will be replaced by 'Missing" therefore converting your numerical variables into categorical, which is probably not what you are after.

Most datasets contain both numerical and categorical variables, so very likely you will have to use a column transformer

In [15]:
# let's load the dataset with both numerical and categorical variables

cols_to_use = [
    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
    'SalePrice'
]

data = pd.read_csv('houseprice.csv', usecols=cols_to_use)
data.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,SalePrice
0,65.0,196.0,Gd,,2003.0,208500
1,80.0,0.0,Gd,TA,1976.0,181500
2,68.0,162.0,Gd,TA,2001.0,223500
3,60.0,0.0,TA,Gd,1998.0,140000
4,84.0,350.0,Gd,TA,2000.0,250000


In [16]:
# let's separate into training and testing set

# first drop the target from the feature list
cols_to_use.remove('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],
                                                    data['SalePrice'],
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((1022, 5), (438, 5))

In [17]:
# let's look at the missing values

X_train.isnull().mean()

BsmtQual       0.023483
FireplaceQu    0.467710
LotFrontage    0.184932
MasVnrArea     0.004892
GarageYrBlt    0.052838
dtype: float64

In [18]:
# first we need to make lists, indicating which features
# will be imputed with each method

features_numeric = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
features_categoric = ['BsmtQual', 'FireplaceQu']

# then we put the features list and the transformers together
# using the column transformer

preprocessor = ColumnTransformer(transformers=[
    ('imputer_numeric', SimpleImputer(strategy='mean'), features_numeric),
    ('imputer_categoric', SimpleImputer(strategy='constant', fill_value='Missing'), features_categoric)])

In [19]:
# now we fit the preprocessor
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('imputer_numeric', SimpleImputer(),
                                 ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']),
                                ('imputer_categoric',
                                 SimpleImputer(fill_value='Missing',
                                               strategy='constant'),
                                 ['BsmtQual', 'FireplaceQu'])])

In [20]:
preprocessor.transformers

[('imputer_numeric',
  SimpleImputer(),
  ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']),
 ('imputer_categoric',
  SimpleImputer(fill_value='Missing', strategy='constant'),
  ['BsmtQual', 'FireplaceQu'])]

In [21]:
preprocessor.named_transformers_['imputer_numeric'].statistics_

array([  69.66866747,  103.55358899, 1978.01239669])

In [22]:
preprocessor.named_transformers_['imputer_categoric'].statistics_

array(['Missing', 'Missing'], dtype=object)

In [23]:
# and now we can impute the data
# remember it returs a numpy array

X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [24]:
# now we convert the result into a dataframe
pd.DataFrame(X_train,
             columns=features_numeric+features_categoric).head()

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,BsmtQual,FireplaceQu
0,69.6687,573,1998.0,Gd,Missing
1,69.6687,0,1996.0,Gd,Gd
2,50.0,0,1978.01,TA,Missing
3,60.0,0,1939.0,TA,Missing
4,60.0,0,1930.0,TA,Missing


In [25]:
# now we convert the result into a dataframe
# and explore the missing values
# there should be none

X_train = pd.DataFrame(X_train,
             columns=features_numeric+features_categoric)

X_train.isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
GarageYrBlt    0.0
BsmtQual       0.0
FireplaceQu    0.0
dtype: float64