### Missing Value imputation using scikit-learn

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv(r"D:\Projects\train.csv")
test = pd.read_csv(r"D:\Projects\test.csv")


In [3]:
train.shape

(1460, 81)

In [4]:
test.shape

(1459, 80)

In [5]:
X_train = train.drop(columns='SalePrice', axis=1)
y_train = train['SalePrice']
X_test = test.copy()

In [7]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_test:", y_train.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (1460, 80)
Shape of y_test: (1460,)
Shape of X_test: (1459, 80)


#### Missing value imputation

In [8]:
null_sum = X_train.isnull().sum()
null_sum

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 80, dtype: int64

In [10]:
# find numerical variables which have missing values
num_var = X_train.select_dtypes(include=['int64', 'float64'])
num_var_miss = [var for var in num_var if null_sum[var] >0 ]
num_var_miss

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [11]:
cat_var = X_train.select_dtypes(include=['object'])
cat_var_miss = [var for var in cat_var if null_sum[var] >0 ]
cat_var_miss

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [22]:
num_var_mean = ['LotFrontage']
num_var_median = ['MasVnrArea', 'GarageYrBlt']
cat_var_mode = ['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu']
cat_var_missing = ['GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [23]:
num_var_mean_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy= "mean"))])
num_var_median_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy= "median"))])
cat_var_mode_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy= "most_frequent"))])
cat_var_missing_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy= "constant", fill_value ='missing'))])

In [24]:
preprocessor = ColumnTransformer(transformers = [('mean_imputer', num_var_mean_imputer, num_var_mean),
                                  ('median_imputer', num_var_median_imputer,num_var_median), 
                                  ('mode_imputer', cat_var_mode_imputer,cat_var_mode),
                                  ('missing_imputer', cat_var_missing_imputer,cat_var_missing)])

In [25]:
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('missing_imputer',
                                 Pipe

In [27]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('missing_im

In [29]:
preprocessor.named_transformers_['mean_imputer'].named_steps['imputer'].statistics_

array([70.04995837])

In [30]:
# lets check mean
train['LotFrontage'].mean()

70.04995836802665

In [31]:
preprocessor.named_transformers_['mode_imputer'].named_steps['imputer'].statistics_

array(['Grvl', 'None', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd'],
      dtype=object)

In [32]:
X_train_clean = preprocessor.transform(X_train)
X_test_clean = preprocessor.transform(X_test)

In [33]:
X_train_clean   # convert into a dataframe

array([[65.0, 196.0, 2003.0, ..., 'missing', 'missing', 'missing'],
       [80.0, 0.0, 1976.0, ..., 'missing', 'missing', 'missing'],
       [68.0, 162.0, 2001.0, ..., 'missing', 'missing', 'missing'],
       ...,
       [66.0, 0.0, 1941.0, ..., 'missing', 'GdPrv', 'Shed'],
       [68.0, 0.0, 1950.0, ..., 'missing', 'missing', 'missing'],
       [75.0, 0.0, 1965.0, ..., 'missing', 'missing', 'missing']],
      dtype=object)

In [37]:
preprocessor.transformers_

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['LotFrontage']),
 ('median_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['MasVnrArea', 'GarageYrBlt']),
 ('mode_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['Alley',
   'MasVnrType',
   'BsmtQual',
   'BsmtCond',
   'BsmtExposure',
   'BsmtFinType1',
   'BsmtFinType2',
   'Electrical',
   'FireplaceQu']),
 ('missing_imputer',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='missing', strategy='constant'))]),
  ['GarageType',
   'GarageFinish',
   'GarageQual',
   'GarageCond',
   'PoolQC',
   'Fence',
   'MiscFeature']),
 ('remainder',
  'drop',
  [0,
   1,
   2,
   4,
   5,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   27,
   28,
   29,
   34,
   36,
   37,
   38,
   39,
   40,
   41,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   

In [39]:
X_train_clean_miss_var = pd.DataFrame(X_train_clean, columns=num_var_mean + num_var_median + cat_var_mode + cat_var_missing)

In [40]:
X_train_clean_miss_var.isnull().sum().sum()

0

In [43]:
train["Alley"].value_counts()

Grvl    50
Pave    41
Name: Alley, dtype: int64

In [46]:
X_train_clean_miss_var["Alley"].value_counts()

Grvl    1419
Pave      41
Name: Alley, dtype: int64

In [47]:
X_train_clean_miss_var["MiscFeature"].value_counts()

missing    1406
Shed         49
Othr          2
Gar2          2
TenC          1
Name: MiscFeature, dtype: int64

### Categorical Variable


#### One hot encoding/ Dummy Variables

In [49]:
import seaborn as sns

In [50]:
tips_df = sns.load_dataset("tips")

In [51]:
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
