In [1]:
# Import Library and Dataset
import pandas as pd
train_df = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')
train_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [11]:
train_df.shape

(614, 13)

In [8]:
# Check for Null Values
pd.DataFrame(train_df.isnull().sum())

Unnamed: 0,0
Loan_ID,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14


In [9]:
print(train_df.isnull().sum().sum())

149


In [None]:
# Code to drop the entire row is as follows:
# df = train_df.dropna(axis=0)
# df.isnull().sum()

In [None]:
# Code to drop the entire column is as follows:
# df = train_df.drop(['Dependents'],axis=1)
# df.isnull().sum()

#### Imputing the Missing Value
There are different ways of replacing the missing values. You can use the python libraries Pandas and Sci-kit learn as follows:

##### Replacing With Arbitrary Value
If you can make an educated guess about the missing value then you can replace it with some arbitrary value using the following code.

Ex. In the following code, we are replacing the missing values of the ‘Dependents’ column with ‘0’.

In [None]:
#Replace the missing value with '0' using 'fiilna' method
train_df['Dependents'] = train_df['Dependents'].fillna(0)
train_df[‘Dependents'].isnull().sum()

#### Replacing With Mean
This is the most common method of imputing missing values of numeric columns. If there are outliers then the mean will not be appropriate. In such cases, outliers need to be treated first.
You can use the ‘fillna’ method for imputing the columns ‘LoanAmount’ and ‘Credit_History’ with the mean of the respective column values.

In [12]:
train_df['LoanAmount'].mean()

146.41216216216216

In [13]:
train_df['Credit_History'].mean()

0.8421985815602837

In [None]:
#Replace the missing values for numerical columns with mean
train_df['LoanAmount'] = train_df['LoanAmount'].fillna(train_df['LoanAmount'].mean())
train_df['Credit_History'] = train_df[‘Credit_History'].fillna(train_df['Credit_History'].mean())

#### Replacing With Mode
Mode is the most frequently occurring value. It is used in the case of categorical features.
You can use the ‘fillna’ method for imputing the categorical columns ‘Gender’, ‘Married’, and ‘Self_Employed’.

In [16]:
print(train_df['Gender'].mode()[0])
print(train_df['Married'].mode()[0])
print(train_df['Self_Employed'].mode()[0])

Male
Yes
No


In [None]:
#Replace the missing values for categorical columns with mode
train_df['Gender'] = train_df['Gender'].fillna(train_df['Gender'].mode()[0])
train_df['Married'] = train_df['Married'].fillna(train_df['Married'].mode()[0])
train_df['Self_Employed'] = train_df['Self_Employed'].fillna(train_df['Self_Employed'].mode()[0])
train_df.isnull().sum()

#### Replacing With Median

Median is the middlemost value. It’s better to use the median value for imputation in the case of outliers.

You can use ‘fillna’ method for imputing the column ‘Loan_Amount_Term’ with the median value.

In [17]:
train_df['Loan_Amount_Term'].median()

360.0

In [None]:
train_df['Loan_Amount_Term']= train_df['Loan_Amount_Term'].fillna(train_df['Loan_Amount_Term'].median())

#### Replacing with previous value – Forward fill

In some cases, imputing the values with the previous value instead of mean, mode or median is more appropriate. This is called forward fill. It is mostly used in time series data.

You can use ‘fillna’ function with the parameter ‘method = ffill’

In [None]:
import pandas as pd
import numpy as np
test = pd.Series(range(6))
test.loc[2:4] = np.nan
test

In [None]:
# Forward-Fill
test.fillna(method=‘ffill')

#### Replacing with next value – Backward fill

In backward fill, the missing value is imputed using the next value.

In [None]:
# Backward-Fill
test.fillna(method=‘bfill')

#### Interpolation

Missing values can also be imputed using interpolation. Pandas interpolate method can be used to replace the missing values with different interpolation methods like ‘polynomial’, ‘linear’, ‘quadratic’. Default method is ‘linear’.

In [None]:
test.interpolate()

### Imputing Missing Values For Categorical Features

There are two ways to impute missing values for categorical features as follows:

#### Impute the Most Frequent Value

We will make use of ‘SimpleImputer’ in this case and as this is a non-numeric column we can’t use mean or median but we can use most frequent value and constant.

In [19]:
import pandas as pd
import numpy as np
X = pd.DataFrame({'Shape':['square', 'square', 'oval', 'circle', np.nan]})
X.Shape

0    square
1    square
2      oval
3    circle
4       NaN
Name: Shape, dtype: object

In [20]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'most_frequent')
imputer.fit_transform(X)

array([['square'],
       ['square'],
       ['oval'],
       ['circle'],
       ['square']], dtype=object)

#### Impute the Value “missing”, which treats it as a Separate Category

In [21]:
import pandas as pd
import numpy as np
X = pd.DataFrame({'Shape':['square', 'square', 'oval', 'circle', np.nan]})
X.Shape

0    square
1    square
2      oval
3    circle
4       NaN
Name: Shape, dtype: object

In [22]:
imputer = SimpleImputer(strategy='constant', fill_value = 'missing')
imputer.fit_transform(X)

array([['square'],
       ['square'],
       ['oval'],
       ['circle'],
       ['missing']], dtype=object)

### Imputation of Missing Value Using sci-kit learn Library

#### Univariate Approach

In a Univariate approach, only a single feature is taken into consideration. You can use the class SimpleImputer and replace the missing values with mean, mode, median or some constant value.

Let’s see an example:

In [24]:
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imp.fit([[1, 2], [np.nan, 3], [7, 6]])

SimpleImputer()

In [28]:
X = [[np.nan, 2], [6, np.nan], [7, 6]]
pd.DataFrame(X)

Unnamed: 0,0,1
0,,2.0
1,6.0,
2,7.0,6.0


In [30]:
pd.DataFrame(imp.transform(X))

Unnamed: 0,0,1
0,4.0,2.0
1,6.0,3.666667
2,7.0,6.0


#### Multivariate Approach
In a multivariate approach, more than one feature is taken into consideration. There are two ways to impute missing values considering the multivariate approach. Using KNNImputer or IterativeImputer classes.

Let’s take an example of a titanic dataset.

Suppose the feature ‘age’ is well correlated with the feature ‘Fare’ such that people with lower fares are also younger and people with higher fares are also older.

In that case, it would make sense to impute low age for low fare values and high age for high fares values. So here we are taking multiple features into account by following a multivariate approach.

In [31]:
import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain', nrows = 6)
cols = ['SibSp', 'Fare', 'Age']
X = df[cols]
X

Unnamed: 0,SibSp,Fare,Age
0,1,7.25,22.0
1,1,71.2833,38.0
2,0,7.925,26.0
3,1,53.1,35.0
4,0,8.05,35.0
5,0,8.4583,


In [33]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
impute_it = IterativeImputer()
pd.DataFrame(impute_it.fit_transform(X))

Unnamed: 0,0,1,2
0,1.0,7.25,22.0
1,1.0,71.2833,38.0
2,0.0,7.925,26.0
3,1.0,53.1,35.0
4,0.0,8.05,35.0
5,0.0,8.4583,28.506395


#### Nearest Neighbors Imputations (KNNImputer)

Missing values are imputed using the k-Nearest Neighbors approach where a Euclidean distance is used to find the nearest neighbors.

Let’s take the above example of the titanic dataset to see how it works.

In [35]:
from sklearn.impute import KNNImputer
impute_knn = KNNImputer(n_neighbors = 2)
pd.DataFrame(impute_knn.fit_transform(X))

Unnamed: 0,0,1,2
0,1.0,7.25,22.0
1,1.0,71.2833,38.0
2,0.0,7.925,26.0
3,1.0,53.1,35.0
4,0.0,8.05,35.0
5,0.0,8.4583,30.5


###  Adding missing indicator to encode “missingness” as a feature


In some cases, while imputing missing values, you can preserve information about which values were missing and use that as a feature.
Because sometimes there may be a relationship between the reason for missing values (also called the “missingness”) and the target variable you are trying to predict.
Why do we need to do this?

Suppose you are predicting the presence of a disease and you can imagine a scenario in which a missing age is a good predictor of a disease because assume that we don’t have records for people in poverty. The age values are not missing at random. They are missing for people in poverty and poverty is a good predictor of disease. Thus, missing age or “missingness” is a good predictor of disease.

In [36]:
import pandas as pd
import numpy as np
X = pd.DataFrame({'Age':[20, 30, 10, np.nan, 10]})
X

Unnamed: 0,Age
0,20.0
1,30.0
2,10.0
3,
4,10.0


In [39]:
from sklearn.impute import SimpleImputer
# impute the mean
imputer = SimpleImputer()
pd.DataFrame(imputer.fit_transform(X))

Unnamed: 0,0
0,20.0
1,30.0
2,10.0
3,17.5
4,10.0


In [40]:
imputer = SimpleImputer(add_indicator = True)
imputer.fit_transform(X)

array([[20. ,  0. ],
       [30. ,  0. ],
       [10. ,  0. ],
       [17.5,  1. ],
       [10. ,  0. ]])