In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer,MissingIndicator

In [2]:
df = pd.read_csv("/home/yash/Downloads/titanic_toy.csv").drop(columns = ["Family"])
df.head()

Unnamed: 0,Age,Fare,Survived
0,22.0,7.25,0
1,38.0,71.2833,1
2,26.0,7.925,1
3,35.0,53.1,1
4,35.0,8.05,0


In [3]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns = ["Survived"]),df["Survived"],test_size = 0.2,random_state = 0)
x_train.head()

Unnamed: 0,Age,Fare
140,,15.2458
439,31.0,10.5
817,31.0,37.0042
378,20.0,
491,21.0,7.25


## Use SimpleImputer 

In [4]:
imputer = SimpleImputer()
x_train_trf = imputer.fit_transform(x_train)
x_test_trf = imputer.transform(x_test)
# Default strategy = 'mean'
# it concert dataframe into numpy array

In [5]:
pd.DataFrame(x_train_trf).head()

Unnamed: 0,0,1
0,29.745184,15.2458
1,31.0,10.5
2,31.0,37.0042
3,20.0,31.885314
4,21.0,7.25


In [6]:
# train a model 
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train_trf,y_train)

In [7]:
# predict output from x_test_trf 
y_pred_trf = lr.predict(x_test_trf)

# find accuracy score to compare with y_test
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_trf)*100

69.27374301675978

## Use MissingIndicator

In [8]:
mi = MissingIndicator()
mi.fit(x_train)
x_train_mi = mi.transform(x_train)
x_test_mi = mi.transform(x_test)
# it also concert dataframe into numpy array

In [9]:
pd.DataFrame(x_train_mi).head()

Unnamed: 0,0,1
0,True,False
1,False,False
2,False,False
3,False,True
4,False,False


In [10]:
# Convert Into DataFrame again 
x_train_mi = pd.DataFrame(x_train_mi,columns = ["Age_missing","Fare_missing"])
x_test_mi = pd.DataFrame(x_test_mi,columns = ["Age_missing","Fare_missing"])

In [11]:
x_train.head()

Unnamed: 0,Age,Fare
140,,15.2458
439,31.0,10.5
817,31.0,37.0042
378,20.0,
491,21.0,7.25


In [12]:
# Reset index because of index it return an error
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)

x_train_mi = x_train_mi.reset_index(drop=True)
x_test_mi = x_test_mi.reset_index(drop=True)

In [13]:
x_train.head()

Unnamed: 0,Age,Fare
0,,15.2458
1,31.0,10.5
2,31.0,37.0042
3,20.0,
4,21.0,7.25


In [14]:
# join x_train and x_test_mi and same for test
a = pd.concat([x_train,x_train_mi],axis = 1)
b = pd.concat([x_test,x_test_mi],axis = 1)

In [15]:
a.head()

Unnamed: 0,Age,Fare,Age_missing,Fare_missing
0,,15.2458,True,False
1,31.0,10.5,False,False
2,31.0,37.0042,False,False
3,20.0,,False,True
4,21.0,7.25,False,False


In [16]:
imputer = SimpleImputer()
x_train_trf2 = imputer.fit_transform(a)
x_test_trf2 = imputer.transform(b)

In [17]:
# Train a model again
lr.fit(x_train_trf2,y_train)

In [18]:
# predict output from x_test_trf2 
y_pred_trf2 = lr.predict(x_test_trf2)

# find accuracy score to compare with y_test
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_trf2)*100

67.59776536312849

## Use MissingIndicater Directly from sklearn function   

In [19]:
imputer = SimpleImputer(add_indicator = True)

x_train_trf3 = imputer.fit_transform(x_train)
x_test_trf3 = imputer.transform(x_test)
# it do MissingIndicator and SimpleIputer both

In [20]:
pd.DataFrame(x_train_trf3).shape

(712, 4)

In [21]:
lr.fit(x_train_trf3,y_train) # train a model
y_pred_trf3 = lr.predict(x_test_trf3)
accuracy_score(y_test,y_pred_trf3)*100
# Same accuracy like MissingIndector

67.59776536312849