### Import Libraries and Packages

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.impute import MissingIndicator,SimpleImputer

In [2]:
df = pd.read_csv('/content/train.csv',usecols=['Age','Fare','Survived'])
df.sample(5)

Unnamed: 0,Survived,Age,Fare
277,0,,0.0
33,0,66.0,10.5
200,0,28.0,9.5
566,0,19.0,7.8958
873,0,47.0,9.0


In [3]:
# Separate Independent and Dependent Features
X = df.drop(columns=['Survived'])
y = df['Survived']

# Perform Train Test Split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)
X_train.sample(5)

Unnamed: 0,Age,Fare
837,,8.05
856,45.0,164.8667
12,20.0,8.05
884,25.0,7.05
54,65.0,61.9792


#### Using Simple Imputer from Scikit Learn

In [4]:
si = SimpleImputer()
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.transform(X_test)

In [5]:
X_train_trf

array([[ 40.        ,  27.7208    ],
       [  4.        ,  16.7       ],
       [ 47.        ,   9.        ],
       ...,
       [ 71.        ,  49.5042    ],
       [ 29.78590426, 221.7792    ],
       [ 29.78590426,  25.925     ]])

#### Train and test the model

In [6]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train_trf,y_train)

y_pred = clf.predict(X_test_trf)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6145251396648045

#### Using Missing Indicator from Scikit Learn

In [7]:
mi = MissingIndicator()

mi.fit(X_train)

MissingIndicator()

In [8]:
mi.features_

array([0])

In [9]:
X_train_missing = mi.transform(X_train)

X_train_missing[0:10]

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True]])

In [10]:
X_test_missing = mi.transform(X_test)

X_test_missing [0:10]

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False]])

In [11]:
X_train['Age_NA'] = X_train_missing

X_test[0:10]

Unnamed: 0,Age,Fare
707,42.0,26.2875
37,21.0,8.05
615,24.0,65.0
169,28.0,56.4958
68,17.0,7.925
606,30.0,7.8958
630,80.0,30.0
785,25.0,7.25
660,50.0,133.65
728,25.0,26.0


In [12]:
X_test['Age_NA'] = X_test_missing

X_train[0:10]

Unnamed: 0,Age,Fare,Age_NA
30,40.0,27.7208,False
10,4.0,16.7,False
873,47.0,9.0,False
182,9.0,31.3875,False
876,20.0,9.8458,False
213,30.0,13.0,False
157,30.0,8.05,False
780,13.0,7.2292,False
572,36.0,26.3875,False
77,,8.05,True


In [13]:
si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [14]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred = clf.predict(X_test_trf2)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6312849162011173

In [15]:
si = SimpleImputer(add_indicator=True)

X_train = si.fit_transform(X_train)

X_test = si.transform(X_test)

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred = clf.predict(X_test_trf2)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)


0.6312849162011173