**Imputing = handling missing data by replacing it with a value**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv('titanic_toy.csv')

In [None]:
df.head()

## Using Pandas

In [None]:
df.isnull().mean()

In [None]:
df.info()

In [None]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
X_train.shape, X_test.shape

In [None]:
X_train.isnull().mean()

In [None]:
age_mean = X_train['Age'].mean()
age_median = X_train['Age'].median()

In [None]:
X_train['Age_mean'] = X_train['Age'].fillna(age_mean)
X_train['Age_median'] = X_train['Age'].fillna(age_median)

X_train['Fare_mean'] = X_train['Fare'].fillna(age_mean)
X_train['Fare_median'] = X_train['Fare'].fillna(age_median)

In [None]:
X_train.sample(5)

In [None]:
print('Original Age Variance :',X_train['Age'].var())
print('Age Variance after mean imputation :',X_train['Age_mean'].var())
print('Age Variance after median imputation :',X_train['Age_median'].var())
print(50*"-")
print('Original Fare variable variance: ', X_train['Fare'].var())
print('Fare Variance after mean imputation :',X_train['Fare_mean'].var())
print('Fare Variance after median imputation :',X_train['Fare_median'].var())

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

X_train['Age'].plot(kind='kde',ax=ax)
X_train['Age_mean'].plot(kind='kde',ax=ax,color='red')
X_train['Age_median'].plot(kind='kde',ax=ax,color='green')

# add labels
lines, labels= ax.get_legend_handles_labels()
ax.legend(lines,labels,loc='best')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

X_train['Fare'].plot(kind='kde',ax=ax)
X_train['Fare_mean'].plot(kind='kde',ax=ax,color='red')
X_train['Fare_median'].plot(kind ='kde',ax=ax,color='green')

lines,labels = ax.get_legend_handles_labels()
ax.legend(lines,labels,loc='best')

In [None]:
X_train.cov()

In [None]:
X_train.corr()

In [None]:
X_train[['Age','Age_mean','Age_median']].boxplot()

In [None]:
X_train[['Fare','Fare_mean','Fare_median']].boxplot()

## Using sklearn

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
imputer1 = SimpleImputer(strategy='median')
imputer2 = SimpleImputer(strategy='mean')

In [None]:
trf = ColumnTransformer([
    ('imputer1',imputer1,['Age']),
    ('imputer2',imputer2,['Fare'])
],remainder='passthrough')

In [None]:
trf.fit(X_train)

In [None]:
X_train = trf.transform(X_train)
X_test = trf.transform(X_test)

In [None]:
X_train

In [None]:
trf.named_transformers_['imputer1'].statistics_   #median value for age

In [None]:
trf.named_transformers_['imputer2'].statistics_   # mean value for fare