In [None]:
# Blog link at
#https://towardsdatascience.com/imputing-missing-values-using-the-simpleimputer-class-in-sklearn

In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('./NaNDataset.csv')
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,,6.0,'Good'
2,7,,9.0,'Excellent'
3,10,11.0,12.0,
4,13,14.0,15.0,'Excellent'
5,16,17.0,,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


In [2]:
df['B'] = df['B'].fillna(df['B'].mean())
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,11.166667,6.0,'Good'
2,7,11.166667,9.0,'Excellent'
3,10,11.0,12.0,
4,13,14.0,15.0,'Excellent'
5,16,17.0,,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


In [3]:
df['D'] = df['D'].fillna(df['D'].value_counts().index[0])
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,11.166667,6.0,'Good'
2,7,11.166667,9.0,'Excellent'
3,10,11.0,12.0,'Excellent'
4,13,14.0,15.0,'Excellent'
5,16,17.0,,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


# Sklearn SimpleImputer class

In [4]:
from sklearn.impute import SimpleImputer
df = pd.read_csv('NaNDataset.csv')
imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
imputer = imputer.fit(df[['B']])
df['B'] = imputer.transform(df[['B']])
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,11.166667,6.0,'Good'
2,7,11.166667,9.0,'Excellent'
3,10,11.0,12.0,
4,13,14.0,15.0,'Excellent'
5,16,17.0,,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


In [5]:
imputer = SimpleImputer(strategy='mean', missing_values=np.nan)

In [6]:
imputer = imputer.fit(df[['B']])

In [7]:
df['B'] = imputer.transform(df[['B']])

In [8]:
df['B'] = imputer.transform(df[['B']])

In [9]:
# Replace multiple columns
df = pd.read_csv('NaNDataset.csv')
imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
imputer = imputer.fit(df[['B','C']])
df[['B','C']] = imputer.transform(df[['B','C']])
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,11.166667,6.0,'Good'
2,7,11.166667,9.0,'Excellent'
3,10,11.0,12.0,
4,13,14.0,15.0,'Excellent'
5,16,17.0,11.428571,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


In [10]:
# Replace with median
df = pd.read_csv('NaNDataset.csv')
imputer = SimpleImputer(strategy='median', missing_values=np.nan)
imputer = imputer.fit(df[['B','C']])
df[['B','C']] = imputer.transform(df[['B','C']])
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,11.5,6.0,'Good'
2,7,11.5,9.0,'Excellent'
3,10,11.0,12.0,
4,13,14.0,15.0,'Excellent'
5,16,17.0,12.0,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


In [11]:
# Replace with most frequent values
df = pd.read_csv('NaNDataset.csv')
imputer = SimpleImputer(strategy='most_frequent', 
                        missing_values=np.nan)
imputer = imputer.fit(df[['D']])
df[['D']] = imputer.transform(df[['D']])
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,,6.0,'Good'
2,7,,9.0,'Excellent'
3,10,11.0,12.0,'Excellent'
4,13,14.0,15.0,'Excellent'
5,16,17.0,,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


In [12]:
# Replace with a fixed value of constant as 0
df = pd.read_csv('NaNDataset.csv')
imputer = SimpleImputer(strategy='constant',
                        missing_values=np.nan, fill_value=0)
imputer = imputer.fit(df[['B','C']])
df[['B','C']] = imputer.transform(df[['B','C']])
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,0.0,6.0,'Good'
2,7,0.0,9.0,'Excellent'
3,10,11.0,12.0,
4,13,14.0,15.0,'Excellent'
5,16,17.0,0.0,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


In [13]:
#Applying the SimpleImputer to the entire dataframe
df = pd.read_csv('NaNDataset.csv')
imputer = SimpleImputer(strategy='most_frequent', 
                        missing_values=np.nan)
imputer = imputer.fit(df)
df.iloc[:,:] = imputer.transform(df)
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,11.0,6.0,'Good'
2,7,11.0,9.0,'Excellent'
3,10,11.0,12.0,'Excellent'
4,13,14.0,15.0,'Excellent'
5,16,17.0,12.0,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


In [14]:
#Another technique is to create a new dataframe using the result returned by the transform() function:

df = pd.DataFrame(imputer.transform(df.loc[:,:]), 
                  columns = df.columns)
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,11.0,6.0,'Good'
2,7,11.0,9.0,'Excellent'
3,10,11.0,12.0,'Excellent'
4,13,14.0,15.0,'Excellent'
5,16,17.0,12.0,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'
