In [106]:
import numpy as np
import pandas as pd
from numpy import nan

from sklearn.impute import SimpleImputer

In [2]:
# Create Main DF:

data = {'name': ['abc1', 'abc2', 'abc3', 'abc4', 'abc5', 'abc6','abc7', 'abc8', 'abc9', 'abc10'], 
        'age': [10, 15, 20, 25, 30, 35,40,45,50,50],
'sex': ['male','female','male','female','male','female','male','female','male','female'],
'sal': [100,200,300,400,500,600,700,800,900,1000]
       }

maindf = pd.DataFrame(data)
maindf

Unnamed: 0,name,age,sex,sal
0,abc1,10,male,100
1,abc2,15,female,200
2,abc3,20,male,300
3,abc4,25,female,400
4,abc5,30,male,500
5,abc6,35,female,600
6,abc7,40,male,700
7,abc8,45,female,800
8,abc9,50,male,900
9,abc10,50,female,1000


In [3]:
maindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    10 non-null     object
 1   age     10 non-null     int64 
 2   sex     10 non-null     object
 3   sal     10 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 448.0+ bytes


# Mean Imputation

In [54]:
data = {'name': ['abc1', 'abc2', 'abc3', 'abc4', 'abc5', 'abc6','abc7', 'abc8', 'abc9', 'abc10'], 
        'age': [10, 15, None, 25, 30, None ,40,45, None,50],
'sex': ['male','female','male','female','male','female','male','female','male','female'],
'sal': [100,200,300,400,500,600,700,800,900,1000]
       }

traindf = pd.DataFrame(data)
traindf

Unnamed: 0,name,age,sex,sal
0,abc1,10.0,male,100
1,abc2,15.0,female,200
2,abc3,,male,300
3,abc4,25.0,female,400
4,abc5,30.0,male,500
5,abc6,,female,600
6,abc7,40.0,male,700
7,abc8,45.0,female,800
8,abc9,,male,900
9,abc10,50.0,female,1000


In [55]:
testdf.isnull().sum()

name    0
age     3
sex     0
sal     0
dtype: int64

In [77]:
data = {'name': ['xyz1', 'xyz2', 'xyz3', 'xyz4', 'xyz5', 'xyz6','xyz7', 'xyz8', 'xyz9', 'xyz10'], 
        'age': [11, 22, None, 44, 55, None ,77,88, None,100],
'sex': ['male','female','male','female','male','female','male','female','male','female'],
'sal': [100,200,None,400,500,None,700,800,None,1000]
       }

testdf = pd.DataFrame(data)
testdf

Unnamed: 0,name,age,sex,sal
0,xyz1,11.0,male,100.0
1,xyz2,22.0,female,200.0
2,xyz3,,male,
3,xyz4,44.0,female,400.0
4,xyz5,55.0,male,500.0
5,xyz6,,female,
6,xyz7,77.0,male,700.0
7,xyz8,88.0,female,800.0
8,xyz9,,male,
9,xyz10,100.0,female,1000.0


In [64]:
testdf.isnull().sum()

name    0
age     3
sex     0
sal     3
dtype: int64

In [58]:
# create the imputer, the strategy can be mean and median.

imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #by default also 'mean' strategy is used.
imputer

In [59]:
# all rows, age & sal columns. Only numerical columns should be passed.
# if we want imputer to work only age column then only one column we can pass.

imputer.fit(traindf.loc[:,['age','sal']])  

SimpleImputer()

In [62]:
# apply the transformation to the train and test

traindf.loc[:,['age','sal']] = imputer.transform(traindf.loc[:,['age','sal']])  
traindf


#below we can see mean value is applied in age column whereever null was there.
# sal field there was no null, so no action applied. 

Unnamed: 0,name,age,sex,sal
0,abc1,10.0,male,100.0
1,abc2,15.0,female,200.0
2,abc3,30.714286,male,300.0
3,abc4,25.0,female,400.0
4,abc5,30.0,male,500.0
5,abc6,30.714286,female,600.0
6,abc7,40.0,male,700.0
7,abc8,45.0,female,800.0
8,abc9,30.714286,male,900.0
9,abc10,50.0,female,1000.0


In [66]:
traindf.isnull().sum()   # no null values

name    0
age     0
sex     0
sal     0
dtype: int64

In [68]:
traindf.age.mean()

30.714285714285715

In [65]:
testdf.loc[:,['age','sal']] = imputer.transform(testdf.loc[:,['age','sal']])
testdf

Unnamed: 0,name,age,sex,sal
0,xyz1,11.0,male,100.0
1,xyz2,22.0,female,200.0
2,xyz3,30.714286,male,550.0
3,xyz4,44.0,female,400.0
4,xyz5,55.0,male,500.0
5,xyz6,30.714286,female,550.0
6,xyz7,77.0,male,700.0
7,xyz8,88.0,female,800.0
8,xyz9,30.714286,male,550.0
9,xyz10,100.0,female,1000.0


In [67]:
testdf.isnull().sum() # no null values

name    0
age     0
sex     0
sal     0
dtype: int64

In [70]:
testdf.age.mean()  

#but above we can see that the replaced value is 30.71 i.e the mean of traindf age column. This is
# because the imputer was trained with traindf. So in testdf the trained value got applied. Need to remember this. 

48.91428571428572

In [75]:
# we can train the imputer based on test data also and apply that. 

imputer2 = imputer.fit(testdf.loc[:,['age','sal']])

imputer2

SimpleImputer()

In [79]:
# redefined the testdf

testdf

Unnamed: 0,name,age,sex,sal
0,xyz1,11.0,male,100.0
1,xyz2,22.0,female,200.0
2,xyz3,,male,
3,xyz4,44.0,female,400.0
4,xyz5,55.0,male,500.0
5,xyz6,,female,
6,xyz7,77.0,male,700.0
7,xyz8,88.0,female,800.0
8,xyz9,,male,
9,xyz10,100.0,female,1000.0


In [80]:
testdf.loc[:,['age','sal']] = imputer2.transform(testdf.loc[:,['age','sal']])
testdf

#now we can use the imputed value is 48.91 i.e average based on age column from test data set. 
#it's better to do this way, instead of using average value from traindf. 
#since the method is same to apply imputer for both traindf, testdf, in future exercises will cover the example
# only for traindf to save the time. 

Unnamed: 0,name,age,sex,sal
0,xyz1,11.0,male,100.0
1,xyz2,22.0,female,200.0
2,xyz3,48.914286,male,535.0
3,xyz4,44.0,female,400.0
4,xyz5,55.0,male,500.0
5,xyz6,48.914286,female,535.0
6,xyz7,77.0,male,700.0
7,xyz8,88.0,female,800.0
8,xyz9,48.914286,male,535.0
9,xyz10,100.0,female,1000.0


# Median Imputation

In [82]:
data = {'name': ['abc1', 'abc2', 'abc3', 'abc4', 'abc5', 'abc6','abc7', 'abc8', 'abc9', 'abc10'], 
        'age': [10, 15, None, 25, 30, None ,40,45, None,50],
'sex': ['male','female','male','female','male','female','male','female','male','female'],
'sal': [100,200,300,400,500,600,700,800,900,1000]
       }

traindf = pd.DataFrame(data)
traindf

Unnamed: 0,name,age,sex,sal
0,abc1,10.0,male,100
1,abc2,15.0,female,200
2,abc3,,male,300
3,abc4,25.0,female,400
4,abc5,30.0,male,500
5,abc6,,female,600
6,abc7,40.0,male,700
7,abc8,45.0,female,800
8,abc9,,male,900
9,abc10,50.0,female,1000


In [83]:
# to use Median strategy need to define as below. 

imputer = SimpleImputer(missing_values=np.nan, strategy='median')

In [84]:
imputer = imputer.fit(traindf.loc[:,['age','sal']])

imputer

SimpleImputer(strategy='median')

In [86]:
traindf.loc[:,['age','sal']] = imputer.transform(traindf.loc[:,['age','sal']])
traindf

# median value 30 is applied. 

Unnamed: 0,name,age,sex,sal
0,abc1,10.0,male,100.0
1,abc2,15.0,female,200.0
2,abc3,30.0,male,300.0
3,abc4,25.0,female,400.0
4,abc5,30.0,male,500.0
5,abc6,30.0,female,600.0
6,abc7,40.0,male,700.0
7,abc8,45.0,female,800.0
8,abc9,30.0,male,900.0
9,abc10,50.0,female,1000.0


In [87]:
traindf.isnull().sum()

name    0
age     0
sex     0
sal     0
dtype: int64

# Arbitrary Value Imputation

In [114]:
data = {'name': ['abc1', 'abc2', 'abc3', 'abc4', 'abc5', 'abc6','abc7', 'abc8', 'abc9', 'abc10'], 
        'age': [10, 15, 20, np.nan, 30, 35,np.nan,45,50,np.nan],
'sex': [np.nan, np.nan,'male','female','male','female','male','female','male','female'],
'sal': [100,200,300,400,500,600,700,800,900,1000]
       }

df = pd.DataFrame(data)
df

# to define NULL is always better to use np.nan method instead of None or any other. 

Unnamed: 0,name,age,sex,sal
0,abc1,10.0,,100
1,abc2,15.0,,200
2,abc3,20.0,male,300
3,abc4,,female,400
4,abc5,30.0,male,500
5,abc6,35.0,female,600
6,abc7,,male,700
7,abc8,45.0,female,800
8,abc9,50.0,male,900
9,abc10,,female,1000


In [115]:
df.isnull().sum()

name    0
age     3
sex     2
sal     0
dtype: int64

In [116]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=999)

In [117]:
imputer.fit(df)

SimpleImputer(fill_value=999, strategy='constant')

In [118]:
df = imputer.transform(df)

In [119]:
df   

#This strategry works on both numerical and categorical column. If the idea was the just replace in age column
#then need to provide that explicty.

array([['abc1', 10.0, 999, 100],
       ['abc2', 15.0, 999, 200],
       ['abc3', 20.0, 'male', 300],
       ['abc4', 999, 'female', 400],
       ['abc5', 30.0, 'male', 500],
       ['abc6', 35.0, 'female', 600],
       ['abc7', 999, 'male', 700],
       ['abc8', 45.0, 'female', 800],
       ['abc9', 50.0, 'male', 900],
       ['abc10', 999, 'female', 1000]], dtype=object)

In [120]:
#redine the data set

data = {'name': ['abc1', 'abc2', 'abc3', 'abc4', 'abc5', 'abc6','abc7', 'abc8', 'abc9', 'abc10'], 
        'age': [10, 15, 20, np.nan, 30, 35,np.nan,45,50,np.nan],
'sex': [np.nan, np.nan,'male','female','male','female','male','female','male','female'],
'sal': [100,200,300,400,500,600,700,800,900,1000]
       }

df = pd.DataFrame(data)
df


Unnamed: 0,name,age,sex,sal
0,abc1,10.0,,100
1,abc2,15.0,,200
2,abc3,20.0,male,300
3,abc4,,female,400
4,abc5,30.0,male,500
5,abc6,35.0,female,600
6,abc7,,male,700
7,abc8,45.0,female,800
8,abc9,50.0,male,900
9,abc10,,female,1000


In [121]:
df.isnull().sum()

name    0
age     3
sex     2
sal     0
dtype: int64

In [124]:
imputer.fit(df.loc[:,['age']])

SimpleImputer(fill_value=999, strategy='constant')

In [125]:
df['age'] = imputer.transform(df.loc[:,['age']])

In [126]:
df  # here we can see sex column is not changed. 

Unnamed: 0,name,age,sex,sal
0,abc1,10.0,,100
1,abc2,15.0,,200
2,abc3,20.0,male,300
3,abc4,999.0,female,400
4,abc5,30.0,male,500
5,abc6,35.0,female,600
6,abc7,999.0,male,700
7,abc8,45.0,female,800
8,abc9,50.0,male,900
9,abc10,999.0,female,1000


In [133]:
# if we want to keep column as integer only then

df['age'] = imputer.transform(df.loc[:,['age']]).astype(int)  #please note the astype here.

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    10 non-null     object
 1   age     10 non-null     int64 
 2   sex     8 non-null      object
 3   sal     10 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 448.0+ bytes


In [135]:
df

Unnamed: 0,name,age,sex,sal
0,abc1,10,,100
1,abc2,15,,200
2,abc3,20,male,300
3,abc4,999,female,400
4,abc5,30,male,500
5,abc6,35,female,600
6,abc7,999,male,700
7,abc8,45,female,800
8,abc9,50,male,900
9,abc10,999,female,1000
