In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# create a dataframe having some missing values

dict1={'Names':['Tarun','Ashish','Suri',None],
       'Age':[30,31,np.nan,32],
       'City':[None,'Hyd','Pune','Blr']
      }
df1=pd.DataFrame(dict1)
df1

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


In [3]:
df1.dtypes

Names     object
Age      float64
City      object
dtype: object

In [4]:
df1.to_csv('missing_val_df1.csv',index=False)

In [5]:
dict2={'Names':['Tarun','Ashish','Suri',None],
       'Age':[30,31,None,32],
       'City':[None,'Hyd','Pune','Blr']
      }
df2=pd.DataFrame(dict2)
df2

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


In [6]:
df2.dtypes

Names     object
Age      float64
City      object
dtype: object

In [7]:
dict3={'Names':['Tarun','Ashish','Suri',None],
       'Age':[30,31,None,'32'],
       'City':[None,'Hyd','Pune','Blr']
      }
df3=pd.DataFrame(dict3)
df3

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


In [8]:
df3.dtypes

Names    object
Age      object
City     object
dtype: object

In [9]:
dict4={'Names':['Tarun','Ashish','Suri',None],
       'Age':[30,31,33,'32'],
       'City':[None,'Hyd','Pune','Blr']
      }
df4=pd.DataFrame(dict4)
df4

Unnamed: 0,Names,Age,City
0,Tarun,30,
1,Ashish,31,Hyd
2,Suri,33,Pune
3,,32,Blr


In [10]:
df4['Age']=df4['Age'].astype(int)
df4['Age']

0    30
1    31
2    33
3    32
Name: Age, dtype: int32

In [11]:
df4.dtypes

Names    object
Age       int32
City     object
dtype: object

- if you want to change a data type of a column use astype

- if a column have missing value astype will fail

In [12]:
# dict5={'Names':['Tarun','Ashish','Suri',None],
#        'Age':[30,31,np.nan,'32'],
#        'City':[None,'Hyd','Pune','Blr']
#       }
# df5=pd.DataFrame(dict5)
# df5['Age']=df5['Age'].astype(int)
# df5['Age']
# this will return a error because np.nan is a float which is a missing value and cant convert it into int

In [13]:
# dict5={'Names':['Tarun','Ashish','Suri',None],
#        'Age':[30,31,None,'32'],
#        'City':[None,'Hyd','Pune','Blr']
#       }
# df5=pd.DataFrame(dict5)
# df5['Age']=df5['Age'].astype(int)
# df5['Age']

- np.nan represents only number that number can be a integer or can be a float

- none represents stringd,bytes,float,int,..etc

**Method-1**

In [14]:
import pandas as pd
dict1={'Names':['Tarun','Ashish','Suri',None],
       'Age':[30,31,np.nan,'32'],
       'City':[None,'Hyd','Pune','Blr']
      }
df1=pd.DataFrame(dict1)
df1

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


In [15]:
df1.isna()

Unnamed: 0,Names,Age,City
0,False,False,True
1,False,False,False
2,False,True,False
3,True,False,False


In [16]:
df1.isnull()

Unnamed: 0,Names,Age,City
0,False,False,True
1,False,False,False
2,False,True,False
3,True,False,False


In [17]:
df1.isnull().sum()

Names    1
Age      1
City     1
dtype: int64

In [18]:
# total values in first column are 4
# in that one value is missed 
# so what could be the percentage : 1/4=25%
df1.isnull().sum()*100/len(df1)

Names    25.0
Age      25.0
City     25.0
dtype: float64

In [19]:
df1.fillna(40)

Unnamed: 0,Names,Age,City
0,Tarun,30,40
1,Ashish,31,Hyd
2,Suri,40,Pune
3,40,32,Blr


In [20]:
df1[['Names']].fillna('suresh')

Unnamed: 0,Names
0,Tarun
1,Ashish
2,Suri
3,suresh


In [21]:
df1[['Age']].fillna(30)

Unnamed: 0,Age
0,30
1,31
2,30
3,32


In [22]:
replace_dict={'City':'hyd','Age':30,'Names':'suresh'}

In [23]:
df1.fillna(replace_dict,inplace=True)

In [24]:
df1

Unnamed: 0,Names,Age,City
0,Tarun,30,hyd
1,Ashish,31,Hyd
2,Suri,30,Pune
3,suresh,32,Blr


**Method-2**

- we have some methods

    - bfill
    
    - ffill
    
    - backfill
    
    - pad

In [25]:
import pandas as pd
dict1={'Names':['Tarun','Ashish','Suri',None],
       'Age':[30,31,np.nan,'32'],
       'City':[None,'Hyd','Pune','Blr']
      }
df1=pd.DataFrame(dict1)
df1

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


**bfill**

- bfill means before fill

- it will fill with next value 

In [43]:
df1.fillna(method='bfill')

Unnamed: 0,Names,Age,City
0,Tarun,30.0,Hyd
1,Ashish,31.0,Hyd
2,Suri,31.0,Pune
3,,32.0,Blr


In [27]:
import warnings
warnings.filterwarnings('ignore')
# to ignore the warnings 

In [28]:
df1

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


**ffill**

- ffill means before fill

- it will fill with before values

In [29]:
df1.fillna(method='ffill')

Unnamed: 0,Names,Age,City
0,Tarun,30,
1,Ashish,31,Hyd
2,Suri,31,Pune
3,Suri,32,Blr


**backfill**

In [30]:
df1.fillna(method='backfill')

Unnamed: 0,Names,Age,City
0,Tarun,30,Hyd
1,Ashish,31,Hyd
2,Suri,32,Pune
3,,32,Blr


**pad**

In [31]:
df1.fillna(method='pad')

Unnamed: 0,Names,Age,City
0,Tarun,30,
1,Ashish,31,Hyd
2,Suri,31,Pune
3,Suri,32,Blr


- backfill = bfill

- pad = ffill

- one more argument is there that is axis

- by default it is 0

In [32]:
df1.fillna(method='bfill',axis=1)

Unnamed: 0,Names,Age,City
0,Tarun,30,
1,Ashish,31,Hyd
2,Suri,Pune,Pune
3,32,32,Blr


In [33]:
df1.fillna(method='ffill',axis=1)

Unnamed: 0,Names,Age,City
0,Tarun,30,30
1,Ashish,31,Hyd
2,Suri,Suri,Pune
3,,32,Blr


In [34]:
df1.fillna(method='backfill',axis=1)

Unnamed: 0,Names,Age,City
0,Tarun,30,
1,Ashish,31,Hyd
2,Suri,Pune,Pune
3,32,32,Blr


In [35]:
df1.fillna(method='pad',axis=1)

Unnamed: 0,Names,Age,City
0,Tarun,30,30
1,Ashish,31,Hyd
2,Suri,Suri,Pune
3,,32,Blr


**Method-3**

- we are filling the values randomly

- we can do in a sepcific way to fill

    - mean
    
    - median
    
    - mode

In [36]:
dict1={'Names':['Tarun','Ashish','Suri',None],
       'Age':[30,31,np.nan,32],
       'City':[None,'Hyd','Pune','Blr']
      }
df1=pd.DataFrame(dict1)
df1

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


In [37]:
mean=df1['Age'].mean()

In [38]:
df1['Age'].fillna(mean)

0    30.0
1    31.0
2    31.0
3    32.0
Name: Age, dtype: float64

**KNN imputer**

- KNN means k nearest neighbours

- where k is a number

- insted of taking akk the numbers average

- it consider k neighbours

- how can we know about the neighbours

- based on distance metrics we will calculate the distance

- user will decide hoe many neighbours need to choose

- k=hyper parameter

- if k=5 it will take top 5 nearest neighbor whixh means

- those 5 observations has less distance

- we will calculate average of those observations values

- package name : Sklearn

    - class name : preprocessing
    
        - method name : ss,mms,pt

    - class name : impute
 
        - method name : KNNimputer

In [39]:
df1

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


In [42]:
from sklearn.impute import KNNImputer
knn=KNNImputer()
df1['Age']=knn.fit_transform(np.array(df1['Age']).reshape(-1,1))
df1['Age']

0    30.0
1    31.0
2    31.0
3    32.0
Name: Age, dtype: float64