## Working with Missing Data in Pandas

In [38]:
import numpy as np
import pandas as pd

from pandas import DataFrame

### Filling missing values using fillna(), replace() and interpolate()

In [39]:
data= {'name':['steve', 'john', 'tim', 'tom', 'jolin', 'mathew', 'micheal'],
       'age':[20,22,21,20,24,21,23],
        'gender':['male','male','male','male','female','male','male'],
         'rank':[2,1,4,5,6,7,6] }
ranking_df = DataFrame(data)
ranking_df.iloc[2:5,1]= np.nan
ranking_df.iloc[3:6,3]= np.nan
ranking_df.iloc[3,:]= np.nan
ranking_df

Unnamed: 0,name,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
2,tim,,male,4.0
3,,,,
4,jolin,,female,
5,mathew,21.0,male,
6,micheal,23.0,male,6.0


In [40]:
ranking_df.isnull() #true if value is NaN

Unnamed: 0,name,age,gender,rank
0,False,False,False,False
1,False,False,False,False
2,False,True,False,False
3,True,True,True,True
4,False,True,False,True
5,False,False,False,True
6,False,False,False,False


In [41]:
ranking_df.notnull()

Unnamed: 0,name,age,gender,rank
0,True,True,True,True
1,True,True,True,True
2,True,False,True,True
3,False,False,False,False
4,True,False,True,False
5,True,True,True,False
6,True,True,True,True


In [42]:
mask = pd.isnull(ranking_df['age'])
ranking_df[mask] #only return when age is null(NaN)

Unnamed: 0,name,age,gender,rank
2,tim,,male,4.0
3,,,,
4,jolin,,female,


In [43]:
ranking_df.fillna(0)

Unnamed: 0,name,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
2,tim,0.0,male,4.0
3,0,0.0,0,0.0
4,jolin,0.0,female,0.0
5,mathew,21.0,male,0.0
6,micheal,23.0,male,6.0


In [44]:
ranking_df.fillna(method='pad') # missing value alternated by last row value

  ranking_df.fillna(method='pad') # missing value alternated by last row value


Unnamed: 0,name,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
2,tim,22.0,male,4.0
3,tim,22.0,male,4.0
4,jolin,22.0,female,4.0
5,mathew,21.0,male,4.0
6,micheal,23.0,male,6.0


In [45]:
ranking_df.fillna(method='bfill') # back fill gon find the next not null value 

  ranking_df.fillna(method='bfill') # back fill gon find the next not null value


Unnamed: 0,name,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
2,tim,21.0,male,4.0
3,jolin,21.0,female,6.0
4,jolin,21.0,female,6.0
5,mathew,21.0,male,6.0
6,micheal,23.0,male,6.0


In [46]:
ranking_df.interpolate(method='linear')  # Perform interpolation



  ranking_df.interpolate(method='linear')  # Perform interpolation


Unnamed: 0,name,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
2,tim,21.75,male,4.0
3,,21.5,,4.5
4,jolin,21.25,female,5.0
5,mathew,21.0,male,5.5
6,micheal,23.0,male,6.0


In [47]:
ranking_df.dropna()

Unnamed: 0,name,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
6,micheal,23.0,male,6.0


In [None]:
ranking_df.dropna(how='all') #row or colume要所有值都是NaN才會被dropped ex row=3

Unnamed: 0,name,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
2,tim,,male,4.0
4,jolin,,female,
5,mathew,21.0,male,
6,micheal,23.0,male,6.0


In [None]:
ranking_df.dropna(axis=1) #所有colume只要有包含一個NaN 就要被drop

0
1
2
3
4
5
6


In [50]:
ranking_df.dropna(axis=0) #所有row只要有包含一個NaN 就要被drop

Unnamed: 0,name,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
6,micheal,23.0,male,6.0
