## Missing Value

In [1]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

In [2]:
#Creatinng a series
data = Series(['one', 'four', np.nan, 'two'])

In [3]:
data

0     one
1    four
2     NaN
3     two
dtype: object

In [4]:
data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
data.dropna()

0     one
1    four
3     two
dtype: object

In [6]:
# Let's see how to handle missing values in dataframe
# create a dataframe first

df = pd.DataFrame([[1, 2, 3], [np.nan, 5, 6], [7, np.nan, 9], [np.nan, np.nan, np.nan]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [7]:
df.dropna(how = 'any')               # 如果都是NaN 才删掉

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [8]:
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [9]:
# Let's see how to drop at least n data points by setting up a threshold
# create a new dateframe called df2

df2 = DataFrame([[1, 2, 3, np.nan], [2, np.nan, 5, 6], [np.nan, 7, np.nan, 8], [9, np.nan, np.nan, np.nan]], 
                columns = ['A', 'B', 'C', 'D'])
df2

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,8.0
3,9.0,,,


In [10]:
# 至少有两个值 不是空
df2.dropna(thresh = 1)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,8.0
3,9.0,,,


In [11]:
# 至少有三个值 不是空
df2.dropna(thresh = 3)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0


In [12]:
df2

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,8.0
3,9.0,,,


In [13]:
# fill null value with 1
df2.fillna(1)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,1.0
1,2.0,1.0,5.0,6.0
2,1.0,7.0,1.0,8.0
3,9.0,1.0,1.0,1.0


In [14]:
# 以列名字来填东西
df2.fillna({'A': df2['A'].mean(), 'B': 'b', 'C': 'c', 'D': 'd'})

Unnamed: 0,A,B,C,D
0,1.0,2,3,d
1,2.0,b,5,6
2,4.0,7,c,8
3,9.0,b,c,d


#### Notice: Although we have modified our dataframe by dropping or filling with na, however, these modification did not affect on our original dataframe permenantly. So we either save it after modifying, or pass inplace parameter.

In [15]:
# save it like following
# df2 = df2.fillna(0)

# or pass inplace parameter equals True
df2.fillna(0, inplace = True)
df2

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,0.0
1,2.0,0.0,5.0,6.0
2,0.0,7.0,0.0,8.0
3,9.0,0.0,0.0,0.0
