In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(np.random.randn(6,1),index=pd.date_range('2013-08-01',periods=6,freq='B'),columns=['A'])
df

Unnamed: 0,A
2013-08-01,-1.212555
2013-08-02,0.160446
2013-08-05,0.364149
2013-08-06,1.925536
2013-08-07,-0.385254
2013-08-08,0.392789


In [3]:
df.loc[df.index[3],'A'] = np.nan
df

Unnamed: 0,A
2013-08-01,-1.212555
2013-08-02,0.160446
2013-08-05,0.364149
2013-08-06,
2013-08-07,-0.385254
2013-08-08,0.392789


#### 用上一个数据填充缺失值

In [4]:
df.reindex(df.index[::-1]).ffill()

Unnamed: 0,A
2013-08-08,0.392789
2013-08-07,-0.385254
2013-08-06,-0.385254
2013-08-05,0.364149
2013-08-02,0.160446
2013-08-01,-1.212555


### 1. 缺失值判断

In [5]:
s = pd.Series(['a','b',np.nan,'c',None])
s

0       a
1       b
2     NaN
3       c
4    None
dtype: object

In [6]:
s.isnull()

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [11]:
s[s.isnull()]

2     NaN
4    None
dtype: object

In [8]:
data = [[1,np.nan,2],[3,4,None]]
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,1,,2.0
1,3,4.0,


In [9]:
df.isnull()

Unnamed: 0,0,1,2
0,False,True,False
1,False,False,True


In [10]:
df[df.isnull()]

Unnamed: 0,0,1,2
0,,,
1,,,


### 2. 过滤缺失数据

#### 删除含有缺失值得行和列

In [14]:
data = [[1, np.nan, 2],[9,None,np.nan],[3, 4, None],[5,6,7]]
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,1,,2.0
1,9,,
2,3,4.0,
3,5,6.0,7.0


In [18]:
# 使用dropna方法删除含有缺失值的行,默认为行
df.dropna() # df 不发生改变

Unnamed: 0,0,1,2
3,5,6.0,7.0


In [20]:
# 删除汉语缺失值得列
df.dropna(axis=1)

Unnamed: 0,0
0,1
1,9
2,3
3,5


#### 删除全为NaN的行和列

In [23]:
data = [[1, np.nan, 2],[np.nan,None,np.nan],[3, None, None],[5,None,7]]
df = pd.DataFrame(data);df

Unnamed: 0,0,1,2
0,1.0,,2.0
1,,,
2,3.0,,
3,5.0,,7.0


In [24]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,,2.0
2,3.0,,
3,5.0,,7.0


In [25]:
df.dropna(how='all',axis=1)

Unnamed: 0,0,2
0,1.0,2.0
1,,
2,3.0,
3,5.0,7.0


#### inplace 参数 (设置为True:没有返回值,df发生改变; 默认False:df不发生变化,有返回值)

In [26]:
data = [[1, np.nan, 2],[np.nan,None,np.nan],[3, None, None],[5,None,7]]
df = pd.DataFrame(data);df

Unnamed: 0,0,1,2
0,1.0,,2.0
1,,,
2,3.0,,
3,5.0,,7.0


In [31]:
df.dropna(how='all',axis=1,inplace=True);df

Unnamed: 0,0,2
0,1.0,2.0
1,,
2,3.0,
3,5.0,7.0


#### 指定删除数据后显示部分数据观察

In [33]:
data = [[1, np.nan, 2],[np.nan,None,np.nan],[3, None, None],[5,None,7]]
df = pd.DataFrame(data);df

Unnamed: 0,0,1,2
0,1.0,,2.0
1,,,
2,3.0,,
3,5.0,,7.0


In [36]:
# 当行为NaN的时候,才删除,参数how默认是any, 含有缺失值就删除
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,,2.0
2,3.0,,
3,5.0,,7.0


In [42]:
# 通过thresh参数来控制显示删除数据的条数,删除列的时候thresh参数无效
df.dropna(how='all',thresh=2)

Unnamed: 0,0,1,2
0,1.0,,2.0
3,5.0,,7.0


### 3. 填充缺失值

#### 指定特殊值填充缺失值

In [43]:
data = [[1, 2, 2],[3,None,6],[3, 7, None],[5,None,7]]
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,1,2.0,2.0
1,3,,6.0
2,3,7.0,
3,5,,7.0


In [44]:
# 用0填充所有缺失数据
df.fillna(0)

Unnamed: 0,0,1,2
0,1,2.0,2.0
1,3,0.0,6.0
2,3,7.0,0.0
3,5,0.0,7.0


#### 不同列使用不同的填充值

In [45]:
data = [[1, 2, 2],[3,None,6],[3, 7, None],[5,None,7]]
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,1,2.0,2.0
1,3,,6.0
2,3,7.0,
3,5,,7.0


In [46]:
df.fillna({1:1,2:2})

Unnamed: 0,0,1,2
0,1,2.0,2.0
1,3,1.0,6.0
2,3,7.0,2.0
3,5,1.0,7.0


In [48]:
df.columns=['A','B','C']

In [50]:
df.fillna({'B':3,'C':5})

Unnamed: 0,A,B,C
0,1,2.0,2.0
1,3,3.0,6.0
2,3,7.0,5.0
3,5,3.0,7.0


#### 前向填充和后向填充

In [51]:
data = [[1, 2, 2],[3,None,6],[3, 7, None],[5,None,7]]
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,1,2.0,2.0
1,3,,6.0
2,3,7.0,
3,5,,7.0


In [53]:
# 前向填充,使用默认是上一行的值,设置axis=1 可以使用列进行填充
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1,2.0,2.0
1,3,2.0,6.0
2,3,7.0,6.0
3,5,7.0,7.0


In [54]:
# 后向填充, 使用下一行的值, 不存在的时候就不填充
df.fillna(method='bfill')

Unnamed: 0,0,1,2
0,1,2.0,2.0
1,3,7.0,6.0
2,3,7.0,7.0
3,5,,7.0


#### 使用列的平均值进行填充

In [55]:
data = [[1, 2, 2],[3,None,6],[3, 7, None],[5,None,7]]
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,1,2.0,2.0
1,3,,6.0
2,3,7.0,
3,5,,7.0


In [57]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,1,2.0,2.0
1,3,4.5,6.0
2,3,7.0,5.0
3,5,4.5,7.0
