In [1]:
import pandas as pd
import numpy as np

In [83]:
df = pd.DataFrame({
      'A':['a1','a1','a2','a3'],
      'B':['b1',None,'b2','b3'],
      'C':[1,2,3,4],
      'D':[5,None,9,10]})

In [84]:
df

Unnamed: 0,A,B,C,D
0,a1,b1,1,5.0
1,a1,,2,
2,a2,b2,3,9.0
3,a3,b3,4,10.0


## 二、缺失值判断

### 1、对整个dataframe判断缺失

In [85]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,True,False,True
2,False,False,False,False
3,False,False,False,False


### 2、对某个列判断缺失

In [86]:
df['C'].isnull()

0    False
1    False
2    False
3    False
Name: C, dtype: bool

In [87]:
df.notna()

Unnamed: 0,A,B,C,D
0,True,True,True,True
1,True,False,True,False
2,True,True,True,True
3,True,True,True,True


## 三、缺失值统计

### 1、列缺失

In [89]:
## 列缺失统计
df.isnull().sum(axis=0)

A    0
B    1
C    0
D    1
dtype: int64

### 2、行缺失

In [90]:
df.isnull().sum(axis=1)

0    0
1    2
2    0
3    0
dtype: int64

### 3、缺失率

In [91]:
## 缺失率
df.isnull().sum(axis=0)/df.shape[0]

A    0.00
B    0.25
C    0.00
D    0.25
dtype: float64

In [94]:
## 缺失率（一步到位）
df.isnull().mean()

A    0.00
B    0.25
C    0.00
D    0.25
dtype: float64

## 四、缺失值筛选

In [95]:
# 筛选有缺失值的行
df.loc[df.isnull().any(1)]

Unnamed: 0,A,B,C,D
1,a1,,2,


In [96]:
# 筛选有缺失值的列
df.loc[:,df.isnull().any()]

Unnamed: 0,B,D
0,b1,5.0
1,,
2,b2,9.0
3,b3,10.0


In [98]:
# 取反
df.loc[~(df.isnull().any(1))]

Unnamed: 0,A,B,C,D
0,a1,b1,1,5.0
2,a2,b2,3,9.0
3,a3,b3,4,10.0


## 五、缺失值填充

In [99]:
# 将dataframe所有缺失值填充为0
df.fillna(0)

Unnamed: 0,A,B,C,D
0,a1,b1,1,5.0
1,a1,0,2,0.0
2,a2,b2,3,9.0
3,a3,b3,4,10.0


In [100]:
# 将D列缺失值填充为-999
df.D.fillna('-999')

0       5
1    -999
2       9
3      10
Name: D, dtype: object

In [101]:
# 向前填充
df.ffill()

Unnamed: 0,A,B,C,D
0,a1,b1,1,5.0
1,a1,b1,2,5.0
2,a2,b2,3,9.0
3,a3,b3,4,10.0


In [102]:
# 平均值填充
df.D.fillna(df.D.mean())

0     5.0
1     8.0
2     9.0
3    10.0
Name: D, dtype: float64


## 六、缺失值删除

### 1、全部直接删除

In [103]:
df.dropna()

Unnamed: 0,A,B,C,D
0,a1,b1,1,5.0
2,a2,b2,3,9.0
3,a3,b3,4,10.0


### 2、行缺失删除

In [104]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,a1,b1,1,5.0
2,a2,b2,3,9.0
3,a3,b3,4,10.0


### 3、列缺失删除

In [105]:
df.dropna(axis=1)

Unnamed: 0,A,C
0,a1,1
1,a1,2
2,a2,3
3,a3,4


In [106]:
# 删除指定列范围内的缺失,因为C列无缺失，所以最后没有变化
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,a1,b1,1,5.0
1,a1,,2,
2,a2,b2,3,9.0
3,a3,b3,4,10.0


### 4、按缺失率删除

In [107]:
df.loc[:,df.isnull().mean(axis=0) < 0.1]

Unnamed: 0,A,C
0,a1,1
1,a1,2
2,a2,3
3,a3,4


In [108]:
# 删除行缺失大于0.1的
df.loc[df.isnull().mean(axis=1) < 0.1]

Unnamed: 0,A,B,C,D
0,a1,b1,1,5.0
2,a2,b2,3,9.0
3,a3,b3,4,10.0


## 七、缺失值参与计算

In [109]:
df.sum()

A    a1a1a2a3
C          10
D          24
dtype: object

In [110]:
# 加法
df.D.cumsum()

0     5.0
1     NaN
2    14.0
3    24.0
Name: D, dtype: float64

In [111]:
# 累加
df.D.cumsum(skipna=False)

0    5.0
1    NaN
2    NaN
3    NaN
Name: D, dtype: float64

In [112]:
# 对列计数
df.count()

A    4
B    3
C    4
D    3
dtype: int64

In [113]:
# 聚合分组
df.groupby('B').sum()

Unnamed: 0_level_0,C,D
B,Unnamed: 1_level_1,Unnamed: 2_level_1
b1,1,5.0
b2,3,9.0
b3,4,10.0


In [114]:
df.groupby('B',dropna=False).sum()

Unnamed: 0_level_0,C,D
B,Unnamed: 1_level_1,Unnamed: 2_level_1
b1,1,5.0
b2,3,9.0
b3,4,10.0
,2,0.0
