## 处理缺失值

In [15]:
from io import StringIO
import pandas as pd
from numpy import NaN

In [28]:
# 创建一个csv格式的数据
csv_data = '''A,B,C,d
          1.0,2.0,3.0,4.0
          5.0,6.0,,8.0
          0.0,11.0,12.0'''

In [46]:
# 读取csv数据转换成DataFrame
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,d
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [47]:
# 统计每一列缺失值的个数
df.isnull().sum()

A    0
B    0
C    1
d    1
dtype: int64

In [48]:
# 从DataFrame中获取NumPy数组
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [ 0., 11., 12., nan]])

## 去除空值的常用方法

In [49]:
# 去除带有空值的行
df.dropna()

Unnamed: 0,A,B,C,d
0,1.0,2.0,3.0,4.0


In [50]:
# 去除带有空值的列
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,0.0,11.0


In [51]:
# 去除所有值都为NaN的行
df.dropna(how='all')

Unnamed: 0,A,B,C,d
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [52]:
# 去除非缺失值小于4个的行
df.dropna(thresh=4)

Unnamed: 0,A,B,C,d
0,1.0,2.0,3.0,4.0


In [53]:
# 去除在特定列出现NaN的行
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,d
0,1.0,2.0,3.0,4.0
2,0.0,11.0,12.0,


### 改写缺失值(插入法)

In [54]:
# 用一个估值来替代缺失值,最常用的是平均估值法
from sklearn.preprocessing import Imputer

In [56]:
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df)

In [57]:
imputed_data = imr.transform(df.values)

In [58]:
print(df.values)
print('---------------------')
print(imputed_data)

[[ 1.  2.  3.  4.]
 [ 5.  6. nan  8.]
 [ 0. 11. 12. nan]]
---------------------
[[ 1.   2.   3.   4. ]
 [ 5.   6.   7.5  8. ]
 [ 0.  11.  12.   6. ]]


先计算每一列的平均值，然后用相应列的平均值来替换NaN。如果将参数axis=0改为axis=1，则会计算每个样本的所有特种的平均值。参数strategy的其他取值包括median(中位数)和most_frequent(众数)。most_frequent对于处理分类数据类型的缺失值很有用。