In [1]:
# 处理缺失数据
import numpy as np
import pandas as pd
from numpy import nan as NA
from pandas import DataFrame
from pandas import Series

In [2]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print(string_data)
print(string_data.isnull())

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
0    False
1    False
2     True
3    False
dtype: bool


In [3]:
string_data[0] = None # None与NaN等价
print(string_data.isnull())

0     True
1    False
2     True
3    False
dtype: bool


In [4]:
# NA处理方法
# dropna：根据各标签的值中是否存在缺少数据对轴标签进行过滤，可通过阈值调节对缺失值的容忍度。
# fillna：用指定值或插值方法（ffill或bfill）填充缺失数据
# isnull：返回一个含有布尔值的对象，这些布尔值表示哪些是缺失值/NaN，该对象的类型与源类型一样。
# notnull：isnull的否定式

In [5]:
# --------------------
# 滤除缺失数据
# --------------------

In [6]:
data = Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
 data[data.notnull()] # 与dropna等价

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
data = DataFrame([[1., 6.5, 3.],
                  [1., NA, NA],
                  [NA, NA, NA],
                  [NA, 6.5, 3.]])
data.dropna() # 默认一行里只要有1个元素是NA就放弃

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [9]:
data.dropna(how='all') # 只有一行的所有元素是NA才放弃

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [10]:
data[4] = NA # 添加全部值为NA的一列
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [11]:
data.dropna(axis=1, how='all') # 如果某列值全部为NA则删除

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
df = DataFrame(np.random.randn(7, 3))
df.iloc[:3, 1] = NA # 前3行的第2列
df.iloc[:2, 2] = NA # 前2行的第3列
df

Unnamed: 0,0,1,2
0,0.997564,,
1,0.956394,,
2,0.751743,,1.355589
3,-0.573066,1.657399,0.006064
4,-0.336803,0.342318,-1.133819
5,0.96713,-0.94445,0.985064
6,-0.241471,1.348159,-0.379127


In [13]:
df.dropna(thresh=2) # 每行至少有几个不为NA的数

Unnamed: 0,0,1,2
2,0.751743,,1.355589
3,-0.573066,1.657399,0.006064
4,-0.336803,0.342318,-1.133819
5,0.96713,-0.94445,0.985064
6,-0.241471,1.348159,-0.379127


In [14]:
# 填充缺失数据

In [15]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.997564,0.0,0.0
1,0.956394,0.0,0.0
2,0.751743,0.0,1.355589
3,-0.573066,1.657399,0.006064
4,-0.336803,0.342318,-1.133819
5,0.96713,-0.94445,0.985064
6,-0.241471,1.348159,-0.379127


In [16]:
df.fillna({1: 0.5, 2: -1}) # 针对不同列上的NA填充，因为axis默认为0。

Unnamed: 0,0,1,2
0,0.997564,0.5,-1.0
1,0.956394,0.5,-1.0
2,0.751743,0.5,1.355589
3,-0.573066,1.657399,0.006064
4,-0.336803,0.342318,-1.133819
5,0.96713,-0.94445,0.985064
6,-0.241471,1.348159,-0.379127


In [17]:
df.fillna(66.66, inplace=True) # 原地填充
df

Unnamed: 0,0,1,2
0,0.997564,66.66,66.66
1,0.956394,66.66,66.66
2,0.751743,66.66,1.355589
3,-0.573066,1.657399,0.006064
4,-0.336803,0.342318,-1.133819
5,0.96713,-0.94445,0.985064
6,-0.241471,1.348159,-0.379127


In [18]:
df = DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA # 第2行及以后行的第1列
df.iloc[4:, 2] = NA # 第2行及以后行的第2列
df

Unnamed: 0,0,1,2
0,-1.188927,-0.986986,0.052379
1,0.25632,0.499657,-0.25631
2,0.116282,,-0.968075
3,0.153187,,0.586539
4,0.290128,,
5,-0.9236,,


In [19]:
df.fillna(method='ffill') # 用上一行对应位置的值填充

Unnamed: 0,0,1,2
0,-1.188927,-0.986986,0.052379
1,0.25632,0.499657,-0.25631
2,0.116282,0.499657,-0.968075
3,0.153187,0.499657,0.586539
4,0.290128,0.499657,0.586539
5,-0.9236,0.499657,0.586539


In [20]:
df.fillna(method='ffill', limit=2) # 最多填充2个元素，剩下的不管了。

Unnamed: 0,0,1,2
0,-1.188927,-0.986986,0.052379
1,0.25632,0.499657,-0.25631
2,0.116282,0.499657,-0.968075
3,0.153187,0.499657,0.586539
4,0.290128,,0.586539
5,-0.9236,,0.586539


In [21]:
data = Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean()) # 使用平均值填充

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [22]:
# fill_na函数的参数
# value：用于填充缺失值的标量值或字典对象
# method：插值方式，默认为ffill。
# axis：待填充的轴，默认axis=0。
# inplace：修改调用者对象而不产生副本
# limit：可以连续填充的最大数量