## Pandas缺失值处理

导包

In [1]:
import numpy as np
import pandas as pd

有两种丢失数据(空值)：
- None
- np.nan

### 1. None
- None是Python自带的，是Python中的空对象。None不能参与到任何计算中。
- object类型的运算要比int类型的运算慢得多  

In [2]:
%timeit np.arange(1e6, dtype=object).sum()

78 ms ± 1.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [3]:
%timeit np.arange(1e6, dtype=np.int32).sum()

2.29 ms ± 86.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### 2. np.nan

- np.nan是浮点类型，能参与到计算中。但计算的结果总是NaN。

In [4]:
type(np.nan)

float

- 但可以使用np.nan*()函数来计算nan，此时会过滤掉nan。

In [5]:
n = np.array([1, 2, 3, np.nan, 5, 6])
n

array([ 1.,  2.,  3., nan,  5.,  6.])

In [7]:
np.sum(n)  # nan
np.nansum(n)  # 自动过滤nan不计算

17.0

In [8]:
np.nan + 10

nan

### 3. Pandas中的None与NaN

#### 1) Pandas中None与np.nan都视作np.nan

- 创建DataFrame

In [9]:
data = np.random.randint(0, 100, size=(5, 5))
df = pd.DataFrame(data=data, columns=list('ABCDE'))
df

Unnamed: 0,A,B,C,D,E
0,60,7,49,87,46
1,45,80,69,34,48
2,8,27,93,50,65
3,55,50,81,2,75
4,54,98,62,50,11


- 使用DataFrame行索引与列索引修改DataFrame数据

In [10]:
df.loc[2, 'B'] = np.nan
df.loc[3, 'C'] = None
df

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,80.0,69.0,34,48
2,8,,93.0,50,65
3,55,50.0,,2,75
4,54,98.0,62.0,50,11


In [11]:
df.loc[2, 'B'], df.loc[3, 'C']

(nan, nan)

#### 2) pandas中None与np.nan的操作

- isnull()
- notnull()
- all()
- any()
- dropna():  过滤丢失数据
- fillna():  填充丢失数据


(1)判断函数

- isnull()
- notnull()


In [12]:
df

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,80.0,69.0,34,48
2,8,,93.0,50,65
3,55,50.0,,2,75
4,54,98.0,62.0,50,11


In [13]:
df.isnull()

Unnamed: 0,A,B,C,D,E
0,False,False,False,False,False
1,False,False,False,False,False
2,False,True,False,False,False
3,False,False,True,False,False
4,False,False,False,False,False


In [14]:
df.notnull()

Unnamed: 0,A,B,C,D,E
0,True,True,True,True,True
1,True,True,True,True,True
2,True,False,True,True,True
3,True,True,False,True,True
4,True,True,True,True,True


In [21]:
# all()  : 必须全部为True才会是True，类似and
# any() : 只要有一个为True就为True，类似or

# 找有空的列
df.isnull().any()   # 常用，尽可能找到有空的列或行
# df.isnull().all()  # 必须全部都为空的行或列才会为True

# 找没有空的列
df.notnull().all()  # 常用，尽量找没有空值的列或行
# df.notnull().any()

A     True
B    False
C    False
D     True
E     True
dtype: bool

In [23]:
# 找有空的行
df.isnull().any(axis=1)

# 找没有空的行
df.notnull().all(axis=1)

0     True
1     True
2    False
3    False
4     True
dtype: bool

- 使用bool值索引过滤数据

In [24]:
# 过滤数据
df

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,80.0,69.0,34,48
2,8,,93.0,50,65
3,55,50.0,,2,75
4,54,98.0,62.0,50,11


In [31]:
# 行过滤
cond = df.isnull().any(axis=1)
# display(~cond)  # 取反
df[~cond]

cond = df.notnull().all(axis=1)
df[cond]

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,80.0,69.0,34,48
4,54,98.0,62.0,50,11


In [37]:
df

# 过滤列
cond = df.isnull().any()
df.loc[:, ~cond]

cond = df.notnull().all()
df.loc[:, cond]

Unnamed: 0,A,D,E
0,60,87,46
1,45,34,48
2,8,50,65
3,55,2,75
4,54,50,11


(2) 过滤函数
- dropna()

可以选择过滤的是行还是列（默认为行）

In [38]:
df

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,80.0,69.0,34,48
2,8,,93.0,50,65
3,55,50.0,,2,75
4,54,98.0,62.0,50,11


In [40]:
# 默认删除有空的行
df.dropna()

df.dropna(axis=1)  # 删除有空的列

Unnamed: 0,A,D,E
0,60,87,46
1,45,34,48
2,8,50,65
3,55,2,75
4,54,50,11


也可以选择过滤的方式 how = 'all'

In [44]:
df.dropna(how='any')

# 必须所有数据都为nan才会删除
df.dropna(how='all', axis=1)

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,80.0,69.0,34,48
2,8,,93.0,50,65
3,55,50.0,,2,75
4,54,98.0,62.0,50,11


inplace=True 修改原数据

In [47]:
df2 = df.copy()
df2

# inplace=True 修改原数据
df2.dropna(inplace=True)
df2

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,80.0,69.0,34,48
4,54,98.0,62.0,50,11


(3) 填充函数 Series/DataFrame
- fillna()

In [61]:
df

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,80.0,69.0,34,48
2,8,,93.0,50,65
3,55,50.0,,2,75
4,54,98.0,62.0,50,11


In [62]:
# 填充nan
df.fillna(value=100)

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,80.0,69.0,34,48
2,8,100.0,93.0,50,65
3,55,50.0,100.0,2,75
4,54,98.0,62.0,50,11


In [63]:
df

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,80.0,69.0,34,48
2,8,,93.0,50,65
3,55,50.0,,2,75
4,54,98.0,62.0,50,11


In [64]:
df2 = df.copy()
df2.loc[1, 'B'] = np.nan
df2.loc[2, 'C'] = np.nan
df2

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,,69.0,34,48
2,8,,,50,65
3,55,50.0,,2,75
4,54,98.0,62.0,50,11


In [67]:
# limit: 限制填充的次数
df2.fillna(value=100, limit=1, inplace=True)

In [68]:
df2

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,100.0,69.0,34,48
2,8,,100.0,50,65
3,55,50.0,,2,75
4,54,98.0,62.0,50,11


可以选择前向填充还是后向填充

In [69]:
df

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,80.0,69.0,34,48
2,8,,93.0,50,65
3,55,50.0,,2,75
4,54,98.0,62.0,50,11


In [71]:
df.fillna(method='ffill')  # 向前填充
df.fillna(method='backfill')  # 向后填充

# method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
#     Method to use for filling holes in reindexed Series
#     pad / ffill: propagate last valid observation forward to next valid
#     backfill / bfill: use next valid observation to fill gap.

Unnamed: 0,A,B,C,D,E
0,60,7.0,49.0,87,46
1,45,80.0,69.0,34,48
2,8,50.0,93.0,50,65
3,55,50.0,62.0,2,75
4,54,98.0,62.0,50,11


In [73]:
df.fillna(method='ffill', axis=1)  # 向左填充
df.fillna(method='backfill', axis=1)  # 向右填充

Unnamed: 0,A,B,C,D,E
0,60.0,7.0,49.0,87.0,46.0
1,45.0,80.0,69.0,34.0,48.0
2,8.0,93.0,93.0,50.0,65.0
3,55.0,50.0,2.0,2.0,75.0
4,54.0,98.0,62.0,50.0,11.0
