### 如何处理使用Pandas的缺失值(如NA或NaN)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(np.random.randn(5,3),
                 index=['a', 'c', 'e', 'f','h'],
                  columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df
# NaN表示不是数字的值

Unnamed: 0,one,two,three
a,-0.628329,0.090403,2.934687
b,,,
c,0.383439,-0.174371,0.250727
d,,,
e,-2.298551,-0.867845,-0.128002
f,-1.483917,-1.070333,-0.510737
g,,,
h,0.086763,0.738312,-0.449155


#### 检查缺失值
Pandas提供了isnull()和notnull()函数，它们也是Series和DataFrame对象的方法

In [4]:
print(df['one'].isnull())

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool


In [5]:
print(df['two'].notnull())

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: two, dtype: bool


#### 缺少数据的计算
- 在求和数据时NA将被视为0

In [6]:
print(df['one'].sum())

-3.940594878034674


In [7]:
df = pd.DataFrame(index=[0,1,2,3,4,5],
                 columns=['one','two','three'])
df

Unnamed: 0,one,two,three
0,,,
1,,,
2,,,
3,,,
4,,,
5,,,


In [9]:
print(df['one'].sum())

0


#### 清理填充缺少数据
fillna()函数可以通过几种方法用非空数据“填充”NA值

##### 用标量替换

In [10]:
df = pd.DataFrame(np.random.randn(3,3),
                 index=['a','c','e'],
                 columns=['one','two','three'])
df

Unnamed: 0,one,two,three
a,0.708479,0.409697,0.318239
c,0.943825,-1.205472,0.312219
e,-0.127514,0.15539,-0.188799


In [14]:
df = df.reindex(['a','b','c'])
df

Unnamed: 0,one,two,three
a,0.708479,0.409697,0.318239
b,,,
c,0.943825,-1.205472,0.312219


In [15]:
print(df.fillna(0))

        one       two     three
a  0.708479  0.409697  0.318239
b  0.000000  0.000000  0.000000
c  0.943825 -1.205472  0.312219


##### 填写NA前进和后退
- pad/ffill    填充方法向前
- bfill/backfill    填充方法向后

In [16]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df

Unnamed: 0,one,two,three
a,-0.63022,-0.830203,0.784068
b,,,
c,1.457237,1.516881,-0.958695
d,,,
e,1.247532,-0.667112,-0.958705
f,0.174082,0.124903,-0.334668
g,,,
h,-0.316185,1.008244,-0.885601


In [17]:
print(df.fillna(method='pad'))

        one       two     three
a -0.630220 -0.830203  0.784068
b -0.630220 -0.830203  0.784068
c  1.457237  1.516881 -0.958695
d  1.457237  1.516881 -0.958695
e  1.247532 -0.667112 -0.958705
f  0.174082  0.124903 -0.334668
g  0.174082  0.124903 -0.334668
h -0.316185  1.008244 -0.885601


In [19]:
print(df.fillna(method='backfill'))

        one       two     three
a -0.630220 -0.830203  0.784068
b  1.457237  1.516881 -0.958695
c  1.457237  1.516881 -0.958695
d  1.247532 -0.667112 -0.958705
e  1.247532 -0.667112 -0.958705
f  0.174082  0.124903 -0.334668
g -0.316185  1.008244 -0.885601
h -0.316185  1.008244 -0.885601


##### 丢失缺少的值
使用dropna函数和axis参数

In [20]:
print(df)

        one       two     three
a -0.630220 -0.830203  0.784068
b       NaN       NaN       NaN
c  1.457237  1.516881 -0.958695
d       NaN       NaN       NaN
e  1.247532 -0.667112 -0.958705
f  0.174082  0.124903 -0.334668
g       NaN       NaN       NaN
h -0.316185  1.008244 -0.885601


In [25]:
# 删除有NaN的行
df.dropna()

Unnamed: 0,one,two,three
a,-0.63022,-0.830203,0.784068
c,1.457237,1.516881,-0.958695
e,1.247532,-0.667112,-0.958705
f,0.174082,0.124903,-0.334668
h,-0.316185,1.008244,-0.885601


In [26]:
# 删除有NaN的列
df.dropna(axis=1)

a
b
c
d
e
f
g
h


##### 替换丢失(或)通用值
用一些具体的值取代一个通用的值。可以通过应用替换方法来实现这一点

In [27]:
df = pd.DataFrame({'one':[10,20,30,40,50,2000],
                  'two':[1000,0,30,40,50,60]})
df

Unnamed: 0,one,two
0,10,1000
1,20,0
2,30,30
3,40,40
4,50,50
5,2000,60


In [28]:
print(df.replace(1000,10))

    one  two
0    10   10
1    20    0
2    30   30
3    40   40
4    50   50
5  2000   60


In [29]:
print(df.replace({1000:77,2000:88}))

   one  two
0   10   77
1   20    0
2   30   30
3   40   40
4   50   50
5   88   60


In [35]:
'{1:<30}'.format('left aligned','hello adam')

'hello adam                    '