# pandas缺失数据

In [1]:
#引入模块
import pandas as pd
import numpy as np


In [94]:
#如何处理使用Pandas的缺失值(如NA或NaN)。
df1 = pd.DataFrame(np.random.randint(10,size = (5,3)),index = ['a','c','d','f','h'],columns = ['One','Two','Three'])
df1

Unnamed: 0,One,Two,Three
a,0,5,2
c,3,8,9
d,9,4,6
f,9,9,7
h,1,4,0


In [127]:
#重置行索引  
df1 = df.reindex(['a','b','c','d','e','f','g','h'])
df1
#使用重构索引(reindexi)，创建了一个缺少值的DataFrame。 在输出中，NaN表示不是数字的值。加上参数columns = xxx  是重置列索引


Unnamed: 0,One,Two,Three
a,8.0,2.0,4.0
b,,,
c,9.0,1.0,9.0
d,1.0,2.0,6.0
e,,,
f,6.0,1.0,4.0
g,,,
h,6.0,1.0,6.0


# （1）检查缺失值

In [96]:
# 为了更容易地检测缺失值(以及跨越不同的数组dtype)，Pandas提供了isnull()和notnull()函数，
df1.isnull()

Unnamed: 0,One,Two,Three
a,False,False,False
b,True,True,True
c,False,False,False
d,False,False,False
e,True,True,True
f,False,False,False
g,True,True,True
h,False,False,False


In [97]:
df1.notnull()

Unnamed: 0,One,Two,Three
a,True,True,True
b,False,False,False
c,True,True,True
d,True,True,True
e,False,False,False
f,True,True,True
g,False,False,False
h,True,True,True


In [98]:
#缺少数据的计算
# 在求和数据时，NA将被视为0
# 如果数据全部是NA，那么结果将是NA
df1['One'].isnull()

a    False
b     True
c    False
d    False
e     True
f    False
g     True
h    False
Name: One, dtype: bool

In [99]:
df1['One'].sum()

30.0

# (2)丢失缺少的值
如果只想排除缺少的值，则使用dropna函数和axis参数。 默认情况下，axis = 0，即在行上应用，这意味着如果行内的任何值是NA，那么整个行被排除。

In [126]:
df2 = pd.DataFrame(np.random.randint(10,size = (5,3)),index = ['a','c','d','f','h'],columns = ['One','Two','Three'])
# a = range(0,101)
df2 = df2.reindex(['a','b','c','d','e','f','g','h'])
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
a,,,,,,,,,,,...,,,,,,,,,,
c,,,,,,,,,,,...,,,,,,,,,,
d,,,,,,,,,,,...,,,,,,,,,,
f,,,,,,,,,,,...,,,,,,,,,,
h,,,,,,,,,,,...,,,,,,,,,,


In [101]:
#删除NaN的行
df2.dropna()


Unnamed: 0,One,Two,Three
a,1.0,9.0,3.0
c,4.0,1.0,8.0
d,5.0,2.0,5.0
f,6.0,4.0,2.0
h,4.0,4.0,9.0


In [102]:
#删除NaN的列
df2 = pd.DataFrame(np.random.randint(10,size = (5,3)),index = ['a','c','d','f','h'],columns = ['One','Two','Three'])
df2


Unnamed: 0,One,Two,Three
a,3,3,3
c,4,4,1
d,6,0,8
f,9,3,0
h,5,3,9


In [103]:
df2['Three'] = np.nan
df2

Unnamed: 0,One,Two,Three
a,3,3,
c,4,4,
d,6,0,
f,9,3,
h,5,3,


In [104]:
df2.dropna(axis=1)

Unnamed: 0,One,Two
a,3,3
c,4,4
d,6,0
f,9,3
h,5,3


In [105]:
#给定阀值，给定只删除全部为缺失值的行
df2

Unnamed: 0,One,Two,Three
a,3,3,
c,4,4,
d,6,0,
f,9,3,
h,5,3,


In [106]:
df2.loc['h'] = np.nan
df2

Unnamed: 0,One,Two,Three
a,3.0,3.0,
c,4.0,4.0,
d,6.0,0.0,
f,9.0,3.0,
h,,,


In [107]:
df2.dropna(how='all')

Unnamed: 0,One,Two,Three
a,3.0,3.0,
c,4.0,4.0,
d,6.0,0.0,
f,9.0,3.0,


# (3)用指定值或插值的方式填充缺失数据
Pandas提供了各种方法来清除缺失的值。fillna()函数可以通过几种方法用非空数据“填充”NA值

In [93]:
df3 = pd.DataFrame(np.random.randint(10,size = (5,3)),index = ['a','c','d','f','h'],columns = ['One','Two','Three'])
df3 = df3.reindex(['a','b','c','d','e','f','g','h'])
df3

Unnamed: 0,One,Two,Three
a,7.0,1.0,5.0
b,,,
c,4.0,3.0,9.0
d,8.0,7.0,2.0
e,,,
f,4.0,5.0,1.0
g,,,
h,2.0,4.0,8.0


In [108]:
#使用0填充
df3.fillna(0)

Unnamed: 0,One,Two,Three
a,7.0,1.0,5.0
b,0.0,0.0,0.0
c,4.0,3.0,9.0
d,8.0,7.0,2.0
e,0.0,0.0,0.0
f,4.0,5.0,1.0
g,0.0,0.0,0.0
h,2.0,4.0,8.0


In [111]:
#向前填充
df3.fillna(method = 'pad')

Unnamed: 0,One,Two,Three
a,7.0,1.0,5.0
b,7.0,1.0,5.0
c,4.0,3.0,9.0
d,8.0,7.0,2.0
e,8.0,7.0,2.0
f,4.0,5.0,1.0
g,4.0,5.0,1.0
h,2.0,4.0,8.0


In [113]:
#向后填充
df3.fillna(method = 'bfill')

Unnamed: 0,One,Two,Three
a,7.0,1.0,5.0
b,4.0,3.0,9.0
c,4.0,3.0,9.0
d,8.0,7.0,2.0
e,4.0,5.0,1.0
f,4.0,5.0,1.0
g,2.0,4.0,8.0
h,2.0,4.0,8.0


In [123]:
#替换值
df3.replace({np.nan:100,1:'haha'})

Unnamed: 0,One,Two,Three
a,7,haha,5
b,100,100,100
c,4,3,9
d,8,7,2
e,100,100,100
f,4,5,haha
g,100,100,100
h,2,4,8
