## 7.1 处理缺失值
### 7.1.1 过滤缺失值

In [1]:
#dropna返回所有非空数据集及其索引
import numpy as np
import pandas as pd
from numpy import nan as NA

In [4]:
#dropna返回所有非空数据集及其索引
data =pd.Series([1,NA,3.5,NA,7])
print(data.dropna())
#上面得例子和下面得代码等价
print(data[data.notnull()])

0    1.0
2    3.5
4    7.0
dtype: float64
0    1.0
2    3.5
4    7.0
dtype: float64


In [10]:
#dropna删除包含缺失值得行
#how="all"参数删除均为nan的行  data.dropna(how="all")
#想要删除列时，传入参数axis=1即可


#thresh参数表示保留一定数量的观察值的行
df=pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1]=NA   #iloc表示到第几行，loc表示到整数
df.iloc[:2,2]=NA
print(df)
print(df.dropna(thresh=2))

          0         1         2
0 -1.636595       NaN       NaN
1 -1.123301       NaN       NaN
2 -0.554647       NaN  1.355724
3  0.470459       NaN -2.108274
4  1.631401  1.595943 -0.107675
5  0.677020 -1.406489 -1.265448
6  0.104145  1.093956  1.207495
          0         1         2
2 -0.554647       NaN  1.355724
3  0.470459       NaN -2.108274
4  1.631401  1.595943 -0.107675
5  0.677020 -1.406489 -1.265448
6  0.104145  1.093956  1.207495


### 7.1.2 补全缺失值

In [11]:
#主要用fillna方法补全，可以使用常数来代替
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.636595,0.0,0.0
1,-1.123301,0.0,0.0
2,-0.554647,0.0,1.355724
3,0.470459,0.0,-2.108274
4,1.631401,1.595943,-0.107675
5,0.67702,-1.406489,-1.265448
6,0.104145,1.093956,1.207495


In [12]:
df.fillna({1:0.5,2:0}) #不同的列设置不同的值

Unnamed: 0,0,1,2
0,-1.636595,0.5,0.0
1,-1.123301,0.5,0.0
2,-0.554647,0.5,1.355724
3,0.470459,0.5,-2.108274
4,1.631401,1.595943,-0.107675
5,0.67702,-1.406489,-1.265448
6,0.104145,1.093956,1.207495


In [13]:
#填充Series平均值
data=pd.Series([1.,NA,3.5,NA,7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 7.2 数据转换
### 7.2.1 删除重复值

In [14]:
data=pd.DataFrame({'k1':['one','two']*3+['two'],
                  'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [15]:
#duplicated判断是否存在重复的行
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [17]:
data.drop_duplicates()#返回不重复的行

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [18]:
data['v1']=range(7)
data.drop_duplicates(['k1'])  #指定列取唯一值

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [19]:
data.drop_duplicates(['k1','k2'],keep='last')#保持唯一性时，保存最后一个匹配到的值

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### 7.2.2 使用函数或映射进行数据转换

In [20]:
data=pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
                  'ounces':[4,3,12,6,7.5,8,3,5,6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [21]:
#添加一列肉的类型
meat_to_animal={
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}
lowercased=data['food'].str.lower()#将字母全都变成小写
data['animal']=lowercased.map(meat_to_animal)  #用map进行匹配
data


Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


### 7.2.3 替代值

In [22]:
data=pd.Series([1.,-999.,2.,-999.,-1000.,3.])
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [23]:
#一次替代多个值
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [25]:
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [26]:
#参数也可以通过字典传递
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### 7.2.4 重命名轴索引

In [31]:
data=pd.DataFrame(np.arange(12).reshape((3,4)),
                 index=['Ohio','Colorado','New York'],
                 columns=['one','two','three','four'])
#与Series类似，轴索引也有一个man方法
transform=lambda x: x[:4].upper()
data.index=data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [32]:
#改名
data.rename(index={'OHIO':'INDIANA'},
           columns={'three':'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [None]:
2