##  移除重复数据


In [3]:
from pandas import DataFrame
data = DataFrame({'k1':['one']*3+['two']*4,
                  'k2':[1,1,2,3,3,4,4]})

In [5]:
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [8]:
#DataFrame 的 duplicated 方法返回一个布尔型 Seriess
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [10]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [13]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [17]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


## 利用函数或映射进行数据转换


In [19]:
data = DataFrame({'food':['bacon','pulled pork','bacon',
                            'pastrami','corned beef','bacon',
                            'pastrami','honey ham','nova lox'],
                      'ounces':[4, 3, 12, 6, 7.5, 8, 3, 5, 6]})


In [21]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [23]:
# 我们添加一个食物来源
meat_to_animal = {
     'bacon': 'pig',
    'pulled pork': 'pig',
     'pastrami': 'cow',
    'corned beef': 'cow',
   'honey ham': 'pig',
     'nova lox': 'salmon'
     }

In [27]:
#Series 的 map 方法可以接受一个函数或含有映射关系的字典型对象
data['animal'] = data['food'].map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [29]:
 # 我们也可以传入一个能够完成全部工作的函数：
data['food'].map(lambda x: meat_to_animal[x])
  

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

## 替换值
利用 fillna 方法填充缺失数据可以看做值替换的一种特殊情况。虽然前面提到的 map 可用于修改对象的数据子集，而 replace 则提供了一种实现该功能的更简单、更灵活的方式

In [34]:
from pandas import Series
import numpy as np
data = Series([1., -999., 2., -999., -1000., 3.])

In [36]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [38]:
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [40]:
data.replace([-999,-1000],[np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [42]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## 重命名轴索引
跟 Series 中的值一样，轴标签也可以通过函数或映射进行转换，从而得到一个新对象。轴还可以被就地修改，而无需新建一个数据结构。

In [44]:
data = DataFrame(np.arange(12).reshape((3,4)),
                    index = ['Ohio','Colorado','New York'],
                    columns = ['one','two','three','four'])


In [46]:
 # 跟 Series 一样，轴标签也有一个 map 方法
data.index.map(str.upper)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [48]:
# 同样我们也可以赋值给 index
data.index = data.index.map(str.upper)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [50]:
# 如果想要创建数据集的转换版（不是修改原始数据）
data.rename(index = str.title,columns = str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [52]:
#rename 可以结合字典对象实现部分轴标签的更新
data.rename(index={'OHIO':'INDIANA'},
             columns = {'three':'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11
