# 第05章 pandas入门

In [1]:
import pandas as pd
from pandas import Series, DataFrame

## 5.1 pandas的数据结构介绍

### Series

In [2]:
# 类似于dict
obj = pd.Series([4, 7, -5, 3])
obj, obj.values, obj.index

(0    4
 1    7
 2   -5
 3    3
 dtype: int64,
 array([ 4,  7, -5,  3], dtype=int64),
 RangeIndex(start=0, stop=4, step=1))

In [3]:
# 指定index
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2, obj2.index

(d    4
 b    7
 a   -5
 c    3
 dtype: int64,
 Index(['d', 'b', 'a', 'c'], dtype='object'))

In [4]:
# 赋值和索引
obj2['d'] = 6
obj2['a'], obj2[['c', 'a', 'd']]

(-5,
 c    3
 a   -5
 d    6
 dtype: int64)

In [5]:
obj2 > 0

d     True
b     True
a    False
c     True
dtype: bool

In [6]:
import numpy as np

obj2[obj2 > 0], obj2 * 2, np.exp(obj2)

(d    6
 b    7
 c    3
 dtype: int64,
 d    12
 b    14
 a   -10
 c     6
 dtype: int64,
 d     403.428793
 b    1096.633158
 a       0.006738
 c      20.085537
 dtype: float64)

In [7]:
'b' in obj2

True

In [8]:
# 传dict带索引
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [9]:
# 根据index 重排序
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [10]:
pd.isnull(obj4), pd.notnull(obj4), obj4.isnull()

(California     True
 Ohio          False
 Oregon        False
 Texas         False
 dtype: bool,
 California    False
 Ohio           True
 Oregon         True
 Texas          True
 dtype: bool,
 California     True
 Ohio          False
 Oregon        False
 Texas         False
 dtype: bool)

In [11]:
# 计算自动对齐
obj3,obj4,obj3 + obj4

(Ohio      35000
 Texas     71000
 Oregon    16000
 Utah       5000
 dtype: int64,
 California        NaN
 Ohio          35000.0
 Oregon        16000.0
 Texas         71000.0
 dtype: float64,
 California         NaN
 Ohio           70000.0
 Oregon         32000.0
 Texas         142000.0
 Utah               NaN
 dtype: float64)

In [12]:
# 名称
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [13]:
# 修改索引
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [14]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame

In [15]:
# 列的顺序随机
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [16]:
# 显示前5行
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [17]:
# 指定列的顺序
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [18]:
# 没有的列默认空
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],index=['one', 'two', 'three', 'four','five', 'six'])
frame2, frame2.columns

(       year   state  pop debt
 one    2000    Ohio  1.5  NaN
 two    2001    Ohio  1.7  NaN
 three  2002    Ohio  3.6  NaN
 four   2001  Nevada  2.4  NaN
 five   2002  Nevada  2.9  NaN
 six    2003  Nevada  3.2  NaN,
 Index(['year', 'state', 'pop', 'debt'], dtype='object'))

In [19]:
# 获取列
frame2['state'], frame2.state

(one        Ohio
 two        Ohio
 three      Ohio
 four     Nevada
 five     Nevada
 six      Nevada
 Name: state, dtype: object,
 one        Ohio
 two        Ohio
 three      Ohio
 four     Nevada
 five     Nevada
 six      Nevada
 Name: state, dtype: object)

In [20]:
# 获取行
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [21]:
# 赋值 支持传播
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [22]:
frame2['debt'] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [23]:
# 赋值不匹配时 默认NA
frame2['debt'] = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [24]:
# 删除列
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [25]:
del frame2['eastern']
frame2, frame2.columns

(       year   state  pop  debt
 one    2000    Ohio  1.5   NaN
 two    2001    Ohio  1.7  -1.2
 three  2002    Ohio  3.6   NaN
 four   2001  Nevada  2.4  -1.5
 five   2002  Nevada  2.9  -1.7
 six    2003  Nevada  3.2   NaN,
 Index(['year', 'state', 'pop', 'debt'], dtype='object'))

In [26]:
# 嵌套字典创建
pop = {'Nevada': {2001: 2.4, 2002: 2.9},'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [27]:
# 转置
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [28]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [29]:
pdata = {'Ohio': frame3['Ohio'][:-1],'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [30]:
# name
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [31]:
# 类型自动兼容所有类型
frame3.values, frame2.values

(array([[2.4, 1.7],
        [2.9, 3.6],
        [nan, 1.5]]),
 array([[2000, 'Ohio', 1.5, nan],
        [2001, 'Ohio', 1.7, -1.2],
        [2002, 'Ohio', 3.6, nan],
        [2001, 'Nevada', 2.4, -1.5],
        [2002, 'Nevada', 2.9, -1.7],
        [2003, 'Nevada', 3.2, nan]], dtype=object))

### 索引对象

In [32]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index, index[1:]

(Index(['a', 'b', 'c'], dtype='object'), Index(['b', 'c'], dtype='object'))

In [33]:
# 不可变的
# index[1] = 'd'

labels = pd.Index(np.arange(3))
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2.index is labels

True

In [34]:
# 集合功能
frame3, frame3.columns, 'Ohio' in frame3.columns, 2003 in frame3.index

(state  Nevada  Ohio
 year               
 2001      2.4   1.7
 2002      2.9   3.6
 2000      NaN   1.5,
 Index(['Nevada', 'Ohio'], dtype='object', name='state'),
 True,
 False)

In [35]:
# 可以包含重复标签
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

## 5.2 基本功能

### 重新索引

In [46]:
# series
# 没有的赋空值
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj,obj2

(d    4.5
 b    7.2
 a   -5.3
 c    3.6
 dtype: float64,
 a   -5.3
 b    7.2
 c    3.6
 d    4.5
 e    NaN
 dtype: float64)

In [37]:
# 可以填充
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj4 = obj3.reindex(range(6), method='ffill') # ffill=forward fill  bfill=back fill
obj3,obj4

(0      blue
 2    purple
 4    yellow
 dtype: object,
 0      blue
 1      blue
 2    purple
 3    purple
 4    yellow
 5    yellow
 dtype: object)

In [38]:
# dataFrame
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [40]:
# 默认对索引轴
frame.reindex(['a', 'b', 'c', 'd']), frame.reindex(columns=['Texas', 'Utah', 'California'])

(   Ohio  Texas  California
 a   0.0    1.0         2.0
 b   NaN    NaN         NaN
 c   3.0    4.0         5.0
 d   6.0    7.0         8.0,
    Texas  Utah  California
 a      1   NaN           2
 c      4   NaN           5
 d      7   NaN           8)

### 丢弃指定轴上的项

In [42]:
# serise
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj, obj.drop('c'), obj.drop(['d', 'c'])

(a    0.0
 b    1.0
 c    2.0
 d    3.0
 e    4.0
 dtype: float64,
 a    0.0
 b    1.0
 d    3.0
 e    4.0
 dtype: float64,
 a    0.0
 b    1.0
 e    4.0
 dtype: float64)

In [43]:
# dataFrame
data = pd.DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four'])
# 默认删除索引轴
data, data.drop(['Colorado', 'Ohio']),data.drop('two', axis=1), data.drop(['two', 'four'], axis='columns')

(          one  two  three  four
 Ohio        0    1      2     3
 Colorado    4    5      6     7
 Utah        8    9     10    11
 New York   12   13     14    15,
           one  two  three  four
 Utah        8    9     10    11
 New York   12   13     14    15,
           one  three  four
 Ohio        0      2     3
 Colorado    4      6     7
 Utah        8     10    11
 New York   12     14    15,
           one  three
 Ohio        0      2
 Colorado    4      6
 Utah        8     10
 New York   12     14)

In [47]:
# 原对象操作
obj.drop('c', inplace=True)
obj

d    4.5
b    7.2
a   -5.3
dtype: float64

### 索引、选取和过滤

In [56]:
# series
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [57]:
obj['b'],obj[1],obj[2:4],obj[['b', 'a', 'd']],obj[[1, 3]],obj[obj < 2]

(1.0,
 1.0,
 c    2.0
 d    3.0
 dtype: float64,
 b    1.0
 a    0.0
 d    3.0
 dtype: float64,
 b    1.0
 d    3.0
 dtype: float64,
 a    0.0
 b    1.0
 dtype: float64)

In [51]:
# 利用index做切片，末端是包含的
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [53]:
# 赋值
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [55]:
# dataFrame
data = pd.DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [59]:
# 默认是列
data['two'], data[['three', 'one']]

(Ohio         1
 Colorado     5
 Utah         9
 New York    13
 Name: two, dtype: int32,
           three  one
 Ohio          2    0
 Colorado      6    4
 Utah         10    8
 New York     14   12)

In [61]:
# 行索引方式 切片或布尔数组
data[:2],data[data['three'] > 5]

(          one  two  three  four
 Ohio        0    1      2     3
 Colorado    4    5      6     7,
           one  two  three  four
 Colorado    4    5      6     7
 Utah        8    9     10    11
 New York   12   13     14    15)

In [62]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### 用loc和iloc进行选取
使用轴标签（loc）或整数索引（iloc）

In [67]:
# 行 + 列
data,data.loc['Colorado', ['two', 'three']], data.iloc[2, [3, 0, 1]], data.iloc[2], data.iloc[[1, 2], [3, 0, 1]]

(          one  two  three  four
 Ohio        0    0      0     0
 Colorado    0    5      6     7
 Utah        8    9     10    11
 New York   12   13     14    15,
 two      5
 three    6
 Name: Colorado, dtype: int32,
 four    11
 one      8
 two      9
 Name: Utah, dtype: int32,
 one       8
 two       9
 three    10
 four     11
 Name: Utah, dtype: int32,
           four  one  two
 Colorado     7    0    5
 Utah        11    8    9)

In [68]:
# 切片
data.loc[:'Utah', 'two'], data.iloc[:, :3][data.three > 5]

(Ohio        0
 Colorado    5
 Utah        9
 Name: two, dtype: int32,
           one  two  three
 Colorado    0    5      6
 Utah        8    9     10
 New York   12   13     14)

In [70]:
type(data.iloc[2:3])

pandas.core.frame.DataFrame

### 整数索引

In [73]:
# 整数索引会产生歧义
ser = pd.Series(np.arange(3.))
# ser[-1]

In [75]:
# 非整数索引
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]

2.0

In [77]:
ser[:1], ser.loc[:1], ser.iloc[:1] # loc包含末端的 和 obj['b':'c']一致

(0    0.0
 dtype: float64,
 0    0.0
 1    1.0
 dtype: float64,
 0    0.0
 dtype: float64)

### 算术运算和数据对齐

In [79]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),columns=list('abcde'))
df1, df2

(     a    b     c     d
 0  0.0  1.0   2.0   3.0
 1  4.0  5.0   6.0   7.0
 2  8.0  9.0  10.0  11.0,
       a     b     c     d     e
 0   0.0   1.0   2.0   3.0   4.0
 1   5.0   6.0   7.0   8.0   9.0
 2  10.0  11.0  12.0  13.0  14.0
 3  15.0  16.0  17.0  18.0  19.0)

In [81]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


### 在算术方法中填充值

In [82]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [84]:
# 填充
df1.add(df2, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [86]:
# r被除数
1/df1, df1.rdiv(1)

(       a         b         c         d
 0    inf  1.000000  0.500000  0.333333
 1  0.250  0.200000  0.166667  0.142857
 2  0.125  0.111111  0.100000  0.090909,
        a         b         c         d
 0    inf  1.000000  0.500000  0.333333
 1  0.250  0.200000  0.166667  0.142857
 2  0.125  0.111111  0.100000  0.090909)

In [87]:
# 重新索引填充
df1.reindex(columns=df2.columns, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


### DataFrame和Series之间的运算
广播

In [89]:
# numpy
import numpy as np
arr = np.arange(12.).reshape((3, 4))
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [90]:
# panda 行广播
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [92]:
# 对齐 没有为空
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [93]:
# 列广播
series3 = frame['d']
frame.sub(series3, axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


### 函数应用和映射

In [94]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame, np.abs(frame)

(               b         d         e
 Utah   -0.969207 -0.226107  0.541730
 Ohio    0.498241 -1.176918  0.588220
 Texas  -0.615616 -0.132741  1.599282
 Oregon -1.259014 -0.119969 -0.272933,
                b         d         e
 Utah    0.969207  0.226107  0.541730
 Ohio    0.498241  1.176918  0.588220
 Texas   0.615616  0.132741  1.599282
 Oregon  1.259014  0.119969  0.272933)

In [96]:
# 默认是行 入参是series
f = lambda x: x.max() - x.min()
frame.apply(f), frame.apply(f, axis='columns')

(b    1.757255
 d    1.056949
 e    1.872215
 dtype: float64,
 Utah      1.510937
 Ohio      1.765138
 Texas     2.214898
 Oregon    1.139045
 dtype: float64)

In [97]:
# 返回serise
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,-1.259014,-1.176918,-0.272933
max,0.498241,-0.119969,1.599282


In [99]:
# 对每个元素操作
format = lambda x: '%.2f' % x
# dataframe
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.97,-0.23,0.54
Ohio,0.5,-1.18,0.59
Texas,-0.62,-0.13,1.6
Oregon,-1.26,-0.12,-0.27


In [101]:
# series
frame['e'].map(format)

Utah       0.54
Ohio       0.59
Texas      1.60
Oregon    -0.27
Name: e, dtype: object

### 排序和排名

In [103]:
# 排序
# series 索引排序
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [105]:
# dataFrame
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),index=['three', 'one'],columns=['d', 'a', 'b', 'c'])
frame.sort_index(), frame.sort_index(axis=1)

(       d  a  b  c
 one    4  5  6  7
 three  0  1  2  3,
        a  b  c  d
 three  1  2  3  0
 one    5  6  7  4)

In [106]:
# 降序
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [107]:
# series 值排序
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [109]:
# nan会放在末尾
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [112]:
# dataFrame 值排序
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame.sort_values(by='b'),frame.sort_values(by=['a', 'b'])

(   b  a
 2 -3  0
 3  2  1
 0  4  0
 1  7  1,
    b  a
 2 -3  0
 0  4  0
 3  2  1
 1  7  1)

In [116]:
# 排名 根据大小排名
# series
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank() # 相同的取平均

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [117]:
# 先看到的排名高
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [119]:
# 降序排名 相同的同一个名次取大值
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [121]:
# dataFrame
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],'c': [-2, 5, 8, -2.5]})
frame.rank(axis='columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


### 带有重复标签的轴索引

In [124]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj.index.is_unique, obj.loc['b']

(False,
 b    2
 b    3
 dtype: int64)

## 5.3 汇总和计算描述统计