# 索引

In [8]:
import pandas as pd
import numpy as np

In [11]:
df = pd.DataFrame(np.random.randn(30,5),columns=list('abcde'))
df.head()

Unnamed: 0,a,b,c,d,e
0,1.230422,1.222455,-0.12643,-2.334785,-1.905806
1,-0.481261,1.490968,-1.128478,-1.106196,1.142075
2,0.319797,-0.430345,-0.156376,-1.074382,1.12742
3,0.192125,0.29428,-0.442639,-0.258107,1.182989
4,-0.106528,0.595629,1.221878,-1.43001,-0.663897


### 1、loc会包含右边界

In [12]:
df.loc[:1]

Unnamed: 0,a,b,c,d,e
0,1.230422,1.222455,-0.12643,-2.334785,-1.905806
1,-0.481261,1.490968,-1.128478,-1.106196,1.142075


### 2、[3::-2]代表从3开始倒数,步长为2

In [14]:
df.iloc[3::-2]

Unnamed: 0,a,b,c,d,e
3,0.192125,0.29428,-0.442639,-0.258107,1.182989
1,-0.481261,1.490968,-1.128478,-1.106196,1.142075


### df.index.get_loc()得到某一索引对应的元素的位置

In [15]:
df.index.get_loc(3)

3

### isin()和all()

In [19]:
df1 = pd.read_csv('../joyful-pandas-master/data/table.csv',index_col='ID')
df1.head(3)

Unnamed: 0_level_0,School,Class,Gender,Address,Height,Weight,Math,Physics
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1101,S_1,C_1,M,street_1,173,63,34.0,A+
1102,S_1,C_1,F,street_2,192,73,32.5,B+
1103,S_1,C_1,M,street_2,186,82,87.2,B+


In [21]:
df1[df1[['Address','Physics']].isin({'Address':['street_1','street_2'],'Physics':['A+','B+']}).all(1)]
#all与&的思路是类似的，其中的1代表按照跨列方向判断是否全为True

Unnamed: 0_level_0,School,Class,Gender,Address,Height,Weight,Math,Physics
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1101,S_1,C_1,M,street_1,173,63,34.0,A+
1102,S_1,C_1,F,street_2,192,73,32.5,B+
1103,S_1,C_1,M,street_2,186,82,87.2,B+


### 只取一个元素时，用at和iat更高效

In [24]:
display(df1.at[1101,'Math'])
display(df1.iat[1,1])

34.0

'C_1'

## 区间索引

### 使用cut或者interval_index创建

In [27]:
pd.interval_range(start=0,end = 5,closed = 'neither')
#closed:'left','right','both','neither'

IntervalIndex([(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)]
              closed='neither',
              dtype='interval[int64]')

In [31]:
pd.interval_range(start=0,periods=3,freq=2)
#periods:区间个数，freq：步长

IntervalIndex([(0, 2], (2, 4], (4, 6]]
              closed='right',
              dtype='interval[int64]')

In [58]:
math_interval = pd.cut(df1.Math,bins=[0,40,60,80,100])
#默认区间右闭
#不是区间类型，而是category类型
math_interval.unique()

[(0, 40], (80, 100], (60, 80], (40, 60]]
Categories (4, interval[int64]): [(0, 40] < (40, 60] < (60, 80] < (80, 100]]

### 区间索引的选取

#### 只要包含该值就会被选取

In [68]:
df1_ = df1.join(math_interval,rsuffix='_interval').reset_index().set_index('Math_interval')
#rsuffix:后缀
df1_.head(3)

Unnamed: 0_level_0,ID,School,Class,Gender,Address,Height,Weight,Math,Physics
Math_interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"(0, 40]",1101,S_1,C_1,M,street_1,173,63,34.0,A+
"(0, 40]",1102,S_1,C_1,F,street_2,192,73,32.5,B+
"(80, 100]",1103,S_1,C_1,M,street_2,186,82,87.2,B+


In [70]:
df1_.loc[[30,80]].head(3)

Unnamed: 0_level_0,ID,School,Class,Gender,Address,Height,Weight,Math,Physics
Math_interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"(0, 40]",1101,S_1,C_1,M,street_1,173,63,34.0,A+
"(0, 40]",1102,S_1,C_1,F,street_2,192,73,32.5,B+
"(0, 40]",1204,S_1,C_2,F,street_5,162,63,33.8,B


#### 区间覆盖需要先转变索引类型为interval，再用overlaps方法

In [73]:
df1_.index.astype('interval').overlap(pd.Interval(65,75))

AttributeError: 'IntervalIndex' object has no attribute 'overlap'

## 多级索引

### 创建多级索引from_tuples,from_arrays,from_product,set_index

In [85]:
tuples = list(zip(list('AABB'),list('abab')))
tuples

[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]

In [87]:
pd.MultiIndex.from_tuples(tuples,names=['upper','lower'])

MultiIndex(levels=[['A', 'B'], ['a', 'b']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['upper', 'lower'])

from_arrays会将array转换为tuple

In [88]:
L1 = ['A','B']
L2 = ['a','b']
pd.MultiIndex.from_product([L1,L2],names = ['upper','lower'])

MultiIndex(levels=[['A', 'B'], ['a', 'b']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['upper', 'lower'])

In [89]:
df1_multi = df1.set_index(['Class','Address'])
df1_multi.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,School,Gender,Height,Weight,Math,Physics
Class,Address,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C_1,street_1,S_1,M,173,63,34.0,A+
C_1,street_2,S_1,F,192,73,32.5,B+
C_1,street_2,S_1,M,186,82,87.2,B+
C_1,street_2,S_1,F,167,81,80.4,B-
C_1,street_4,S_1,F,159,64,84.8,B+


### 多级索引切片

#### 多级索引不排序时会发出性能警告，index.is_lexsorted()检查是否排序

In [91]:
df1_multi.index.is_lexsorted()

False

In [93]:
df1_multi.sort_index().loc['C_1','street_1']

Unnamed: 0_level_0,Unnamed: 1_level_0,School,Gender,Height,Weight,Math,Physics
Class,Address,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C_1,street_1,S_1,M,173,63,34.0,A+


#### 只有排序后才能用多级切片

In [99]:
#元组表示精确到某一元素
df1_multi.sort_index().loc[('C_1','street_6'):('C_2','street_4')]

Unnamed: 0_level_0,Unnamed: 1_level_0,School,Gender,Height,Weight,Math,Physics
Class,Address,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C_1,street_6,S_2,F,161,61,50.6,B+
C_1,street_7,S_2,M,174,84,83.3,C
C_2,street_1,S_2,M,175,74,47.2,B-
C_2,street_4,S_1,F,176,94,63.5,B-
C_2,street_4,S_2,M,155,91,73.8,A+


In [100]:
#非元组表示选中该层所有元素
df1_multi.sort_index().loc[('C_1','street_7'):'C_2']

Unnamed: 0_level_0,Unnamed: 1_level_0,School,Gender,Height,Weight,Math,Physics
Class,Address,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C_1,street_7,S_2,M,174,84,83.3,C
C_2,street_1,S_2,M,175,74,47.2,B-
C_2,street_4,S_1,F,176,94,63.5,B-
C_2,street_4,S_2,M,155,91,73.8,A+
C_2,street_5,S_1,M,188,68,97.0,A-
C_2,street_5,S_1,F,162,63,33.8,B
C_2,street_5,S_2,M,193,100,39.1,B
C_2,street_6,S_1,M,160,53,58.8,A+
C_2,street_6,S_1,F,167,63,68.4,B-
C_2,street_7,S_2,F,194,77,68.5,B+


#### 选中某几个元素用元组的列表，或者列表的元组

In [101]:
#用元组构成列表,选取特定元素
df1_multi.sort_index().loc[[('C_1','street_7'),('C_2','street_1')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,School,Gender,Height,Weight,Math,Physics
Class,Address,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C_1,street_7,S_2,M,174,84,83.3,C
C_2,street_1,S_2,M,175,74,47.2,B-


In [108]:
#用列表构成元组
df1_multi.sort_index().loc[(['C_1','C_2'],['street_2']),:]
#表示选取第一层在C_1和C_2，并且第二层在street_2中的行

Unnamed: 0_level_0,Unnamed: 1_level_0,School,Gender,Height,Weight,Math,Physics
Class,Address,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C_1,street_2,S_1,F,192,73,32.5,B+
C_1,street_2,S_1,M,186,82,87.2,B+
C_1,street_2,S_1,F,167,81,80.4,B-


#### 多层索引中的slice对象

In [125]:
L1,L2 = ['A','B','C'],['a','b','c']
mul_index1 = pd.MultiIndex.from_product([L1,L2],names=('upper','lower'))
L3,L4 = ['D','E','F'],['d','e','f']
mul_index2 = pd.MultiIndex.from_product([L3,L4],names=['big','small'])
df2 = pd.DataFrame(np.random.rand(9,9),index = mul_index1,columns=mul_index2)
df2.head()

Unnamed: 0_level_0,big,D,D,D,E,E,E,F,F,F
Unnamed: 0_level_1,small,d,e,f,d,e,f,d,e,f
upper,lower,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
A,a,0.163883,0.433254,0.122118,0.715612,0.34229,0.346622,0.751729,0.938435,0.883174
A,b,0.459174,0.127912,0.343108,0.485329,0.138395,0.750267,0.73117,0.096754,0.718412
A,c,0.701539,0.868987,0.675641,0.985695,0.096533,0.255113,0.429461,0.183611,0.706451
B,a,0.197623,0.152417,0.672849,0.983012,0.023894,0.064627,0.89106,0.728941,0.306409
B,b,0.730548,0.563862,0.035081,0.503374,0.081504,0.22218,0.61138,0.048749,0.290331


In [145]:
idx = pd.IndexSlice

In [146]:
#df2['D']['d']>0.3也是对DataFrame行的筛选
df2.loc[idx['B':,df2['D']['d']>0.3],idx[df2.sum()>4]]

Unnamed: 0_level_0,big,D,D,E,F,F
Unnamed: 0_level_1,small,d,e,d,d,f
upper,lower,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
B,b,0.730548,0.563862,0.503374,0.61138,0.290331
B,c,0.765739,0.507197,0.939281,0.852105,0.023725
C,a,0.529448,0.887932,0.43533,0.501448,0.588864
C,b,0.360148,0.391558,0.785073,0.432404,0.837107
C,c,0.506017,0.924557,0.298453,0.884174,0.245582


### 索引层交换，两层swaplevel，多层reorder_levels

In [116]:
#两层
df1_multi.swaplevel(i=1,j=0).sort_index().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,School,Gender,Height,Weight,Math,Physics
Address,Class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
street_1,C_1,S_1,M,173,63,34.0,A+
street_1,C_2,S_2,M,175,74,47.2,B-
street_1,C_3,S_1,F,175,57,87.7,A-
street_2,C_1,S_1,F,192,73,32.5,B+
street_2,C_1,S_1,M,186,82,87.2,B+


In [121]:
#多层
df1.set_index(['School','Class','Address']).reorder_levels([2,1,0]).sort_index().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Gender,Height,Weight,Math,Physics
Address,Class,School,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
street_1,C_1,S_1,M,173,63,34.0,A+
street_1,C_2,S_2,M,175,74,47.2,B-
street_1,C_3,S_1,F,175,57,87.7,A-
street_2,C_1,S_1,F,192,73,32.5,B+
street_2,C_1,S_1,M,186,82,87.2,B+


In [122]:
#多层用名字
df1.set_index(['School','Class','Address']).reorder_levels(['Address','Class','School']).sort_index().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Gender,Height,Weight,Math,Physics
Address,Class,School,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
street_1,C_1,S_1,M,173,63,34.0,A+
street_1,C_2,S_2,M,175,74,47.2,B-
street_1,C_3,S_1,F,175,57,87.7,A-
street_2,C_1,S_1,F,192,73,32.5,B+
street_2,C_1,S_1,M,186,82,87.2,B+


### 取出内层索引指定的数据

## 索引设定

### reindex重新索引，重要特性在于索引对齐，很多时候用于重新排序

In [148]:
df1.head()

Unnamed: 0_level_0,School,Class,Gender,Address,Height,Weight,Math,Physics
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1101,S_1,C_1,M,street_1,173,63,34.0,A+
1102,S_1,C_1,F,street_2,192,73,32.5,B+
1103,S_1,C_1,M,street_2,186,82,87.2,B+
1104,S_1,C_1,F,street_2,167,81,80.4,B-
1105,S_1,C_1,F,street_4,159,64,84.8,B+


#### 排index和colums都可以

In [151]:
df1.reindex(index=[1105,1101,1102,1111],method='ffill') 

Unnamed: 0_level_0,School,Class,Gender,Address,Height,Weight,Math,Physics
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1105,S_1,C_1,F,street_4,159,64,84.8,B+
1101,S_1,C_1,M,street_1,173,63,34.0,A+
1102,S_1,C_1,F,street_2,192,73,32.5,B+
1111,S_1,C_1,F,street_4,159,64,84.8,B+


### 关于set_index

In [152]:
#append=True参数保留原来索引
df1.set_index('Class',append = True).head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,School,Gender,Address,Height,Weight,Math,Physics
ID,Class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1101,C_1,S_1,M,street_1,173,63,34.0,A+


**用数据框之外的列做索引要先转为Series**

### rename_axis用来修改多级索引某一层的索引名，而不是索引标签

## 抽样函数df.sample()

- n为样本数
- frac为抽样比
- axis为抽样维度，默认axis=0，即抽行
- weights为样本权重，自动归一化

In [158]:
df1.sample(n = 2,axis=1).head()

Unnamed: 0_level_0,Address,School
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1101,street_1,S_1
1102,street_2,S_1
1103,street_2,S_1
1104,street_2,S_1
1105,street_4,S_1


In [160]:
#以某一列为权重
#抽到的概率与Math的数值成正比
df1.sample(weights=df1['Math'],n = 2)

Unnamed: 0_level_0,School,Class,Gender,Address,Height,Weight,Math,Physics
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1304,S_1,C_3,M,street_2,195,70,85.2,A
2404,S_2,C_4,F,street_2,160,84,67.7,B
