# pandas层次化索引

In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

## 1. 创建多层索引

### 1) 隐式构造

最常见的方法是给DataFrame构造函数的index参数传递两个或更多的数组

In [4]:
df = DataFrame(np.random.rand(4,2),
               index=[['a','a','b','b'],[1,2,1,2]],
              columns=['data1','data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.31694,0.841103
a,2,0.814286,0.196382
b,1,0.689398,0.572509
b,2,0.367352,0.381052


- Series也可以创建多层索引

(a, 1)    0.719628
(a, 2)    0.948441
(b, 3)    0.537870
(b, 4)    0.705189
dtype: float64

In [78]:
s = Series(np.random.rand(4),index = [['a','a','b','b'],[1,2,3,4]])
s

a  1    0.570408
   2    0.749222
b  3    0.768432
   4    0.322986
dtype: float64

### 2) 显示构造MultiIndex并reindex

- 使用数组

In [34]:
Mindex = pd.MultiIndex.from_arrays([['a','a','b','b'],[1,2,1,2]])
Mindex
a = pd.MultiIndex.from_arrays([['a','a','b','b'],[1,2,1,2]])
a

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

- 使用tuple

In [35]:
Mindex = pd.MultiIndex.from_tuples([('a',1),('a',2),('b',1),('b',2)])
Mindex
b = pd.MultiIndex.from_tuples([('a',1),('a',2),('b',1),('b',2)])
b

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

- 使用product

    最简单，推荐使用

In [36]:
Mindex = pd.MultiIndex.from_product([['a','b'],[1,2]])
Mindex


MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

- 直接指定参数

In [37]:
Mindex = pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
Mindex

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

使用reindex构造多层索引对象

In [38]:
index = [('California',2000),('California',2010),
        ('New York',2000),('New York',2010),
        ('Texas',2000),('Texas',2010)]
populations = [33871648,37253956,
              18976457,19378102,
              20851820,25145561]
pop = DataFrame(populations)
pop

Unnamed: 0,0
0,33871648
1,37253956
2,18976457
3,19378102
4,20851820
5,25145561


In [10]:
type(pop)

pandas.core.frame.DataFrame

In [57]:
index = [('California',2000),('California',2010),
        ('New York',2000),('New York',2010),
        ('Texas',2000),('Texas',2010)]
Mindex = pd.MultiIndex.from_tuples(index)

Mindex = pd.MultiIndex.from_product([['California','New York','Texas'],[2000,2010]])

pop = DataFrame(populations,index=Mindex,columns = ['人口'])
pop

Unnamed: 0,Unnamed: 1,人口
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [62]:
Mindex = pd.MultiIndex.from_product([['California','New York','Texas'],[2000,2010]],names=['huan','le'])
pop = DataFrame(populations,index=Mindex)
pop

Unnamed: 0_level_0,Unnamed: 1_level_0,0
huan,le,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


- 还能指定多层索引的名称

使用.index.names属性查看

In [70]:
pop.index.names

FrozenList(['huan', 'le'])

============================================

练习8：

1. 创建一个DataFrame，表示出张三李四期中期末各科成绩

============================================

In [14]:
a = np.random.randint(0,150,(8,2))
Mindex = pd.MultiIndex.from_product([['语文','数学','英语','理综'],['期中','期末']])
ddd = DataFrame(data = a,
                index = Mindex,
                columns = ['张三','李四'])
ddd

Unnamed: 0,Unnamed: 1,张三,李四
语文,期中,72,9
语文,期末,148,115
数学,期中,79,82
数学,期末,99,29
英语,期中,147,147
英语,期末,142,32
理综,期中,9,127
理综,期末,32,31


## 2. 多层列索引

除了行索引index，列索引columns也能用同样的方法创建多层索引

In [76]:
a = np.random.randint(0,150,(2,8))
Mindex = pd.MultiIndex.from_product([['语文','数学','英语','理综'],['期中','期末']])
bbb = DataFrame(data = a,
               index = ['张三','李四'],
               columns = Mindex)
bbb

Unnamed: 0_level_0,语文,语文,数学,数学,英语,英语,理综,理综
Unnamed: 0_level_1,期中,期末,期中,期末,期中,期末,期中,期末
张三,57,59,69,71,1,125,42,135
李四,51,39,31,60,11,91,32,119


## 3. 多层索引对象的索引与切片操作

### 1）Series的操作

【重要】对于Series来说，直接中括号[]与使用.loc()完全一样，因此，推荐使用中括号索引和切片。

In [92]:
s = Series(np.random.rand(4),index = [['a','a','b','b'],[1,2,1,2]])
s

a  1    0.522025
   2    0.831857
b  1    0.654684
   2    0.981633
dtype: float64

In [93]:
index = [('California',2000),('California',2010),
        ('New York',2000),('New York',2010),
        ('Texas',2000),('Texas',2010)]
index = [['California','California','New York','New York','Texas','Texas'],[2000,2010,2000,2010,2000,2010]]
# index = pd.MultiIndex.from_product([['California','New York','Texas'],[2000,2010]]) #相同的效果
populations = [33871648,37253956,
              18976457,19378102,
              20851820,25145561]
pop = Series(populations,index = index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [20]:
Mindex = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(Mindex)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

(1) 索引

In [94]:
pop['California']

2000    33871648
2010    37253956
dtype: int64

(2) 切片

In [22]:
pop['California':'New York']

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [97]:
pop[::-1]

Texas       2010    25145561
            2000    20851820
New York    2010    19378102
            2000    18976457
California  2010    37253956
            2000    33871648
dtype: int64

In [100]:
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

### 2）DataFrame的操作

(1) 可以直接使用列名称来进行列索引

In [9]:
index = [('California',2000),('California',2010),
        ('New York',2000),('New York',2010),
        ('Texas',2000),('Texas',2010)]
populations = [33871648,37253956,
              18976457,19378102,
              20851820,25145561]
Mindex = pd.MultiIndex.from_product([['California','New York','Texas'],[2000,2010]])
pop = DataFrame(populations,index=Mindex,columns=['population'])
pop

Unnamed: 0,Unnamed: 1,population
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [107]:
pop['population']

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
Name: population, dtype: int64

(2) 使用行索引需要用ix()，loc()等函数

【极其重要】推荐使用loc()函数

In [112]:
pop.loc['California']


Unnamed: 0,population
2000,33871648
2010,37253956


In [115]:
pop.loc['California',2000]

population    33871648
Name: (California, 2000), dtype: int64

In [117]:
pop.loc['California',2000][0]

33871648

In [120]:
pop.loc['California'].loc[2000]

population    33871648
Name: 2000, dtype: int64

In [122]:
pop.loc['California',2000]

population    33871648
Name: (California, 2000), dtype: int64

In [133]:
pop.loc[('California',2000),'population']

33871648

注意在对行索引的时候，若一级行索引还有多个，对二级行索引会遇到问题！也就是说，无法直接对二级索引进行索引，必须让二级索引变成一级索引后才能对其进行索引！

In [141]:
pop.loc['California':'New York',2000]

TypeError: cannot do label indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [2000] of <class 'int'>

============================================

练习9：

1. 分析比较Series和DataFrame各种索引的方式，熟练掌握.loc()方法

2. 假设张三再一次在期中考试的时候因为特殊原因放弃英语考试，如何实现？

============================================

In [7]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
np.random.seed(0)
Mindex = pd.MultiIndex.from_product([['李四','张三','王五'],['期中','期末']])
df = DataFrame(data = np.random.randint(0,150,(6,3)),index =Mindex,columns = ['语文','数学','英语'])
df
df.loc['张三','期中']['英语'] = 0
display(df)

Unnamed: 0,Unnamed: 1,语文,数学,英语
李四,期中,47,117,67
李四,期末,103,9,21
张三,期中,36,87,0
张三,期末,88,140,58
王五,期中,39,87,88
王五,期末,81,25,77


## 4. 索引的堆（stack）

- ``stack()``
- ``unstack()``

In [11]:
populations = [33871648,37253956,
              18976457,19378102,
              20851820,25145561]
Mindex = pd.MultiIndex.from_product([['California','New York','Texas'],[2000,2010]])
pop = DataFrame(populations,index=Mindex,columns=['population'])
pop

Unnamed: 0,Unnamed: 1,population
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


【小技巧】使用unstack()的时候，level等于哪一个，哪一个就消失，出现在列里。

In [12]:
pop.unstack(level=0)

Unnamed: 0_level_0,population,population,population
Unnamed: 0_level_1,California,New York,Texas
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [13]:
pop.unstack(level=1)

Unnamed: 0_level_0,population,population
Unnamed: 0_level_1,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


============================================

练习10：

1. 使用unstack()将ddd变为两行，分别为期中期末

2. 使用unstack()将ddd变为四行，分别为四个科目

============================================

In [18]:
a = np.random.randint(0,150,(8,2))
Mindex = pd.MultiIndex.from_product([['语文','数学','英语','理综'],['期中','期末']])
ddd = DataFrame(data = a,
                index = Mindex,
                columns = ['张三','李四'])
ddd
# ddd.unstack(level = 0)
ddd.unstack(level = 1)

Unnamed: 0_level_0,张三,张三,李四,李四
Unnamed: 0_level_1,期中,期末,期中,期末
数学,48,131,93,98
理综,0,114,138,43
英语,42,149,112,127
语文,94,113,0,36


## 5. 聚合操作

【注意】

- 需要指定level

- 【小技巧】和unstack()相反，聚合的时候，level等于哪一个，哪一个就保留。

In [22]:
populations = [33871648,37253956,
              18976457,19378102,
              20851820,25145561]
Mindex = pd.MultiIndex.from_product([['California','New York','Texas'],[2000,2010]])
pop = DataFrame(populations,index=Mindex,columns=['population'])
pop

Unnamed: 0,Unnamed: 1,population
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [20]:
pop.mean(level=0)

Unnamed: 0,population
California,35562802.0
New York,19177279.5
Texas,22998690.5


In [21]:
pop.mean(level=1)

Unnamed: 0,population
2000,24566640.0
2010,27259210.0


============================================

练习11：

1. 计算各个科目期中期末平均成绩

2. 计算各科目张三李四的最高分

============================================

In [35]:
df = DataFrame(np.random.randint(0,150,(6,3)),
               index = pd.MultiIndex.from_product([['张三','李四','王五'],['期中','期末']]),
              columns = ['语文','数学','编程'])
display(df)
print("计算各个科目期中期末平均成绩")
display(df.mean(level = 1))
print("计算各科目张三李四的最高分")
display(df.loc['张三':'李四'].max(level = 1))

Unnamed: 0,Unnamed: 1,语文,数学,编程
张三,期中,5,0,136
张三,期末,139,4,92
李四,期中,26,74,52
李四,期末,51,105,18
王五,期中,117,34,51
王五,期末,58,55,18


计算各个科目期中期末平均成绩


Unnamed: 0,语文,数学,编程
期中,49.333333,36.0,79.666667
期末,82.666667,54.666667,42.666667


计算各科目张三李四的最高分


Unnamed: 0,语文,数学,编程
期中,26,74,136
期末,139,105,92
