## 8.1階層式索引

In [1]:
import pandas as pd
import numpy as np
data = pd.Series(np.random.randn(9),
                index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                      [1,2,3,1,3,1,2,2,3]])
data

a  1    0.890653
   2    0.380326
   3   -0.899443
b  1    1.438287
   3    1.377960
c  1   -0.907391
   2   -0.993755
d  2    0.359150
   3    0.997621
dtype: float64

In [3]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])

In [4]:
# 選擇子集
data[['b','c']]  # 同 data['b':'c']

b  1    1.438287
   3    1.377960
c  1   -0.907391
   2   -0.993755
dtype: float64

In [6]:
# 選擇內層
data.loc[:,2]

a    0.380326
c   -0.993755
d    0.359150
dtype: float64

In [7]:
'''
重排DataFrame資料
'''
# 將資料的行"旋轉"為列
data.unstack()  # 設操作為最內層

Unnamed: 0,1,2,3
a,0.890653,0.380326,-0.899443
b,1.438287,,1.37796
c,-0.907391,-0.993755,
d,,0.35915,0.997621


In [10]:
# 將資料的列"旋轉"為行
data.unstack().stack()   # 設操作為最內層

a  1    0.890653
   2    0.380326
   3   -0.899443
b  1    1.438287
   3    1.377960
c  1   -0.907391
   2   -0.993755
d  2    0.359150
   3    0.997621
dtype: float64

In [11]:
'''
每個軸都有自己的階層式索引
'''
frame = pd.DataFrame(np.arange(12).reshape((4,3)),
                    index=[['a', 'a', 'b', 'b'], [1,2,1,2]],
                    columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])

frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [12]:
# 給階層式索引的每一層名字
frame.index.names =['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [31]:
'''
利用 Pandas MultiIndex，可以新建立多維資料，效果同上
'''
columns2 = pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],
                                   names=['state', 'color'])
columns2

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

In [32]:
index2 = pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1,2,1,2]],
                                   names=['key1', 'key2'])
index2

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['key1', 'key2'])

In [33]:
frame2 = pd.DataFrame(np.arange(12).reshape((4,3)), index=index2, columns=columns2)
frame2

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


### 重排階層及依階層排序值

In [34]:
# 針對兩個階層(軸)
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [39]:
# 用於指定層index
frame.sort_index(level=1)   # level=1: 第二層

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [41]:
frame.swaplevel(0,1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [40]:
frame.swaplevel(0,1).sort_index(level=0)   # level=0: 第一層

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


### 指定階層統計資訊

In [42]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [43]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### 用DataFrame 的欄當 index

In [45]:
frame = pd.DataFrame({'a': range(7),
                      'b': range(7,0,-1),
                      'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                      'd': [0,1,2,0,1,2,3]
                     })
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [46]:
# 新物件拿原物件的一或多個欄位來當index
# columns --> index
frame2 = frame.set_index(['c', 'd'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [51]:
# 選擇留下欄位資料(預設會被刪掉)
frame3 = frame.set_index(['c', 'd'], drop=False)
frame3

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [52]:
# index --> columns
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


---

## 8.2 合併資料集合