## pandas07; multi index

In [1]:
import numpy as np
import pandas as pd

### 멀티 인덱스 
- pandas mutli index지원 
- series dataframe 다 가능
- index 방향 column 방향 모두 multi 가능하다

#### 다중 인덱스의 좋지 않은 형태 

In [4]:
multi_df = pd.DataFrame(np.random.randn(4,4),index=[('Group01','col01'),('Group02','col02'),('Group03','col03'),('Group04','col04')])
display(multi_df)

Unnamed: 0,0,1,2,3
"(Group01, col01)",0.149436,-0.126465,-0.106692,1.327128
"(Group02, col02)",-1.225975,-0.828206,1.326796,0.056881
"(Group03, col03)",0.120482,-0.190184,-1.792524,0.499005
"(Group04, col04)",0.182951,-0.513043,0.359277,-1.357466


#### 다중 인덱스의 바람직한 형태

In [7]:
# series
index=[('Group01','col01'),('Group01','col02'),('Group02','col03'),('Group02','col04')]
index = pd.MultiIndex.from_tuples(index)
pd.Series(range(4),index=index)

Group01  col01    0
         col02    1
Group02  col03    2
         col04    3
dtype: int64

In [9]:
# datafrmae
index=[('Group01','col01'),('Group01','col02'),('Group02','col03'),('Group02','col04')]
index = pd.MultiIndex.from_tuples(index)
pd.DataFrame(np.random.randn(4,4),index=index)

Unnamed: 0,Unnamed: 1,0,1,2,3
Group01,col01,0.731766,0.677621,0.839421,-1.690829
Group01,col02,-0.137007,0.78504,-0.046417,-0.959389
Group02,col03,1.158822,0.502073,1.468764,2.050496
Group02,col04,0.562004,-0.379638,-0.115096,-0.033078


### MultiIndex 생성하는법
- index columns 를 배열로
- 명시적인 multiindex => pd.MultiIndex 속성 이용

In [12]:
# index와 columns에 2차원 배열 전달
# 위의 tuple의 형태를 쪼개놓았다고 생각해도 된다
np.random.seed(100)
multi_df = pd.DataFrame(np.random.randn(5,4),
                        columns=[['Group01','Group01','Group02','Group02'],
                                                      ['col01','col02','col01','col02']],
                        index = [['M','M','M','F','F'],
                                 [i for i in range(5)]])
display(multi_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Group01,Group01,Group02,Group02
Unnamed: 0_level_1,Unnamed: 1_level_1,col01,col02,col01,col02
M,0,-1.749765,0.34268,1.153036,-0.252436
M,1,0.981321,0.514219,0.22118,-1.070043
M,2,-0.189496,0.255001,-0.458027,0.435163
F,3,-0.583595,0.816847,0.672721,-0.104411
F,4,-0.53128,1.029733,-0.438136,-1.118318


In [15]:
# multi index값을 확인해보면 무언가 튜플 인덱스의 배열처럼 들어가있다
# 하지만 정확하게 타입은 MultiIndex이다
print(multi_df.index)
print(type(multi_df.index))

MultiIndex([('M', 0),
            ('M', 1),
            ('M', 2),
            ('F', 3),
            ('F', 4)],
           )
<class 'pandas.core.indexes.multi.MultiIndex'>


In [16]:
# MultiIndex의 이름도 설정 가능하다
# 다만 그전처럼 index.name columns.name이 아닌 index.names column.names에 2차원 형식으로 설정한다
np.random.seed(100)
multi_df = pd.DataFrame(np.random.randn(5,4),
                        columns=[['Group01','Group01','Group02','Group02'],
                                                      ['col01','col02','col01','col02']],
                        index = [['M','M','M','F','F'],
                                 [i for i in range(5)]])
multi_df.columns.names=['Group_Id','Col_Id']
multi_df.index.names=['Gender','Row_Id']
display(multi_df)

Unnamed: 0_level_0,Group_Id,Group01,Group01,Group02,Group02
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,0,-1.749765,0.34268,1.153036,-0.252436
M,1,0.981321,0.514219,0.22118,-1.070043
M,2,-0.189496,0.255001,-0.458027,0.435163
F,3,-0.583595,0.816847,0.672721,-0.104411
F,4,-0.53128,1.029733,-0.438136,-1.118318


#### 명시적으로 MultiIndex 선언
- MultiIndex 객체를 만드는 여러방법 존재
    - pd.MultiIndex.from_arrays()
    - pd.MultiIndex.from_tuples()
    - 이정도만 알아도 충분하다

In [71]:
# arrays
arrays = [['a','b','c','d'],[1,2,3,4]]
pd.MultiIndex.from_arrays(arrays)

MultiIndex([('a', 1),
            ('b', 2),
            ('c', 3),
            ('d', 4)],
           )

In [73]:
tuples = [('a',1),('b',2),('c',3),('d',4)]
pd.MultiIndex.from_tuples(tuples)

MultiIndex([('a', 1),
            ('b', 2),
            ('c', 3),
            ('d', 4)],
           )

In [77]:
tuples = [('a',1),('b',2),('c',3),('d',4)]
index = pd.MultiIndex.from_tuples(tuples)
pd.Series(range(100,104),index= index)

a  1    100
b  2    101
c  3    102
d  4    103
dtype: int64

### stack unstack
- stack : 열을 행으로
- unstack : 행을 열로

In [66]:
# multi_df.stack(0)
# multi_df.stack('Col_Id')
multi_df.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Group_Id,Group01,Group02
Gender,Row_Id,Col_Id,Unnamed: 3_level_1,Unnamed: 4_level_1
M,0,col01,-1.749765,1.153036
M,0,col02,0.34268,-0.252436
M,1,col01,0.981321,0.22118
M,1,col02,0.514219,-1.070043
M,2,col01,-0.189496,-0.458027
M,2,col02,0.255001,0.435163
F,3,col01,-0.583595,0.672721
F,3,col02,0.816847,-0.104411
F,4,col01,-0.53128,-0.438136
F,4,col02,1.029733,-1.118318


In [65]:
multi_df.stack('Group_Id')

Unnamed: 0_level_0,Unnamed: 1_level_0,Col_Id,col01,col02
Gender,Row_Id,Group_Id,Unnamed: 3_level_1,Unnamed: 4_level_1
M,0,Group01,-1.749765,0.34268
M,0,Group02,1.153036,-0.252436
M,1,Group01,0.981321,0.514219
M,1,Group02,0.22118,-1.070043
M,2,Group01,-0.189496,0.255001
M,2,Group02,-0.458027,0.435163
F,3,Group01,-0.583595,0.816847
F,3,Group02,0.672721,-0.104411
F,4,Group01,-0.53128,1.029733
F,4,Group02,-0.438136,-1.118318


In [49]:
multi_df.unstack(0)

Group_Id,Group01,Group01,Group01,Group01,Group02,Group02,Group02,Group02
Col_Id,col01,col01,col02,col02,col01,col01,col02,col02
Gender,F,M,F,M,F,M,F,M
Row_Id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
0,,-1.749765,,0.34268,,1.153036,,-0.252436
1,,0.981321,,0.514219,,0.22118,,-1.070043
2,,-0.189496,,0.255001,,-0.458027,,0.435163
3,-0.583595,,0.816847,,0.672721,,-0.104411,
4,-0.53128,,1.029733,,-0.438136,,-1.118318,


In [68]:
multi_df.unstack('Row_Id')

Group_Id,Group01,Group01,Group01,Group01,Group01,Group01,Group01,Group01,Group01,Group01,Group02,Group02,Group02,Group02,Group02,Group02,Group02,Group02,Group02,Group02
Col_Id,col01,col01,col01,col01,col01,col02,col02,col02,col02,col02,col01,col01,col01,col01,col01,col02,col02,col02,col02,col02
Row_Id,0,1,2,3,4,0,1,2,3,4,0,1,2,3,4,0,1,2,3,4
Gender,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3
F,,,,-0.583595,-0.53128,,,,0.816847,1.029733,,,,0.672721,-0.438136,,,,-0.104411,-1.118318
M,-1.749765,0.981321,-0.189496,,,0.34268,0.514219,0.255001,,,1.153036,0.22118,-0.458027,,,-0.252436,-1.070043,0.435163,,


### MultiIndex의 name을 어떻게 바꿀까?
- rename의 형식과 같다
- level 과 axis를 조작해서 원하는 level에 진입하자

In [2]:
np.random.seed(100)
multi_df = pd.DataFrame(np.random.randn(5,4),
                        columns=[['Group01','Group01','Group02','Group02'],
                                                      ['col01','col02','col01','col02']],
                        index = [['M','M','M','F','F'],
                                 [i for i in range(5)]])
multi_df.columns.names=['Group_Id','Col_Id']
multi_df.index.names=['Gender','Row_Id']
display(multi_df)

Unnamed: 0_level_0,Group_Id,Group01,Group01,Group02,Group02
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,0,-1.749765,0.34268,1.153036,-0.252436
M,1,0.981321,0.514219,0.22118,-1.070043
M,2,-0.189496,0.255001,-0.458027,0.435163
F,3,-0.583595,0.816847,0.672721,-0.104411
F,4,-0.53128,1.029733,-0.438136,-1.118318


In [12]:
# axis=0이 기본으로 설정되어있다고 보아야된다
multi_df.rename({'M':'남자','F':'여자'},level='Gender')

Unnamed: 0_level_0,Group_Id,Group01,Group01,Group02,Group02
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
남자,0,-1.749765,0.34268,1.153036,-0.252436
남자,1,0.981321,0.514219,0.22118,-1.070043
남자,2,-0.189496,0.255001,-0.458027,0.435163
여자,3,-0.583595,0.816847,0.672721,-0.104411
여자,4,-0.53128,1.029733,-0.438136,-1.118318


In [11]:
multi_df.rename({'Group01':'그룹1','Group02':'그룹2'},level='Group_Id',axis=1)

Unnamed: 0_level_0,Group_Id,그룹1,그룹1,그룹2,그룹2
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,0,-1.749765,0.34268,1.153036,-0.252436
M,1,0.981321,0.514219,0.22118,-1.070043
M,2,-0.189496,0.255001,-0.458027,0.435163
F,3,-0.583595,0.816847,0.672721,-0.104411
F,4,-0.53128,1.029733,-0.438136,-1.118318


In [13]:
multi_df.rename({'Group01':'그룹1','Group02':'그룹2'},level=0,axis=1)

Unnamed: 0_level_0,Group_Id,그룹1,그룹1,그룹2,그룹2
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,0,-1.749765,0.34268,1.153036,-0.252436
M,1,0.981321,0.514219,0.22118,-1.070043
M,2,-0.189496,0.255001,-0.458027,0.435163
F,3,-0.583595,0.816847,0.672721,-0.104411
F,4,-0.53128,1.029733,-0.438136,-1.118318


### MultiIndex 인덱싱
- multiIndex는 튜플로 접근한다
- 앞써 배운 인덱싱 슬라이싱 규칙과 return type은 다 똑같이 적용된다
- 다만 MultiIndex도 하나의 차원으로 생각하면 좋다 

In [112]:
index=[('Group01','col01'),('Group01','col02'),('Group02','col03'),('Group02','col04')]
index = pd.MultiIndex.from_tuples(index)
multi_series = pd.Series(range(4),index=index)
display(multi_series)

Group01  col01    0
         col02    1
Group02  col03    2
         col04    3
dtype: int64

In [113]:
multi_series['Group01']

col01    0
col02    1
dtype: int64

In [124]:
print(multi_series['Group01','col01'])

# 형식상 tuple을 붙여주는게 시각적으로 좋을거 같다
print(multi_series[('Group01','col01')])

0
0


In [125]:
multi_series.loc[('Group01','col01'):('Group01','col02')]

Group01  col01    0
         col02    1
dtype: int64

In [78]:
np.random.seed(100)
multi_df = pd.DataFrame(np.random.randn(5,4),
                        columns=[['Group01','Group01','Group02','Group02'],
                                                      ['col01','col02','col01','col02']],
                        index = [['M','M','M','F','F'],
                                 [i for i in range(5)]])
multi_df.columns.names=['Group_Id','Col_Id']
multi_df.index.names=['Gender','Row_Id']
display(multi_df)

Unnamed: 0_level_0,Group_Id,Group01,Group01,Group02,Group02
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,0,-1.749765,0.34268,1.153036,-0.252436
M,1,0.981321,0.514219,0.22118,-1.070043
M,2,-0.189496,0.255001,-0.458027,0.435163
F,3,-0.583595,0.816847,0.672721,-0.104411
F,4,-0.53128,1.029733,-0.438136,-1.118318


- 아래 두 예시는 모두 오류
    - multi_df['col01'] # key value error
    - multi_df[('col01','Group01')] # 차원의 순서 맞아야된다

In [121]:
# 앞써서 데이터 프레임에서 아래의 형식이 행열 인덱싱이 아닌것을 보았다
multi_df['Group01','col01']
multi_df[('Group01','col01')]

Gender  Row_Id
M       0         9
        1         8
        2         6
F       3         2
        4         1
Name: (Group01, col01), dtype: int64

In [92]:
# 열 indexing return type dataframe
multi_df[[('Group01','col01')]]

Unnamed: 0_level_0,Group_Id,Group01
Unnamed: 0_level_1,Col_Id,col01
Gender,Row_Id,Unnamed: 2_level_2
M,0,-1.749765
M,1,0.981321
M,2,-0.189496
F,3,-0.583595
F,4,-0.53128


In [85]:
# 열 fancy indexing
multi_df[[('Group01','col01'),('Group01','col02')]]

Unnamed: 0_level_0,Group_Id,Group01,Group01
Unnamed: 0_level_1,Col_Id,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2
M,0,-1.749765,0.34268
M,1,0.981321,0.514219
M,2,-0.189496,0.255001
F,3,-0.583595,0.816847
F,4,-0.53128,1.029733


In [87]:
# 행 인덱싱
multi_df.loc[[('M',0)]]

Unnamed: 0_level_0,Group_Id,Group01,Group01,Group02,Group02
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,0,-1.749765,0.34268,1.153036,-0.252436


In [90]:
# 행 열 인덱싱
multi_df.loc[('M',0),('Group02','col01')]

1.153035802563644

In [93]:
multi_df.loc[['M']]

Unnamed: 0_level_0,Group_Id,Group01,Group01,Group02,Group02
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,0,-1.749765,0.34268,1.153036,-0.252436
M,1,0.981321,0.514219,0.22118,-1.070043
M,2,-0.189496,0.255001,-0.458027,0.435163


In [94]:
# return type이 신기하게 data type이다
multi_df['Group01']

Unnamed: 0_level_0,Col_Id,col01,col02
Gender,Row_Id,Unnamed: 2_level_1,Unnamed: 3_level_1
M,0,-1.749765,0.34268
M,1,0.981321,0.514219
M,2,-0.189496,0.255001
F,3,-0.583595,0.816847
F,4,-0.53128,1.029733


### MultiIndex 인덱스 정렬
- axis는 당연히 지정
- level 속성을 통해 기준을 정의 
- level을 숫자 혹은 name으로 지정해준다
- level 은 상위 속성이 0부터시작

In [95]:
np.random.seed(100)
multi_df = pd.DataFrame(np.random.randint(1,10,(5,4)),
                        columns=[['Group01','Group01','Group02','Group02'],
                                                      ['col01','col02','col01','col02']],
                        index = [['M','M','M','F','F'],
                                 [i for i in range(5)]])
multi_df.columns.names=['Group_Id','Col_Id']
multi_df.index.names=['Gender','Row_Id']
display(multi_df)

Unnamed: 0_level_0,Group_Id,Group01,Group01,Group02,Group02
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,0,9,9,4,8
M,1,8,1,5,3
M,2,6,3,3,3
F,3,2,1,9,5
F,4,1,7,3,5


In [96]:
multi_df.sort_index(level='Gender')

Unnamed: 0_level_0,Group_Id,Group01,Group01,Group02,Group02
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
F,3,2,1,9,5
F,4,1,7,3,5
M,0,9,9,4,8
M,1,8,1,5,3
M,2,6,3,3,3


In [97]:
multi_df.sort_index(axis=0,level='Gender')

Unnamed: 0_level_0,Group_Id,Group01,Group01,Group02,Group02
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
F,3,2,1,9,5
F,4,1,7,3,5
M,0,9,9,4,8
M,1,8,1,5,3
M,2,6,3,3,3


In [98]:
multi_df.sort_index(axis=0,level=['Gender','Row_Id'])

Unnamed: 0_level_0,Group_Id,Group01,Group01,Group02,Group02
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
F,3,2,1,9,5
F,4,1,7,3,5
M,0,9,9,4,8
M,1,8,1,5,3
M,2,6,3,3,3


In [100]:
multi_df.sort_index(level=1,axis=1)

Unnamed: 0_level_0,Group_Id,Group01,Group02,Group01,Group02
Unnamed: 0_level_1,Col_Id,col01,col01,col02,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,0,9,4,9,8
M,1,8,5,1,3
M,2,6,3,3,3
F,3,2,9,1,5
F,4,1,3,7,5


### 다중 인덱스 집계 함수
- depreciated 되는 방법이다

In [101]:
np.random.seed(100)
multi_df = pd.DataFrame(np.random.randint(1,10,(5,4)),
                        columns=[['Group01','Group01','Group02','Group02'],
                                                      ['col01','col02','col01','col02']],
                        index = [['M','M','M','F','F'],
                                 [i for i in range(5)]])
multi_df.columns.names=['Group_Id','Col_Id']
multi_df.index.names=['Gender','Row_Id']
display(multi_df)

Unnamed: 0_level_0,Group_Id,Group01,Group01,Group02,Group02
Unnamed: 0_level_1,Col_Id,col01,col02,col01,col02
Gender,Row_Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,0,9,9,4,8
M,1,8,1,5,3
M,2,6,3,3,3
F,3,2,1,9,5
F,4,1,7,3,5


In [102]:
multi_df.mean()

Group_Id  Col_Id
Group01   col01     5.2
          col02     4.2
Group02   col01     4.8
          col02     4.8
dtype: float64

In [103]:
multi_df.mean(level='Gender')
# 이런식으로 할빠에 => groupby를 쓰는게 더 좋다고 한다

  multi_df.mean(level='Gender')


Group_Id,Group01,Group01,Group02,Group02
Col_Id,col01,col02,col01,col02
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
M,7.666667,4.333333,4.0,4.666667
F,1.5,4.0,6.0,5.0


In [109]:
multi_df.groupby(level='Gender').mean()

Group_Id,Group01,Group01,Group02,Group02
Col_Id,col01,col02,col01,col02
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
F,1.5,4.0,6.0,5.0
M,7.666667,4.333333,4.0,4.666667


In [104]:
multi_df.groupby('Gender').mean()

Group_Id,Group01,Group01,Group02,Group02
Col_Id,col01,col02,col01,col02
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
F,1.5,4.0,6.0,5.0
M,7.666667,4.333333,4.0,4.666667
