### Hierarchical Indexing (Multi-Indexing)

Data indexed by more than one or two keys. (Pandas : `Panel, Panel 4D Object`)

#### 1. Multi-Index Creation

###### The Bad Way

In [4]:
import pandas as pd
import numpy as np

index = [ ('California', 2000), ('California', 2010), ('New York', 2000), ('New York', 2010), 
        ('Texas', 2000), ('Texas', 2010) ]
populations = [ 33871648, 37253956, 20851820, 25145561, 18976457, 19378102]

pop = pd.Series(populations, index= index)
print(pop)

# messy code to find all event in 2010
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      20851820
(New York, 2010)      25145561
(Texas, 2000)         18976457
(Texas, 2010)         19378102
dtype: int64


(California, 2010)    37253956
(New York, 2010)      25145561
(Texas, 2010)         19378102
dtype: int64

###### The Better Way
**\- Pass a list of two or more index arrays**

In [5]:
df = pd.DataFrame(np.random.rand(4,2), index= [['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns = ['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.204101,0.654957
a,2,0.319888,0.587272
b,1,0.918936,0.264804
b,2,0.757801,0.530068


**\-Pass a dictionary with appropriate tuples as keys**

In [6]:
data = {('California', 2000): 33871648, ('California', 2010): 37253956, ('New York', 2000): 18976457, 
        ('New York', 2010): 19378102, ('Texas', 2000): 20851820, ('Texas', 2010): 25145561 }
pop = pd.Series(data)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [7]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

** \-Explicit Multi-Index Constructor**

In [8]:
# directly from its encoding
pd.MultiIndex(levels= [['a', 'b'], [1, 2]], labels= [[0, 0, 1, 1], [0, 1, 0, 1]])
    # 여기서 라벨은 앞의 level을 받는 것
    # 아래 Cartesian은 자동으로 교차 선택이라면, 여기서는 수동으로 골라주는 것이다.

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [9]:
# from a single list of array
index1 = pd.MultiIndex.from_arrays( [['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

In [10]:
# from a list of tuples
pd.MultiIndex.from_tuples([ ('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [11]:
# from a Cartesian product
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [12]:
# Index를 만든 다음, 아래처럼 reindex를 해준다.
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

**\- Another way - add extra dimension**

In [13]:
pop_df = pop.unstack()
print(pop_df)

#opposite operation
pop_df.stack()
pop

                2000      2010
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561


California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

**\-MultiIndex Level Names**

In [14]:
pop.index.names

FrozenList([None, None])

In [15]:
pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

**\-MultiIndex For Columns**

In [16]:
# Hierarchical Indices and columns
index = pd.MultiIndex.from_product( [[2013, 2014], [1, 2]], names= ['year', 'visit'] )
columns = pd.MultiIndex.from_product( [['Bob', 'Guido', 'Sue'], ['HR', 'Temp']], names= ['subject', 'type'] )

# mock some data
data = np.round(np.random.randn(4,6), 1)
data[:, ::2]*= 10

# Create the DataFrame
health_data = pd.DataFrame(data, index= index, columns= columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,8.0,-0.1,-3.0,0.6,-4.0,-0.0
2013,2,-9.0,-1.4,15.0,0.6,1.0,-0.3
2014,1,-14.0,-1.2,-0.0,1.0,1.0,-1.0
2014,2,-2.0,-0.1,11.0,0.4,8.0,-0.3


#### 2. Indexing and Slicing a MultiIndex

**\-Series**

In [17]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [18]:
pop['California', 2000]

33871648

In [19]:
# partial indexing
pop['California']

year
2000    33871648
2010    37253956
dtype: int64

In [20]:
# partial slicing
pop['California':'New York']

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [21]:
# partial indexing on lower level
pop[:, 2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [22]:
print( pop[ pop> 22000000])  # boolean mask
print( pop[ [('California', 2000), ('Texas', 2000)]])  # fancy indexing

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64
state       year
California  2000    33871648
Texas       2000    20851820
dtype: int64


**\-DataFrame**

In [23]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,8.0,-0.1,-3.0,0.6,-4.0,-0.0
2013,2,-9.0,-1.4,15.0,0.6,1.0,-0.3
2014,1,-14.0,-1.2,-0.0,1.0,1.0,-1.0
2014,2,-2.0,-0.1,11.0,0.4,8.0,-0.3


In [24]:
health_data['Guido', 'HR']

year  visit
2013  1        -3.0
      2        15.0
2014  1        -0.0
      2        11.0
Name: (Guido, HR), dtype: float64

In [25]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,8.0,-0.1
2013,2,-9.0,-1.4


In [26]:
health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1         8.0
      2        -9.0
2014  1       -14.0
      2        -2.0
Name: (Bob, HR), dtype: float64

#### 3. Rearranging Multiple Indices

Many of the MultiIndex slicing operations ***will fail*** if the index is not sorted.

**\-Unsorted Indices**

In [27]:
# Unsorted Data
index = pd.MultiIndex.from_product([ ['a', 'c', 'b'], [1, 2]])
data = pd.Series( np.random.rand(6), index= index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.395969
      2      0.041776
c     1      0.684811
      2      0.836663
b     1      0.232567
      2      0.387720
dtype: float64

In [28]:
# will make an error
try : data['a' : 'b'] 
    
except KeyError as e :
    print("error!")
    print(type(e))
    print(e)

error!
<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


**\-Sorting**

In [29]:
data = data.sort_index()
print(data)

data = data.sortlevel()
print(data)

char  int
a     1      0.395969
      2      0.041776
b     1      0.232567
      2      0.387720
c     1      0.684811
      2      0.836663
dtype: float64
char  int
a     1      0.395969
      2      0.041776
b     1      0.232567
      2      0.387720
c     1      0.684811
      2      0.836663
dtype: float64


  after removing the cwd from sys.path.


**\-Stacking & Unstacking**

In [30]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [31]:
print(pop.unstack(level = 0))
print("\n", pop.unstack(level = 1))  # default is level 1
print("\n", pop.unstack().stack())

state  California  New York     Texas
year                                 
2000     33871648  18976457  20851820
2010     37253956  19378102  25145561

 year            2000      2010
state                         
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561

 state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


**\-Index setting and resetting**

In [32]:
# turn the index labels into columns
pop_flat = pop.reset_index(name = 'population')
print(pop_flat)

# Now the reverse
print(pop_flat.set_index(['state', 'year']))

        state  year  population
0  California  2000    33871648
1  California  2010    37253956
2    New York  2000    18976457
3    New York  2010    19378102
4       Texas  2000    20851820
5       Texas  2010    25145561
                 population
state      year            
California 2000    33871648
           2010    37253956
New York   2000    18976457
           2010    19378102
Texas      2000    20851820
           2010    25145561


#### 4. Data Aggregation on Multi Indices
Pandas has built-in data aggregation methods.

In [34]:
data_mean = health_data.mean(level='year')
print(data_mean,"\n")
print(data_mean.mean(axis = 1, level='type'))

subject  Bob       Guido       Sue      
type      HR  Temp    HR Temp   HR  Temp
year                                    
2013    -0.5 -0.75   6.0  0.6 -1.5 -0.15
2014    -8.0 -0.65   5.5  0.7  4.5 -0.65 

type        HR  Temp
year                
2013  1.333333  -0.1
2014  0.666667  -0.2


In [35]:
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,-0.5,-0.75,6.0,0.6,-1.5,-0.15
2014,-8.0,-0.65,5.5,0.7,4.5,-0.65


# Indexers : loc, iloc, and ix

** These slicing and indexing conventions can be a source of confusion. If your Series has an explicit integer index, an indexing operation such as data[1] will use the explicit indices, while a slicing operation like data[1:3] will use the implicit Python-style index. **

In [2]:
import pandas as pd

In [3]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [4]:
# explicit index when indexing
data[1]

'a'

In [5]:
# implicit  index when slicing
data[1:3]

3    b
5    c
dtype: object

##### - loc
loc attribute allows indexing and slicing that always references the explicit index

In [6]:
print(data.loc[1])
print(data.loc[1:3])

a
1    a
3    b
dtype: object


##### - iloc
iloc attribute uses the Python-style index

In [7]:
print(data.iloc[1],"\n")
print(data.iloc[1:3])

b 

3    b
5    c
dtype: object


##### - ix
ix attribute is a hybrid of the two

In [8]:
col_1 = pd.Series(data = ['Hello', 'How', 'Are', 'You'])
col_2 = pd.Series(data = ['A', 'B', 'C', 'D'])

df = pd.DataFrame({'Str1' : col_1, 'Str2' : col_2})
df

Unnamed: 0,Str1,Str2
0,Hello,A
1,How,B
2,Are,C
3,You,D


In [9]:
df.ix['0':'2', 0:1]  # (exp, imp)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  if __name__ == '__main__':


Unnamed: 0,Str1
0,Hello
1,How
2,Are


In [10]:
df.ix[0:2, 'Str1' :'Str2'] # (exp, exp)

Unnamed: 0,Str1,Str2
0,Hello,A
1,How,B
2,Are,C
