In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# from IPython.display import Image


### Hierarchical Indexes on DataFrames and Series


In [2]:
# np.random.seed(12345)
# pd.options.display.max_rows = 20
# plt.rc('figure', figsize=(10, 6))

### Hierarchical Indexing
* We briefly discussed hierarchical indexing in the context of `groupby`
  * 
- Hierarchical indexing allows you to have multiple index levels on an axis
- Despite the name (indexing), it applies to both indexes and columns

In [14]:
prog_languages = pd.DataFrame({ "Course":["Python", "Rust", "Python", "Rust"], 
              "Nb_participants": [10, 30, 27, 18]})
prog_languages.groupby("Course").sum()
              

Unnamed: 0_level_0,Nb_participants
Course,Unnamed: 1_level_1
Python,37
Rust,48


In [33]:
data = pd.Series(np.random.randn(9),
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1    1.327195
   2   -0.919262
   3   -1.549106
b  1    0.022185
   3    0.758363
c  1   -0.660524
   2    0.862580
d  2   -0.010032
   3    0.050009
dtype: float64

In [34]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [38]:
data[('b', 1)]


0.0221845986581725

In [39]:
data['b', 1]

0.0221845986581725

In [40]:
data['b':'c']


b  1    0.022185
   3    0.758363
c  1   -0.660524
   2    0.862580
dtype: float64

In [41]:
data.loc[['b', 'd']]

b  1    0.022185
   3    0.758363
d  2   -0.010032
   3    0.050009
dtype: float64

In [46]:
data.loc[: , 2]

a   -0.919262
c    0.862580
d   -0.010032
dtype: float64

### Stacked vs. Unstacked Data

* Hierarchical indexes represent how the data is often collected
 * Ex. if variables in a hospital are measured and recorded in different files

```python
    File 1
    patient_ABC LDL 112
    patient_ABC HDL 48
    patient_CCX  LDL 112
    patient_VDM  LDL 112
    patient_ABC  VO2 112
    patient_CCZ  RER 48
    ...
```

* This format is preferred  when not all the variables are measured in all the patients

  * I.e., using columns for the variables would result in a large number of empty cells
  
* You can convert back and forth between a data frame and a hierarchical index with `stack()` and `unstack()`


In [12]:
x = data.unstack()
x

Unnamed: 0,1,2,3
a,-0.204708,0.478943,-0.519439
b,-0.55573,,1.965781
c,1.393406,0.092908,
d,,0.281746,0.769023


In [13]:
x.stack()

a  1   -0.204708
   2    0.478943
   3   -0.519439
b  1   -0.555730
   3    1.965781
c  1    1.393406
   2    0.092908
d  2    0.281746
   3    0.769023
dtype: float64

### Hierarchical indexes on rows and columns


* In a `DataFrame`, both axes can have a hierarchical index.

In [47]:
data = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns=[['Ohio', 'Ohio', 'Colorado'],
                              ['Green', 'Red', 'Green']])
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [48]:
data.index.names = ['key1', 'key2']
data.columns.names = ['state', 'color']
data

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [59]:
print(data.index.get_level_values("key1"))
print("*" * 55)
print(data.index.get_level_values("key2"))

Index(['a', 'a', 'b', 'b'], dtype='object', name='key1')
*******************************************************
Int64Index([1, 2, 1, 2], dtype='int64', name='key2')


- What type of variable would the following return (DataFrame, Series, np.array,...)?

```python
    data['Ohio']`
```


- How about the following?

```python
    frame['Ohio', 'Geeen']`
```


In [65]:
print(type(data['Ohio']))
print("*" * 40)
data['Ohio']

<class 'pandas.core.frame.DataFrame'>
****************************************


Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [66]:
print(type(data['Ohio', 'Green']))
data['Ohio', 'Green']

<class 'pandas.core.series.Series'>


key1  key2
a     1       0
      2       3
b     1       6
      2       9
Name: (Ohio, Green), dtype: int64

- You can create a `MultiIndex` and assign it manually if needed


In [67]:
data = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]])
data.index.names = ["letter", "number"]
data

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
letter,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [26]:
pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],
                       names=['state', 'color'])

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

In [69]:
data.columns = pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],
                       names=['state', 'color'])
data

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
letter,number,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


### Reordering and Sorting Levels
- Swap levels takes two levels `numbers` or `names` and returns a new object with the levels interchanged

In [70]:
data.index.names = ['key1', 'key2']
data

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [71]:
data.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


- You can also pass the level's index, instead of name

In [73]:
data.sort_index(level=0, ascending=False)


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
b,2,9,10,11
b,1,6,7,8
a,2,3,4,5
a,1,0,1,2


In [75]:
data.swaplevel(0, 1).sort_index(level=0, ascending=False)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2,b,9,10,11
2,a,3,4,5
1,b,6,7,8
1,a,0,1,2


In [77]:
data

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [76]:
data.swaplevel('state', 'color', axis=1)

Unnamed: 0_level_0,color,Green,Red,Green
Unnamed: 0_level_1,state,Ohio,Ohio,Colorado
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


### Summary Statistics by Level

- The summary statistics that we've seen before can also be applied by level
- When you apply a function on the level, the remaining levels get `"squashed"`


In [35]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


### Setting and Resetting and Index
* It' common to read in a data frame file and use one of its columns as an index
  * Setting a column as an index is done using `set_index()`
    * By default, the columns are removed from the `DataFrame`, but you can choose to leave them in 

  * `reset_index()` does the opposite; moves the index as a new column of the `DataFrame`
    * Creates a new index from an `IndexRange()`
      

In [78]:
data = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                      'c': ['one', 'one', 'one', 'two', 'two',
                            'two', 'two'],
                      'd': [0, 1, 2, 0, 1, 2, 3]})
data

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [80]:
data2 = data.set_index('b')
data2

Unnamed: 0_level_0,a,c,d
b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7,0,one,0
6,1,one,1
5,2,one,2
4,3,two,0
3,4,two,1
2,5,two,2
1,6,two,3


In [82]:
data3 = data.set_index(['c', 'd'])
data3

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [83]:
data.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [84]:
data4 = data2.reset_index()
data4

Unnamed: 0,b,a,c,d
0,7,0,one,0
1,6,1,one,1
2,5,2,one,2
3,4,3,two,0
4,3,4,two,1
5,2,5,two,2
6,1,6,two,3


In [44]:
frame4.index

RangeIndex(start=0, stop=7, step=1)