# Hierarchical Indexing

multiple (two or more) index levels on an axis

In [25]:
import pandas as pd
import numpy as np
data = pd.Series(np.random.randn(9),
                    index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                      [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1   -2.066752
   2    0.221352
   3   -1.729650
b  1   -0.493059
   3    0.101517
c  1    0.584712
   2   -0.337995
d  2    1.952769
   3   -0.655319
dtype: float64

# partial indexing

In [26]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [27]:
print(data)
data['a']

a  1   -2.066752
   2    0.221352
   3   -1.729650
b  1   -0.493059
   3    0.101517
c  1    0.584712
   2   -0.337995
d  2    1.952769
   3   -0.655319
dtype: float64


1   -2.066752
2    0.221352
3   -1.729650
dtype: float64

# inner level selection

In [28]:
print(data)
data[:,3]

a  1   -2.066752
   2    0.221352
   3   -1.729650
b  1   -0.493059
   3    0.101517
c  1    0.584712
   2   -0.337995
d  2    1.952769
   3   -0.655319
dtype: float64


a   -1.729650
b    0.101517
d   -0.655319
dtype: float64

In [29]:
print(data)
data.unstack()

a  1   -2.066752
   2    0.221352
   3   -1.729650
b  1   -0.493059
   3    0.101517
c  1    0.584712
   2   -0.337995
d  2    1.952769
   3   -0.655319
dtype: float64


Unnamed: 0,1,2,3
a,-2.066752,0.221352,-1.72965
b,-0.493059,,0.101517
c,0.584712,-0.337995,
d,,1.952769,-0.655319


In [30]:
# can you put back hierarchical index
data.unstack().stack()

a  1   -2.066752
   2    0.221352
   3   -1.729650
b  1   -0.493059
   3    0.101517
c  1    0.584712
   2   -0.337995
d  2    1.952769
   3   -0.655319
dtype: float64

# DataFrame

In [31]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                             ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [32]:
frame['Ohio']

Unnamed: 0,Unnamed: 1,Green,Red
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


# Converting columns to index

In [57]:
frame = pd.DataFrame({'a': np.random.randint(0, high =10, size =7), 'b': range(7, 0, -1),
                          'c': ['one', 'one', 'one', 'two', 'two',
                            'two', 'two'],
                      'd': [0, 1, 2, 0, 1, 2, 3]})
frame

Unnamed: 0,a,b,c,d
0,8,7,one,0
1,6,6,one,1
2,8,5,one,2
3,4,4,two,0
4,4,3,two,1
5,9,2,two,2
6,6,1,two,3


In [55]:
print(frame)
frame = frame.set_index(['a', 'b'])
frame

   a  b    c  d
0  5  7  one  0
1  2  6  one  1
2  3  5  one  2
3  3  4  two  0
4  4  3  two  1
5  9  2  two  2
6  4  1  two  3


Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
5,7,one,0
2,6,one,1
3,5,one,2
3,4,two,0
4,3,two,1
9,2,two,2
4,1,two,3


sorting index

In [58]:
frame.set_index(['a', 'b']).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
4,3,two,1
4,4,two,0
6,1,two,3
6,6,one,1
8,5,one,2
8,7,one,0
9,2,two,2


In [59]:
# can you revert the hierachical indexing
frame.set_index(['a', 'b']).sort_index().reset_index()

Unnamed: 0,a,b,c,d
0,4,3,two,1
1,4,4,two,0
2,6,1,two,3
3,6,6,one,1
4,8,5,one,2
5,8,7,one,0
6,9,2,two,2


# Reordering and Sorting Levels

# Simple stats

In [61]:
frame = frame.set_index(['a', 'b'])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
8,7,one,0
6,6,one,1
8,5,one,2
4,4,two,0
4,3,two,1
9,2,two,2
6,1,two,3


In [62]:
# let's give name to columns
print(frame)
frame.index.names = ['k1', 'k2']
frame

       c  d
a b        
8 7  one  0
6 6  one  1
8 5  one  2
4 4  two  0
  3  two  1
9 2  two  2
6 1  two  3


Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
8,7,one,0
6,6,one,1
8,5,one,2
4,4,two,0
4,3,two,1
9,2,two,2
6,1,two,3


In [63]:
# find the sum based on k2
#frame.sum(level='k2') does not work anymore. Use the syntax below instead
frame.groupby(level="k2").sum()


Unnamed: 0_level_0,c,d
k2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,two,3
2,two,2
3,two,1
4,two,0
5,one,2
6,one,1
7,one,0


# Combining and Merging Datasets
- pandas.merge (SQL join) connects rows in DataFrames based on one or more keys. 
- pandas.concat concatenates or "stacks" together objects along an axis

In [13]:
df1 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1': range(7)})
df1

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [14]:
df2 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})
df2

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [15]:
#https://www.geeksforgeeks.org/different-types-of-joins-in-pandas/
# inner join
pd.merge(df1, df2, left_on='lkey', right_on='rkey') 


Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [17]:
pd.merge(df1, df2, left_on='lkey', right_on='rkey', how='outer')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0
