# Data Fusion Types

## Hierarchy Indices

We can have many levels of indices, which we create with lists inside lists.

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
data = Series(np.random.randn(9), index=[["a","a","a","b","b","c","c","d","d"],[1,2,3,1,2,1,2,1,2]])
data

a  1   -0.327007
   2    2.173434
   3    1.131291
b  1    0.292659
   2    1.917039
c  1   -0.312174
   2    0.918347
d  1    0.496944
   2   -0.992298
dtype: float64

In [3]:
data.loc["a", 1]

-0.3270074001676573

In [4]:
data.loc[["a","c"], 1]

a  1   -0.327007
c  1   -0.312174
dtype: float64

In [5]:
data.loc["a":"c", 1]

a  1   -0.327007
b  1    0.292659
c  1   -0.312174
dtype: float64

In [6]:
data.loc["a":"c", [1,2]]

a  1   -0.327007
   2    2.173434
b  1    0.292659
   2    1.917039
c  1   -0.312174
   2    0.918347
dtype: float64

In [7]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2),
            ('d', 1),
            ('d', 2)],
           )

In [8]:
data.loc[:, 2] #a2 b2 c2 d2

a    2.173434
b    1.917039
c    0.918347
d   -0.992298
dtype: float64

In [9]:
# the 3rd value of all the array
data[2]

1.1312907815507591

In [11]:
# a two index series can be seen as a dataframe
df1 = data.unstack()
df1

Unnamed: 0,1,2,3
a,-0.327007,2.173434,1.131291
b,0.292659,1.917039,
c,-0.312174,0.918347,
d,0.496944,-0.992298,


In [15]:
# go back to original
df1.stack()

a  1   -0.327007
   2    2.173434
   3    1.131291
b  1    0.292659
   2    1.917039
c  1   -0.312174
   2    0.918347
d  1    0.496944
   2   -0.992298
dtype: float64

In [17]:
df2 = DataFrame(np.arange(12).reshape(4,3), index=[["a","a","b","b"],[1,2,1,2]], columns=[["Tulip","Tulip","Rose"],
                                                                                 ["Orange", "Purple", "Red"]])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Tulip,Tulip,Rose
Unnamed: 0_level_1,Unnamed: 1_level_1,Orange,Purple,Red
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [19]:
df2["Tulip", "Orange"]["a",2]

3

In [21]:
df2.index.names=["quality", "size"]
df2.columns.names=["flower", "color"]
df2

Unnamed: 0_level_0,flower,Tulip,Tulip,Rose
Unnamed: 0_level_1,color,Orange,Purple,Red
quality,size,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [22]:
df2.swaplevel("quality", "size")

Unnamed: 0_level_0,flower,Tulip,Tulip,Rose
Unnamed: 0_level_1,color,Orange,Purple,Red
size,quality,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [23]:
df2.sort_index(level=1) # level 0 is quality, level 1 is size

Unnamed: 0_level_0,flower,Tulip,Tulip,Rose
Unnamed: 0_level_1,color,Orange,Purple,Red
quality,size,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [25]:
df2.swaplevel(0,1).sort_index(level="size") # level 0 by default

Unnamed: 0_level_0,flower,Tulip,Tulip,Rose
Unnamed: 0_level_1,color,Orange,Purple,Red
size,quality,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


## Statistics in more than two dimensions

In [26]:
df2.sum() # by hierarchy in column

flower  color 
Tulip   Orange    18
        Purple    22
Rose    Red       26
dtype: int64

In [28]:
df2

Unnamed: 0_level_0,flower,Tulip,Tulip,Rose
Unnamed: 0_level_1,color,Orange,Purple,Red
quality,size,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [27]:
df2.sum(axis=1)

quality  size
a        1        3
         2       12
b        1       21
         2       30
dtype: int64

In [29]:
df2.sum(level='size')

flower,Tulip,Tulip,Rose
color,Orange,Purple,Red
size,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [30]:
df2.sum(level='color', axis=1).sum()

color
Orange    18
Purple    22
Red       26
dtype: int64

In [32]:
df2.sum(level='flower', axis=1) # group by flower

Unnamed: 0_level_0,flower,Tulip,Rose
quality,size,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,2
a,2,7,5
b,1,13,8
b,2,19,11


In [33]:
df2.sum(level='flower', axis=1).sum(level='size')

flower,Tulip,Rose
size,Unnamed: 1_level_1,Unnamed: 2_level_1
1,14,10
2,26,16


In [34]:
df3 = DataFrame({"a": range(7), "b": range(7,0,-1), "c":["dog"]*3+["cat"]*4, "d":[0,1,2,0,1,2,3]})
df3

Unnamed: 0,a,b,c,d
0,0,7,dog,0
1,1,6,dog,1
2,2,5,dog,2
3,3,4,cat,0
4,4,3,cat,1
5,5,2,cat,2
6,6,1,cat,3


In [39]:
df4 = df3.set_index(["c", "d"])
df4

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
dog,0,0,7
dog,1,1,6
dog,2,2,5
cat,0,3,4
cat,1,4,3
cat,2,5,2
cat,3,6,1


In [40]:
df4.reset_index()
df4

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
dog,0,0,7
dog,1,1,6
dog,2,2,5
cat,0,3,4
cat,1,4,3
cat,2,5,2
cat,3,6,1


In [41]:
df4 = df3.set_index(["c", "d"], drop=False)
df4

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dog,0,0,7,dog,0
dog,1,1,6,dog,1
dog,2,2,5,dog,2
cat,0,3,4,cat,0
cat,1,4,3,cat,1
cat,2,5,2,cat,2
cat,3,6,1,cat,3
