In [1]:
# Hierarchical Indexing
# represent high dimensional data with simple index
import pandas as pd 
import numpy as np 

In [2]:
# two dimensional data with a one-dimensional Series:

# The Bad Way
index = [('California', 2010), ('California', 2020),
    ('New York', 2010), ('New York', 2020),
    ('Texas', 2010), ('Texas', 2020)]
populations = [37253956, 39538223,
    19378102, 20201249,
    25145561, 29145505]


In [3]:
pop = pd.Series(populations, index=index)
pop 

(California, 2010)    37253956
(California, 2020)    39538223
(New York, 2010)      19378102
(New York, 2020)      20201249
(Texas, 2010)         25145561
(Texas, 2020)         29145505
dtype: int64

In [5]:
# slice the series based on this tuple index:
pop[("California",2020):("Texas",2010)]

(California, 2020)    39538223
(New York, 2010)      19378102
(New York, 2020)      20201249
(Texas, 2010)         25145561
dtype: int64

In [7]:
# hard to read selections

# select all with indexed by tuples 
# where year is 2010:
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [8]:
# The Better Way: The Pandas MultiIndex
# multilevel of indexing

index = pd.MultiIndex.from_tuples(index)

In [9]:
pop = pop.reindex(index)
pop 

California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [10]:
# access all 2020(second index, first is the name of state) with slicing notation:
pop[:,2020]

California    39538223
New York      20201249
Texas         29145505
dtype: int64

In [12]:
# MultiIndex as Extra Dimension

# unstack - is to show all dimensions at once as a table:
pop_df = pop.unstack()
pop_df 

Unnamed: 0,2010,2020
California,37253956,39538223
New York,19378102,20201249
Texas,25145561,29145505


In [14]:
# stack - is to show all dimensions as an enumeration
pop_df.stack()

California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [15]:
pop_df = pd.DataFrame({
    "total":pop,
    "under18":[
        9284094, 8898092,
        4318033, 4181528,
        6879014, 7432474
    ]
})
pop_df 

Unnamed: 0,Unnamed: 1,total,under18
California,2010,37253956,9284094
California,2020,39538223,8898092
New York,2010,19378102,4318033
New York,2020,20201249,4181528
Texas,2010,25145561,6879014
Texas,2020,29145505,7432474


In [16]:
# create functional view from operation:
f_u18 = pop_df["under18"] / pop_df["total"]
f_u18.unstack()

Unnamed: 0,2010,2020
California,0.249211,0.22505
New York,0.222831,0.206994
Texas,0.273568,0.255013


In [17]:
# Methods of MultiIndex Creation 
df = pd.DataFrame(
    np.random.rand(4,2),
    index=[["a","a","b","b"],[1,2,1,2]],
    columns=["data1","data2"]
)
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.027022,0.625457
a,2,0.916235,0.835333
b,1,0.565453,0.509843
b,2,0.194554,0.719801


In [18]:
data = {('California', 2010): 37253956,
    ('California', 2020): 39538223,
    ('New York', 2010): 19378102,
    ('New York', 2020): 20201249,
    ('Texas', 2010): 25145561,
    ('Texas', 2020): 29145505}
pd.Series(data)

California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [19]:
# Explicit MultiIndex Constructors

pd.MultiIndex.from_arrays([["a","a","b","b"],[1,2,1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [20]:
pd.MultiIndex.from_tuples([
    ("a",1),("a",2),("b",1),("b",2)
])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [21]:
# Cartesian product:
pd.MultiIndex.from_product([["a","b"],[1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [22]:
pd.MultiIndex(levels=[["a","b"],[1,2]],codes=[[0,0,1,1],[0,1,0,1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [25]:
# MultiIndex Level Names 

pop.index.names = ["state","year"]
pop 

state       year
California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [26]:
# MultiIndex for Columns

# hierarchical indices and columns 
index = pd.MultiIndex.from_product(
    [[2013,2014],[1,2]],
    names=["year","visit"]
)
columns = pd.MultiIndex.from_product(
    [["Bob","Guido","Sue"],["HR","Temp"]],
    names=["subject","type"]
)

In [27]:
# mock some data
data = np.round(np.random.randn(4,6),1)
data[:,::2] *= 10
data += 37

In [29]:
# create the DataFrame
health_data = pd.DataFrame(data,index=index, columns=columns)
health_data 

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,25.0,38.3,37.0,37.0,35.0,37.6
2013,2,33.0,36.6,45.0,38.3,36.0,36.0
2014,1,54.0,37.0,53.0,37.3,38.0,37.6
2014,2,33.0,35.4,45.0,37.4,25.0,40.1


In [30]:
health_data["Guido"]    # addressing from the top level of abstraction

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,37.0,37.0
2013,2,45.0,38.3
2014,1,53.0,37.3
2014,2,45.0,37.4


In [31]:
# Indexing and Slicing a MultiIndex 

# Multiply Indexed Series

pop 

state       year
California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [32]:
pop["California",2010]  # indexing by first key, then second key 

37253956

In [33]:
# get access to one level:
pop["California"]

year
2010    37253956
2020    39538223
dtype: int64

In [34]:
# slicing for the view creation
pop.loc["California":"New York"]

state       year
California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
dtype: int64

In [35]:
# slicing with ":"
pop[:,2010]

state
California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [36]:
pop[pop > 22_000_000]

state       year
California  2010    37253956
            2020    39538223
Texas       2010    25145561
            2020    29145505
dtype: int64

In [38]:
# fancy indexing
pop[["California","Texas"]]

state       year
California  2010    37253956
            2020    39538223
Texas       2010    25145561
            2020    29145505
dtype: int64

In [39]:
# Multy indexed DataFrames 
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,25.0,38.3,37.0,37.0,35.0,37.6
2013,2,33.0,36.6,45.0,38.3,36.0,36.0
2014,1,54.0,37.0,53.0,37.3,38.0,37.6
2014,2,33.0,35.4,45.0,37.4,25.0,40.1


In [40]:
health_data["Guido","HR"]

year  visit
2013  1        37.0
      2        45.0
2014  1        53.0
      2        45.0
Name: (Guido, HR), dtype: float64

In [47]:
health_data.iloc[:2,:2] # zero-to-second rows, zero-to-second columns

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,25.0,38.3
2013,2,33.0,36.6


In [48]:
health_data.loc[:, ("Bob","HR")]

year  visit
2013  1        25.0
      2        33.0
2014  1        54.0
      2        33.0
Name: (Bob, HR), dtype: float64

In [49]:
# built a slice for MultiIndex:
idx = pd.IndexSlice
health_data.loc[ idx[:,1], idx[:,"HR"] ]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,25.0,37.0,35.0
2014,1,54.0,53.0,38.0


In [58]:
# Sorted and Unsorted Indices

# non sorted:
index = pd.MultiIndex.from_product( [["a","c","b"],[1,2]] )
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ["char","int"]
data

char  int
a     1      0.042914
      2      0.359681
c     1      0.201771
      2      0.862872
b     1      0.512310
      2      0.523097
dtype: float64

In [59]:
data = data.sort_index()
data 

char  int
a     1      0.042914
      2      0.359681
b     1      0.512310
      2      0.523097
c     1      0.201771
      2      0.862872
dtype: float64

In [60]:
data["a":"c"]

char  int
a     1      0.042914
      2      0.359681
b     1      0.512310
      2      0.523097
c     1      0.201771
      2      0.862872
dtype: float64

In [61]:
# Stacking and Unstacking Indices
pop.unstack(level=0)


state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010,37253956,19378102,25145561
2020,39538223,20201249,29145505


In [62]:
pop.unstack(level=1)

year,2010,2020
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,37253956,39538223
New York,19378102,20201249
Texas,25145561,29145505


In [63]:
# the opposite of "unstack" is "stack" 
# stacked and unstacked is a DF state
pop.unstack().stack()

state       year
California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [64]:
# Index Setting and Resetting
pop_flat = pop.reset_index(name = "population")
pop_flat 

Unnamed: 0,state,year,population
0,California,2010,37253956
1,California,2020,39538223
2,New York,2010,19378102
3,New York,2020,20201249
4,Texas,2010,25145561
5,Texas,2020,29145505


In [65]:
pop_flat.set_index(["state","year"]) 

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2010,37253956
California,2020,39538223
New York,2010,19378102
New York,2020,20201249
Texas,2010,25145561
Texas,2020,29145505
