# Multi-Indexing
- A multi indexing or hierarchial indexing in pandas a way to create multiple index level for rows and columns in Series or DataFrame.
- It allows us to work with multi-dimensional data in a tabular structure.

### Types of Multi-Indexing
- pd.MultiIndex.from_tuples()
- pd.MultiIndex.from_produt()
- .set_index() : setting multiple columns as index

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
index_val = [('cse',2021),('cse',2022),('cse',2023),('cse',2024),\
            ('ese',2021),('ese',2022),('ese',2023),('ese',2024)]

In [4]:
a = pd.Series([1,2,3,4,5,6,7,8],index = index_val)
a

(cse, 2021)    1
(cse, 2022)    2
(cse, 2023)    3
(cse, 2024)    4
(ese, 2021)    5
(ese, 2022)    6
(ese, 2023)    7
(ese, 2024)    8
dtype: int64

In [6]:
a[('ese',2021)]

np.int64(5)

#### It returns error while passing single key rather than tuple
#### a['ece]

# Create Multi-Index

#### From Tuples


In [9]:
multi_index = pd.MultiIndex.from_tuples(index_val)
multi_index

MultiIndex([('cse', 2021),
            ('cse', 2022),
            ('cse', 2023),
            ('cse', 2024),
            ('ese', 2021),
            ('ese', 2022),
            ('ese', 2023),
            ('ese', 2024)],
           )

In [12]:
series_data = pd.Series([1,2,3,4,5,6,7,8],index = multi_index)
series_data

cse  2021    1
     2022    2
     2023    3
     2024    4
ese  2021    5
     2022    6
     2023    7
     2024    8
dtype: int64

In [13]:
series_data[('ese',2024)]

np.int64(8)

In [14]:
series_data['ese']

2021    5
2022    6
2023    7
2024    8
dtype: int64

#### From Products

In [17]:
m_i = pd.MultiIndex.from_product([['cse','ese'],[2021,2022,2023,2024]],names=['dept','year'])
m_i

MultiIndex([('cse', 2021),
            ('cse', 2022),
            ('cse', 2023),
            ('cse', 2024),
            ('ese', 2021),
            ('ese', 2022),
            ('ese', 2023),
            ('ese', 2024)],
           names=['dept', 'year'])

In [18]:
pd.DataFrame({
    'values' : [1,2,3,4,5,6,7,8]
},index= m_i)

Unnamed: 0_level_0,Unnamed: 1_level_0,values
dept,year,Unnamed: 2_level_1
cse,2021,1
cse,2022,2
cse,2023,3
cse,2024,4
ese,2021,5
ese,2022,6
ese,2023,7
ese,2024,8


## Multi - Col

In [19]:
multi_col = pd.MultiIndex.from_product([
    ['KTM','PKR'],
    ['avg_pac','students']
])
multi_col

MultiIndex([('KTM',  'avg_pac'),
            ('KTM', 'students'),
            ('PKR',  'avg_pac'),
            ('PKR', 'students')],
           )

## Use for both side of DF

In [21]:
placement = pd.DataFrame([
    ['20k',50,'25k',40],
    ['21k',45,'25k',42],
    ['25k',55,'30k',50],
    ['25k',50,'28k',48],
    ['20k',50,'25k',40],
    ['21k',45,'25k',42],
    ['25k',55,'30k',50],
    ['25k',50,'28k',48],
],index= m_i,columns=multi_col)
placement

Unnamed: 0_level_0,Unnamed: 1_level_0,KTM,KTM,PKR,PKR
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_pac,students,avg_pac,students
dept,year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
cse,2021,20k,50,25k,40
cse,2022,21k,45,25k,42
cse,2023,25k,55,30k,50
cse,2024,25k,50,28k,48
ese,2021,20k,50,25k,40
ese,2022,21k,45,25k,42
ese,2023,25k,55,30k,50
ese,2024,25k,50,28k,48


## .set_index()

In [26]:
tips = sns.load_dataset('tips')
tips_subset = tips.set_index(['time','day'])
tips_subset

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,size
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,Sun,16.99,1.01,Female,No,2
Dinner,Sun,10.34,1.66,Male,No,3
Dinner,Sun,21.01,3.50,Male,No,3
Dinner,Sun,23.68,3.31,Male,No,2
Dinner,Sun,24.59,3.61,Female,No,4
Dinner,...,...,...,...,...,...
Dinner,Sat,29.03,5.92,Male,No,3
Dinner,Sat,27.18,2.00,Female,Yes,2
Dinner,Sat,22.67,2.00,Male,Yes,2
Dinner,Sat,17.82,1.75,Male,No,2


In [30]:
tips_subset.loc[('Dinner','Sun')]

  tips_subset.loc[('Dinner','Sun')]


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,size
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,Sun,16.99,1.01,Female,No,2
Dinner,Sun,10.34,1.66,Male,No,3
Dinner,Sun,21.01,3.50,Male,No,3
Dinner,Sun,23.68,3.31,Male,No,2
Dinner,Sun,24.59,3.61,Female,No,4
Dinner,...,...,...,...,...,...
Dinner,Sun,20.90,3.50,Female,Yes,3
Dinner,Sun,30.46,2.00,Male,Yes,5
Dinner,Sun,18.15,3.50,Female,Yes,3
Dinner,Sun,23.10,4.00,Male,Yes,3


In [31]:
tips_subset.xs('Dinner')

Unnamed: 0_level_0,total_bill,tip,sex,smoker,size
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Sun,16.99,1.01,Female,No,2
Sun,10.34,1.66,Male,No,3
Sun,21.01,3.50,Male,No,3
Sun,23.68,3.31,Male,No,2
Sun,24.59,3.61,Female,No,4
...,...,...,...,...,...
Sat,29.03,5.92,Male,No,3
Sat,27.18,2.00,Female,Yes,2
Sat,22.67,2.00,Male,Yes,2
Sat,17.82,1.75,Male,No,2


In [33]:
tips_subset

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,size
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,Sun,16.99,1.01,Female,No,2
Dinner,Sun,10.34,1.66,Male,No,3
Dinner,Sun,21.01,3.50,Male,No,3
Dinner,Sun,23.68,3.31,Male,No,2
Dinner,Sun,24.59,3.61,Female,No,4
Dinner,...,...,...,...,...,...
Dinner,Sat,29.03,5.92,Male,No,3
Dinner,Sat,27.18,2.00,Female,Yes,2
Dinner,Sat,22.67,2.00,Male,Yes,2
Dinner,Sat,17.82,1.75,Male,No,2


In [45]:
tips_subset.index.names= ['Time','Day']

In [47]:
tips_subset.rename_axis(index={'time':'day'},inplace=True)

In [48]:
tips_subset.index.names

FrozenList(['Time', 'Day'])

In [49]:
FrozenList(['time','day'])

NameError: name 'FrozenList' is not defined

## Swaplevel

In [55]:
tips_subset.swaplevel('day', 'time')

KeyError: 'Level day not found'

In [54]:
placement.swapaxes(axis1='dept',axis2='time')

  placement.swapaxes(axis1='dept',axis2='time')


ValueError: No axis named dept for object type DataFrame

In [None]:
# axix = 0 , default : for swap index value
# axix =1 , for swap columnIndex value
placement.swaplevel(0,1),axis =1

## Sort Values

In [57]:
placement.sort_index(level='year').swaplevel('year','dept')

Unnamed: 0_level_0,Unnamed: 1_level_0,KTM,KTM,PKR,PKR
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_pac,students,avg_pac,students
year,dept,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2021,cse,20k,50,25k,40
2021,ese,20k,50,25k,40
2022,cse,21k,45,25k,42
2022,ese,21k,45,25k,42
2023,cse,25k,55,30k,50
2023,ese,25k,55,30k,50
2024,cse,25k,50,28k,48
2024,ese,25k,50,28k,48


In [58]:
placement['KTM','students'].groupby(level='dept').mean()

dept
cse    50.0
ese    50.0
Name: (KTM, students), dtype: float64

## Reshaping
- To transform structure of DF to make it more suitable for analysis and visualization.
- To invloves changing the layout of rows and columns without altering the underlying data.
- Common reshaping techinques:
   - pivot_tables : summarizing data with aggregation, long-format to wide format
   - melting : convert wide format data to lonf format
   - stacking/unstacking : moving index levels btn rows and columns.

In [61]:
death = pd.read_csv("time_series_covid19_deaths_global.csv")
confirm = pd.read_csv("time_series_covid19_confirmed_global.csv")


In [62]:
print(death.shape,confirm.shape)

(289, 1081) (289, 1081)


In [63]:
death.columns

Index(['Province/State', 'Country/Region', 'Lat', 'Long', '1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       ...
       '12/24/22', '12/25/22', '12/26/22', '12/27/22', '12/28/22', '12/29/22',
       '12/30/22', '12/31/22', '1/1/23', '1/2/23'],
      dtype='object', length=1081)

In [64]:
confirm.columns

Index(['Province/State', 'Country/Region', 'Lat', 'Long', '1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       ...
       '12/24/22', '12/25/22', '12/26/22', '12/27/22', '12/28/22', '12/29/22',
       '12/30/22', '12/31/22', '1/1/23', '1/2/23'],
      dtype='object', length=1081)

In [68]:
death_new = death.melt(id_vars=['Province/State','Country/Region','Lat','Long'],\
           var_name='Date',value_name='Num_deaths')
death_new

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Num_deaths
0,,Afghanistan,33.939110,67.709953,1/22/20,0
1,,Albania,41.153300,20.168300,1/22/20,0
2,,Algeria,28.033900,1.659600,1/22/20,0
3,,Andorra,42.506300,1.521800,1/22/20,0
4,,Angola,-11.202700,17.873900,1/22/20,0
...,...,...,...,...,...,...
311248,,West Bank and Gaza,31.952200,35.233200,1/2/23,5708
311249,,Winter Olympics 2022,39.904200,116.407400,1/2/23,0
311250,,Yemen,15.552727,48.516388,1/2/23,2159
311251,,Zambia,-13.133897,27.849332,1/2/23,4024


In [69]:
confirm_new = confirm.melt(id_vars=['Province/State','Country/Region','Lat','Long'],\
           var_name='Date',value_name='Num_deaths')
confirm_new

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Num_deaths
0,,Afghanistan,33.939110,67.709953,1/22/20,0
1,,Albania,41.153300,20.168300,1/22/20,0
2,,Algeria,28.033900,1.659600,1/22/20,0
3,,Andorra,42.506300,1.521800,1/22/20,0
4,,Angola,-11.202700,17.873900,1/22/20,0
...,...,...,...,...,...,...
311248,,West Bank and Gaza,31.952200,35.233200,1/2/23,703228
311249,,Winter Olympics 2022,39.904200,116.407400,1/2/23,535
311250,,Yemen,15.552727,48.516388,1/2/23,11945
311251,,Zambia,-13.133897,27.849332,1/2/23,334661


In [70]:
death_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311253 entries, 0 to 311252
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Province/State  98007 non-null   object 
 1   Country/Region  311253 non-null  object 
 2   Lat             309099 non-null  float64
 3   Long            309099 non-null  float64
 4   Date            311253 non-null  object 
 5   Num_deaths      311253 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 14.2+ MB


In [71]:
confirm_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311253 entries, 0 to 311252
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Province/State  98007 non-null   object 
 1   Country/Region  311253 non-null  object 
 2   Lat             309099 non-null  float64
 3   Long            309099 non-null  float64
 4   Date            311253 non-null  object 
 5   Num_deaths      311253 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 14.2+ MB


In [77]:
final = confirm_new.merge(death_new, on = ['Province/State', 'Country/Region', 'Lat', 'Long','Date'])
final.to_csv('Covid_2019.csv')

## Stacking and Unstacking
 - stack() can't use for series obj

In [84]:
placement.stack(future_stack=True).stack(future_stack=True)

dept  year               
cse   2021  avg_pac   KTM    20k
                      PKR    25k
            students  KTM     50
                      PKR     40
      2022  avg_pac   KTM    21k
                      PKR    25k
            students  KTM     45
                      PKR     42
      2023  avg_pac   KTM    25k
                      PKR    30k
            students  KTM     55
                      PKR     50
      2024  avg_pac   KTM    25k
                      PKR    28k
            students  KTM     50
                      PKR     48
ese   2021  avg_pac   KTM    20k
                      PKR    25k
            students  KTM     50
                      PKR     40
      2022  avg_pac   KTM    21k
                      PKR    25k
            students  KTM     45
                      PKR     42
      2023  avg_pac   KTM    25k
                      PKR    30k
            students  KTM     55
                      PKR     50
      2024  avg_pac   KTM    25k
                 

In [87]:
placement.unstack().unstack().unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,dept,cse,ese
Unnamed: 0_level_1,Unnamed: 1_level_1,year,Unnamed: 3_level_1,Unnamed: 4_level_1
KTM,avg_pac,2021,20k,20k
KTM,avg_pac,2022,21k,21k
KTM,avg_pac,2023,25k,25k
KTM,avg_pac,2024,25k,25k
KTM,students,2021,50,50
KTM,students,2022,45,45
KTM,students,2023,55,55
KTM,students,2024,50,50
PKR,avg_pac,2021,25k,25k
PKR,avg_pac,2022,25k,25k
