# Data Wrangling

## Join, Combine and Reshape

## Importance of Hierarchical Indexing

In [None]:
import pandas as pd
import numpy as np

In [None]:
data_hi = pd.Series(np.random.randn(9),
          index=[['A', 'A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
                 [1, 2, 3, 1, 4, 1, 2, 2, 4]])
data_hi

In [None]:
pd.Series.index?

In [None]:
data_hi.index

In [None]:
data_hi['A']

In [None]:
data_hi['A':'C']

In [None]:
data_hi[['A', 'C']]

In [None]:
data_hi.loc[:, 1]

In [None]:
pd.Series.unstack?

In [None]:
data_hi.unstack()

In [None]:
data_hi.unstack(fill_value=0)

In [None]:
data_hi.unstack().stack()

In [None]:
df_hi = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns=[['one', 'one', 'three'],
                              ['Green', 'Red', 'Green']])
df_hi

In [None]:
pd.names?

In [None]:
#help(pd.MultiIndex)
pd.MultiIndex?

In [None]:
df_hi.index.names = ['val1', 'val2']

In [None]:
df_hi.columns.names = ['number', 'color']

In [None]:
df_hi

In [None]:
df_hi['one']

### How Reordering and Sorting of Index Levels Takes Place?

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.DataFrame.swaplevel?

In [None]:
df_hi

In [None]:
df_hi.swaplevel('val1', 'val2', axis=0)

In [None]:
df_hi.swaplevel('number', 'color', axis=1)

In [None]:
pd.DataFrame.sort_index?

In [None]:
df_hi.sort_index(level=1)

In [None]:
df_hi.sort_index(level=0)

In [None]:
df_hi.swaplevel(0, 1).sort_index(level=0) 

### How To Get The Summary Statistics By Level?

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.DataFrame.sum?

In [None]:
print(df_hi)
df_hi.sum(level='val1') # all 'a' and 'b's are grouped first and then sum is applied

In [None]:
print(df_hi)
df_hi.sum(level='val2') # all '1' and '2's are grouped first and then sum is applied

In [None]:
print(df_hi)
df_hi.sum(level='color', axis=1) # in color index level 'Green' are grouped together first and then sum is applied.

### How To Index With DF's columns?

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_c = pd.DataFrame({'a': range(7), 'b': range(14, 7, -1),
                     'c': ['one', 'one', 'one', 'two', 'two','two', 'two'],
                     'd': [0, 1, 2, 0, 1, 2, 3]})
df_c

In [None]:
pd.DataFrame.set_index?

In [None]:
df_si = df_c.set_index(['c', 'd'])
df_si

In [None]:
df_c.set_index(['c', 'd'], drop=False)

In [None]:
df_si.reset_index()

### How To Combine and Merge Datasets?

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.DataFrame.merge?

In [None]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df1

In [None]:
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                   'data2': range(3)})
df2

#### Let's see Database-Style DataFrame Joins

In [None]:
# 'many to one join' 
pd.merge(df1, df2)

In [None]:
df1.merge(df2)

In [None]:
pd.merge(df1, df2, on='key')

In [None]:
df_l = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df_l

In [None]:
df_r = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                   'data2': range(3)})
df_r

In [None]:
print('df_l', df_l, end='\n'), print('df_r', df_r, end='\n')
pd.merge(df_l, df_r, left_on='lkey', right_on='rkey')

In [None]:
pd.merge(df_l, df_r,  left_on='lkey', right_on='rkey', how='outer')

In [None]:
# Let's see  'many to many merge' operation

In [None]:
df_m1 = pd.DataFrame({'1key': ['b', 'b', 'a', 'c', 'a', 'b'],
              'data1': range(6)})
df_m1          # 'd' is not present

In [None]:
df_m2 = pd.DataFrame({'2key': ['a', 'b', 'a', 'b', 'd'],
               'data2': range(5)})
df_m2           # 'c' is not present

In [None]:
pd.merge(df_m1, df_m2, left_on='1key', right_on='2key')                  # let's ommit "how='outer'" or "how='inner'"

In [None]:
pd.merge(df_m1, df_m2, left_on='1key', right_on='2key', how='outer')     # let's include "how='outer'" or "how='inner'"

In [None]:
pd.merge(df_m1, df_m2, left_on='1key', right_on='2key', how='left')      # let's include "how='left'

In [None]:
pd.merge(df_m1, df_m2, left_on='1key', right_on='2key', how='right')      # let's include "how='right'

#### Let's see How we can merge with Multiple column 'keys' as names

In [None]:
dfleft = pd.DataFrame({'key1': ['raga', 'raga', 'anuraga'],
                      'key2': ['one', 'two', 'one'],
                      'lval': [1, 2, 3]})

dfright = pd.DataFrame({'key1': ['raga', 'raga', 'anuraga', 'anuraga'],
                       'key2': ['one', 'one', 'one', 'two'],
                       'rval': [4, 5, 6, 7]})
print('dfleft')
print(dfleft)
print()
print('dfright')
print(dfright)

In [None]:
pd.merge(dfleft, dfright, on=['key1', 'key2'], how='outer') # 'outer' includes all the values and associated data

In [None]:
pd.merge(dfleft, dfright, on=['key1', 'key2'], how='inner') # 'inner' includes only the  common values and associated data

In [None]:
pd.merge(dfleft, dfright, on=['key1', 'key2'], how='left') # 'left' includes priority for left values and associated data

In [None]:
pd.merge(dfleft, dfright, on=['key1', 'key2'], how='right') # 'right' includes priority for right values and associated data

In [None]:
print('dfleft')
print(dfleft)
print()
print('dfright')
print(dfright)
pd.merge(dfleft, dfright, on='key1')

In [None]:
pd.merge(dfleft, dfright, on='key1', suffixes=('_left', '_right'))

### Merging on Row Index

In [None]:
import pandas as pd
import numpy as np

In [None]:
dfril = pd.DataFrame({'key1': ['raga', 'raga', 'anuraga', 'raga', 'adiraga'],
                       'lval': range(5)})

dfrir = pd.DataFrame({'rval': [1, 2]},
                         index = ['raga', 'anuraga'])
print('dfril')
print(dfril)   
print()
print('dfrir')
print(dfrir)

In [None]:
pd.merge(dfril, dfrir, left_on='key1', right_index=True)

In [None]:
# Let's see what happens if swich on left_index=True
#pd.merge(dfril, dfrir, left_on='key1', left_index=True) 

In [None]:
pd.merge(dfril, dfrir, left_on='key1', right_index=True, how='outer')

#### Let's work with hierarchically indexed DataFrames

In [None]:
# In hierachically indexed data the joining is implicitely a multiple key merge 

In [None]:
df_l = pd.DataFrame({'key1': ['raga', 'raga', 'raga', 'anuraga', 'anuraga'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})

df_r = pd.DataFrame(np.arange(12).reshape((6, 2)),
                       index=[['anuraga', 'anuraga', 'raga', 'raga', 'raga', 'raga'],
                              [2001, 2000, 2000, 2000, 2001, 2002]],
                       columns=['prog1', 'prog2'])
print('df_l:\n', df_l)
print()
print('df_r:\n', df_r) # use '\n' at the end of string to print the object or data variable in the next line

In [None]:
print('df_l:\n', df_l)
print()
print('df_r:\n', df_r)
pd.merge(df_l, df_r, left_on=['key1', 'key2'], right_index=True)

In [None]:
print('df_l:\n', df_l)
print()
print('df_r:\n', df_r)
pd.merge(df_l, df_r, left_on=['key1', 'key2'], right_index=True, how='outer')

In [None]:
print('df_l:\n', df_l)
print()
print('df_r:\n', df_r)
pd.merge(df_l, df_r, left_on=['key1', 'key2'], right_index=True, how='left')

In [None]:
print('df_l:\n', df_l)
print()
print('df_r:\n', df_r)
pd.merge(df_l, df_r, left_on=['key1', 'key2'], right_index=True, how='right')

In [None]:
df_li = pd.DataFrame([[10, 20], [30, 40], [50, 60]],
                      index=['a', 'c', 'e'],
                      columns=['raga', 'anuraga'])

df_ri = pd.DataFrame([[70, 80], [90, 100], [110, 120], [130, 140]],
                       index=['b', 'c', 'd', 'e'],
                       columns=['braga', 'sraga'])
print('df_li\n', df_li)
print()
print('df_ri\n', df_ri)

In [None]:
pd.merge(df_li, df_ri, how='outer', left_index=True, right_index=True)

In [None]:
pd.merge(df_li, df_ri, how='inner', left_index=True, right_index=True)

In [None]:
pd.DataFrame.join?

In [None]:
df_li.join(df_ri, how='outer')

In [None]:
caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                        'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                       'B': ['B0', 'B1', 'B2']})
print('caller:\n', caller)
print()
print('other:\n', other)

In [None]:
caller.join(other, lsuffix='_caller', rsuffix='_other') 

In [None]:
caller.join(other, lsuffix='_caller', rsuffix='_other', how='right')

In [None]:
caller.set_index('key').join(other.set_index('key'))

In [None]:
#caller.join(other.set_index('key'), on='key')

### How To Concatenate DataFrame's Along The Row or Column Axis?

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.concat?

In [None]:
ser1 = pd.Series([0, 1], index=['A', 'B'])

ser2 = pd.Series([2, 3, 4], index=['C', 'D', 'E'])

ser3 = pd.Series([5, 6], index=['F', 'G'])
print(ser1); print(ser2); print(ser3)

In [None]:
pd.concat([ser1, ser2, ser3])

In [None]:
pd.concat([ser1, ser2, ser3], sort=True) 

In [None]:
print(ser1); print(ser2); print(ser3)
pd.concat([ser1, ser2, ser3], axis=1, sort=True) 

In [None]:
pd.concat([ser1, ser2, ser3], axis=1, sort=True, join='inner') 

In [None]:
ser4 = pd.concat([ser1, ser3]) 
ser4

In [None]:
print(ser1); print(ser4)
pd.concat([ser1, ser4], axis=1, sort=True) 

In [None]:
print(ser1); print(ser4)
pd.concat([ser1, ser4], axis=1, join='inner') 

In [None]:
pd.concat([ser1, ser4], axis=1, join_axes=[['A', 'B', 'F', 'G']])

In [None]:
idc = pd.concat([ser1, ser2, ser3], axis=0, keys=['1', '2', '3'])
idc

In [None]:
idc.unstack()

In [None]:
pd.concat([ser1, ser2, ser3], axis=1, keys=['1', '2', '3'], sort=True)

#### Let's see the same logic on DataFrame objects

In [None]:
df_li = pd.DataFrame([[10, 20], [30, 40], [50, 60]],
                      index=['a', 'c', 'e'],
                      columns=['raga', 'anuraga'])

df_ri = pd.DataFrame([[70, 80], [90, 100], [110, 120], [130, 140]],
                       index=['b', 'c', 'd', 'e'],
                       columns=['braga', 'sraga'])
print('df_li\n', df_li)
print()
print('df_ri\n', df_ri)

In [None]:
pd.concat([df_li, df_ri], axis=1, keys=['one', 'two'], sort=True, join='inner') 

In [None]:
pd.concat([df_li, df_ri], axis=1, keys=['1', '2', '3'], sort=True, join='outer')            

In [None]:
pd.concat({'level1':df_li, 'level2':df_ri}, sort=True, join='outer')

In [None]:
pd.concat({'level1':df_li, 'level2':df_ri}, sort=True, join='outer', axis=1) 

In [None]:
pd.concat({'level1':df_li, 'level2':df_ri}, sort=True, join='outer', axis=1, names=['first', 'second'])

In [None]:
df_li = pd.DataFrame([[10, 20], [30, 40], [50, 60]],
                      columns=['raga', 'anuraga'])

df_ri = pd.DataFrame([[70, 80], [90, 100], [110, 120], [130, 140]],
                       columns=['braga', 'sraga'])
print('df_li\n', df_li)
print()
print('df_ri\n', df_ri)

In [None]:
pd.concat([df_li, df_ri], axis=0, join='outer', ignore_index=True, sort=True)

In [None]:
pd.concat([df_li, df_ri], axis=1, join='outer', ignore_index=True, sort=True)

### How To Combine Data With Overlap?

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.DataFrame.combine_first?

In [None]:
df1 = pd.DataFrame([[1, np.nan]])
df2 = pd.DataFrame([[3, 4]])
print('df1:\n', df1)
print('df2:\n', df2)

In [None]:
df1.combine_first(df2)

In [None]:
pd.DataFrame.combine?

In [None]:
df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
print('df1:\n', df1)
print('df2:\n', df2)

In [None]:
df1.combine(df2, lambda s1, s2: s1 if s1.sum() < s2.sum() else s2)

### How To Reshape and Pivot Pandas Data?

In [None]:
# Pandas provides many ways to rearrange the Tabular Data and is known as 'reshape or pivot' operation

#### Reshaping The Hierarchically indexed DataFrame's

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.DataFrame.stack?

In [None]:
df_s = data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                     index=pd.Index(['raga', 'mmraga'], name='state'),
                     columns=pd.Index(['one', 'two', 'three'], name='number'))
df_s

In [None]:
df_s.stack()

In [None]:
pd.DataFrame.unstack?

In [None]:
df_s.stack().unstack() # the default level=-1

In [None]:
df_s.stack().unstack(level=0) # the column index is considered to unstack the data

In [None]:
df_s.stack().unstack('state')

In [None]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'], name='one')
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'], name='two')
data = pd.concat([s1, s2], keys= ['one', 'two'])
data

In [None]:
data.unstack()

In [None]:
data.unstack().stack()

In [None]:
data.unstack().stack(dropna=False)

In [None]:
df_s.stack().unstack('state')