In [1]:
import numpy  as np
import pandas as pd

# for displaying
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# <a id='0'>Combining and Reshaping Data</a>
    
    
## Highlights

## Content
Wrangle data into a form that is more suitable for storage or analysis.

- <a href='#1'>Hierarchical Indexing</a>  
- <a href='#2'>Joining and Cancatenating</a>  
    - <a href='#21'> Database-style joins: Merge
    - <a href='#22'> Join: Merge on Index
    - <a href='#23'> Concatenating Along an Axis
- <a href='#3'>Reshaping Data </a>    
    - <a href='#32'> Pivoting "Long" Data to "Wide"
    - <a href='#33'> Melting  "Wide" Data to "Long" 

## <a id='1'> Hierarchical Indexing: Multi-Index
Let's get warmed up by handling the multi-index of a **single DF** before we start to combine multiple DFs. <br>

### Create MultiIndex object

In [2]:
# from arrays
df1 = pd.DataFrame(np.random.randn(2), 
                   index=pd.MultiIndex.from_arrays([['Lab_1', 'Lab_2'], ['Test_1', 'Test_2']]) )
df1
# from product
df2 = pd.DataFrame(np.random.randn(4), 
                   index=pd.MultiIndex.from_product([['Lab_1', 'Lab_2'], ['Test_1', 'Test_2']]) )
df2

Unnamed: 0,Unnamed: 1,0
Lab_1,Test_1,-0.288575
Lab_2,Test_2,-1.075899


Unnamed: 0,Unnamed: 1,0
Lab_1,Test_1,-0.786891
Lab_1,Test_2,-1.385873
Lab_2,Test_1,-1.62353
Lab_2,Test_2,-1.672981


#### Reindex a DF

In [3]:
df2.reindex(pd.MultiIndex.from_product([['Lab 1', 'Lab 2', 'Lab 3'], 
                                        ['Test 1', 'Test 2', 'Test 3']]), fill_value=np.nan)

Unnamed: 0,Unnamed: 1,0
Lab 1,Test 1,
Lab 1,Test 2,
Lab 1,Test 3,
Lab 2,Test 1,
Lab 2,Test 2,
Lab 2,Test 3,
Lab 3,Test 1,
Lab 3,Test 2,
Lab 3,Test 3,


### Unstacking of Multi-Indexed Series
How can this be useful?

In [4]:
se = pd.Series(np.random.rand(9),
               index=[
                         ['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                         [1, 2, 3, 1, 3, 1, 2, 2, 3],
                         ['A', 'A', 'B', 'B', 'B', 'A', 'B', 'A', 'B'],
                       ]
              )
se.index.names = ['Patient','Test','Lab']
se

Patient  Test  Lab
a        1     A      0.237543
         2     A      0.042841
         3     B      0.913638
b        1     B      0.738532
         3     B      0.162580
c        1     A      0.061244
         2     B      0.833774
d        2     A      0.668201
         3     B      0.341299
dtype: float64

In [5]:
df = se.unstack() # se -> df, unstack the inner-most index.
df

Unnamed: 0_level_0,Lab,A,B
Patient,Test,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0.237543,
a,2,0.042841,
a,3,,0.913638
b,1,,0.738532
b,3,,0.16258
c,1,0.061244,
c,2,,0.833774
d,2,0.668201,
d,3,,0.341299


#### Access all information of patient a

In [6]:
df.loc['a']

Lab,A,B
Test,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.237543,
2,0.042841,
3,,0.913638


#### Access all information of test 1 from patient a

In [7]:
df.loc[('a',1)]

Lab
A    0.237543
B         NaN
Name: (a, 1), dtype: float64

#### Multi-index Slicing
ft. pd.IndexSlice( )

In [8]:
df.loc[pd.IndexSlice['a':'b']]

Unnamed: 0_level_0,Lab,A,B
Patient,Test,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0.237543,
a,2,0.042841,
a,3,,0.913638
b,1,,0.738532
b,3,,0.16258


In [9]:
se.loc[pd.IndexSlice['a':'b',:, 'A']]  # slicing of inner levels works only for Series

Patient  Test  Lab
a        1     A      0.237543
         2     A      0.042841
dtype: float64

### <a id='13'> Indexing with a DataFrame's column: column to index
ft. df.set_index( ) 
    
<a href='#0'> Back to TOC

In [10]:
# set_index, reset_index

frame = pd.DataFrame({'A': range(7), 
                      'B': range(7, 0, -1),
                      'C': ['one', 'one', 'one', 'two', 'two',
                            'two', 'two'],
                      'D': [0, 1, 2, 0, 1, 2, 3]})
frame

frame2 = frame.set_index(['C', 'D'])
frame2

Unnamed: 0,A,B,C,D
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
C,D,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


## <a id='2'> Joining and Cancatenating Datasets
 <a href='#0'> Go Back To TOC</a>

### <a id='21'> Database-Style Join of DFs

In [11]:
df1 = pd.DataFrame({'Key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'Data1': range(7)})
df2 = pd.DataFrame({'Key': ['a', 'b', 'd'],
                    'Data2': range(3)})
df1
df2

Unnamed: 0,Key,Data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


Unnamed: 0,Key,Data2
0,a,0
1,b,1
2,d,2


#### Merge two (and only two) DFs

In [12]:
# Make parameters explict: 
pd.merge(df1, df2, on='Key', how='inner')

Unnamed: 0,Key,Data1,Data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


#### When the shared Key columns have different names

In [13]:
df2 = df2.rename(columns={'Key': 'Key1'})  # df2 now has a different key

pd.merge(df1, df2.rename(columns={'Key1':'Key'}),  how='inner')

#pd.merge(df1, df2, left_on='Key', right_on='Key1', how='inner') # less desirable b/o duplicated columns

Unnamed: 0,Key,Data1,Data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


### <a id='22'> Merging on Index: 
ft. df.join( )
    
Join is a special merge when the shared key is the index for all the involved DFs. You can join DFs or named Series. You can also join multiple DFs all at once, unlike merge can only do two DFs at a time.

<a href='#0'> Back to TOC

#### Join DFs

In [14]:
df1 = pd.DataFrame({'Key': ['c', 'b', 'a', 'a', 'b', 'c'], 'Value': range(6)})
df2 = pd.DataFrame({'Group_val': [3.5, 7]}, index=['b', 'a'])
df1
df2

Unnamed: 0,Key,Value
0,c,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


Unnamed: 0,Group_val
b,3.5
a,7.0


In [15]:
df1.set_index('Key').join(df2, how='outer') # set_index sorts the new index

Unnamed: 0,Value,Group_val
a,2,7.0
a,3,7.0
b,1,3.5
b,4,3.5
c,0,
c,5,


#### Join named Series

In [16]:
se1 = pd.Series(range(3), index=list('abc'))
se1

a    0
b    1
c    2
dtype: int64

In [17]:
# Note: no series.join
df1.set_index('Key').join(se1.rename('s1'), how='outer')

Unnamed: 0,Value,s1
a,2,0
a,3,0
b,1,1
b,4,1
c,0,2
c,5,2


#### Join multiple DFs

In [18]:
df3 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index=['a', 'c', 'e', 'f'],
                       columns=['New York', 'Oregon'])
df3

Unnamed: 0,New York,Oregon
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0


In [19]:
df1.set_index('Key').join([pd.DataFrame(se1.rename('s1')), df3], how='outer')

Unnamed: 0,Value,s1,New York,Oregon
a,2.0,0.0,7.0,8.0
a,3.0,0.0,7.0,8.0
b,1.0,1.0,,
b,4.0,1.0,,
c,0.0,2.0,9.0,10.0
c,5.0,2.0,9.0,10.0
e,,,11.0,12.0
f,,,16.0,17.0


### <a id='23'> Concatenating Along an Axis

Concatenate Series, DFs, or a mixture of both. <br>
It's widely used when engineering features during machine learning: you generate additional columns of features and horizontally attach them to the original DF. Or you have additional rows of data coming in, you need to append  them vertically. <br>
Note: it can be tricky when you horizontally stack multiple DFs that have different indices, as shown below.

<a href='#0'> Back to TOC

In [20]:
# on axis 0 by default, more rows
s1 = pd.Series([0, 1], index=['a', 'b'], name='s1')
s2 = pd.Series([2, 3], index=['a', 'b'], name='s2')
s3 = pd.Series([5, 6], index=['a', 'g'], name='s1')

In [21]:
pd.concat( [s1, s2, s3], axis=0 ) # vertical

a    0
b    1
a    2
b    3
a    5
g    6
dtype: int64

In [22]:
pd.concat([s1, s2, s3], axis=0, ignore_index=True) # ignore original index

0    0
1    1
2    2
3    3
4    5
5    6
dtype: int64

In [23]:
pd.concat([s1, s2, s3], axis=1) # horizontal, note the NAs created due to the mismatch of the indices.

Unnamed: 0,s1,s2,s1.1
a,0.0,2.0,5.0
b,1.0,3.0,
g,,,6.0


In [24]:
pd.concat([s1,s2,s3], axis=1, ignore_index=True) # ignore_index: ignores the labels on the chosen axis

Unnamed: 0,0,1,2
a,0.0,2.0,5.0
b,1.0,3.0,
g,,,6.0


#### Ignore the original index of s2 and s3
If you simply want to stack the values and don't want matching index, then the NAs created above is not what you desire. One solution is to ignore the index of s2 and s3 by assigning s1's index to them.

In [25]:
s2.index = s1.index
s3.index = s1.index
pd.concat([s1, s2, s3], axis=1)

Unnamed: 0,s1,s2,s1.1
a,0,2,5
b,1,3,6


## <a id='3'>Reshaping Data
 <a href='#0'> Go Back To TOC</a>

### <a id='32'> Pivoting “Long” Format to “Wide” Format
"Long" data are commonly used in storing multiple time series into a database. The features are stacked into a single column instead of being unstacked into different columns. Pivot it into the unstacked ("wide") form so that we can apply further analysis.

In [26]:
long_df = pd.read_csv('examples/macrodata_cleaned_long.csv')
long_df.head()

Unnamed: 0,date,item,value
0,1959-03-31,realgdp,2710.349
1,1959-03-31,unemp,5.8
2,1959-03-31,infl,0.0
3,1959-06-30,realgdp,2778.801
4,1959-06-30,unemp,5.1


In [27]:
pivoted = long_df.pivot(index='date', columns='item', values='value')
pivoted.head()

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31,0.0,2710.349,5.8
1959-06-30,2.34,2778.801,5.1
1959-09-30,2.74,2775.488,5.3
1959-12-31,0.27,2785.204,5.6
1960-03-31,2.31,2847.699,5.2


### <a id='33'> Melting “Wide” Format to “Long” Format
This time we reverse the process for data storage.
    
<a href='#0'> Back to TOC

In [28]:
pd.melt(pivoted.reset_index(), id_vars='date', value_vars=['infl', 'realgdp','unemp']).head()

Unnamed: 0,date,item,value
0,1959-03-31,infl,0.0
1,1959-06-30,infl,2.34
2,1959-09-30,infl,2.74
3,1959-12-31,infl,0.27
4,1960-03-31,infl,2.31
