# Summary
- __in this chapter focuses on tools to help combine, join, and rearrange data__

In [1]:
import pandas as pd
import numpy as np

# Hierarchical Indexing
- __This features enable us multiple level on axis__
- __This provide us to work with high dimensional form__
- __Example__

In [2]:
data = pd.Series(np.random.randn(9), 
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],[1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1   -0.474592
   2    0.264177
   3    1.622772
b  1   -1.508143
   3    0.734738
c  1   -0.089981
   2    0.521986
d  2    1.163670
   3    1.004994
dtype: float64

In [3]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [5]:
data['b']

1   -1.508143
3    0.734738
dtype: float64

In [6]:
data['b':'c']

b  1   -1.508143
   3    0.734738
c  1   -0.089981
   2    0.521986
dtype: float64

In [8]:
data[:2]

a  1   -0.474592
   2    0.264177
dtype: float64

- __unstack reshape the DataFrame__

In [11]:
result = data.unstack()
result

Unnamed: 0,1,2,3
a,-0.474592,0.264177,1.622772
b,-1.508143,,0.734738
c,-0.089981,0.521986,
d,,1.16367,1.004994


- __inverse of unstack is stack__

In [12]:
result.stack()

a  1   -0.474592
   2    0.264177
   3    1.622772
b  1   -1.508143
   3    0.734738
c  1   -0.089981
   2    0.521986
d  2    1.163670
   3    1.004994
dtype: float64

- __Another example__

In [13]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns=[['Ohio', 'Ohio', 'Colorado'],
                              ['Green', 'Red', 'Green']]
                    )
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [14]:
frame['Colorado']

Unnamed: 0,Unnamed: 1,Green
a,1,2
a,2,5
b,1,8
b,2,11


- __Let's name the index and columns name__

In [25]:
frame.index.names = ['key1','key2']
frame.columns.names = ['State','Color']
frame

Unnamed: 0_level_0,State,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


- __We can also swap the level__

In [19]:
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,State,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


- __sorting index base on key level__

In [23]:
frame.sort_index(level=1)

Unnamed: 0_level_0,State,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


- __we can swap level and do sorting at the same time too__

In [26]:
frame.swaplevel(0,1).sort_index(level=1)

Unnamed: 0_level_0,State,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


# Summary Statistics by Level

In [30]:
frame.sum(level = 1)

State,Ohio,Ohio,Colorado
Color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [31]:
frame.sum(level = 0)

State,Ohio,Ohio,Colorado
Color,Green,Red,Green
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [32]:
frame.sum(level='State',axis=1)

Unnamed: 0_level_0,State,Ohio,Colorado
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,2
a,2,7,5
b,1,13,8
b,2,19,11


# Combining and Merging Datasets

- __'pandas.merge' connect rows in DataFrame base on one or more keys__
- __'pandas.contact' cont objects along axis__
- __example__

In [39]:

df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)}
                  )
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [40]:
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)}
                  )
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [41]:
pd.merge(df1,df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


- __'d' of ''df2' is missed beacuse by defaut merge do the inner join__
- __Note that I didn’t specify which column to join on. If that information is not specified, merge uses the overlapping column names as the keys. It’s a good practice to specify explicitly, though:__

In [42]:
pd.merge(df1,df2, on = 'key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


- __If the column names are different in each object, you can specify them separately:__

In [43]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)}
                  )
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [44]:
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)}
                  )
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [45]:
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


- __We can use other join too__

In [47]:
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey', how = 'outer')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0


- __To merge with multiple keys, pass a list of column names:__

In [48]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]}
                   )
left

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [49]:
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]}
                    )
right

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7


In [51]:
pd.merge(left, right, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


- __merge has a suffixes option for specifying strings to append to overlapping names in the left and right DataFrame objects__

In [57]:
pd.merge(left, right, on='key1', suffixes=('__left', '__right'))

Unnamed: 0,key1,key2__left,lval,key2__right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


# Merging on Index

- __In some cases, the merge key(s) in a DataFrame will be found in its index.__
- __In this case, you can pass left_index=True or right_index=True (or both) to indicate that the index should be used as the merge key:__

In [58]:
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)}
                    )
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [59]:
right1 = pd.DataFrame({'group_val': [3.5, 7]}, 
                      index=['a', 'b']
                     )
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [63]:
pd.merge(left1, right1, left_on = 'key', right_index = True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


# Concatenating Along an Axis

In [2]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

- __'numpy' have a 'concatenate' function to concatinate the array__-
- __'pandas' have a 'concat' function to concatinate__
- __example__


In [5]:
np.concatenate([arr,arr])

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [6]:
np.concatenate([arr,arr],axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

- __Let's concatenate the series__
- __example__

In [7]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

In [10]:
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [11]:
pd.concat([s1,s2,s3],axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [12]:
pd.concat([s1,s2,s3],keys=['one', 'two', 'three'])

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

- __Let's concat dataframe using the keys__
- __example__

In [13]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), 
                   index=['a', 'b', 'c'],
                   columns=['one', 'two']
                  )
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [14]:
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), 
                   index=['a', 'c'],
                   columns=['three', 'four']
                  )
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [17]:
pd.concat([df1,df2], keys=['Level1', 'Levele2'], axis=1)

Unnamed: 0_level_0,Level1,Level1,Levele2,Levele2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [18]:
pd.concat([df1,df2], keys=['Level1', 'Levele2'], axis=1, names = ['Upper', 'Lower'])

Upper,Level1,Level1,Levele2,Levele2
Lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


- __A last consideration concerns DataFrames in which the row index does not contain
any relevant data:__
- __example__

In [19]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df1

Unnamed: 0,a,b,c,d
0,0.671623,0.62448,1.968591,-1.252256
1,-1.00868,0.149854,-0.221783,1.323102
2,1.550677,0.871162,0.122487,1.500623


In [20]:
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
df2

Unnamed: 0,b,d,a
0,0.17988,1.450719,-0.693402
1,0.319003,-0.325505,-0.60099


In [22]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,a,b,c,d
0,0.671623,0.62448,1.968591,-1.252256
1,-1.00868,0.149854,-0.221783,1.323102
2,1.550677,0.871162,0.122487,1.500623
3,-0.693402,0.17988,,1.450719
4,-0.60099,0.319003,,-0.325505


# Concatenating Along an Axis

- __'Stack' :- This “rotates” or pivots from the columns in the data to the rows__
- __'Unkstack' :- This pivots from the rows into the columns__

In [24]:

data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index=pd.Index(['Ohio', 'Colorado'], name='state'),
                    columns=pd.Index(['one', 'two', 'three'],
                                     name='number')
                 )
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [26]:
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64

- __'unstack' is just opposite of stack__

In [27]:
# unstacke with passing different level

result.unstack('state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


# Pivoting “Wide” to “Long” Format

In [28]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                   'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]}
                 )
df

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


- __The 'key' column may be a group indicator, and the other columns are data values.
When using pandas.melt , we must indicate which columns (if any) are group indicators. Let’s use 'key' as the only group indicator here:__

In [None]:
pd.melt(df, ['key'])