# Data Wrangling: Join, Combine and Reshape

When data be spread across a number of files or datasets 

Hierarchical indexing 
Have multiple index levels on an axis

Work with higher dimensional data in a lower dimensional form

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.Series(
    np.random.uniform(size=9),
    index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"], [1, 2, 3, 1, 3, 1, 2, 2, 3]],
)

data

In [None]:
# A Series with MultiIndex
data.index

In [None]:
# Partial indexing
data["b"]

In [None]:
data["b": "c"]

In [None]:
# Selection from "inner level", select all of the values that contains value 2 from the second index level
data.loc[:, 2]

In [None]:
data.unstack()

In [None]:
data.unstack().stack()

In [17]:
# Either axis can have a hierarchical index
# Each index element have to have the same shape as the row or column
# Repeated index will be categorized
frame = pd.DataFrame(
    np.arange(12).reshape((4, 3)),
    index=[["a", "a", "b", "b"], [1, 2, 1, 2]],
    columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]],
)


In [22]:
frame.index.names = ["key1", "key2"]
frame.columns.names = ["state", 'color']

In [23]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [24]:
# See how many levels an index has
frame.index.nlevels

2

In [25]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [27]:
# A MultiIndex can be created by itself, then reused. 
pd.MultiIndex.from_arrays(
    [["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], names=["state", "color"]
)


MultiIndex([(    'Ohio', 'Green'),
            (    'Ohio',   'Red'),
            ('Colorado', 'Green')],
           names=['state', 'color'])

### Reordering and sorting levels
Rearrange the order of the levels on the axis or sort the data by the values in one specific level.

Use the `swaplevel` method
Takes two level numbers or names and returns a new object with the levels interchanged 



In [28]:
frame.swaplevel(0, 1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [29]:
frame.groupby(level="key2").sum()


state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [31]:
frame.sum(axis='columns')

key1  key2
a     1        3
      2       12
b     1       21
      2       30
dtype: int64

In [32]:
# Indexing with a DataFrame's columns
# Use one or more columns from a DataFrame as the row index, or move row index in to the DataFrame's columns

frame = pd.DataFrame(
    {
        "a": range(7),
        "b": range(7, 0, -1),
        "c": ["one", "one", "one", "one", "two", "two", "two"],
        "d": [1, 2, 3, 0, 1, 2, 3],
    }
)


In [38]:
frame2 = frame.set_index(["c", "d"])

In [36]:
# By default, columns are removed from the DataFrame, use drop=False to keep the index
frame2 = frame.set_index(["c", "d"], drop=False)

In [42]:
# Does the opposite to the set_index, hierarchical index levels are moved into the columns
frame2.reset_index()


Unnamed: 0,c,d,a,b
0,one,1,0,7
1,one,2,1,6
2,one,3,2,5
3,one,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


## 8.2 Combining and Merging Datasets

### pandas.merge
Connect rows in DataFrames based on one or more keys 

Many-to-Many merge forms the Cartesian product of the matching keys. 
All possible ordered paris will be combined

### pandas.concat
Concatenate or "stack" objects together along an axis

### combine_first
Splice overlapping data to fill in missing values in one object wit valyes from another


In [48]:
df1 = pd.DataFrame(
    {
        "key": ["b", "b", "a", "c", "a", "a", "b"],
        "data1": pd.Series(range(7), dtype="int64"),
    }
)

df2 = pd.DataFrame(
    {
        "key": ["a", "b", "d", "b"],
        "data2": pd.Series(range(4), dtype="int64"),
    }
)

# Take df1, find items that is common between df1 and df2
# the unique element in both df will be dropped
# the overlapped element in both column will be added to new column
pd.merge(df1, df2, on="key")

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,6,1
5,b,6,3
6,a,2,0
7,a,4,0
8,a,5,0


In [54]:

# Left , right, outer to decide if want to keep the missing values
pd.merge(df1, df2, on='key', how="outer")

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,0.0,3.0
2,b,1.0,1.0
3,b,1.0,3.0
4,b,6.0,1.0
5,b,6.0,3.0
6,a,2.0,0.0
7,a,4.0,0.0
8,a,5.0,0.0
9,c,3.0,
