# Data Wrangling with Pandas - merge and concat

In [1]:
import pandas as pd

### Pandas merge() method for joining tables

* **merge(params)** -> Merges the columns in the left DataFrame with the columns in the right DataFrame based on the data in one or more columns that aren’t indexed.

Parameters:

* **right** -> The DataFrame to merge with the current DataFrame.
* **on** -> The column or list of columns to merge on. By default, it uses the column names that are the same in both DataFrames.
* **how** -> TWorks like the join() method but uses ‘inner’ by default.
* **suffixes** -> A tuple that provides the values to append to columns with the same name in the left and right DataFrames.

In [2]:
# customer sales table from last week (fact table)
customer_sales_lw = pd.DataFrame(
    {
        "customer_id": [1000, 1001, 1002, 1003],
        "item_id": [1000033, 2004457, 4002245, 2003547],
        "Sales": [340, 404, 230, 500]
    }
)

# customer demographics table (dimension table)
customer_demographics = pd.DataFrame(
    {
        "customer_id": [701, 805, 1000, 1001, 1003, 1004, 1006],
        "Age": [35, 43, 29, 44, 25, 45, 33],
        "Sex": ["M", "M", "F", "M", "F", "F","F"]
    }
)

display(customer_sales_lw, customer_demographics)

Unnamed: 0,customer_id,item_id,Sales
0,1000,1000033,340
1,1001,2004457,404
2,1002,4002245,230
3,1003,2003547,500


Unnamed: 0,customer_id,Age,Sex
0,701,35,M
1,805,43,M
2,1000,29,F
3,1001,44,M
4,1003,25,F
5,1004,45,F
6,1006,33,F


![image](img/tables.png)

### Inner Join

![image](img/inner_join.png)

In [3]:
#customer_sales_lw.merge(customer_demographics, on='customer_id', how='inner')

# or

pd.merge(customer_sales_lw, customer_demographics, on='customer_id', how='inner')

Unnamed: 0,customer_id,item_id,Sales,Age,Sex
0,1000,1000033,340,29,F
1,1001,2004457,404,44,M
2,1003,2003547,500,25,F


### Left Join

![image](img/left_join.png)

In [4]:
#customer_sales_lw.merge(customer_demographics, on='customer_id', how='left')

# or

pd.merge(customer_sales_lw, customer_demographics, on='customer_id', how='left')

Unnamed: 0,customer_id,item_id,Sales,Age,Sex
0,1000,1000033,340,29.0,F
1,1001,2004457,404,44.0,M
2,1002,4002245,230,,
3,1003,2003547,500,25.0,F


### Right Join

![image](img/right_join.png)

In [5]:
#customer_sales_lw.merge(customer_demographics, on='customer_id', how='right')

# or

pd.merge(customer_sales_lw, customer_demographics, on='customer_id', how='right')

Unnamed: 0,customer_id,item_id,Sales,Age,Sex
0,701,,,35,M
1,805,,,43,M
2,1000,1000033.0,340.0,29,F
3,1001,2004457.0,404.0,44,M
4,1003,2003547.0,500.0,25,F
5,1004,,,45,F
6,1006,,,33,F


### Outer Join

![image](img/outer_join.png)

In [6]:
#customer_sales_lw.merge(customer_demographics, on='customer_id', how='outer')

# or

pd.merge(customer_sales_lw, customer_demographics, on='customer_id', how='outer')

Unnamed: 0,customer_id,item_id,Sales,Age,Sex
0,1000,1000033.0,340.0,29.0,F
1,1001,2004457.0,404.0,44.0,M
2,1002,4002245.0,230.0,,
3,1003,2003547.0,500.0,25.0,F
4,701,,,35.0,M
5,805,,,43.0,M
6,1004,,,45.0,F
7,1006,,,33.0,F


### Joining multiple table

In [7]:
# item details table (dimension table)
item_id = pd.DataFrame(
    {
        "item_id": [1000015,1000033, 2004457, 4002245, 2003547, 3000434],
        "item_brand": ["Coke", "Pepsi", "Lays", "Hersley", "Mars", "Ozarka"]
    }
)

display(item_id)

Unnamed: 0,item_id,item_brand
0,1000015,Coke
1,1000033,Pepsi
2,2004457,Lays
3,4002245,Hersley
4,2003547,Mars
5,3000434,Ozarka


In [8]:
# create a table that has customers sales from last week and get their demographics and brands of the items they purchased. 
customer_sales_lw.merge(customer_demographics, on='customer_id', how='left') \
                 .merge(item_id, on='item_id', how='left')

Unnamed: 0,customer_id,item_id,Sales,Age,Sex,item_brand
0,1000,1000033,340,29.0,F,Pepsi
1,1001,2004457,404,44.0,M,Lays
2,1002,4002245,230,,,Hersley
3,1003,2003547,500,25.0,F,Mars


### Pandas concat() function to concatinate tables

* **concat(params)** -> Concatenates (adds) the data in one DataFrame to another DataFrame.

Parameters:

* **objs** -> A list of the DataFrames that you want to concatenate.
* **axis** -> The default of 0 adds rows to the bottom of the first DataFrame. Setting it to 1 adds columns to the right side of the first DataFrame.
* **ignore_index** -> If True, don’t keep the index values along the concatenation axis. Instead, reset the index on that axis. 
* **join** -> If ‘inner’, use an inner join. Otherwise, use an outer join.

In [9]:
customer_sales_lw

Unnamed: 0,customer_id,item_id,Sales
0,1000,1000033,340
1,1001,2004457,404
2,1002,4002245,230
3,1003,2003547,500


In [10]:
customer_sales_lw_2 = pd.DataFrame(
    {
        "customer_id": [1004, 1005, 1006, 1007, 1008],
        "item_id": [1000032, 2000357, 1002245, 3003531, 1200324],
        "Sales": [140, 432, 233, 400, 213]
    }
)

In [11]:
customer_sales_lw_2

Unnamed: 0,customer_id,item_id,Sales
0,1004,1000032,140
1,1005,2000357,432
2,1006,1002245,233
3,1007,3003531,400
4,1008,1200324,213


In [12]:
pd.concat([customer_sales_lw, customer_sales_lw_2])

Unnamed: 0,customer_id,item_id,Sales
0,1000,1000033,340
1,1001,2004457,404
2,1002,4002245,230
3,1003,2003547,500
0,1004,1000032,140
1,1005,2000357,432
2,1006,1002245,233
3,1007,3003531,400
4,1008,1200324,213


In [13]:
pd.concat([customer_sales_lw, customer_sales_lw_2], ignore_index=True)

Unnamed: 0,customer_id,item_id,Sales
0,1000,1000033,340
1,1001,2004457,404
2,1002,4002245,230
3,1003,2003547,500
4,1004,1000032,140
5,1005,2000357,432
6,1006,1002245,233
7,1007,3003531,400
8,1008,1200324,213


In [14]:
customer_sales_lw_2

Unnamed: 0,customer_id,item_id,Sales
0,1004,1000032,140
1,1005,2000357,432
2,1006,1002245,233
3,1007,3003531,400
4,1008,1200324,213


In [15]:
customer_sales_lw_3 = pd.DataFrame(
    {
        "customer_id": [1009, 1010, 1011, 1012, 1013],
        "item_id": [1000132, 2002357, 1012245, 3103531, 1203324],
        "Sales": [180, 532, 213, 343, 313],
        "Units": [3, 5, 4, 6, 2]
    }
)

In [16]:
customer_sales_lw_3

Unnamed: 0,customer_id,item_id,Sales,Units
0,1009,1000132,180,3
1,1010,2002357,532,5
2,1011,1012245,213,4
3,1012,3103531,343,6
4,1013,1203324,313,2


In [17]:
pd.concat([customer_sales_lw_2, customer_sales_lw_3], ignore_index=True)

Unnamed: 0,customer_id,item_id,Sales,Units
0,1004,1000032,140,
1,1005,2000357,432,
2,1006,1002245,233,
3,1007,3003531,400,
4,1008,1200324,213,
5,1009,1000132,180,3.0
6,1010,2002357,532,5.0
7,1011,1012245,213,4.0
8,1012,3103531,343,6.0
9,1013,1203324,313,2.0
