# Lesson 23b: Joining data from a few data frames: merge

## Import libraries ans load data

In [16]:
import numpy as np
import pandas as pd

cat1 = pd.read_csv("categories_del_1.csv", usecols = ["CategoryID","CategoryName"])
cat2 = pd.read_csv("categories_del_2.csv", usecols = ["CategoryID","CategoryName"])

categories = pd.read_csv("categories1.csv")
products = pd.read_csv("products.csv", usecols = ["ProductID","ProductName","UnitPrice","CategoryID"])
orders = pd.read_csv("order-details.csv")

In [2]:
cat1.head()

Unnamed: 0,CategoryID,CategoryName
0,2,Condiments
1,3,Confections
2,4,Dairy Products
3,5,Grains/Cereals
4,6,Meat/Poultry


In [3]:
cat2.head()

Unnamed: 0,CategoryID,CategoryName
0,1,Beverages
1,2,Condiments
2,3,Confections
3,4,Dairy Products
4,7,Produce


In [4]:
# Note that there are some missing rows in both tables.

# Important difference between join() and merge() is that, merge() can be based on the same columns which both 
# are not indexes but just normal columns. In join(), at least one column from two tables must be an index.
 
cat1.merge(cat2)

# In a result, the simplest command shows only these rows which appear in both tables - the inner part.

Unnamed: 0,CategoryID,CategoryName
0,2,Condiments
1,3,Confections
2,4,Dairy Products
3,7,Produce
4,8,Seafood


In [5]:
# I can add the propert "on", but then I will get values from both tables for this index:

cat1.merge(cat2, on = "CategoryID")

Unnamed: 0,CategoryID,CategoryName_x,CategoryName_y
0,2,Condiments,Condiments
1,3,Confections,Confections
2,4,Dairy Products,Dairy Products
3,7,Produce,Produce
4,8,Seafood,Seafood


In [6]:
# Sometimes I may need a new table in such a form, and I want to change the names so that to add suffixes:

cat1.merge(cat2, on = "CategoryID", suffixes = ["_1","_2"])


Unnamed: 0,CategoryID,CategoryName_1,CategoryName_2
0,2,Condiments,Condiments
1,3,Confections,Confections
2,4,Dairy Products,Dairy Products
3,7,Produce,Produce
4,8,Seafood,Seafood


In [7]:
# To decide which part of data from both tables should be shown, I can use the property "how"
# Note that these parts are divided into: left (cat1), right (cat2), inner (only rows which show up in both tables)
# and outer (all rows from both tables)

cat1.merge(cat2, on = "CategoryID", suffixes = ["_1","_2"], how = "outer")

Unnamed: 0,CategoryID,CategoryName_1,CategoryName_2
0,2,Condiments,Condiments
1,3,Confections,Confections
2,4,Dairy Products,Dairy Products
3,5,Grains/Cereals,
4,6,Meat/Poultry,
5,7,Produce,Produce
6,8,Seafood,Seafood
7,1,,Beverages


In [8]:
cat_merged = cat1.merge(cat2, on = "CategoryID", suffixes = ["_1","_2"], how = "outer", indicator = True)
cat_merged.head()

Unnamed: 0,CategoryID,CategoryName_1,CategoryName_2,_merge
0,2,Condiments,Condiments,both
1,3,Confections,Confections,both
2,4,Dairy Products,Dairy Products,both
3,5,Grains/Cereals,,left_only
4,6,Meat/Poultry,,left_only


In [9]:
filter = cat_merged["_merge"] == "left_only"

cat_merged[filter]

Unnamed: 0,CategoryID,CategoryName_1,CategoryName_2,_merge
3,5,Grains/Cereals,,left_only
4,6,Meat/Poultry,,left_only


In [10]:
cat_merged["_merge"].value_counts()

both          5
left_only     2
right_only    1
Name: _merge, dtype: int64

In [11]:
filter2 = cat_merged["_merge"] == "both"
cat_merged[~filter2]

Unnamed: 0,CategoryID,CategoryName_1,CategoryName_2,_merge
3,5,Grains/Cereals,,left_only
4,6,Meat/Poultry,,left_only
7,1,,Beverages,right_only


In [12]:
# Some checks of options of merge()

# products.merge(categories, on = "CategoriesID")

In [13]:
# What can happen if the name of the column in one table is changed?

categories.rename({"CategoryID" : "ID"}, axis = "columns", inplace = True)

# Then we need to specify which columns from both tables should be our keys: 

products.merge(categories, left_on = "CategoryID", right_on = "ID")

Unnamed: 0,ProductID,ProductName,CategoryID,UnitPrice,ID,CategoryName,Description
0,1,Chai,1,18.00,1,Beverages,Soft drinks
1,2,Chang,1,19.00,1,Beverages,Soft drinks
2,24,Guaraná Fantástica,1,4.50,1,Beverages,Soft drinks
3,34,Sasquatch Ale,1,14.00,1,Beverages,Soft drinks
4,35,Steeleye Stout,1,18.00,1,Beverages,Soft drinks
...,...,...,...,...,...,...,...
72,42,Singaporean Hokkien Fried Mee,5,14.00,5,Grains/Cereals,Breads
73,52,Filo Mix,5,7.00,5,Grains/Cereals,Breads
74,56,Gnocchi di nonna Alice,5,38.00,5,Grains/Cereals,Breads
75,57,Ravioli Angelo,5,19.50,5,Grains/Cereals,Breads


In [14]:
# Note that if there are other normal columns that have the same names in both tables,
# they will be treated by merge() separately so that there is no conflict.

# There is a property "sort=True", which sorts data. 
products.merge(categories, left_on = "CategoryID", right_on = "ID", sort = True)

Unnamed: 0,ProductID,ProductName,CategoryID,UnitPrice,ID,CategoryName,Description
0,1,Chai,1,18.00,1,Beverages,Soft drinks
1,2,Chang,1,19.00,1,Beverages,Soft drinks
2,24,Guaraná Fantástica,1,4.50,1,Beverages,Soft drinks
3,34,Sasquatch Ale,1,14.00,1,Beverages,Soft drinks
4,35,Steeleye Stout,1,18.00,1,Beverages,Soft drinks
...,...,...,...,...,...,...,...
72,41,Jack's New England Clam Chowder,8,9.65,8,Seafood,Seaweed and fish
73,45,Rogede sild,8,9.50,8,Seafood,Seaweed and fish
74,46,Spegesild,8,12.00,8,Seafood,Seaweed and fish
75,58,Escargots de Bourgogne,8,13.25,8,Seafood,Seaweed and fish


In [15]:
# We see that nothing changed, because for inner part, which is here by default, data are always sorted.

# And what can happen if we set a key column to be an index?

categories.set_index("ID", inplace = True)

# products.merge(categories, left_on = "CategoryID", right_on = "ID", sort = True)

# Error

# This can work properly if, on the right object, the merging will be based on the index:

products.merge(categories, left_on = "CategoryID", right_index = True, sort = True)

Unnamed: 0,ProductID,ProductName,CategoryID,UnitPrice,CategoryName,Description
0,1,Chai,1,18.00,Beverages,Soft drinks
1,2,Chang,1,19.00,Beverages,Soft drinks
23,24,Guaraná Fantástica,1,4.50,Beverages,Soft drinks
33,34,Sasquatch Ale,1,14.00,Beverages,Soft drinks
34,35,Steeleye Stout,1,18.00,Beverages,Soft drinks
...,...,...,...,...,...,...
40,41,Jack's New England Clam Chowder,8,9.65,Seafood,Seaweed and fish
44,45,Rogede sild,8,9.50,Seafood,Seaweed and fish
45,46,Spegesild,8,12.00,Seafood,Seaweed and fish
57,58,Escargots de Bourgogne,8,13.25,Seafood,Seaweed and fish


## Merging more than 2 data frames

In [17]:
# By using merge() we can only merge 2 objects at the same time. To merge 3 objects we need to do it in 2 steps.

categories = pd.read_csv("categories1.csv")
products = pd.read_csv("products.csv", usecols = ["ProductID","ProductName","UnitPrice","CategoryID"])
orders = pd.read_csv("order-details.csv")

In [19]:
cat_prod = products.merge(categories, on = "CategoryID")
cat_prod.head()

Unnamed: 0,ProductID,ProductName,CategoryID,UnitPrice,CategoryName,Description
0,1,Chai,1,18.0,Beverages,Soft drinks
1,2,Chang,1,19.0,Beverages,Soft drinks
2,24,Guaraná Fantástica,1,4.5,Beverages,Soft drinks
3,34,Sasquatch Ale,1,14.0,Beverages,Soft drinks
4,35,Steeleye Stout,1,18.0,Beverages,Soft drinks


In [21]:
cat_prod.merge(orders, on = "ProductID", suffixes = ["_Prod","_Order"])

Unnamed: 0,ProductID,ProductName,CategoryID,UnitPrice_Prod,CategoryName,Description,OrderID,UnitPrice_Order,Quantity,Discount
0,1,Chai,1,18.00,Beverages,Soft drinks,10285,14.40,45,0.20
1,1,Chai,1,18.00,Beverages,Soft drinks,10294,14.40,18,0.00
2,1,Chai,1,18.00,Beverages,Soft drinks,10317,14.40,20,0.00
3,1,Chai,1,18.00,Beverages,Soft drinks,10348,14.40,15,0.15
4,1,Chai,1,18.00,Beverages,Soft drinks,10354,14.40,12,0.00
...,...,...,...,...,...,...,...,...,...,...
2150,64,Wimmers gute Semmelknödel,5,33.25,Grains/Cereals,Breads,10968,33.25,4,0.00
2151,64,Wimmers gute Semmelknödel,5,33.25,Grains/Cereals,Breads,11031,33.25,20,0.00
2152,64,Wimmers gute Semmelknödel,5,33.25,Grains/Cereals,Breads,11053,33.25,25,0.20
2153,64,Wimmers gute Semmelknödel,5,33.25,Grains/Cereals,Breads,11072,33.25,130,0.00
