## Example of merging data with pandas

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
input_file = Path.cwd() / 'data' / 'raw' / 'sample_sales.xlsx'
level_file = Path.cwd() / 'data' / 'raw' / 'customer_levels.xlsx'
extra_transactions = Path.cwd() / 'data' / 'raw' / 'sample_sales_50_extra.xlsx'

In [3]:
sales = pd.read_excel(input_file)
levels = pd.read_excel(level_file)
extra_sales = pd.read_excel(extra_transactions)

In [4]:
sales.head()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161
3,XP-005-55,Skipfire,2019-11-18,pen,7,29,203
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684


In [5]:
sales.shape

(1000, 7)

In [6]:
levels.head()

Unnamed: 0,Company Name,level
0,Abatz,diamond
1,Agivu,silver
2,Aibox,platinum
3,Ailane,silver
4,Aimbo,diamond


In [7]:
levels.shape

(351, 2)

In [8]:
extra_sales.shape

(50, 7)

In [9]:
extra_sales.head()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount
0,ZN-870-293,Linkbuzz,2019-01-28,shirt,28,17,476
1,JQ-501-633,Aimbu,2019-10-04,book,21,14,294
2,FI-165-583,Dablist,2019-05-05,poster,4,23,92
3,XP-005-553,Rhycero,2019-11-03,pen,49,29,1421
4,NB-917-183,Skinder,2019-07-17,poster,18,19,342


In [10]:
pd.concat([sales, extra_sales])

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161
3,XP-005-55,Skipfire,2019-11-18,pen,7,29,203
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684
...,...,...,...,...,...,...,...
45,PS-680-563,Quimm,2019-09-19,poster,0,35,0
46,MU-441-943,Podcat,2019-09-15,shirt,7,15,105
47,RR-924-583,Oyoloo,2019-12-24,book,17,10,170
48,NM-907-403,Thoughtbeat,2019-08-02,poster,8,26,208


In [11]:
# DataFrame.append() has been removed
# See https://stackoverflow.com/questions/75956209/dataframe-object-has-no-attribute-append for details.
# sales.append(extra_sales)
# New Pandas 2.0 syntax:

sales = pd.concat([sales, extra_sales])

In [12]:
all_sales = pd.concat([sales, extra_sales])

In [13]:
all_sales.shape

(1100, 7)

In [14]:
all_sales.head()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161
3,XP-005-55,Skipfire,2019-11-18,pen,7,29,203
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684


In [15]:
# This will give an error
#pd.merge(all_sales, levels)

In [16]:
pd.merge(all_sales, levels, left_on='company', right_on='Company Name', how='left')

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount,Company Name,level
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323,Realcube,gold
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420,Zooxo,platinum
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161,Dabtype,gold
3,XP-005-55,Skipfire,2019-11-18,pen,7,29,203,Skipfire,gold
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684,Bluezoom,gold
...,...,...,...,...,...,...,...,...,...
1095,PS-680-563,Quimm,2019-09-19,poster,0,35,0,Quimm,gold
1096,MU-441-943,Podcat,2019-09-15,shirt,7,15,105,Podcat,silver
1097,RR-924-583,Oyoloo,2019-12-24,book,17,10,170,Oyoloo,silver
1098,NM-907-403,Thoughtbeat,2019-08-02,poster,8,26,208,Thoughtbeat,silver


In [17]:
levels.head()

Unnamed: 0,Company Name,level
0,Abatz,diamond
1,Agivu,silver
2,Aibox,platinum
3,Ailane,silver
4,Aimbo,diamond


In [18]:
levels = levels.rename(columns={'Company Name': 'company'})

In [19]:
levels.head()

Unnamed: 0,company,level
0,Abatz,diamond
1,Agivu,silver
2,Aibox,platinum
3,Ailane,silver
4,Aimbo,diamond


In [20]:
pd.merge(all_sales, levels)

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount,level
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323,gold
1,ZK-911-48,Realcube,2019-01-31,book,33,35,1155,gold
2,CM-131-53,Realcube,2019-12-28,poster,46,35,1610,gold
3,TZ-788-30,Realcube,2019-03-07,poster,22,31,682,gold
4,GF-172-18,Realcube,2019-09-07,poster,42,34,1428,gold
...,...,...,...,...,...,...,...,...
1095,UZ-411-71,Zoovu,2019-03-29,book,15,11,165,platinum
1096,PZ-354-19,Centizu,2019-04-19,pen,0,27,0,gold
1097,JF-350-29,Voonyx,2019-06-13,book,14,24,336,platinum
1098,EH-047-08,Voonyx,2019-11-16,pen,38,13,494,platinum


In [21]:
final_data = pd.merge(all_sales, levels)

In [22]:
final_data.head()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount,level
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323,gold
1,ZK-911-48,Realcube,2019-01-31,book,33,35,1155,gold
2,CM-131-53,Realcube,2019-12-28,poster,46,35,1610,gold
3,TZ-788-30,Realcube,2019-03-07,poster,22,31,682,gold
4,GF-172-18,Realcube,2019-09-07,poster,42,34,1428,gold


In [23]:
final_data.shape

(1100, 8)

In [24]:
final_data.tail()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount,level
1095,UZ-411-71,Zoovu,2019-03-29,book,15,11,165,platinum
1096,PZ-354-19,Centizu,2019-04-19,pen,0,27,0,gold
1097,JF-350-29,Voonyx,2019-06-13,book,14,24,336,platinum
1098,EH-047-08,Voonyx,2019-11-16,pen,38,13,494,platinum
1099,FU-587-89,Omba,2019-09-05,book,9,17,153,gold
