https://towardsdatascience.com/20-examples-to-master-merging-dataframes-in-python-pandas-22ffcd6059d1

In [1]:
import numpy as np
import pandas as pd


names = pd.DataFrame(
    
    {
        "id": [1, 2, 3, 4, 10],
        "name": ["Emily", "Jane", "Joe", "Matt", "Lucas"],
        "age": np.random.randint(20, 30, size=5)
    }
    
)

scores = pd.DataFrame(
    
    {
        "id": np.arange(1, 8),
        "score": np.random.randint(80, 100, size=7),
        "group": list("ABCAACA")
    }
    
)

In [3]:
merged_df = names.merge(scores, on="id")
print(merged_df)

   id   name  age  score group
0   1  Emily   26     98     A
1   2   Jane   28     84     B
2   3    Joe   24     88     C
3   4   Matt   25     93     A


In [4]:
merged_df = names.merge(scores, on="id", how="left")
print(merged_df)

   id   name  age  score group
0   1  Emily   26   98.0     A
1   2   Jane   28   84.0     B
2   3    Joe   24   88.0     C
3   4   Matt   25   93.0     A
4  10  Lucas   23    NaN   NaN


In [5]:
# followings are the same
merged_df = names.merge(scores, on="id", how="left")
merged_df = scores.merge(names, on="id", how="right")

In [6]:
merged_df = names.merge(scores, on="id", how="outer")
print(merged_df)

   id   name   age  score group
0   1  Emily  26.0   98.0     A
1   2   Jane  28.0   84.0     B
2   3    Joe  24.0   88.0     C
3   4   Matt  25.0   93.0     A
4  10  Lucas  23.0    NaN   NaN
5   5    NaN   NaN   81.0     A
6   6    NaN   NaN   87.0     C
7   7    NaN   NaN   87.0     A


In [8]:
merged_df = names.merge(scores, on="id", how="outer", indicator=True)
print(merged_df)

   id   name   age  score group      _merge
0   1  Emily  26.0   98.0     A        both
1   2   Jane  28.0   84.0     B        both
2   3    Joe  24.0   88.0     C        both
3   4   Matt  25.0   93.0     A        both
4  10  Lucas  23.0    NaN   NaN   left_only
5   5    NaN   NaN   81.0     A  right_only
6   6    NaN   NaN   87.0     C  right_only
7   7    NaN   NaN   87.0     A  right_only


In [9]:
merged_df = names.merge(scores, on="id", how="left", indicator="source")
display(merged_df)

Unnamed: 0,id,name,age,score,group,source
0,1,Emily,26,98.0,A,both
1,2,Jane,28,84.0,B,both
2,3,Joe,24,88.0,C,both
3,4,Matt,25,93.0,A,both
4,10,Lucas,23,,,left_only


In [10]:
# rename the id column in the scores DataFrame
scores = scores.rename(columns={"id": "id_number"})

merged_df = names.merge(scores, left_on="id", right_on="id_number")
print(merged_df)

   id   name  age  id_number  score group
0   1  Emily   26          1     98     A
1   2   Jane   28          2     84     B
2   3    Joe   24          3     88     C
3   4   Matt   25          4     93     A


In [11]:
products = pd.DataFrame(
    
    {
        "pg": ["A", "A", "A", "B", "B", "B"],
        "id": [101, 102, 103, 101, 102, 104],
        "price": np.random.randint(50, 80, size=6),
        "cost": np.random.randint(40, 50, size=6),
        "discount": [0.1, 0.1, 0, 0, 0.2, 0]
    }
    
)

sales = pd.DataFrame(
    
    {
        "pg": ["A", "A", "A", "B", "B", "B"],
        "id": [101, 102, 105, 101, 102, 106],
        "sales_qty": np.random.randint(1, 10, size=6),
        "discount": [0, 0.1, 0.1, 0.2, 0, 0]
    }
    
)

In [12]:
merged_df = products.merge(sales, on=["pg", "id"])
print(merged_df)

  pg   id  price  cost  discount_x  sales_qty  discount_y
0  A  101     74    44         0.1          2         0.0
1  A  102     68    49         0.1          5         0.1
2  B  101     74    40         0.0          2         0.2
3  B  102     57    40         0.2          7         0.0


In [13]:
merged_df = products.merge(sales, on=["pg", "id"], suffixes=["_products", "_sales"])
print(merged_df)

  pg   id  price  cost  discount_products  sales_qty  discount_sales
0  A  101     74    44                0.1          2             0.0
1  A  102     68    49                0.1          5             0.1
2  B  101     74    40                0.0          2             0.2
3  B  102     57    40                0.2          7             0.0


In [14]:
# rename the id column
sales = sales.rename(columns={"id": "product_id"})

merged_df = products.merge(
    sales, 
    left_on=["pg", "id"], 
    right_on=["pg", "product_id"],
    how="left",
    suffixes=["_products", "_sales"]
)

In [15]:
df1 = pd.DataFrame(
    np.random.randint(0, 10, size=(5, 4)),
    columns=list("ABCD")
)

df2 = pd.DataFrame(
    np.random.randint(0, 10, size=(5, 4)),
    columns=list("EFGH"),
    index=[2, 3, 4, 5, 6]
)

In [17]:
merged_df = df1.merge(df2, left_index=True, right_index=True)

In [18]:
merged_df = df1.merge(df2, left_index=True, right_index=True, how="left")
print(merged_df)

   A  B  C  D    E    F    G    H
0  3  2  2  8  NaN  NaN  NaN  NaN
1  5  7  3  0  NaN  NaN  NaN  NaN
2  8  7  9  9  7.0  6.0  9.0  6.0
3  2  8  1  9  2.0  8.0  8.0  0.0
4  9  7  4  9  4.0  7.0  4.0  0.0


In [21]:
df1 = pd.DataFrame(
    
    {
        "time": pd.date_range(start="2022-12-09", periods=7, freq="2S"),
        "left_value": np.round(np.random.random(7), 2)
    }

)

df2 = pd.DataFrame(
    
    {
        "time": pd.date_range(start="2022-12-09", periods=6, freq="3S"),
        "right_value": np.round(np.random.random(6), 2)
    }

)
display(df1)
display(df2)

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.86
1,2022-12-09 00:00:02,0.44
2,2022-12-09 00:00:04,0.07
3,2022-12-09 00:00:06,0.9
4,2022-12-09 00:00:08,0.93
5,2022-12-09 00:00:10,0.91
6,2022-12-09 00:00:12,0.26


Unnamed: 0,time,right_value
0,2022-12-09 00:00:00,0.39
1,2022-12-09 00:00:03,0.35
2,2022-12-09 00:00:06,0.36
3,2022-12-09 00:00:09,0.54
4,2022-12-09 00:00:12,0.44
5,2022-12-09 00:00:15,0.83


In [22]:
merged_df = pd.merge_asof(df1, df2, on="time")
print(merged_df)

                 time  left_value  right_value
0 2022-12-09 00:00:00        0.86         0.39
1 2022-12-09 00:00:02        0.44         0.39
2 2022-12-09 00:00:04        0.07         0.35
3 2022-12-09 00:00:06        0.90         0.36
4 2022-12-09 00:00:08        0.93         0.36
5 2022-12-09 00:00:10        0.91         0.54
6 2022-12-09 00:00:12        0.26         0.44


In [24]:
merged_df = pd.merge_asof(df1, df2, on="time", direction="nearest")
print(merged_df)

                 time  left_value  right_value
0 2022-12-09 00:00:00        0.86         0.39
1 2022-12-09 00:00:02        0.44         0.35
2 2022-12-09 00:00:04        0.07         0.35
3 2022-12-09 00:00:06        0.90         0.36
4 2022-12-09 00:00:08        0.93         0.54
5 2022-12-09 00:00:10        0.91         0.54
6 2022-12-09 00:00:12        0.26         0.44


In [25]:
merged_df = pd.merge_asof(
    df1, 
    df2, 
    on="time", 
    direction="forward", 
    tolerance=pd.Timedelta("1s")
)

In [26]:
merged_df = pd.merge_asof(df1, df2, on="time", allow_exact_matches=False)

In [27]:
df1["group"] = ["AA"] * 4 + ["BB"] * 3

df2["group"] = ["AA"] * 3 + ["BB"] * 3

In [30]:
merged_df = pd.merge_asof(df1, df2, on="time", by="group")
print(merged_df)

                 time  left_value group  right_value
0 2022-12-09 00:00:00        0.86    AA         0.39
1 2022-12-09 00:00:02        0.44    AA         0.39
2 2022-12-09 00:00:04        0.07    AA         0.35
3 2022-12-09 00:00:06        0.90    AA         0.36
4 2022-12-09 00:00:08        0.93    BB          NaN
5 2022-12-09 00:00:10        0.91    BB         0.54
6 2022-12-09 00:00:12        0.26    BB         0.44


In [29]:
merged_df = pd.merge_ordered(df1, df2)
print(merged_df)

                 time  left_value group  right_value
0 2022-12-09 00:00:00        0.86    AA         0.39
1 2022-12-09 00:00:02        0.44    AA          NaN
2 2022-12-09 00:00:03         NaN    AA         0.35
3 2022-12-09 00:00:04        0.07    AA          NaN
4 2022-12-09 00:00:06        0.90    AA         0.36
5 2022-12-09 00:00:08        0.93    BB          NaN
6 2022-12-09 00:00:09         NaN    BB         0.54
7 2022-12-09 00:00:10        0.91    BB          NaN
8 2022-12-09 00:00:12        0.26    BB         0.44
9 2022-12-09 00:00:15         NaN    BB         0.83


In [31]:
merged_df = pd.merge_ordered(df1, df2, fill_method="ffill")
print(merged_df)

                 time  left_value group  right_value
0 2022-12-09 00:00:00        0.86    AA         0.39
1 2022-12-09 00:00:02        0.44    AA         0.39
2 2022-12-09 00:00:03        0.44    AA         0.35
3 2022-12-09 00:00:04        0.07    AA         0.35
4 2022-12-09 00:00:06        0.90    AA         0.36
5 2022-12-09 00:00:08        0.93    BB         0.36
6 2022-12-09 00:00:09        0.93    BB         0.54
7 2022-12-09 00:00:10        0.91    BB         0.54
8 2022-12-09 00:00:12        0.26    BB         0.44
9 2022-12-09 00:00:15        0.26    BB         0.83


In [33]:
merged_df = pd.merge_ordered(df1, df2, fill_method="ffill", left_by="group")
print(merged_df)

                 time  left_value group  right_value
0 2022-12-09 00:00:00        0.86    AA         0.39
1 2022-12-09 00:00:02        0.44    AA         0.39
2 2022-12-09 00:00:03        0.44    AA         0.35
3 2022-12-09 00:00:04        0.07    AA         0.35
4 2022-12-09 00:00:06        0.90    AA         0.36
5 2022-12-09 00:00:08        0.93    BB          NaN
6 2022-12-09 00:00:09        0.93    BB         0.54
7 2022-12-09 00:00:10        0.91    BB         0.54
8 2022-12-09 00:00:12        0.26    BB         0.44
9 2022-12-09 00:00:15        0.26    BB         0.83
