# Combining Dataframes

In [1]:
import polars as pl

## Concatenating

In [2]:
df_2000 = pl.DataFrame([
    {"year": 2000, "exporter": "India", "importer": "USA", "quantity": 0},
    {"year": 2000, "exporter": "India", "importer": "USA", "quantity": 1},
])
df_2000

year,exporter,importer,quantity
i64,str,str,i64
2000,"""India""","""USA""",0
2000,"""India""","""USA""",1


In [3]:
df_2001 = pl.DataFrame([
    {"year": 2001, "exporter": "India", "importer": "USA", "quantity": 2},
    {"year": 2001, "exporter": "India", "importer": "USA", "quantity": 3},
])
df_2001

year,exporter,importer,quantity
i64,str,str,i64
2001,"""India""","""USA""",2
2001,"""India""","""USA""",3


In [4]:
(
    df_2000
    .vstack(
        df_2001
    )
)

year,exporter,importer,quantity
i64,str,str,i64
2000,"""India""","""USA""",0
2000,"""India""","""USA""",1
2001,"""India""","""USA""",2
2001,"""India""","""USA""",3


In [5]:
(
    df_2000
    .vstack(
        df_2001
    )
    .rechunk()
)

year,exporter,importer,quantity
i64,str,str,i64
2000,"""India""","""USA""",0
2000,"""India""","""USA""",1
2001,"""India""","""USA""",2
2001,"""India""","""USA""",3


In [6]:
(
    df_2000
    .extend(
        df_2001
    )
)

year,exporter,importer,quantity
i64,str,str,i64
2000,"""India""","""USA""",0
2000,"""India""","""USA""",1
2001,"""India""","""USA""",2
2001,"""India""","""USA""",3


In [7]:
df_2000

year,exporter,importer,quantity
i64,str,str,i64
2000,"""India""","""USA""",0
2000,"""India""","""USA""",1
2001,"""India""","""USA""",2
2001,"""India""","""USA""",3


In [8]:
df_2000 = pl.DataFrame([
    {"year": 2000, "exporter": "India", "importer": "USA", "quantity": 0},
    {"year": 2000, "exporter": "India", "importer": "USA", "quantity": 1},
])
df_2000

year,exporter,importer,quantity
i64,str,str,i64
2000,"""India""","""USA""",0
2000,"""India""","""USA""",1


In [9]:
(
    pl.concat(
        [df_2000, df_2001],
        how="vertical"
    )
)

year,exporter,importer,quantity
i64,str,str,i64
2000,"""India""","""USA""",0
2000,"""India""","""USA""",1
2001,"""India""","""USA""",2
2001,"""India""","""USA""",3


In [10]:
df_vertical = (
    pl.concat(
        [df_2000, df_2001],
        rechunk=False
    )
)
df_vertical

year,exporter,importer,quantity
i64,str,str,i64
2000,"""India""","""USA""",0
2000,"""India""","""USA""",1
2001,"""India""","""USA""",2
2001,"""India""","""USA""",3


In [11]:
df_2001_float = (
    df_2001
    .with_columns(
        pl.col("quantity").cast(pl.Float64)
    )
)
df_2001_float

year,exporter,importer,quantity
i64,str,str,f64
2001,"""India""","""USA""",2.0
2001,"""India""","""USA""",3.0


In [12]:
(
    pl.concat(
        [
            df_2000,
            df_2001_float.with_columns(
                pl.col("quantity").cast(pl.Int64)
            )
        ]
    )
)

year,exporter,importer,quantity
i64,str,str,i64
2000,"""India""","""USA""",0
2000,"""India""","""USA""",1
2001,"""India""","""USA""",2
2001,"""India""","""USA""",3


In [13]:
(
    pl.concat(
        [
            df_2000,
            df_2001_float
        ],
        how="vertical_relaxed"
    )
)

year,exporter,importer,quantity
i64,str,str,f64
2000,"""India""","""USA""",0.0
2000,"""India""","""USA""",1.0
2001,"""India""","""USA""",2.0
2001,"""India""","""USA""",3.0


In [14]:
df_2000_details = pl.DataFrame(
    [
        {"item": "Clothes", "value": 10},
        {"item": "Machinery", "value": 100},
    ]
)
df_2000_details

item,value
str,i64
"""Clothes""",10
"""Machinery""",100


In [15]:
(
    df_2000
    .hstack(
        df_2000_details
    )
)

year,exporter,importer,quantity,item,value
i64,str,str,i64,str,i64
2000,"""India""","""USA""",0,"""Clothes""",10
2000,"""India""","""USA""",1,"""Machinery""",100


In [16]:
(
    df_2000
    .hstack(
        [
            df_2000_details["item"],
            df_2000_details["value"]
        ]
    )
)

year,exporter,importer,quantity,item,value
i64,str,str,i64,str,i64
2000,"""India""","""USA""",0,"""Clothes""",10
2000,"""India""","""USA""",1,"""Machinery""",100


In [17]:
(
    pl.concat(
        [
            df_2000,
            df_2000_details
        ],
        how="horizontal"
    )
)

year,exporter,importer,quantity,item,value
i64,str,str,i64,str,i64
2000,"""India""","""USA""",0,"""Clothes""",10
2000,"""India""","""USA""",1,"""Machinery""",100


In [18]:
df_2000 = pl.DataFrame([
    {"year": 2000, "exporter": "China", "importer": "USA", "quantity": 0},
    {"year": 2000, "exporter": "China", "importer": "USA", "quantity": 1},
])
df_2001 = pl.DataFrame([
    {"year": 2001, "exporter": "China", "importer": "USA", "quantity": 2, "item": "Clothes", "value": 10},
    {"year": 2001, "exporter": "China", "importer": "USA", "quantity": 3, "item": "Machinery", "value": 100},
])

In [19]:
(
    pl.concat(
        [
            df_2000,
            df_2001
        ],
        how="diagonal"
    )
)

year,exporter,importer,quantity,item,value
i64,str,str,i64,str,i64
2000,"""China""","""USA""",0,,
2000,"""China""","""USA""",1,,
2001,"""China""","""USA""",2,"""Clothes""",10.0
2001,"""China""","""USA""",3,"""Machinery""",100.0


## Join

In [20]:
df_left = pl.DataFrame(
    {
        "id": ["A", "B", "C", None],
        "val": [0, 1, 2, 3]
    }
)
df_left

id,val
str,i64
"""A""",0
"""B""",1
"""C""",2
,3


In [21]:
df_right = pl.DataFrame(
    {
        "id": ["A", "C", None, "D"],
        "val": [10, 11, 12, 13]
    }
)
df_right

id,val
str,i64
"""A""",10
"""C""",11
,12
"""D""",13


In [22]:
(
    df_left
    .join(
        df_right,
        on="id",
        how="left"
    )
)

id,val,val_right
str,i64,i64
"""A""",0,10.0
"""B""",1,
"""C""",2,11.0
,3,


In [24]:
(
    df_left
    .join(
        df_right,
        on="id",
        how="left",
        nulls_equal=True
    )
)

id,val,val_right
str,i64,i64
"""A""",0,10.0
"""B""",1,
"""C""",2,11.0
,3,12.0


In [25]:
(
    df_left
    .join(
        df_right,
        on="id",
        how="left",
        suffix="_r"
    )
)

id,val,val_r
str,i64,i64
"""A""",0,10.0
"""B""",1,
"""C""",2,11.0
,3,


In [26]:
(
    df_left
    .join(
        df_right,
        on="id"
    )
)

id,val,val_right
str,i64,i64
"""A""",0,10
"""C""",2,11


In [27]:
(
    df_left
    .join(
        df_right,
        on="id",
        nulls_equal=True
    )
)

id,val,val_right
str,i64,i64
"""A""",0,10
"""C""",2,11
,3,12


In [28]:
(
    df_left
    .join(
        df_right,
        how="cross"
    )
)

id,val,id_right,val_right
str,i64,str,i64
"""A""",0,"""A""",10
"""A""",0,"""C""",11
"""A""",0,,12
"""A""",0,"""D""",13
"""B""",1,"""A""",10
…,…,…,…
"""C""",2,"""D""",13
,3,"""A""",10
,3,"""C""",11
,3,,12


In [30]:
(
    df_left
    .join(
        df_right,
        on="id",
        how="full"
    )
)

id,val,id_right,val_right
str,i64,str,i64
"""A""",0.0,"""A""",10.0
"""C""",2.0,"""C""",11.0
,,,12.0
,,"""D""",13.0
,3.0,,
"""B""",1.0,,


In [31]:
(
    df_left
    .join(
        df_right,
        on="id",
        how="full",
        nulls_equal=True
    )
)

id,val,id_right,val_right
str,i64,str,i64
"""A""",0.0,"""A""",10.0
"""C""",2.0,"""C""",11.0
,3.0,,12.0
,,"""D""",13.0
"""B""",1.0,,


In [33]:
(
    df_left
    .join(
        df_right,
        on="id",
        how="full",
        coalesce=True
    )
)

id,val,val_right
str,i64,i64
"""A""",0.0,10.0
"""C""",2.0,11.0
,,12.0
"""D""",,13.0
,3.0,
"""B""",1.0,


In [34]:
df_left_valid = pl.DataFrame(
    {
        "id": ["A", "B", "C", None],
        "val": [0, 1, 2, 3]
    }
)
df_left_valid

id,val
str,i64
"""A""",0
"""B""",1
"""C""",2
,3


In [35]:
df_right_valid = pl.DataFrame(
    {
        "id": ["A", "C", None, "D"],
        "val": [10, 11, 12, 13]
    }
)
df_right_valid

id,val
str,i64
"""A""",10
"""C""",11
,12
"""D""",13


In [37]:
(
    df_left_valid
    .join(
        df_right_valid,
        on="id",
        how="left",
        validate="1:1"
    )
)

id,val,val_right
str,i64,i64
"""A""",0,10.0
"""B""",1,
"""C""",2,11.0
,3,


In [38]:
df_left_m = pl.DataFrame(
    {
        "id": ["A", "A"],
        "val": [0, 1]
    }
)
df_left_m

id,val
str,i64
"""A""",0
"""A""",1


In [39]:
df_right_m = pl.DataFrame(
    {
        "id": ["A", "B"],
        "val": [10, 11]
    }
)
df_right_m

id,val
str,i64
"""A""",10
"""B""",11


In [40]:
(
    df_left_m
    .join(
        df_right_m,
        on="id",
        how="left",
        validate="m:1"
    )
)

id,val,val_right
str,i64,i64
"""A""",0,10
"""A""",1,10


In [41]:
df_left_multiple = pl.DataFrame(
    {
        "id": ["A", "B", "A", "B"],
        "year": [2020, 2020, 2021, 2021],
        "val": [0, 1, 2, 3]
    }
)
df_left_multiple

id,year,val
str,i64,i64
"""A""",2020,0
"""B""",2020,1
"""A""",2021,2
"""B""",2021,3


In [42]:
df_right_multiple = pl.DataFrame(
    {
        "id": ["a", "b", "a", "b"],
        "year": [2020, 2020, 2021, 2021],
        "val": [10, 11, 12, 13]
    }
)
df_right_multiple

id,year,val
str,i64,i64
"""a""",2020,10
"""b""",2020,11
"""a""",2021,12
"""b""",2021,13


In [43]:
(
    df_left_multiple
    .join(
        df_right_multiple,
        on=[
            pl.col("id").str.to_uppercase(),
            "year"
        ],
        how="inner"
    )
)

id,year,val,id_right,year_right,val_right
str,i64,i64,str,i64,i64
"""A""",2020,0,"""a""",2020,10
"""B""",2020,1,"""b""",2020,11
"""A""",2021,2,"""a""",2021,12
"""B""",2021,3,"""b""",2021,13


In [44]:
(
    df_left
    .lazy()
    .join(
        df_right.lazy(),
        on="id",
        how="inner"
    )
    .select("id", "val_right")
    .collect()
)

id,val_right
str,i64
"""A""",10
"""C""",11


## Join on strings

In [45]:
df_left = (
    pl.DataFrame(
        {
            "id": ["id3", "id3", "id1", "id2"],
            "values": [3,3,1,2]
        }
    )
)
df_left

id,values
str,i64
"""id3""",3
"""id3""",3
"""id1""",1
"""id2""",2


In [46]:
df_right = (
    pl.DataFrame(
        {
            "id": ["id1", "id2", "id3"],
            "metadata": [1,2,3]
        }
    )
)
df_right

id,metadata
str,i64
"""id1""",1
"""id2""",2
"""id3""",3


In [47]:
(
    df_left
    .join(
        df_right,
        on="id"
    )
)

id,values,metadata
str,i64,i64
"""id3""",3,3
"""id3""",3,3
"""id1""",1,1
"""id2""",2,2


In [48]:
df_left_cat = (
    df_left
    .with_columns(
        pl.col("id").cast(pl.Categorical)
    )
)
df_left_cat

id,values
cat,i64
"""id3""",3
"""id3""",3
"""id1""",1
"""id2""",2


In [49]:
df_right_cat = (
    df_right
    .with_columns(
        pl.col("id").cast(pl.Categorical)
    )
)
df_right_cat

id,metadata
cat,i64
"""id1""",1
"""id2""",2
"""id3""",3


In [50]:
(
    df_left_cat
    .join(
        df_right_cat,
        on="id"
    )
)

  .join(


id,values,metadata
cat,i64,i64
"""id3""",3,3
"""id3""",3,3
"""id1""",1,1
"""id2""",2,2


In [51]:
with pl.StringCache():

    df_left_cat = (
        df_left
        .with_columns(
            pl.col("id").cast(pl.Categorical)
        )
    )

    df_right_cat = (
        df_right
        .with_columns(
            pl.col("id").cast(pl.Categorical)
        )
    )

(
    df_left_cat
    .join(
        df_right_cat,
        on="id"
    )
)

id,values,metadata
cat,i64,i64
"""id3""",3,3
"""id3""",3,3
"""id1""",1,1
"""id2""",2,2


In [53]:
enum_dtype = pl.Enum(["id1", "id2", "id3"])

In [54]:
df_left_enum = (
    df_left
    .with_columns(
        pl.col("id").cast(enum_dtype)
    )
)
df_left_enum

id,values
enum,i64
"""id3""",3
"""id3""",3
"""id1""",1
"""id2""",2


In [55]:
df_right_enum = (
    df_right
    .with_columns(
        pl.col("id").cast(enum_dtype)
    )
)
df_right_enum

id,metadata
enum,i64
"""id1""",1
"""id2""",2
"""id3""",3


In [56]:
(
    df_left_enum
    .join(
        df_right_enum,
        on="id"
    )
)

id,values,metadata
enum,i64,i64
"""id3""",3,3
"""id3""",3,3
"""id1""",1,1
"""id2""",2,2


# Filter DataFrame by another DataFrame

In [57]:
df_left = (
    pl.DataFrame(
        {
            "id": ["id1", "id2", "id3", "id4"],
            "values": [1,2,3,4]
        }
    )
)
df_left

id,values
str,i64
"""id1""",1
"""id2""",2
"""id3""",3
"""id4""",4


In [59]:
df_right = (
    pl.DataFrame(
        {
            "id": ["id1", "id2", "id3"],
            "metadata": [1,2,3]
        }
    )
)
df_right

id,metadata
str,i64
"""id1""",1
"""id2""",2
"""id3""",3


In [60]:
(
    df_left
    .join(
        df_right,
        how="semi",
        on="id"
    )
)

id,values
str,i64
"""id1""",1
"""id2""",2
"""id3""",3


In [61]:
(
    df_left
    .join(
        df_right,
        how="anti",
        on="id"
    )
)

id,values
str,i64
"""id4""",4
