https://duckdb.org/docs/guides/python/polars.html

In [2]:
!pip install duckdb

Collecting duckdb
  Downloading duckdb-0.6.1-cp39-cp39-win_amd64.whl (9.0 MB)
     ---------------------------------------- 9.0/9.0 MB 3.8 MB/s eta 0:00:00
Installing collected packages: duckdb
Successfully installed duckdb-0.6.1


In [3]:
import duckdb
import polars as pl
import pandas as pd

df = pd.DataFrame(
    {
        "A": [1, 2, 3, 4, 5],
        "fruits": ["banana", "banana", "apple", "apple", "banana"],
        "B": [5, 4, 3, 2, 1],
        "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
    }
)

polars_df = pl.DataFrame(df)

In [4]:
df

Unnamed: 0,A,fruits,B,cars
0,1,banana,5,beetle
1,2,banana,4,audi
2,3,apple,3,beetle
3,4,apple,2,beetle
4,5,banana,1,beetle


In [5]:
polars_df

A,fruits,B,cars
i64,str,i64,str
1,"""banana""",5,"""beetle"""
2,"""banana""",4,"""audi"""
3,"""apple""",3,"""beetle"""
4,"""apple""",2,"""beetle"""
5,"""banana""",1,"""beetle"""


In [16]:
polars_df2 = (
    polars_df
    .sort("fruits")
    .select(
        [
            "A", "B",
            "fruits",
            "cars",
            pl.lit("fruits").alias("literal_string_fruits"),
            pl.col("B").filter(pl.col("cars") == "beetle").sum().alias("sum_B_by_beetle"),
            pl.col("A").filter(pl.col("B") > 2).sum().over("cars").alias("sum_A_by_cars"),     # groups by "cars"
            pl.col("A").sum().over("fruits").alias("sum_A_by_fruits"),                         # groups by "fruits"
            pl.col("A").reverse().alias("rev_A"),
            pl.col("A").reverse().over("fruits").alias("rev_A_by_fruits"),                     # groups by "fruits
            pl.col("A").sort_by("B").alias("sort_A_by_B"),
            pl.col("A").sort_by("B").over("fruits").alias("sort_A_by_B_by_fruits"),            # groups by "fruits"
        ]
    )
)

In [17]:
polars_df2

A,B,fruits,cars,literal_string_fruits,sum_B_by_beetle,sum_A_by_cars,sum_A_by_fruits,rev_A,rev_A_by_fruits,sort_A_by_B,sort_A_by_B_by_fruits
i64,i64,str,str,str,i64,i64,i64,i64,i64,i64,i64
3,3,"""apple""","""beetle""","""fruits""",11,4,7,5,4,5,4
4,2,"""apple""","""beetle""","""fruits""",11,4,7,2,3,4,3
1,5,"""banana""","""beetle""","""fruits""",11,4,8,1,5,3,5
2,4,"""banana""","""audi""","""fruits""",11,2,8,4,2,2,2
5,1,"""banana""","""beetle""","""fruits""",11,4,8,3,1,1,1


In [18]:
polars_to_arrow = polars_df2.to_arrow()

In [19]:
output = duckdb.query("""
  SELECT 
    fruits,
    first(sum_A_by_fruits) as sum_A
  FROM polars_to_arrow
  GROUP BY ALL
  ORDER BY ALL
""").arrow()

In [20]:
output

pyarrow.Table
fruits: string
sum_A: int64
----
fruits: [["apple","banana"]]
sum_A: [[7,8]]