In [1]:
import numpy as np
import functools
import sys
print(sys.version)
from timeit import Timer

import pyarrow as pa
import pandas as pd
import polars as pl
import duckdb
from datafusion import SessionContext

ctx = SessionContext()
con = duckdb.connect()


def generate_data(number_of_rows):
    rng = np.random.default_rng()

    return {
        "order_id": range(1, number_of_rows + 1),
        "region": rng.choice(
            ["North", "South", "East", "West"], size=number_of_rows
        ),
        "sales_person": rng.choice(
            ["Armstrong", "Aldrin", "Collins"], size=number_of_rows
        ),
        "product": rng.choice(
            ["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
        ),
        "sales_income": rng.integers(1, 5001, size=number_of_rows),
    }

3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:37:03) [MSC v.1929 64 bit (AMD64)]


In [2]:
def create_pandas_dataframe(test_data):
    return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")

def create_pyarrow_dataframe(test_data):
    return pa.table(test_data)

def create_polars_dataframe(test_data):
    return pl.from_arrow(pa.table(test_data))

def create_polars_lazyframe(test_data):
    return pl.from_arrow(pa.table(test_data)).lazy()

In [3]:
def analyze_pandas_dataframe(pandas_df):
    return pandas_df.groupby(["region", "product", "sales_person"])[
        "sales_income"
    ].sum()

def analyze_pyarrow_dataframe(pyarrow_df):
    return pyarrow_df.group_by(["region", "product", "sales_person"]).aggregate(
        [("sales_income", "sum")]
    )

def analyze_datafusion_dataframe():
    sql = """
        SELECT region, product, sales_person, SUM(sales_income) AS total_sales
        FROM t
        GROUP BY region, product, sales_person
        """
    return ctx.sql(sql).to_arrow_table()

def analyze_duckdb_dataframe():
    sql = """
        SELECT region, product, sales_person, SUM(sales_income) AS total_sales
        FROM t
        GROUP BY region, product, sales_person
        """
    return con.execute(sql).fetch_arrow_table()

def analyze_polars_dataframe(polars_df):
    return polars_df.group_by(["region", "product", "sales_person"]).agg(
        total_sales=pl.col("sales_income").sum()
    )

def analyze_polars_lazyframe(polars_lf):
    return polars_lf.group_by(["region", "product", "sales_person"]).agg(
        total_sales=pl.col("sales_income").sum()
    ).collect(engine="streaming")

In [4]:
print("Creating DataFrames...")

n = 5000000
test_data = generate_data(n)

print(f"Pandas dataframe creation time for {n:,} rows:")
print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(1))
print(f"\nPyarrow dataframe creation time for {n:,} rows:")
print(Timer(functools.partial(create_pyarrow_dataframe, test_data)).timeit(1))
print(f"\nPolars dataframe creation time for {n:,} rows:")
print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(1))
print(f"\nPolars lazyframe creation time for {n:,} rows:")
print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(1))

Creating DataFrames...
Pandas dataframe creation time for 5,000,000 rows:
2.90731290000258

Pyarrow dataframe creation time for 5,000,000 rows:
2.6035517000127584

Polars dataframe creation time for 5,000,000 rows:
2.7015297999896575

Polars lazyframe creation time for 5,000,000 rows:
2.7471058999944944


In [5]:
print("-" * 50)

pandas_df = create_pandas_dataframe(test_data)
pyarrow_df = create_pyarrow_dataframe(test_data)
if ctx.table_exist("t"):
    ctx.deregister_table("t")
ctx.from_arrow(pyarrow_df, "t")
con.register("t", pyarrow_df)
polars_df = create_polars_dataframe(test_data)
polars_lf = create_polars_lazyframe(test_data)

print(f"Pandas dataframe analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(1))

print(f"\nPyarrow dataframe analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_pyarrow_dataframe, pyarrow_df)).timeit(1))

print(f"\nDatafusion dataframe analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_datafusion_dataframe)).timeit(1))

print(f"\nDuckdb dataframe analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_duckdb_dataframe)).timeit(1))

print(f"\nPolars dataframe analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(1))

print(f"\nPolars lazyframe analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(1))

--------------------------------------------------
Pandas dataframe analysis time for 5,000,000 rows:
0.630527000001166

Pyarrow dataframe analysis time for 5,000,000 rows:
0.09193279998726211

Datafusion dataframe analysis time for 5,000,000 rows:
0.033187299995915964

Duckdb dataframe analysis time for 5,000,000 rows:
0.056433400022797287

Polars dataframe analysis time for 5,000,000 rows:
0.1142857999948319

Polars lazyframe analysis time for 5,000,000 rows:
0.06341190001694486
