In [1]:
import numpy as np
import functools
import sys
print(sys.version)
from timeit import Timer

import pyarrow as pa
import pandas as pd
import polars as pl
import duckdb
from datafusion import SessionContext

ctx = SessionContext()
con = duckdb.connect()


def generate_data(number_of_rows):
    rng = np.random.default_rng()

    return {
        "order_id": range(1, number_of_rows + 1),
        "region": rng.choice(
            ["North", "South", "East", "West"], size=number_of_rows
        ),
        "sales_person": rng.choice(
            ["Armstrong", "Aldrin", "Collins"], size=number_of_rows
        ),
        "product": rng.choice(
            ["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
        ),
        "sales_income": rng.integers(1, 5001, size=number_of_rows),
    }

3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:09:02) [GCC 11.2.0]


In [2]:
def create_pandas_dataframe(test_data):
    return pd.DataFrame(test_data)

def create_pandas_dataframe_with_pyarrow(test_data):
    return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")

def create_pyarrow_dataframe(test_data):
    return pa.table(test_data)

def create_polars_dataframe(test_data):
    return pl.from_arrow(pa.table(test_data))

def create_polars_lazyframe(test_data):
    return pl.from_arrow(pa.table(test_data)).lazy()

In [3]:
def analyze_pandas_dataframe(pandas_df):
    return pandas_df.groupby(["region", "product", "sales_person"])[
        "sales_income"
    ].sum()

def analyze_pyarrow_dataframe(pyarrow_df):
    return pyarrow_df.group_by(["region", "product", "sales_person"]).aggregate(
        [("sales_income", "sum")]
    )

def analyze_datafusion_dataframe():
    sql = """
        SELECT region, product, sales_person, SUM(sales_income) AS total_sales
        FROM t
        GROUP BY region, product, sales_person
        """
    return ctx.sql(sql).to_arrow_table()

def analyze_duckdb_dataframe():
    sql = """
        SELECT region, product, sales_person, SUM(sales_income) AS total_sales
        FROM t
        GROUP BY region, product, sales_person
        """
    return con.execute(sql).fetch_arrow_table()

def analyze_polars_dataframe(polars_df):
    return polars_df.group_by(["region", "product", "sales_person"]).agg(
        total_sales=pl.col("sales_income").sum()
    )

def analyze_polars_lazyframe(polars_lf):
    return polars_lf.group_by(["region", "product", "sales_person"]).agg(
        total_sales=pl.col("sales_income").sum()
    ).collect(engine="streaming")


n = 10_000_000

test_data = generate_data(n)

In [4]:
print("Creating DataFrames...")

print(f"Pandas dataframe creation time for {n:,} rows:")
print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(1))
print(f"\nPandas dataframe with pyarrow creation time for {n:,} rows:")
print(Timer(functools.partial(create_pandas_dataframe_with_pyarrow, test_data)).timeit(1))
print(f"\nPyarrow dataframe creation time for {n:,} rows:")
print(Timer(functools.partial(create_pyarrow_dataframe, test_data)).timeit(1))
print(f"\nPolars dataframe creation time for {n:,} rows:")
print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(1))
print(f"\nPolars lazyframe creation time for {n:,} rows:")
print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(1))

Creating DataFrames...
Pandas dataframe creation time for 10,000,000 rows:
3.134043699999893

Pandas dataframe with pyarrow creation time for 10,000,000 rows:
5.336611126000207

Pyarrow dataframe creation time for 10,000,000 rows:
5.04147997400014

Polars dataframe creation time for 10,000,000 rows:
4.945008783000048

Polars lazyframe creation time for 10,000,000 rows:
4.76657176900062


# 1. Data analysis
--------------

In [5]:
print("-" * 50)

pandas_df = create_pandas_dataframe(test_data)
pandas_df_with_pyarrow = create_pandas_dataframe_with_pyarrow(test_data)
pyarrow_df = create_pyarrow_dataframe(test_data)
if ctx.table_exist("t"):
    ctx.deregister_table("t")
ctx.from_arrow(pyarrow_df, "t")
con.register("t", pyarrow_df)
polars_df = create_polars_dataframe(test_data)
polars_lf = create_polars_lazyframe(test_data)

print(f"Pandas dataframe analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(1))

print(f"\nPandas dataframe with pyarrow analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_pandas_dataframe, pandas_df_with_pyarrow)).timeit(1))

print(f"\nPyarrow dataframe analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_pyarrow_dataframe, pyarrow_df)).timeit(1))

print(f"\nDatafusion dataframe analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_datafusion_dataframe)).timeit(1))

print(f"\nDuckdb dataframe analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_duckdb_dataframe)).timeit(1))

print(f"\nPolars dataframe analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(1))

print(f"\nPolars lazyframe analysis time for {n:,} rows:")
print(Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(1))

--------------------------------------------------
Pandas dataframe analysis time for 10,000,000 rows:
2.208175615000073

Pandas dataframe with pyarrow analysis time for 10,000,000 rows:
1.3885868320003283

Pyarrow dataframe analysis time for 10,000,000 rows:
0.07580342599976575

Datafusion dataframe analysis time for 10,000,000 rows:
0.06742536899946572

Duckdb dataframe analysis time for 10,000,000 rows:
0.09878986600051576

Polars dataframe analysis time for 10,000,000 rows:
0.2783320210000966

Polars lazyframe analysis time for 10,000,000 rows:
0.10997739200047363


# 2. Serialization/Deserialization
------------------

In [6]:
import pickle

pandas_df = create_pandas_dataframe(test_data)
pandas_df_with_pyarrow = create_pandas_dataframe_with_pyarrow(test_data)
pyarrow_df = create_pyarrow_dataframe(test_data)

In [7]:
def serialize_pandas_df(pandas_df):
    return pickle.dumps(pandas_df, protocol=pickle.HIGHEST_PROTOCOL)

def serialize_pyarrow_df(pyarrow_df):
    sink = pa.BufferOutputStream()
    with pa.ipc.new_file(sink, pyarrow_df.schema) as writer:
        writer.write_table(pyarrow_df)

    buf = sink.getvalue()
    return buf.to_pybytes()

In [8]:
print(f"Pandas dataframe serialize time for {n:,} rows:")
print(Timer(functools.partial(serialize_pandas_df, pandas_df)).timeit(1))

print(f"\nPandas dataframe with pyarrow serialize time for {n:,} rows:")
print(Timer(functools.partial(serialize_pandas_df, pandas_df_with_pyarrow)).timeit(1))

print(f"\nPyarrow dataframe serialize time for {n:,} rows:")
print(Timer(functools.partial(serialize_pyarrow_df, pyarrow_df)).timeit(1))

Pandas dataframe serialize time for 10,000,000 rows:
8.297720185999424

Pandas dataframe with pyarrow serialize time for 10,000,000 rows:
0.5178799360000994

Pyarrow dataframe serialize time for 10,000,000 rows:
0.4243422239997017


In [9]:
pandas_df_bytes = serialize_pandas_df(pandas_df)
pandas_df_with_pyarrow_bytes = serialize_pandas_df(pandas_df_with_pyarrow)
pyarrow_df_bytes = serialize_pyarrow_df(pyarrow_df)

print(f"pandas bytes              : {len(pandas_df_bytes)}")
print(f"pandas with pyarrow bytes : {len(pandas_df_with_pyarrow_bytes)}")
print(f"pyarrow_df_bytes bytes    : {len(pyarrow_df_bytes)}")

pandas bytes              : 425929524
pandas with pyarrow bytes : 455833770
pyarrow_df_bytes bytes    : 455837770


In [10]:
def deserialize_pandas_df(pandas_df_bytes):
    return pickle.loads(pandas_df_bytes)

def deserialize_pyarrow_df(pyarrow_df_bytes):
    with pa.ipc.open_file(pyarrow_df_bytes) as reader:
        return reader.read_all()

In [11]:
print(f"Pandas dataframe deserialize time for {n:,} rows:")
print(Timer(functools.partial(deserialize_pandas_df, pandas_df_bytes)).timeit(1))
print(f"\nPandas dataframe with pyarrow deserialize time for {n:,} rows:")
print(Timer(functools.partial(deserialize_pandas_df, pandas_df_with_pyarrow_bytes)).timeit(1))
print(f"\nPyarrow dataframe deserialize time for {n:,} rows:")
print(Timer(functools.partial(deserialize_pyarrow_df, pyarrow_df_bytes)).timeit(1))

Pandas dataframe deserialize time for 10,000,000 rows:
4.088341994000075

Pandas dataframe with pyarrow deserialize time for 10,000,000 rows:
0.11177832100020169

Pyarrow dataframe deserialize time for 10,000,000 rows:
0.00032485600058862474
