# Pandas Vs Polars


## Aggregation

* count 
* max 
* min
* mean
* median
* product
* sum

In [7]:
import numpy as np
import pandas as pd
import polars as pl
import time

def generate_df(n_rows=10_000_000):
    df = pd.DataFrame({
        'id': np.arange(n_rows),
        'category': np.random.choice(['A', 'B', 'C', 'D'], size=n_rows),
        'value': np.random.rand(n_rows) * 100,
        'timestamp': pd.date_range('2020-01-01', periods=n_rows, freq='s')
    })
    return df


In [3]:
pd_df = generate_df()
pl_df = pl.from_pandas(pd_df)

In [31]:
import plotly.graph_objects as go

def compare_times(pandas_count_time, polars_count_time):
    """
    Plots a horizontal, thin bar chart comparing Pandas and Polars count operation times,
    and prints which one is faster and by how much.
    """
    # Data setup
    frameworks = ['Pandas', 'Polars']
    times = [pandas_count_time, polars_count_time]
    colors = ['steelblue', 'seagreen']

    # Plotly horizontal bar chart with thinner bars
    fig = go.Figure(data=[
        go.Bar(
            y=frameworks,
            x=times,
            orientation='h',
            marker_color=colors,
            text=[f"{t:.2f}s" for t in times],
            textposition='auto',
            width=0.3  # thinner bar width (default is 0.8)
        )
    ])

    fig.update_layout(
        title='Pandas vs Polars: Count Operation Time',
        xaxis_title='Time (seconds)',
        yaxis_title='Framework',
        template='plotly_white'
    )

    fig.show()

    # Print faster framework
    if pandas_count_time > polars_count_time:
        speedup = pandas_count_time / polars_count_time
        print(f"⚡ Polars is {speedup:.2f}x faster than Pandas at counting. 🚀")
    elif polars_count_time > pandas_count_time:
        speedup = polars_count_time / pandas_count_time
        print(f"📈 Pandas is {speedup:.2f}x faster than Polars at counting. 📊")
    else:
        print("🤝 It's a tie! Count time is the same for both. Possibly plotting a merger...")



### Count

In [35]:
start = time.time()
pl_df.count()
end = time.time()
polars_count_time = end - start
print(f"Polars count time: {polars_count_time:.4f} seconds")

Polars count time: 0.0004 seconds


In [36]:
start = time.time()
pd_df.count()
end = time.time()
pandas_count_time = end - start
print(f"Pandas count time: {pandas_count_time:.4f} seconds")

Pandas count time: 0.2960 seconds


In [37]:
compare_times(pandas_count_time, polars_count_time)

⚡ Polars is 701.38x faster than Pandas at counting. 🚀


### Max

In [38]:
start = time.time()
pl_df.max()
end = time.time()
polars_max_time = end - start
print(f"Polars max time: {polars_max_time:.4f} seconds")

Polars max time: 0.0224 seconds


In [39]:
start = time.time()
pd_df.max()
end = time.time()
pandas_max_time = end - start
print(f"Pandas max time: {pandas_max_time:.4f} seconds")

Pandas max time: 0.4410 seconds


In [40]:
compare_times(pandas_max_time, polars_max_time)

⚡ Polars is 19.66x faster than Pandas at counting. 🚀


### min

In [41]:
start = time.time()
pl_df.min()
end = time.time()
polars_min_time = end - start
print(f"Polars min time: {polars_min_time:.4f} seconds")

Polars min time: 0.0174 seconds


In [43]:
start = time.time()
pd_df.min()
end = time.time()
pandas_min_time = end - start
print(f"Pandas min time: {pandas_min_time:.4f} seconds")

Pandas min time: 0.4259 seconds


In [44]:
compare_times(pandas_min_time, polars_min_time)

⚡ Polars is 24.44x faster than Pandas at counting. 🚀


### mean

In [56]:
start = time.time()
pl_df.min()
end = time.time()
polars_min_time = end - start
print(f"Polars min time: {polars_min_time:.4f} seconds")

start = time.time()
pd_df.min()
end = time.time()
pandas_min_time = end - start
print(f"Pandas min time: {pandas_min_time:.4f} seconds")

compare_times(pandas_min_time, polars_min_time)

Polars min time: 0.0148 seconds
Pandas min time: 0.4586 seconds


⚡ Polars is 30.91x faster than Pandas at counting. 🚀


### Median

In [46]:
pl_df.median()

id,category,value,timestamp
f64,str,f64,datetime[ns]
4999999.5,,49.954752,2020-02-27 20:53:19.500


In [None]:
start = time.time()
pl_df.drop('category').median()
end = time.time()
polars_median_time = end - start
print(f"Polars median time: {polars_median_time:.4f} seconds")

start = time.time()
pd_df.drop('category', axis=1).median()
end = time.time()
pandas_median_time = end - start
print(f"Pandas median time: {pandas_median_time:.4f} seconds")

compare_times(pandas_median_time, polars_median_time)

Polars median time: 0.1044 seconds
Pandas median time: 0.3066 seconds


⚡ Polars is 2.94x faster than Pandas at counting. 🚀


### Sum

In [65]:
start = time.time()
pl_df.drop(['category', 'timestamp']).sum()
end = time.time()
polars_sum_time = end - start
print(f"Polars sum time: {polars_sum_time:.4f} seconds")

start = time.time()
pd_df.drop(columns=['category', 'timestamp']).sum()
end = time.time()
pandas_sum_time = end - start
print(f"Pandas sum time: {pandas_sum_time:.4f} seconds")

compare_times(pandas_sum_time, polars_sum_time)

Polars sum time: 0.0119 seconds
Pandas sum time: 0.0664 seconds


⚡ Polars is 5.57x faster than Pandas at counting. 🚀


## Computation (Fold)

In [85]:
import pandas as pd
import numpy as np
import polars as pl
from functools import reduce


def create_pandas_df(n_rows=10_000_000):
    df = pd.DataFrame({
        'a': np.random.randint(1, 101, size=n_rows),
        'b': np.random.randint(1, 101, size=n_rows),
        'c': np.random.randint(1, 101, size=n_rows),
        'd': np.random.randint(1, 101, size=n_rows),
        'e': np.random.randint(1, 101, size=n_rows),
        'f': np.random.randint(1, 101, size=n_rows),
        'g': np.random.randint(1, 101, size=n_rows),
        'h': np.random.randint(1, 101, size=n_rows),
    })
    return df



In [97]:
import plotly.graph_objects as go

def compare_timings(**kwargs):
    """
    Accepts multiple named timing arguments as kwargs.
    Plots a horizontal bar chart of all timings and highlights the fastest.
    """
    if not kwargs:
        print("🛑 No timings provided. You had one job...")
        return

    labels = list(kwargs.keys())
    times = list(kwargs.values())

    # Sort by time (ascending) for better readability
    sorted_pairs = sorted(zip(times, labels))
    times_sorted, labels_sorted = zip(*sorted_pairs)

    # Bar chart with thinner bars
    fig = go.Figure(data=[
        go.Bar(
            y=labels_sorted,
            x=times_sorted,
            orientation='h',
            marker_color='mediumpurple',
            text=[f"{t:.2f}s" for t in times_sorted],
            textposition='auto',
            width=0.3
        )
    ])

    fig.update_layout(
        title='Benchmark Comparison of Methods',
        xaxis_title='Time (seconds)',
        yaxis_title='Method',
        template='plotly_white',
        height=50 * len(labels_sorted) + 200  # Dynamically adjust height
    )

    fig.show()

    # Celebrate the winner (fastest method)
    winner = labels_sorted[0]
    speedup = times_sorted[1] / times_sorted[0] if len(times_sorted) > 1 and times_sorted[0] > 0 else None

    print(f"🏆 Fastest method: **{winner}** at {times_sorted[0]:.2f}s.")
    if speedup:
        print(f"⚡ It's ~{speedup:.2f}x faster than the next best contender. Time to update your toolchain!")


In [81]:
pd_df = create_pandas_df()
pl_df = pl.from_pandas(pd_df)

In [102]:
start = time.time()
pl_df.with_columns(
    pl.fold(
        acc=pl.lit(0),
        function=lambda x,y: x + y,
        exprs= pl.all(),
    ).alias('sum_columns')
)
end = time.time()
polars_fold_time = end - start
print(f"Polars fold time: {polars_fold_time:.4f} seconds")

start = time.time()
pd_df.sum(axis=1)
end = time.time()
pandas_fold_using_sum = end - start
print(f"Pandas fold time (using sum): {pandas_fold_using_sum:.4f} seconds")


start = time.time()
pd_df['sum_columns'] = reduce(lambda x, y: x + y, pd_df.values.T)
end = time.time()
python_reduce = end - start
print(f"Python fold time (method reduce): {python_reduce:.4f} seconds")

pd_df.drop(columns='sum_columns', inplace=True)  # Clean up the DataFrame for the next test

start = time.time()
pd_df.apply(lambda row: reduce(lambda x, y: x + y, row), axis=1)
end = time.time()
pandas_fold_using_apply = end - start
print(f"Pandas fold time (method apply): {pandas_fold_using_apply:.4f} seconds")

compare_timings(polars_fold_time=polars_fold_time,
                pandas_fold_using_sum=pandas_fold_using_sum,
                pandas_fold_using_apply=pandas_fold_using_apply,
                python_reduce=python_reduce)



Polars fold time: 0.2269 seconds
Pandas fold time (using sum): 0.6565 seconds
Python fold time (method reduce): 0.1068 seconds
Pandas fold time (method apply): 29.0694 seconds


🏆 Fastest method: **python_reduce** at 0.11s.
⚡ It's ~2.12x faster than the next best contender. Time to update your toolchain!


## Size

In [111]:
pl_df.estimated_size('mb')

610.3515625

In [108]:
pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 8 columns):
 #   Column  Dtype
---  ------  -----
 0   a       int64
 1   b       int64
 2   c       int64
 3   d       int64
 4   e       int64
 5   f       int64
 6   g       int64
 7   h       int64
dtypes: int64(8)
memory usage: 610.4 MB


## Groupby

In [113]:
import numpy as np
import pandas as pd
import polars as pl
import time

def generate_df(n_rows=10_000_000):
    df = pd.DataFrame({
        'id': np.arange(n_rows),
        'category': np.random.choice(
                [
                'A',
                'B',
                'C',
                'D',
                'E',
                'F',
                'G',
                'H',
                'I',
                'J',
                'K',
                'L',
                ],
            size=n_rows),
        'value_a': np.random.rand(n_rows) * 100,
        'value_b': np.random.randint(1, 101, size=n_rows),

    })
    return df

In [116]:
pd_df = generate_df()
pl_df = pl.from_pandas(pd_df)

* agg
* all
* count (len)
* max
* mean
* median
* min
* n_unique
* sum

### Agg

In [125]:
start = time.time()
# Perform a groupby operation in Polars
pl_df.group_by('category').agg(
    pl.col('value_a'),
    pl.col('value_b')
)
end = time.time()
polars_groupby_time = end - start
print(f"Polars groupby time: {polars_groupby_time:.4f} seconds")

start = time.time()
# Perform a groupby operation in Polars
pd_df.groupby('category')[['value_a', 'value_b']].agg(list).reset_index()
end = time.time()
pandas_groupby_time = end - start
print(f"Pandas groupby time: {pandas_groupby_time:.4f} seconds")

compare_times(pandas_groupby_time, polars_groupby_time)

Polars groupby time: 0.3133 seconds
Pandas groupby time: 1.7071 seconds


⚡ Polars is 5.45x faster than Pandas at counting. 🚀


### Count

In [135]:
start = time.time()
# Perform a count operation in Polars
pl_df.group_by('category').len().sort('category')
end = time.time()
polars_count_groupby_time = end - start
print(f"Polars count groupby time: {polars_count_groupby_time:.4f} seconds")


start = time.time()
# Perform a count operation in Pandas
pd_df.groupby('category').size().reset_index(name='count')
end = time.time()
pandas_count_groupby_time = end - start
print(f"Pandas count groupby time: {pandas_count_groupby_time:.4f} seconds")
compare_times(pandas_count_groupby_time, polars_count_groupby_time)

Polars count groupby time: 0.0907 seconds
Pandas count groupby time: 0.3277 seconds


⚡ Polars is 3.61x faster than Pandas at counting. 🚀


'2.2.3'

### Max

In [None]:
start = time.time()
# Perform a groupby operation in Polars
pl_df.group_by('category').agg(
    pl.col('value_a').mean().alias('mean_value_a'),
)
end = time.time()
polars_groupby_time = end - start
print(f"Polars groupby time: {polars_groupby_time:.4f} seconds")

start = time.time()
# Perform a groupby operation in Pandas
pd_df[['category','value_a']].groupby('category').mean()
end = time.time()
pandas_groupby_time = end - start
print(f"Pandas groupby time: {pandas_groupby_time:.4f} seconds")

compare_times(pandas_groupby_time, polars_groupby_time)

Polars groupby time: 0.0534 seconds
Pandas groupby time: 0.4000 seconds


⚡ Polars is 7.49x faster than Pandas at counting. 🚀


### Min

In [154]:
start = time.time()
# Perform a groupby operation in Polars
pl_df.group_by('category').agg(
    pl.col('value_a').min().alias('min_value_a'),
)
end = time.time()
polars_groupby_time = end - start
print(f"Polars groupby time: {polars_groupby_time:.4f} seconds")

start = time.time()
# Perform a groupby operation in Pandas
pd_df[['category','value_a']].groupby('category').min()
end = time.time()
pandas_groupby_time = end - start
print(f"Pandas groupby time: {pandas_groupby_time:.4f} seconds")

compare_times(pandas_groupby_time, polars_groupby_time)

Polars groupby time: 0.0654 seconds
Pandas groupby time: 0.4384 seconds


⚡ Polars is 6.71x faster than Pandas at counting. 🚀


In [171]:
start = time.time()
# Perform a groupby operation in Polars
pl_df.group_by('category').agg(
    pl.col('value_a').sum().alias('sum_value_a'),
)
end = time.time()
polars_groupby_time = end - start
print(f"Polars groupby time: {polars_groupby_time:.4f} seconds")
start = time.time()
# Perform a groupby operation in Pandas
pd_df[['category','value_a']].groupby('category').sum()
end = time.time()
pandas_groupby_time = end - start
print(f"Pandas groupby time: {pandas_groupby_time:.4f} seconds")
compare_times(pandas_groupby_time, polars_groupby_time) 

Polars groupby time: 0.1721 seconds
Pandas groupby time: 0.4857 seconds


⚡ Polars is 2.82x faster than Pandas at counting. 🚀


In [156]:
start = time.time()
# Perform a groupby operation in Polars
pl_df.group_by('category').agg(
    pl.col('value_a').mean().alias('mean_value_a'),
)
end = time.time()
polars_groupby_time = end - start
print(f"Polars groupby time: {polars_groupby_time:.4f} seconds")
start = time.time()
# Perform a groupby operation in Pandas
pd_df[['category','value_a']].groupby('category').mean()
end = time.time()
pandas_groupby_time = end - start
print(f"Pandas groupby time: {pandas_groupby_time:.4f} seconds")
compare_times(pandas_groupby_time, polars_groupby_time)

Polars groupby time: 0.1429 seconds
Pandas groupby time: 0.4917 seconds


⚡ Polars is 3.44x faster than Pandas at counting. 🚀


In [170]:
start = time.time()
# Perform a groupby operation in Polars
pl_df.group_by('category').agg(
    pl.col('value_a').n_unique().alias('n_unique_value_a'),
).sort('category')
end = time.time()
polars_groupby_time = end - start
print(f"Polars groupby time: {polars_groupby_time:.4f} seconds")
start = time.time()
# Perform a groupby operation in Pandas
pd_df[['category','value_a']].groupby('category').nunique()
end = time.time()
pandas_groupby_time = end - start
print(f"Pandas groupby time: {pandas_groupby_time:.4f} seconds")
compare_times(pandas_groupby_time, polars_groupby_time)


Polars groupby time: 0.2003 seconds
Pandas groupby time: 2.5074 seconds


⚡ Polars is 12.52x faster than Pandas at counting. 🚀


In [175]:
start = time.time()
# Perform a groupby operation in Polars
pl_df.group_by('category').agg(
    pl.col('value_a').sum().alias('sum_value_a'),
)
end = time.time()
polars_groupby_time = end - start
print(f"Polars groupby time: {polars_groupby_time:.4f} seconds")
start = time.time()
# Perform a groupby operation in Pandas
pd_df[['category','value_a']].groupby('category').sum()
end = time.time()
pandas_groupby_time = end - start
print(f"Pandas groupby time: {pandas_groupby_time:.4f} seconds")
compare_times(pandas_groupby_time, polars_groupby_time)

Polars groupby time: 0.0444 seconds
Pandas groupby time: 0.3852 seconds


⚡ Polars is 8.67x faster than Pandas at counting. 🚀


## Manupulation/Selection

* cast
* explode
* filter
* iter_columns
* iter_rows
* sort
* unique
* when

In [208]:
import pandas as pd
import numpy as np

# Number of rows
n_rows = 10_000_000

# Generate data
pd_df = pd.DataFrame({
    'random_float': np.random.rand(n_rows) * 100,  # Random floats 0–100
    'timestamp': pd.date_range('2020-01-01', periods=n_rows, freq='s')  # 1-second intervals
})
pd_df['timestamp_str'] = pd_df['timestamp'].astype(str)

pl_df = pl.from_pandas(pd_df)


### Cast

In [209]:
pd.to_datetime(pd_df['timestamp_str'], errors='coerce')

0         2020-01-01 00:00:00
1         2020-01-01 00:00:01
2         2020-01-01 00:00:02
3         2020-01-01 00:00:03
4         2020-01-01 00:00:04
                  ...        
9999995   2020-04-25 17:46:35
9999996   2020-04-25 17:46:36
9999997   2020-04-25 17:46:37
9999998   2020-04-25 17:46:38
9999999   2020-04-25 17:46:39
Name: timestamp_str, Length: 10000000, dtype: datetime64[ns]

In [192]:
start = time.time()
# Perform a cast operation in Polars
pl_df.select('random_float').cast({'random_float': pl.Int16,}, strict=False)
end = time.time()
polars_cast_time = end - start
print(f"Polars cast time: {polars_cast_time:.4f} seconds")
start = time.time()
# Perform a cast operation in Pandas
pd_df['random_float'].astype(np.int16)
end = time.time()
pandas_cast_time = end - start
print(f"Pandas cast time: {pandas_cast_time:.4f} seconds")
compare_times(pandas_cast_time, polars_cast_time)


Polars cast time: 0.0343 seconds
Pandas cast time: 0.0092 seconds


📈 Pandas is 3.75x faster than Polars at counting. 📊


#### Float to int

#### Date to datetime

In [205]:
start = time.time()
# Perform a cast operation in Polars
pl_df.select('timestamp').cast({'timestamp': pl.Date,}, strict=False)
end = time.time()
polars_cast_time = end - start
print(f"Polars cast time: {polars_cast_time:.4f} seconds")
start = time.time()
# Perform a cast operation in Pandas
pd_df['timestamp'].dt.date
end = time.time()
pandas_cast_time = end - start
print(f"Pandas cast time: {pandas_cast_time:.4f} seconds")
compare_times(pandas_cast_time, polars_cast_time)

Polars cast time: 0.0751 seconds
Pandas cast time: 1.3050 seconds


⚡ Polars is 17.38x faster than Pandas at counting. 🚀


### Explode

### Filter

### iter_columns

### iter_rows

### unique

### sort

### when