In [1]:
#reference: https://medium.com/cuenex/pandas-2-0-vs-polars-the-ultimate-battle-a378eb75d6d1

In [2]:
import pandas as pd
import polars as pl
import numpy as np
import time

# 1. Read large dataset of 30,000,000+ rows and 15 columns

## Pandas

In [3]:
%%time
demo_pd = pd.read_parquet("demo_data/train.parquet")

# calculate execution time
st = time.process_time()
sum_x = 0
for i in range(1000000):
    sum_x += i
et = time.process_time()

CPU times: total: 51.9 s
Wall time: 28.7 s


## Polars

In [4]:
%%time
demo_pl = pl.read_parquet("demo_data/train.parquet")

# calculate execution time
st = time.process_time()
sum_x = 0
for i in range(1000000):
    sum_x += i
et = time.process_time()

CPU times: total: 4.61 s
Wall time: 4.03 s


# 2. Test common functions

###  Query 1: The mean of all numerical columns when cat_1 equals 1.

## Pandas

In [5]:
%%time
demo_pd[demo_pd["cat_1"] == 1].select_dtypes(include = "number").mean()

# calculate execution time
st = time.process_time()
sum_x = 0
for i in range(1000000):
    sum_x += i
et = time.process_time()

CPU times: total: 1 s
Wall time: 3.3 s


## Polars

In [6]:
%%time
demo_pl.filter(pl.col("cat_1") == 1).select(pl.col("num_7", "num_8", "num_9", "num_10", "num_11", "num_12", "num_13", "num_14", "num_15").mean())

# calculate execution time
st = time.process_time()
sum_x = 0
for i in range(1000000):
    sum_x += i
et = time.process_time()

CPU times: total: 4.17 s
Wall time: 684 ms


### Query 2: Aggregate all numerical columns and caculate mean

## Pandas

In [7]:
%%time
nums = ["num_7", "num_8", "num_9", "num_10", "num_11", "num_12", "num_13", "num_14", "num_15"]
demo_pd.groupby(["user"])[nums].agg("mean") 

# calculate execution time
st = time.process_time()
sum_x = 0
for i in range(1000000):
    sum_x += i
et = time.process_time()

CPU times: total: 3.42 s
Wall time: 10.1 s


## Polars

In [8]:
%%time
demo_pl.groupby("user").agg(pl.col(nums).mean())  

# calculate execution time
st = time.process_time()
sum_x = 0
for i in range(1000000):
    sum_x += i
et = time.process_time()



CPU times: total: 19.8 s
Wall time: 1.78 s


# Lazy API

In [9]:
lazy_api = (
    pl.scan_csv(f"data/cast.csv")  #pl.scan_() starts lazy API 
    #replace the file location with your location
    .with_columns(pl.col("name").str.to_uppercase())  #convert the "name" column to uppercase
    .filter(pl.col("n") > 0)  #apply a filter on the "n" column to select values only larger than 0
)

lazy_api

In [10]:
print(lazy_api.explain(optimized=True))

 WITH_COLUMNS:
 [col("name").str.uppercase()]

    Csv SCAN data/cast.csv
    PROJECT */6 COLUMNS
    SELECTION: [(col("n")) > (0)]


In [11]:
lazy_api = (
    pl.scan_csv(f"data/cast.csv")  #pl.scan_() starts lazy API 
    #replace the file location with your location
    .with_columns(pl.col("name").str.to_uppercase())  #convert the "name" column to uppercase
    .filter(pl.col("n") > 0)  #apply a filter on the "n" column to select values only larger than 0
    .collect()
)

lazy_api

title,year,name,type,character,n
str,i64,str,str,str,i64
"""Closet Monster…",2015,"""BUFFY #1""","""actor""","""Buffy 4""",31
"""Suuri illusion…",1985,"""HOMO $""","""actor""","""Guests""",22
"""Battle of the …",2017,"""$HUTTER""","""actor""","""Bobby Riggs Fa…",10
"""Lapis, Ballpen…",2014,"""JORI ' DANILO'…","""actor""","""Jaime (young)""",9
"""When the Man W…",2014,"""TAIPALETI 'ATU…","""actor""","""Two Palms - Ua…",8
"""Little Angel (…",2015,"""MICHAEL 'BABEE…","""actor""","""Chico""",9
"""My Song for Yo…",2010,"""GEORGE 'BOOTSY…","""actor""","""Cooley's Custo…",16
"""My Song for Yo…",2010,"""GEORGE 'BOOTSY…","""actor""","""Celebration Gu…",16
"""Mariano Mison.…",1997,"""JOSEPH 'BYRON'…","""actor""","""Putik's Son""",31
"""Pelotazo nacio…",1993,"""FÉLIX 'EL GATO…","""actor""","""Rebolledo""",12
