# Polars - Getting Started

In [3]:
import polars as pl
import numpy as np
import random
import string

# Create Parquet
### Create a Parquet file with `n` rows and `m` columns that are assigned round-robin as int, float, or string

In [4]:
%%time

n = 100_000_000 # 100M rows ~ 1 minute

df = pl.DataFrame({
    
    "int_col_0":   np.random.randint(0, 100, n),    # integers in [0, 100)
    "float_col_1": np.random.random(n),            # floats in [0.0, 1.0)
    "str_col_2": ["".join(random.choices(string.ascii_lowercase, k=5)) for _ in range(n)],

    "int_col_3":   np.random.randint(0, 100, n),
    "float_col_4": np.random.random(n),         
    "str_col_5": ["".join(random.choices(string.ascii_lowercase, k=5)) for _ in range(n)],

    "int_col_6":   np.random.randint(0, 100, n),
    "float_col_7": np.random.random(n),         
    "str_col_8": ["".join(random.choices(string.ascii_lowercase, k=5)) for _ in range(n)],

    "int_col_9":   np.random.randint(0, 100, n),
    "float_col_10": np.random.random(n),
    "str_col_11": ["".join(random.choices(string.ascii_lowercase, k=5)) for _ in range(n)],
})

df

CPU times: user 2min 43s, sys: 6.31 s, total: 2min 49s
Wall time: 2min 49s


int_col_0,float_col_1,str_col_2,int_col_3,float_col_4,str_col_5,int_col_6,float_col_7,str_col_8,int_col_9,float_col_10,str_col_11
i64,f64,str,i64,f64,str,i64,f64,str,i64,f64,str
93,0.652884,"""pntcm""",8,0.811316,"""nbxyc""",87,0.367002,"""ariyq""",51,0.030269,"""ppoav"""
11,0.333926,"""uvrqb""",65,0.499348,"""giuem""",62,0.551787,"""pimrp""",64,0.178633,"""khtfv"""
31,0.017871,"""ezaiv""",50,0.677592,"""lxtox""",67,0.601547,"""sjbft""",33,0.757437,"""tsroh"""
51,0.657203,"""ckcwh""",89,0.708234,"""oqash""",51,0.971546,"""cumrs""",65,0.173563,"""fauyl"""
9,0.742658,"""fdxkx""",90,0.115617,"""wlihc""",42,0.778501,"""oktvk""",38,0.317014,"""vodkv"""
…,…,…,…,…,…,…,…,…,…,…,…
12,0.617882,"""fwjwk""",12,0.274015,"""ozwnf""",67,0.786789,"""akuhk""",81,0.08208,"""blzdt"""
42,0.695621,"""kqdpg""",46,0.453561,"""vvnpv""",8,0.342684,"""wwglw""",56,0.56422,"""ekjud"""
53,0.17968,"""wuohd""",60,0.43204,"""apdzg""",96,0.264924,"""oarof""",10,0.259905,"""auuil"""
33,0.130465,"""zpcyg""",19,0.503983,"""vlqxx""",82,0.839144,"""znctb""",70,0.377668,"""pxqpc"""


In [5]:
# create a Polars DataFrame

df = pl.DataFrame(data)

In [6]:
%%time
# write to file
df.write_parquet(filename)

CPU times: user 19.7 s, sys: 1.69 s, total: 21.4 s
Wall time: 18.2 s


In [12]:
n = 3
["".join(random.choices(string.ascii_lowercase, k=5)) for _ in range(n)]

['xemvz', 'hdxmb', 'cxkrc']

CPU times: user 41.5 s, sys: 1.59 s, total: 43.1 s
Wall time: 43.1 s


int_col_0,float_col_1,str_col_2
i64,f64,str
23,0.665972,"""gnnlr"""
42,0.069425,"""xszhk"""
82,0.874121,"""mlldz"""
70,0.244381,"""pndgg"""
11,0.618358,"""moqah"""
…,…,…
61,0.031972,"""uekyx"""
24,0.444002,"""pgmfz"""
89,0.134568,"""mnmpk"""
97,0.826676,"""eeccc"""


In [None]:
# 2) Using Python's random.choices() for random letters
letters = ["a", "b", "c", "d", "e"]
df2 = pl.DataFrame({
    "rand_letter": random.choices(letters, k=5)   # 5 random letters
})
print("\nRandom data from Python random:")
print(df2)

In [4]:
# create a Polars DataFrame
lf = pl.LazyFrame(data)

In [5]:
del data

In [9]:
%%time
# 22 s - GPU (A6K)
# write to Parquet file
lf.collect(engine="gpu").write_parquet(filename)

CPU times: user 23.9 s, sys: 4.63 s, total: 28.6 s
Wall time: 22 s


In [10]:
!ls -l {filename}

-rw-rw-r-- 1 will will 4932501216 Feb 24 13:58 sample.12x100000000.parquet


# Parquet Demo 
### restart session

In [None]:
import polars as pl
import time

filename = 'sample.12x100000000.parquet'

### Get Schema w/out Loading File

In [3]:
pl.scan_parquet(filename).collect_schema()

Schema([('int_col_0', Int64),
        ('float_col_1', Float64),
        ('str_col_2', String),
        ('int_col_3', Int64),
        ('float_col_4', Float64),
        ('str_col_5', String),
        ('int_col_6', Int64),
        ('float_col_7', Float64),
        ('str_col_8', String),
        ('int_col_9', Int64),
        ('float_col_10', Float64),
        ('str_col_11', String)])

### Slow Operations accelerated with GPU 
* Large aggregations (e.g., groupby(...).agg(...))  
* Sorting (especially multi-column sorts on large datasets)  
* Joins (hash or sort-merge joins)  
* Projections (selecting or transforming multiple columns in parallel)  
* Filters (predicate pushdowns where the GPU can scan/filter billions of rows quickly)  

---

# Lazy Execution

In [5]:
import polars as pl
import time

filename = 'sample.12x100000000.parquet'

In [6]:
lf = (

    # scan_parquet (lazy) - read_parquet (eager)
    pl.scan_parquet(filename)

    # row filter
    .filter(pl.col("int_col_0") > 50)

    # sorting
    .sort(["float_col_7", "int_col_6"], descending=[True, False])

    # grouping & aggregating
    .group_by("str_col_2").agg([
        pl.col("int_col_0").mean(),
        pl.col("float_col_1").mean(),
        pl.col("int_col_3").mean(),
        pl.col("float_col_4").mean(),
        pl.col("int_col_6").mean(),
        pl.col("float_col_7").mean(),
    ])
    
    # column filter    
    .select(["int_col_0", "float_col_1", "str_col_2", "int_col_3"])    
)

In [7]:
start = time.time()

df = lf.collect(engine="gpu")

print(round(time.time() - start,2), end="\n\n")

0.99



# Eager Execution

In [12]:
# 5.22 s
start = time.time()

df = (
    # scan_parquet (lazy) - read_parquet (eager)
    pl.read_parquet(filename)
    
     # row filter
    .filter(pl.col("int_col_0") > 50)

    # sorting
    .sort(["float_col_7", "int_col_6"], descending=[True, False])

    # grouping & aggregating
    .group_by("str_col_2").agg([
        pl.col("int_col_0").mean(),
        pl.col("float_col_1").mean(),
        pl.col("int_col_3").mean(),
        pl.col("float_col_4").mean(),
        pl.col("int_col_6").mean(),
        pl.col("float_col_7").mean(),
    ])
    
    # column filter    
    .select(["int_col_0", "float_col_1", "str_col_2", "int_col_3"])    
)

print(round(time.time() - start,4), end="\n\n")

5.0305



In [13]:
display(df)

int_col_0,float_col_1,str_col_2,int_col_3
f64,f64,str,f64
71.0,0.509543,"""zpmwr""",63.0
75.363636,0.524108,"""ztgjs""",46.181818
75.666667,0.477204,"""kfasx""",62.833333
74.0,0.057641,"""eicll""",85.0
87.0,0.207514,"""uzryj""",74.333333
…,…,…,…
75.0,0.67616,"""bwwib""",41.0
74.2,0.294168,"""iguka""",50.4
97.0,0.266422,"""qzoib""",26.0
76.5,0.65473,"""tjbeu""",58.375
