<a href="https://colab.research.google.com/github/xpdlaldam/data_science/blob/master/polars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import polars as pl
import datetime as dt
from datetime import date

In [3]:
df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            dt.date(1997, 1, 10),
            dt.date(1985, 2, 15),
            dt.date(1983, 3, 22),
            dt.date(1981, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)
df

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [5]:
bmi_expr = pl.col("weight") / (pl.col("height") ** 2)
bmi_expr # because expressions are lazy, no computation is implemented yet. Polars expressions need a context in which they are executed to produce a result

In [10]:
result_select = df.select(
    bmi=bmi_expr,
)
result_select

bmi
f64
23.791913
23.141498
19.687787
27.134694


In [11]:
df_new = df.with_columns(
    bmi=bmi_expr,
)
df_new

name,birthdate,weight,height,bmi
str,date,f64,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56,23.791913
"""Ben Brown""",1985-02-15,72.5,1.77,23.141498
"""Chloe Cooper""",1983-03-22,53.6,1.65,19.687787
"""Daniel Donovan""",1981-04-30,83.1,1.75,27.134694


In [13]:
result_filter = df.filter(
    pl.col("birthdate").is_between(date(1982, 12, 31), date(1996, 1, 1)),
    pl.col("height") > 1.7,
)
result_filter

name,birthdate,weight,height
str,date,f64,f64
"""Ben Brown""",1985-02-15,72.5,1.77


In [None]:
## name by decade & whether height > 1.7
result_groupby = df.group_by(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    (pl.col("height") < 1.7).alias("short?"),
).agg(pl.col("name"))
result_groupby