# Polars Select & Filter

- Column Selection
- Row Filtering
- Chaining Operations

Goal: select columns and filter rows with expressions.

## Column Selection


In [24]:
import polars as pl

df = pl.DataFrame(
    data={
        "name": ["James", "Mary", "Tom", "Mary"],
        "age": [38, 24, 45, 19],
        "level":['beginner','advanced','intermediate','intermediate']
    }
)

In [25]:
# pandas-like works, but it is not the Polars way
df[["name", "age"]]

name,age
str,i64
"""James""",38
"""Mary""",24
"""Tom""",45
"""Mary""",19


In [26]:
# prefer Polars `.select` context

# different options work

# string-based
df.select("name", "age")

# idiomatic with column expression  
# is recommended
df.select(pl.col("age"), pl.col("name"))

age,name
i64,str
38,"""James"""
24,"""Mary"""
45,"""Tom"""
19,"""Mary"""


In [27]:
# expressions tell Polars WHAT to do
# and are declarative
col_expression = pl.col("name", "age")
col_expression

In [28]:
# the context `select` sets the scope
# and executes
df.select(col_expression)

name,age
str,i64
"""James""",38
"""Mary""",24
"""Tom""",45
"""Mary""",19


In [29]:
# pl.col() powers selection

# by column name pattern
df.select(pl.col("^na.*$"))        # regex: start with 'na'


name
str
"""James"""
"""Mary"""
"""Tom"""
"""Mary"""


In [30]:
# by dtype
# supports multiple dtypes pl.col(pl.Int64,pl.Float64)
df.select(pl.col(pl.Int64))


age
i64
38
24
45
19


In [31]:
# `.select` context returns a NEW DataFrame
# syntactic sugar for pl.col('*')
selection = df.select(pl.all())
selection

name,age,level
str,i64,str
"""James""",38,"""beginner"""
"""Mary""",24,"""advanced"""
"""Tom""",45,"""intermediate"""
"""Mary""",19,"""intermediate"""


In [32]:
# dataframes are different -> no inplace mutations
df is selection
print("Is same object?", df is selection)  # expect False

Is same object? False


In [56]:
# expression allow more than pure column selection
average_age = pl.col("age").mean().alias("average_age")
average_age

In [None]:
# 1. `.select` can return DataFrame of any shape
#
# 2. expressions allow to compose
# complex actions
most_frequent_name = pl.col('name').mode().alias('most_frequent_name')

df.select(average_age, most_frequent_name)

average_age,most_frequent_name
f64,str
31.5,"""Mary"""


In [None]:
# create columns on the fly
df.select(
    
    # selection
    pl.col('name'),

    # modify columns
    (pl.col('age') + 2).alias('corrected_age'),

    # add new column
    pl.lit("Lesson 3").alias('participants'),
)

name,corrected_age,participants
str,i64,str
"""James""",40,"""Lesson 3"""
"""Mary""",26,"""Lesson 3"""
"""Tom""",47,"""Lesson 3"""
"""Mary""",21,"""Lesson 3"""


## Row Filtering


In [53]:
import polars as pl
from pathlib import Path

if Path("coffee_sales.parquet").exists():
    coffee_sales = pl.read_parquet("coffee_sales.parquet")
else:
    # minimal inline dataset so the lesson is self-contained
    coffee_sales = pl.DataFrame(
        {
            "drink": ["espresso", "water", "espresso", "water", "latte", "tea"],
            "price": [3.5, 2.0, 3.5, 2.0, 4.0, 2.5],
            "timestamp": pl.datetime_range(
                start=pl.datetime(2024, 1, 1),
                end=pl.datetime(2024, 1, 6),
                interval="1d",
                eager=True,
            ),
            "city": [
                "Berlin",
                "Austin",
                "New York",
                "Los Angeles",
                "Austin",
                "San Francisco"
            ],
        }
    )

print(f"Length of full DataFrame: {len(coffee_sales)}")
coffee_sales.head(8)


Length of full DataFrame: 1000


drink,price,timestamp,city
str,f64,datetime[μs],str
"""cappuccino""",3.5,2025-09-17 08:15:00,"""New York"""
"""water""",2.0,2025-09-17 03:43:00,"""Los Angeles"""
"""lemonade""",3.0,2025-09-07 13:02:00,"""New York"""
"""cappuccino""",3.5,2025-08-14 07:32:00,"""New York"""
"""lemonade""",3.0,2025-09-05 06:45:00,"""New York"""
"""lemonade""",3.0,2025-08-15 14:37:00,"""Austin"""
"""espresso""",3.0,2025-09-18 05:44:00,"""New York"""
"""tea""",2.5,2025-08-18 04:13:00,"""San Francisco"""


In [51]:
# `.filter()` returns DataFrame

# supported comparisons
# < > <= >= == !=
water_sales = coffee_sales.filter(pl.col("drink") == "water")
print(f"Length of full DataFrame: {len(coffee_sales)}")
print(f"Length of filtered DataFrame: {len(water_sales)}")
water_sales


Length of full DataFrame: 1000
Length of filtered DataFrame: 202


drink,price,timestamp,city
str,f64,datetime[μs],str
"""water""",2.0,2025-09-17 03:43:00,"""Los Angeles"""
"""water""",2.0,2025-09-17 18:27:00,"""San Francisco"""
"""water""",2.0,2025-09-08 10:31:00,"""New York"""
"""water""",2.0,2025-08-26 21:41:00,"""San Francisco"""
"""water""",2.0,2025-09-07 23:34:00,"""New York"""
…,…,…,…
"""water""",2.0,2025-09-24 15:55:00,"""Austin"""
"""water""",2.0,2025-09-18 15:20:00,"""New York"""
"""water""",2.0,2025-08-18 01:32:00,"""San Francisco"""
"""water""",2.0,2025-08-26 07:05:00,"""Austin"""


In [38]:

coffee_sales.filter(pl.col("city") != "Los Angeles")

drink,price,timestamp,city
str,f64,datetime[μs],str
"""cappuccino""",3.5,2025-09-17 08:15:00,"""New York"""
"""lemonade""",3.0,2025-09-07 13:02:00,"""New York"""
"""cappuccino""",3.5,2025-08-14 07:32:00,"""New York"""
"""lemonade""",3.0,2025-09-05 06:45:00,"""New York"""
"""lemonade""",3.0,2025-08-15 14:37:00,"""Austin"""
…,…,…,…
"""lemonade""",3.0,2025-09-26 16:35:00,"""San Francisco"""
"""water""",2.0,2025-09-30 10:23:00,"""Austin"""
"""lemonade""",3.0,2025-09-15 21:23:00,"""Austin"""
"""tea""",2.5,2025-08-07 21:14:00,"""New York"""


In [None]:
# compound filters parallel to pandas
coffee_sales.filter(
    # combine conditions with & (AND) and | (OR)

    # price condition
    (pl.col("price") <= 3) &

    # city conditiion
    (
        (pl.col("city") == "Berlin") | 
        (pl.col("city") == "New York")
    )
)

drink,price,timestamp,city
str,f64,datetime[μs],str
"""lemonade""",3.0,2025-09-07 13:02:00,"""New York"""
"""lemonade""",3.0,2025-09-05 06:45:00,"""New York"""
"""espresso""",3.0,2025-09-18 05:44:00,"""New York"""
"""espresso""",3.0,2025-08-06 12:06:00,"""New York"""
"""tea""",2.5,2025-09-05 09:53:00,"""New York"""
…,…,…,…
"""lemonade""",3.0,2025-08-28 21:46:00,"""New York"""
"""lemonade""",3.0,2025-08-01 20:59:00,"""New York"""
"""water""",2.0,2025-09-18 15:20:00,"""New York"""
"""tea""",2.5,2025-08-07 12:26:00,"""New York"""


In [40]:
# compound filter expression
# no mask and no index in contrast to pandas
water_or_espresso = (pl.col("drink") == "water") | (pl.col("drink") == "espresso")
water_or_espresso

In [41]:
# apply column expression in `filter` context
# boolean mask used under the hood
coffee_sales.filter(water_or_espresso)

drink,price,timestamp,city
str,f64,datetime[μs],str
"""water""",2.0,2025-09-17 03:43:00,"""Los Angeles"""
"""espresso""",3.0,2025-09-18 05:44:00,"""New York"""
"""espresso""",3.0,2025-08-06 12:06:00,"""New York"""
"""espresso""",3.0,2025-09-08 08:51:00,"""San Francisco"""
"""espresso""",3.0,2025-09-24 07:55:00,"""New York"""
…,…,…,…
"""water""",2.0,2025-09-24 15:55:00,"""Austin"""
"""water""",2.0,2025-09-18 15:20:00,"""New York"""
"""water""",2.0,2025-08-18 01:32:00,"""San Francisco"""
"""water""",2.0,2025-08-26 07:05:00,"""Austin"""


In [42]:
# use built-in operators
# eg. `is_in`, `is_between`, `is_null`, ...
water_or_espresso_in_austin_new_york = (
    water_or_espresso 
    & pl.col("city").is_in( ["New York", "Austin"])
)

coffee_sales.filter(water_or_espresso_in_austin_new_york)


drink,price,timestamp,city
str,f64,datetime[μs],str
"""espresso""",3.0,2025-09-18 05:44:00,"""New York"""
"""espresso""",3.0,2025-08-06 12:06:00,"""New York"""
"""espresso""",3.0,2025-09-24 07:55:00,"""New York"""
"""espresso""",3.0,2025-09-08 20:10:00,"""New York"""
"""espresso""",3.0,2025-08-18 02:13:00,"""Austin"""
…,…,…,…
"""water""",2.0,2025-08-23 21:43:00,"""Austin"""
"""water""",2.0,2025-09-24 15:55:00,"""Austin"""
"""water""",2.0,2025-09-18 15:20:00,"""New York"""
"""water""",2.0,2025-08-26 07:05:00,"""Austin"""


In [43]:
# inclusive bounds
coffee_sales.filter(pl.col("price").is_between(2.0, 3.0))      

drink,price,timestamp,city
str,f64,datetime[μs],str
"""water""",2.0,2025-09-17 03:43:00,"""Los Angeles"""
"""lemonade""",3.0,2025-09-07 13:02:00,"""New York"""
"""lemonade""",3.0,2025-09-05 06:45:00,"""New York"""
"""lemonade""",3.0,2025-08-15 14:37:00,"""Austin"""
"""espresso""",3.0,2025-09-18 05:44:00,"""New York"""
…,…,…,…
"""tea""",2.5,2025-08-07 12:26:00,"""New York"""
"""lemonade""",3.0,2025-09-26 16:35:00,"""San Francisco"""
"""water""",2.0,2025-09-30 10:23:00,"""Austin"""
"""lemonade""",3.0,2025-09-15 21:23:00,"""Austin"""


In [44]:
# negate filters using ~ (not)
coffee_sales.filter(~water_or_espresso).head(5)

drink,price,timestamp,city
str,f64,datetime[μs],str
"""cappuccino""",3.5,2025-09-17 08:15:00,"""New York"""
"""lemonade""",3.0,2025-09-07 13:02:00,"""New York"""
"""cappuccino""",3.5,2025-08-14 07:32:00,"""New York"""
"""lemonade""",3.0,2025-09-05 06:45:00,"""New York"""
"""lemonade""",3.0,2025-08-15 14:37:00,"""Austin"""


In [45]:
# create "not `is_in`"
coffee_sales.filter(~(pl.col('city').is_in(['New York','Austin'])))


drink,price,timestamp,city
str,f64,datetime[μs],str
"""water""",2.0,2025-09-17 03:43:00,"""Los Angeles"""
"""tea""",2.5,2025-08-18 04:13:00,"""San Francisco"""
"""espresso""",3.0,2025-09-08 08:51:00,"""San Francisco"""
"""lemonade""",3.0,2025-09-06 06:45:00,"""San Francisco"""
"""espresso""",3.0,2025-08-24 11:13:00,"""Los Angeles"""
…,…,…,…
"""tea""",2.5,2025-09-23 17:57:00,"""Los Angeles"""
"""water""",2.0,2025-08-18 01:32:00,"""San Francisco"""
"""lemonade""",3.0,2025-09-26 16:35:00,"""San Francisco"""
"""cappuccino""",3.5,2025-09-24 14:21:00,"""Los Angeles"""


## Combine Operations


In [47]:
#  Safe chaining due to new DataFrames
# `.filter()` -> DataFrame
# `.select()` -> DataFrame

filtered_selection = coffee_sales.filter(water_or_espresso_in_austin_new_york).select(
    pl.col("price", "drink",'city')
)
filtered_selection


price,drink,city
f64,str,str
3.0,"""espresso""","""New York"""
3.0,"""espresso""","""New York"""
3.0,"""espresso""","""New York"""
3.0,"""espresso""","""New York"""
3.0,"""espresso""","""Austin"""
…,…,…
2.0,"""water""","""Austin"""
2.0,"""water""","""Austin"""
2.0,"""water""","""New York"""
2.0,"""water""","""Austin"""


## Wrap Up


- `pl.col()` → creates column expressions
- `.select()` → subset columns
- `.filter()` → subset rows
- Use `&`, `|`, and `~` for logic
- Operations return **new DataFrames**, enabling safe chaining
- Parallels to pandas but cleaner, index-free syntax


In [None]:
# 🔍 Polars Select & Filter

# Cheatsheet on GitHub

# SELECT
df.select(pl.col("name","age"))
df.select(pl.col("^na.*$"))                    # pattern
df.select(pl.col(pl.String))       # by dtype

# FILTER
coffee_sales.filter(pl.col("price") > 3)
coffee_sales.filter((pl.col("price") <= 3) & (pl.col("drink").is_in(["water","espresso"])))
coffee_sales.filter(~(pl.col("city") == "Los Angeles"))

# CHAIN
coffee_sales.filter(pl.col("city").is_in(["Austin","New York"])) \
  .select(pl.col("drink","price","timestamp"))


print("Done - Happy Coding")

Done - Happy Coding
