# Grouping and Aggregation

In [1]:
import polars as pl

In [2]:
CSV_FILE = "./data/titanic.csv"

# Statistics

In [3]:
df = pl.read_csv(CSV_FILE)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


In [4]:
df.mean()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,


In [5]:
df.describe()

statistic,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
str,f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
"""count""",891.0,891.0,891.0,"""891""","""891""",714.0,891.0,891.0,"""891""",891.0,"""204""","""889"""
"""null_count""",0.0,0.0,0.0,"""0""","""0""",177.0,0.0,0.0,"""0""",0.0,"""687""","""2"""
"""mean""",446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
"""std""",257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
"""min""",1.0,0.0,1.0,"""Abbing, Mr. Anthony""","""female""",0.42,0.0,0.0,"""110152""",0.0,"""A10""","""C"""
"""25%""",224.0,0.0,2.0,,,20.0,0.0,0.0,,7.925,,
"""50%""",446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
"""75%""",669.0,1.0,3.0,,,38.0,1.0,0.0,,31.0,,
"""max""",891.0,1.0,3.0,"""van Melkebeke, Mr. Philemon""","""male""",80.0,8.0,6.0,"""WE/P 5735""",512.3292,"""T""","""S"""


In [6]:
df.describe(percentiles=(0.1, 0.3, 0.5, 0.7, 0.9))

statistic,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
str,f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
"""count""",891.0,891.0,891.0,"""891""","""891""",714.0,891.0,891.0,"""891""",891.0,"""204""","""889"""
"""null_count""",0.0,0.0,0.0,"""0""","""0""",177.0,0.0,0.0,"""0""",0.0,"""687""","""2"""
"""mean""",446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
"""std""",257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
"""min""",1.0,0.0,1.0,"""Abbing, Mr. Anthony""","""female""",0.42,0.0,0.0,"""110152""",0.0,"""A10""","""C"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""30%""",268.0,0.0,2.0,,,22.0,0.0,0.0,,8.05,,
"""50%""",446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
"""70%""",624.0,1.0,3.0,,,36.0,1.0,0.0,,27.0,,
"""90%""",802.0,1.0,3.0,,,50.0,1.0,2.0,,77.9583,,


In [7]:
(
    df
    .select(
        pl.col("Fare").mean()
    )
)

Fare
f64
32.204208


In [8]:
df_rolling = (
    pl.DataFrame(
        {
            "value": range(12)
        }
    )
)
df_rolling

value
i64
0
1
2
3
4
…
7
8
9
10


In [9]:
(
    df_rolling
    .with_columns(
        rolling_mean_value = pl.col("value").rolling_mean(window_size=3)
    )
)

value,rolling_mean_value
i64,f64
0,
1,
2,1.0
3,2.0
4,3.0
…,…
7,6.0
8,7.0
9,8.0
10,9.0


In [11]:
(
    df_rolling
    .with_columns(
        rolling_mean_value = pl.col("value").rolling_mean(window_size=3),
        rolling_mean_value_min_periods = pl.col("value").rolling_mean(window_size=3, min_samples=1)
    )
)

value,rolling_mean_value,rolling_mean_value_min_periods
i64,f64,f64
0,,0.0
1,,0.5
2,1.0,1.0
3,2.0,2.0
4,3.0,3.0
…,…,…
7,6.0,6.0
8,7.0,7.0
9,8.0,8.0
10,9.0,9.0


In [12]:
(
    df_rolling
    .with_columns(
        rolling_mean_value_center = pl.col("value").rolling_mean(window_size=3, center=True)
    )
)

value,rolling_mean_value_center
i64,f64
0,
1,1.0
2,2.0
3,3.0
4,4.0
…,…
7,7.0
8,8.0
9,9.0
10,10.0


In [13]:
(
    df_rolling
    .with_columns(
        rolling_mean_value = pl.col("value").rolling_mean(window_size=3),
        ewm_mean_value = pl.col("value").ewm_mean(span=3),
        ewm_mean_high_span = pl.col("value").ewm_mean(span=5)
    )
)

value,rolling_mean_value,ewm_mean_value,ewm_mean_high_span
i64,f64,f64,f64
0,,0.0,0.0
1,,0.666667,0.6
2,1.0,1.428571,1.263158
3,2.0,2.266667,1.984615
4,3.0,3.16129,2.758294
…,…,…,…
7,6.0,6.031373,5.324822
8,7.0,7.017613,6.240363
9,8.0,8.009775,7.176476
10,9.0,9.005374,8.128659


In [15]:
(
    df_rolling
    .select(
        pl.col(pl.Int64).min().name.suffix("_min"),
        pl.col(pl.Int64).max().name.suffix("_max")
    )
)

value_min,value_max
i64,i64
0,11


In [17]:
(
    df
    .with_columns(
        ((pl.col("Fare") - pl.col("Fare").min()) / (pl.col("Fare").max() - pl.col("Fare").min())).name.suffix("_scaled")
    )
    .select("Fare", "Fare_scaled")
    .sort("Fare", descending=True)
)

Fare,Fare_scaled
f64,f64
512.3292,1.0
512.3292,1.0
512.3292,1.0
263.0,0.513342
263.0,0.513342
…,…
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0


In [18]:
df_hor = pl.DataFrame(
    {
        "vals1": [0, 1, 2],
        "val2": [3, 4, 5]
    }
)
df_hor

vals1,val2
i64,i64
0,3
1,4
2,5


In [19]:
(
    df_hor
    .with_columns(
        pl.max_horizontal(pl.all()).alias("max"),
        pl.min_horizontal(pl.all()).alias("min"),
        pl.sum_horizontal(pl.all()).alias("sum")
    )
)

vals1,val2,max,min,sum
i64,i64,i64,i64,i64
0,3,3,0,3
1,4,4,1,5
2,5,5,2,7


In [20]:
(
    df_hor
    .max_horizontal()
)

max
i64
3
4
5


In [21]:
(
    df_hor
    .with_columns(
        pl.cum_sum_horizontal(pl.all())
    )
)

vals1,val2,cum_sum
i64,i64,struct[2]
0,3,"{0,3}"
1,4,"{1,5}"
2,5,"{2,7}"


In [22]:
(
    df_hor
    .with_columns(
        pl.concat_list(pl.all()).alias("concat")
    )
)

vals1,val2,concat
i64,i64,list[i64]
0,3,"[0, 3]"
1,4,"[1, 4]"
2,5,"[2, 5]"
