## [Polars Expressions](https://pola-rs.github.io/polars-book/user-guide/dsl/intro.html)

In [10]:
!pip install polars --upgrade --user



In [3]:
import polars as pl

In [4]:
pl.__version__

'0.16.9'

In [5]:
# then let's load some csv data with information about pokemon
df = pl.read_csv(
    "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv"
)

In [6]:
", ".join(df.columns)

'#, Name, Type 1, Type 2, Total, HP, Attack, Defense, Sp. Atk, Sp. Def, Speed, Generation, Legendary'

In [9]:
df

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,false
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,false
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,false
3,"""VenusaurMega V...","""Grass""","""Poison""",625,80,100,123,122,120,80,1,false
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,false
5,"""Charmeleon""","""Fire""",,405,58,64,58,80,65,80,1,false
6,"""Charizard""","""Fire""","""Flying""",534,78,84,78,109,85,100,1,false
6,"""CharizardMega ...","""Fire""","""Dragon""",634,78,130,111,130,85,100,1,false
6,"""CharizardMega ...","""Fire""","""Flying""",634,78,104,78,159,115,100,1,false
7,"""Squirtle""","""Water""",,314,44,48,65,50,64,43,1,false


### [Expressions](https://pola-rs.github.io/polars-book/user-guide/dsl/expressions.html)

In [8]:
df.select(
    [
        pl.col("Name").n_unique().alias("unique_names_1"),
        pl.col("Name").unique().count().alias("unique_names_2"),
    ]
)

unique_names_1,unique_names_2
u32,u32
163,163


In [14]:
df.select([
    pl.col("Name").sort().head(5).alias("first_names"),
    pl.col("Attack").filter(pl.col("Type 1") == "Fire").sort().head(5).alias("top_attack")
])

# Note: first_names and top_attack are independent

first_names,top_attack
str,i64
"""Abra""",41
"""Aerodactyl""",52
"""AerodactylMega...",64
"""Alakazam""",70
"""AlakazamMega A...",76


In [15]:
df.filter(pl.col("Name") == "Abra")

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool
63,"""Abra""","""Psychic""",,310,25,20,15,105,55,90,1,False


### [Windows functions](https://pola-rs.github.io/polars-book/user-guide/dsl/window_functions.html)

The power of window expressions is that you often don't need a groupby -> explode combination, but you can put the logic in a single expression.

- groupby -> marks that groups are aggregated and we expect a DataFrame of size n_groups
- over -> marks that we want to compute something within a group, but doesn't modify the original size of the DataFrame

In [4]:
df

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,false
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,false
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,false
3,"""VenusaurMega V...","""Grass""","""Poison""",625,80,100,123,122,120,80,1,false
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,false
5,"""Charmeleon""","""Fire""",,405,58,64,58,80,65,80,1,false
6,"""Charizard""","""Fire""","""Flying""",534,78,84,78,109,85,100,1,false
6,"""CharizardMega ...","""Fire""","""Dragon""",634,78,130,111,130,85,100,1,false
6,"""CharizardMega ...","""Fire""","""Flying""",634,78,104,78,159,115,100,1,false
7,"""Squirtle""","""Water""",,314,44,48,65,50,64,43,1,false


#### [Groupby Aggregations in selection](https://pola-rs.github.io/polars-book/user-guide/dsl/window_functions.html#groupby-aggregations-in-selection)

In [5]:
df.select(
    [
        "Type 1",
        "Type 2",
        pl.col("Attack").mean().over(["Type 1"]).alias("avg_attack_by_type"),
        pl.col("Defense").mean().over(["Type 1", "Type 2"]).alias("avg_defense_by_type_combination"),
        pl.col("Attack").mean().alias("avg_attack"),
    ]
)


Type 1,Type 2,avg_attack_by_type,avg_defense_by_type_combination,avg_attack
str,str,f64,f64,f64
"""Grass""","""Poison""",72.923077,67.8,75.349693
"""Grass""","""Poison""",72.923077,67.8,75.349693
"""Grass""","""Poison""",72.923077,67.8,75.349693
"""Grass""","""Poison""",72.923077,67.8,75.349693
"""Fire""",,88.642857,58.3,75.349693
"""Fire""",,88.642857,58.3,75.349693
"""Fire""","""Flying""",88.642857,82.0,75.349693
"""Fire""","""Dragon""",88.642857,111.0,75.349693
"""Fire""","""Flying""",88.642857,82.0,75.349693
"""Water""",,74.193548,74.526316,75.349693


In [10]:
filtered = df.filter(pl.col("Type 2") == "Psychic").select(
    [
        "Type 1",
        "Name",
        "Speed",
    ]
)
# print(filtered)
filtered

Type 1,Name,Speed
str,str,i64
"""Water""","""Slowpoke""",15
"""Water""","""Slowbro""",30
"""Water""","""SlowbroMega Sl...",30
"""Grass""","""Exeggcute""",40
"""Grass""","""Exeggutor""",55
"""Water""","""Starmie""",115
"""Ice""","""Jynx""",95


In [11]:
out = filtered.with_columns(
    [
        pl.col(["Name", "Speed"]).sort(descending=True).over("Type 1"),
    ]
)
print(out)

shape: (7, 3)
┌────────┬─────────────────────┬───────┐
│ Type 1 ┆ Name                ┆ Speed │
│ ---    ┆ ---                 ┆ ---   │
│ str    ┆ str                 ┆ i64   │
╞════════╪═════════════════════╪═══════╡
│ Water  ┆ Starmie             ┆ 115   │
│ Water  ┆ Slowpoke            ┆ 30    │
│ Water  ┆ SlowbroMega Slowbro ┆ 30    │
│ Grass  ┆ Exeggutor           ┆ 55    │
│ Grass  ┆ Exeggcute           ┆ 40    │
│ Water  ┆ Slowbro             ┆ 15    │
│ Ice    ┆ Jynx                ┆ 95    │
└────────┴─────────────────────┴───────┘


### [Window expression rules](https://pola-rs.github.io/polars-book/user-guide/dsl/window_functions.html#window-expression-rules)

'#, Name, Type 1, Type 2, Total, HP, Attack, Defense, Sp. Atk, Sp. Def, Speed, Generation, Legendary'

In [27]:
# use Lazy API in order to reference new columns in one-go

def add_new_col(df):
    return (
        df.select(
            pl.col(["Name", "Type 1", "Attack", "Speed"])
        )
        .with_columns([
            pl.sum("Attack").alias("total_attack"),
        ])
        .with_columns([  # derived column from new column "total_attack" must be defined separately
            (pl.col("total_attack")*pl.col("Speed")).alias("attack_speed"),
        ])  
    )
df_tmp = df.filter(pl.col("Type 1") == "Grass").lazy()
df_tmp.pipe(add_new_col).collect()

Name,Type 1,Attack,Speed,total_attack,attack_speed
str,str,i64,i64,i64,i64
"""Bulbasaur""","""Grass""",49,45,948,42660
"""Ivysaur""","""Grass""",62,60,948,56880
"""Venusaur""","""Grass""",82,80,948,75840
"""VenusaurMega V...","""Grass""",100,80,948,75840
"""Oddish""","""Grass""",50,30,948,28440
"""Gloom""","""Grass""",65,40,948,37920
"""Vileplume""","""Grass""",80,50,948,47400
"""Bellsprout""","""Grass""",75,40,948,37920
"""Weepinbell""","""Grass""",90,55,948,52140
"""Victreebel""","""Grass""",105,70,948,66360


In [32]:
df.select(
    pl.col("Type 1"),
    pl.col("Name"),
    pl.col("Attack"),
    pl.col("Speed"),
    pl.sum("Attack").over("Type 1").alias("grouped_total_attacks"),
    (pl.sum("Attack")*pl.col("Speed")).over("Type 1").alias("grouped_total_attacks_speed"),
    (pl.sum("Attack")*pl.col("Speed")).list().over("Type 1").alias("grouped_total_attacks_speed_list"),
#     (pl.sum("Attack")*pl.col("Speed")).list().over("Type 1").flatten().alias("grouped_total_attacks_speed_flat"),
)

# note only a few rows per Type 1 are displayed

Type 1,Name,Attack,Speed,grouped_total_attacks,grouped_total_attacks_speed,grouped_total_attacks_speed_list
str,str,i64,i64,i64,i64,list[i64]
"""Grass""","""Bulbasaur""",49,45,948,42660,"[42660, 56880, ... 56880]"
"""Grass""","""Ivysaur""",62,60,948,56880,"[42660, 56880, ... 56880]"
"""Grass""","""Venusaur""",82,80,948,75840,"[42660, 56880, ... 56880]"
"""Grass""","""VenusaurMega V...",100,80,948,75840,"[42660, 56880, ... 56880]"
"""Fire""","""Charmander""",52,65,1241,80665,"[80665, 99280, ... 111690]"
"""Fire""","""Charmeleon""",64,80,1241,99280,"[80665, 99280, ... 111690]"
"""Fire""","""Charizard""",84,100,1241,124100,"[80665, 99280, ... 111690]"
"""Fire""","""CharizardMega ...",130,100,1241,124100,"[80665, 99280, ... 111690]"
"""Fire""","""CharizardMega ...",104,100,1241,124100,"[80665, 99280, ... 111690]"
"""Water""","""Squirtle""",48,43,2300,98900,"[98900, 133400, ... 149500]"


- sort all pokemon by type
- select the first 3 pokemon per type as "Type 1"
- sort the pokemon within a type by speed and select the first 3 as "fastest/group"
- sort the pokemon within a type by attack and select the first 3 as "strongest/group"
- sort the pokemon by name within a type and select the first 3 as "sorted_by_alphabet"

In [34]:
df.sort("Type 1").select(
    [
        pl.col("Type 1").head(3).list().over("Type 1").flatten(),
        pl.col("Name").sort_by(pl.col("Speed")).head(3).list().over("Type 1").flatten().alias("fastest-3/group"),
        pl.col("Name").sort_by(pl.col("Attack")).head(3).list().over("Type 1").flatten().alias("strongest-3/group"),
        pl.col("Name").sort().head(3).list().over("Type 1").flatten().alias("name_alphabetically"),
    ]
)


Type 1,fastest-3/group,strongest-3/group,name_alphabetically
str,str,str,str
"""Bug""","""Paras""","""Metapod""","""Beedrill"""
"""Bug""","""Metapod""","""Kakuna""","""BeedrillMega B..."
"""Bug""","""Parasect""","""Caterpie""","""Butterfree"""
"""Dragon""","""Dratini""","""Dratini""","""Dragonair"""
"""Dragon""","""Dragonair""","""Dragonair""","""Dragonite"""
"""Dragon""","""Dragonite""","""Dragonite""","""Dratini"""
"""Electric""","""Magnemite""","""Voltorb""","""Electabuzz"""
"""Electric""","""Magneton""","""Magnemite""","""Electrode"""
"""Electric""","""Pikachu""","""Electrode""","""Jolteon"""
"""Fairy""","""Clefairy""","""Clefairy""","""Clefable"""


### [Flattened window function](https://pola-rs.github.io/polars-book/user-guide/dsl/window_functions.html#flattened-window-function)

In [6]:
df.sort("Type 1").select(
    [
        pl.col("Type 1").head(3).list().over("Type 1").flatten(),
        pl.col("Name").sort_by(pl.col("Speed")).head(3).list().over("Type 1").flatten().alias("fastest/group"),
        pl.col("Name").sort_by(pl.col("Attack")).head(3).list().over("Type 1").flatten().alias("strongest/group"),
        pl.col("Name").sort().head(3).list().over("Type 1").flatten().alias("sorted_by_alphabet"),
    ]
)


Type 1,fastest/group,strongest/group,sorted_by_alphabet
str,str,str,str
"""Bug""","""Paras""","""Metapod""","""Beedrill"""
"""Bug""","""Metapod""","""Kakuna""","""BeedrillMega B..."
"""Bug""","""Parasect""","""Caterpie""","""Butterfree"""
"""Dragon""","""Dratini""","""Dratini""","""Dragonair"""
"""Dragon""","""Dragonair""","""Dragonair""","""Dragonite"""
"""Dragon""","""Dragonite""","""Dragonite""","""Dratini"""
"""Electric""","""Magnemite""","""Voltorb""","""Electabuzz"""
"""Electric""","""Magneton""","""Magnemite""","""Electrode"""
"""Electric""","""Pikachu""","""Electrode""","""Jolteon"""
"""Fairy""","""Clefairy""","""Clefairy""","""Clefable"""
