In [3]:
import polars as pl

df = pl.read_csv("iris.csv")
df

sepal_length,sepal_width,petal_length,petal_width,species
f64,f64,f64,f64,str
5.1,3.5,1.4,0.2,"""setosa"""
4.9,3.0,1.4,0.2,"""setosa"""
4.7,3.2,1.3,0.2,"""setosa"""
4.6,3.1,1.5,0.2,"""setosa"""
5.0,3.6,1.4,0.2,"""setosa"""
5.4,3.9,1.7,0.4,"""setosa"""
4.6,3.4,1.4,0.3,"""setosa"""
5.0,3.4,1.5,0.2,"""setosa"""
4.4,2.9,1.4,0.2,"""setosa"""
4.9,3.1,1.5,0.1,"""setosa"""


In [6]:
(
    df.filter(
        pl.col("sepal_length") > 5  #筛选出sepal_length大于5的行
        )
      .groupby("species")  #按species分组
      .agg(pl.all().sum()) #计算所有列的组内求和
)

species,sepal_length,sepal_width,petal_length,petal_width
str,f64,f64,f64,f64
"""setosa""",116.9,81.7,33.2,6.1
"""versicolor""",281.9,131.8,202.9,63.3
"""virginica""",324.5,146.2,273.1,99.6


## 构建示例数据

In [31]:
import polars as pl
import numpy as np

np.random.seed(12)

df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.154163,"""A"""
2.0,"""ham""",0.74,"""A"""
3.0,"""spam""",0.263315,"""B"""
,"""egg""",0.533739,"""C"""
5.0,,0.014575,"""B"""


## 表达式

#### 计算唯一值

In [8]:
out = df.select(
    [
        pl.col("names").n_unique().alias("unique_names_1"),
        # 计算name列中的唯一值，并把结果存为名为unique_names_1的列中
        pl.col("names").unique().count().alias("unique_names_2"),
    ]
)
out

unique_names_1,unique_names_2
u32,u32
5,5


### 各种聚合

In [9]:
out = df.select(
    [
        pl.sum("random").alias("sum"), #对random列求和，并将结果存在sum列中
        pl.min("random").alias("min"),
        pl.max("random").alias("max"),
        pl.col("random").max().alias("other_max"),
        pl.std("random").alias("std dev"),
        pl.var("random").alias("variance"),
    ]
)
out

sum,min,max,other_max,std dev,variance
f64,f64,f64,f64,f64,f64
1.705842,0.014575,0.74,0.74,0.293209,0.085971


### 过滤器和条件

In [11]:
out = df.select(
    [
        pl.col("names").filter( #选出names列，然后按条件筛选
            pl.col("names").str.contains(r"am$")  # 条件为该列的值包含am字符
        ).count(),
    ]
)
out

names
u32
2


### 二进制函数和修改

In the example below   
we use a conditional to create a new expression in the following  
 `when -> then -> otherwise` construct.  
The `when` function requires a predicate expression(断言，可推导表达式) (and thus leads to a boolean Series).   
The `then` function expects an expression that will be used in case the predicate evaluates to true,  
and the `otherwise` function expects an expression that will be used in case the predicate evaluates to false.

In [13]:
out = df.select(
    [
        pl.when(
            pl.col("random") > 0.5
        ).then(0).otherwise( pl.col("random") ) * pl.sum("nrs"),
        #如果random的值大于0.5，则取0
        #否则，令值为random的值
        #所有的结果乘以pl.sum("nrs") = 11
    ]
)
out

literal
f64
1.695791
0.0
2.896465
0.0
0.160325


In [None]:
# 注意，直接打印pl.sum("nrs")是没用的
df.select(
    pl.sum("nrs")
)

### 窗口表达式

A polars expression can also do an implicit GROUPBY, AGGREGATION, and JOIN in a single expression.   
In the examples below we do a GROUPBY OVER "groups" and AGGREGATE SUM of "random",   
按groups列分组，然后对random列进行组内求和  
and in the next expression we GROUPBY OVER "names" and AGGREGATE a LIST of "random".   
按name列分组，然后对random列转为list  
These window functions can be combined with other expressions and are an efficient way to determine group statistics.   
See more on those group statistics here.

In [32]:
out = df[
    [
        pl.col("*"),  # select all，即输出结果的前4列
        pl.col("random").sum().over("groups").alias("sum[random]/groups"),
        # 
        pl.col("random").list().over("names").alias("random/name"),
    ]
]
out

nrs,names,random,groups,sum[random]/groups,random/name
i64,str,f64,str,f64,list
1.0,"""foo""",0.154163,"""A""",0.894213,[0.154163]
2.0,"""ham""",0.74,"""A""",0.894213,[0.74]
3.0,"""spam""",0.263315,"""B""",0.2778,[0.263315]
,"""egg""",0.533739,"""C""",0.533739,[0.533739]
5.0,,0.014575,"""B""",0.2778,[0.014575]


## Expression contexts

### 选择context

In [26]:
out = df.select(
    [
        pl.sum("nrs"), #求和，被自动扩充
        pl.col("names").sort(),
        pl.col("names").first().alias("first name"), # first()得到第一个元素
        (pl.mean("nrs") * 10).alias("10xnrs"),
    ]
)
out

nrs,names,first name,10xnrs
i64,str,str,f64
11,,"""foo""",27.5
11,"""egg""","""foo""",27.5
11,"""foo""","""foo""",27.5
11,"""ham""","""foo""",27.5
11,"""spam""","""foo""",27.5


### 添加列df.with_columns

In [34]:
out = df.with_columns(
    [
        pl.sum("nrs").alias("nrs_sum"),
        pl.col("random").count().alias("count"),
    ]
)
out
# 示例代码有误

nrs,names,random,groups,nrs_sum,count
i64,str,f64,str,i64,u32
1.0,"""foo""",0.154163,"""A""",11,5
2.0,"""ham""",0.74,"""A""",11,5
3.0,"""spam""",0.263315,"""B""",11,5
,"""egg""",0.533739,"""C""",11,5
5.0,,0.014575,"""B""",11,5


### 分组上下文df.groupby().agg()

In [35]:
out = df.groupby("groups").agg(
    [
        pl.sum("nrs"),  # sum nrs by groups，对nrs分组求和
        pl.col("random").count().alias("count"),  
        # count group members
        # 对random分组计数

        pl.col("random").filter(
            pl.col("names").is_not_null()
        ).sum().suffix("_sum"),
        # sum random where name != null
        # 如果name值不为空，就选出random的值，然后求和，并添加后缀名
        
        pl.col("names").reverse().alias(("reversed names")),
        # 反向排序
    ]
)
out

groups,nrs,count,random_sum,reversed names
str,i64,u32,f64,list
"""B""",8.0,2,0.263315,"[null, ""spam""]"
"""A""",3.0,2,0.894213,"[""ham"", ""foo""]"
"""C""",,1,0.533739,"[""egg""]"


## 分组

### 导入数据
说明，数据来自于 https://github.com/unitedstates/congress-legislators  
但是直接下载整个包没啥数据
我是在页面中选择
|File|Download|Description|
|:----|:----|:----|
|legislators-current|YAML JSON CSV|Currently serving Members of Congress.|


然后下载了其中的CSV文件

用这个工具
https://markdown-convert.com/zh/tool/table


In [1]:
import polars as pl

In [2]:
dataset = pl.read_csv('./dataset/legislators-current.csv')
dataset.head(10)

last_name,first_name,middle_name,suffix,nickname,full_name,birthday,gender,type,state,district,senate_class,party,url,address,phone,contact_form,rss_url,twitter,facebook,youtube,youtube_id,bioguide_id,thomas_id,opensecrets_id,lis_id,fec_ids,cspan_id,govtrack_id,votesmart_id,ballotpedia_id,washington_post_id,icpsr_id,wikipedia_id
str,str,str,str,str,str,str,str,str,str,i64,i64,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,i64,i64,i64,str,str,i64,str
"""Brown""","""Sherrod""",,,,"""Sherrod Brown""","""1952-11-09""","""M""","""sen""","""OH""",,1,"""Democrat""","""https://www.brown.senate.gov""","""503 Hart Senate Office Building Washington DC 20510""","""202-224-2315""","""https://www.brown.senate.gov/contact/""","""http://www.brown.senate.gov/rss/feeds/?type=all&""","""SenSherrodBrown""","""SenatorSherrodBrown""","""SherrodBrownOhio""","""UCgy8jfERh-t_ixkKKoCmglQ""","""B000944""",136,"""N00003535""","""S307""","""H2OH13033,S6OH00163""",5051,400050,27018,"""Sherrod Brown""",,29389,"""Sherrod Brown"""
"""Cantwell""","""Maria""",,,,"""Maria Cantwell""","""1958-10-13""","""F""","""sen""","""WA""",,1,"""Democrat""","""https://www.cantwell.senate.gov""","""511 Hart Senate Office Building Washington DC 20510""","""202-224-3441""","""https://www.cantwell.senate.gov/public/index.cfm/email-maria""","""http://www.cantwell.senate.gov/public/index.cfm/rss/feed""","""SenatorCantwell""","""senatorcantwell""","""SenatorCantwell""","""UCN52UDqKgvHRk39ncySrIMw""","""C000127""",172,"""N00007836""","""S275""","""S8WA00194,H2WA01054""",26137,300018,27122,"""Maria Cantwell""",,39310,"""Maria Cantwell"""
"""Cardin""","""Benjamin""","""L.""",,,"""Benjamin L. Cardin""","""1943-10-05""","""M""","""sen""","""MD""",,1,"""Democrat""","""https://www.cardin.senate.gov""","""509 Hart Senate Office Building Washington DC 20510""","""202-224-4524""","""https://www.cardin.senate.gov/contact/""","""http://www.cardin.senate.gov/rss/feeds/?type=all""","""SenatorCardin""","""senatorbencardin""","""senatorcardin""","""UCiQaJnMzlfzzG3VESgyZChA""","""C000141""",174,"""N00001955""","""S308""","""H6MD03177,S6MD03177""",4004,400064,26888,"""Ben Cardin""",,15408,"""Ben Cardin"""
"""Carper""","""Thomas""","""Richard""",,,"""Thomas R. Carper""","""1947-01-23""","""M""","""sen""","""DE""",,1,"""Democrat""","""https://www.carper.senate.gov/public""","""513 Hart Senate Office Building Washington DC 20510""","""202-224-2441""","""https://www.carper.senate.gov/public/index.cfm/email-senator-carper""","""http://www.carper.senate.gov/public/index.cfm/rss/feed""","""SenatorCarper""","""tomcarper""","""senatorcarper""","""UCgLnvbKwu4B3navofj6Qvvw""","""C000174""",179,"""N00012508""","""S277""","""S8DE00079""",663,300019,22421,"""Tom Carper""",,15015,"""Tom Carper"""
"""Casey""","""Robert""","""P.""","""Jr.""","""Bob""","""Robert P. Casey, Jr.""","""1960-04-13""","""M""","""sen""","""PA""",,1,"""Democrat""","""https://www.casey.senate.gov""","""393 Russell Senate Office Building Washington DC 20510""","""202-224-6324""","""https://www.casey.senate.gov/contact""","""http://www.casey.senate.gov/rss/feeds/?all""","""SenBobCasey""","""SenatorBobCasey""","""SenatorBobCasey""","""UCtVssXhx-KuZa-hSvnsnJ0A""","""C001070""",1828,"""N00027503""","""S309""","""S6PA00217""",47036,412246,2541,"""Bob Casey, Jr.""",,40703,"""Bob Casey Jr."""
"""Feinstein""","""Dianne""",,,,"""Dianne Feinstein""","""1933-06-22""","""F""","""sen""","""CA""",,1,"""Democrat""","""https://www.feinstein.senate.gov""","""331 Hart Senate Office Building Washington DC 20510""","""202-224-3841""","""https://www.feinstein.senate.gov/public/index.cfm/e-mail-me""","""http://www.feinstein.senate.gov/public/?a=rss.feed""","""SenFeinstein""","""senatorfeinstein""","""SenatorFeinstein""","""UCtVC--6LR0ff2aOP8THpuEw""","""F000062""",1332,"""N00007364""","""S221""","""S0CA00199""",13061,300043,53273,"""Dianne Feinstein""",,49300,"""Dianne Feinstein"""
"""Klobuchar""","""Amy""","""Jean""",,,"""Amy Klobuchar""","""1960-05-25""","""F""","""sen""","""MN""",,1,"""Democrat""","""https://www.klobuchar.senate.gov""","""425 Dirksen Senate Office Building Washington DC 20510""","""202-224-3244""","""https://www.klobuchar.senate.gov/public/index.cfm/contact""",,"""SenAmyKlobuchar""",,"""senatorklobuchar""","""UCvdeJsDsV51tFb_hVqvtYGA""","""K000367""",1826,"""N00027500""","""S311""","""S6MN00267""",83701,412242,65092,"""Amy Klobuchar""",,40700,"""Amy Klobuchar"""
"""Menendez""","""Robert""",,,"""Bob""","""Robert Menendez""","""1954-01-01""","""M""","""sen""","""NJ""",,1,"""Democrat""","""https://www.menendez.senate.gov""","""528 Hart Senate Office Building Washington DC 20510""","""202-224-4744""","""https://www.menendez.senate.gov/contact""","""http://www.menendez.senate.gov/rss/feeds/index.cfm?type=news""","""SenatorMenendez""","""senatormenendez""","""SenatorMenendezNJ""","""UC0PV0K9Z5a9p3D5917KF5fw""","""M000639""",791,"""N00000699""","""S306""","""H2NJ13075,S6NJ00289""",29608,400272,26961,"""Bob Menendez""",,29373,"""Bob Menendez"""
"""Sanders""","""Bernard""",,,"""Bernie""","""Bernard Sanders""","""1941-09-08""","""M""","""sen""","""VT""",,1,"""Independent""","""https://www.sanders.senate.gov""","""332 Dirksen Senate Office Building Washington DC 20510""","""202-224-5141""","""https://www.sanders.senate.gov/contact/""","""http://www.sanders.senate.gov/rss/""","""SenSanders""","""senatorsanders""","""senatorsanders""","""UCD_DaKNac0Ta-2PeHuoQ1uA""","""S000033""",1010,"""N00000528""","""S313""","""H8VT01016,S4VT00033""",994,400357,27110,"""Bernie Sanders""",,29147,"""Bernie Sanders"""
"""Stabenow""","""Debbie""","""Ann""",,,"""Debbie Stabenow""","""1950-04-29""","""F""","""sen""","""MI""",,1,"""Democrat""","""https://www.stabenow.senate.gov""","""731 Hart Senate Office Building Washington DC 20510""","""202-224-4822""","""https://www.stabenow.senate.gov/contact""","""http://stabenow.senate.gov/rss/?p=news""","""SenStabenow""","""SenatorStabenow""","""senatorstabenow""","""UCFoDKCvxSwCUfDv-4Eg4K5A""","""S000770""",1531,"""N00004118""","""S284""","""S8MI00281,H6MI08163""",45451,300093,515,"""Debbie Stabenow""",,29732,"""Debbie Stabenow"""


In [3]:
dataset.columns

['last_name',
 'first_name',
 'middle_name',
 'suffix',
 'nickname',
 'full_name',
 'birthday',
 'gender',
 'type',
 'state',
 'district',
 'senate_class',
 'party',
 'url',
 'address',
 'phone',
 'contact_form',
 'rss_url',
 'twitter',
 'facebook',
 'youtube',
 'youtube_id',
 'bioguide_id',
 'thomas_id',
 'opensecrets_id',
 'lis_id',
 'fec_ids',
 'cspan_id',
 'govtrack_id',
 'votesmart_id',
 'ballotpedia_id',
 'washington_post_id',
 'icpsr_id',
 'wikipedia_id']

In [4]:
q = (
    dataset.lazy()
    .groupby("first_name") #按first_name列分组，即第2列
    .agg(
        [
            pl.count(),               #组内计数
            pl.col("gender"),         #组内值列表化 .list()有没有都一样
            pl.first("last_name"),    #last_name列组内的第一个值
        ]
    )
    .sort("count", reverse=True)      #按上面聚合得到的count函数排序
    .limit(5)                         #只取结果的前5行
)
# type(q)
# polars.internals.lazy_frame.LazyFrame
# 所以通过collect函数得到
df = q.collect()
df

first_name,count,gender,last_name
str,u32,list,str
"""John""",19,"[""M"", ""M"", ... ""M""]","""Barrasso"""
"""Mike""",13,"[""M"", ""M"", ... ""M""]","""Kelly"""
"""Michael""",11,"[""M"", ""M"", ... ""M""]","""Bennet"""
"""David""",10,"[""M"", ""M"", ... ""M""]","""Cicilline"""
"""James""",9,"[""M"", ""M"", ... ""M""]","""Inhofe"""


### 条件句

假设我们想知道一个“州”有多少代表是民主党或共和党。  
然后按Republican降序排序

In [5]:
q = (
    dataset.lazy()
    .groupby("state")
    .agg(
        [
            (
                pl.col("party") == "Democrat"
            ).sum().alias("Democrat"),
            (
                pl.col("party") == "Republican"
            ).sum().alias("Republican"),
        ]
    )
    .sort("Republican", reverse=True)
    .limit(10)
)

df = q.collect()
df

state,Democrat,Republican
str,u32,u32
"""TX""",12,25
"""FL""",11,18
"""OH""",5,13
"""PA""",10,10
"""NC""",5,10
"""CA""",44,10
"""TN""",2,9
"""IN""",2,9
"""MO""",2,8
"""SC""",1,8


上述代码的另一种实现方式：嵌套分组  
这个逻辑我不是很喜欢

In [6]:
q = (
    dataset.lazy()
    .groupby(["state", "party"])
    .agg( 
            [pl.count("party").alias("count")]
        )    # 至此已经完成分组，并设定要求为计数
    .filter( # 设定在上面的基础上的计数对象
        (pl.col("party") == "Democrat") | 
            (pl.col("party") == "Republican")
    )
    .sort("count", reverse=True)
    .limit(10)
)

df = q.collect()
df

state,party,count
str,str,u32
"""CA""","""Democrat""",44
"""TX""","""Republican""",25
"""NY""","""Democrat""",21
"""FL""","""Republican""",18
"""IL""","""Democrat""",15
"""OH""","""Republican""",13
"""NJ""","""Democrat""",12
"""TX""","""Democrat""",12
"""MA""","""Democrat""",11
"""FL""","""Democrat""",11


### 过滤

在解决问题之前，学下时间处理的知识  
否则代码会报错  
下面的代码，我们将date列转换为时间格式

In [22]:
import polars as pl

dated = pl.DataFrame(
    {
        "date": ["2020-01-02", "2020-01-03", "2020-01-04"], 
        "index": [1, 2, 3]
    }
)

# 注意，这里with_column比较好用
# 本来是新增一列的，但是，如果我们没有使用alias，则会是替换掉原来的同名列
q = dated.lazy().with_column(
        pl.col("date").str.strptime(pl.Date, "%Y-%m-%d").alias("b_squared")
    )

df = q.collect()
df

date,index,b_squared
str,i64,date
"""2020-01-02""",1,2020-01-02
"""2020-01-03""",2,2020-01-03
"""2020-01-04""",3,2020-01-04


如果使用select，则应书写如下

In [23]:
q = dated.lazy().select(
        [
        pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"),
        pl.col('index'),
        ]
    )

df = q.collect()
df

date,index
date,i64
2020-01-02,1
2020-01-03,2
2020-01-04,3


In [8]:
df.dtypes
# polars.datatypes.Date

[polars.datatypes.Date, polars.datatypes.Int64]

下面就是年月日时分秒的形式  
其中format可以查看datetime的api或者rust的api

In [26]:
import polars as pl

dated = pl.DataFrame(
    {
        "date": ["2020-01-02 01:11:11", "2020-01-03 02:22:22", "2020-01-04 03:33:33"], 
        "index": [1, 2, 3]
    }
)

q = dated.lazy().with_column(
        pl.col("date").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S")
    )

df = q.collect()
df

date,index
datetime,i64
2020-01-02 01:11:11,1
2020-01-03 02:22:22,2
2020-01-04 03:33:33,3


In [10]:
q = dataset.lazy().with_column(
        pl.col("birthday").str.strptime(pl.Date, "%Y-%m-%d")
    )

datasetn = q.collect()
datasetn.head(3)

last_name,first_name,middle_name,suffix,nickname,full_name,birthday,gender,type,state,district,senate_class,party,url,address,phone,contact_form,rss_url,twitter,facebook,youtube,youtube_id,bioguide_id,thomas_id,opensecrets_id,lis_id,fec_ids,cspan_id,govtrack_id,votesmart_id,ballotpedia_id,washington_post_id,icpsr_id,wikipedia_id
str,str,str,str,str,str,date,str,str,str,i64,i64,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,i64,i64,i64,str,str,i64,str
"""Brown""","""Sherrod""",,,,"""Sherrod Brown""",1952-11-09,"""M""","""sen""","""OH""",,1,"""Democrat""","""https://www.brown.senate.gov""","""503 Hart Senate Office Building Washington DC 20510""","""202-224-2315""","""https://www.brown.senate.gov/contact/""","""http://www.brown.senate.gov/rss/feeds/?type=all&""","""SenSherrodBrown""","""SenatorSherrodBrown""","""SherrodBrownOhio""","""UCgy8jfERh-t_ixkKKoCmglQ""","""B000944""",136,"""N00003535""","""S307""","""H2OH13033,S6OH00163""",5051,400050,27018,"""Sherrod Brown""",,29389,"""Sherrod Brown"""
"""Cantwell""","""Maria""",,,,"""Maria Cantwell""",1958-10-13,"""F""","""sen""","""WA""",,1,"""Democrat""","""https://www.cantwell.senate.gov""","""511 Hart Senate Office Building Washington DC 20510""","""202-224-3441""","""https://www.cantwell.senate.gov/public/index.cfm/email-maria""","""http://www.cantwell.senate.gov/public/index.cfm/rss/feed""","""SenatorCantwell""","""senatorcantwell""","""SenatorCantwell""","""UCN52UDqKgvHRk39ncySrIMw""","""C000127""",172,"""N00007836""","""S275""","""S8WA00194,H2WA01054""",26137,300018,27122,"""Maria Cantwell""",,39310,"""Maria Cantwell"""
"""Cardin""","""Benjamin""","""L.""",,,"""Benjamin L. Cardin""",1943-10-05,"""M""","""sen""","""MD""",,1,"""Democrat""","""https://www.cardin.senate.gov""","""509 Hart Senate Office Building Washington DC 20510""","""202-224-4524""","""https://www.cardin.senate.gov/contact/""","""http://www.cardin.senate.gov/rss/feeds/?type=all""","""SenatorCardin""","""senatorbencardin""","""senatorcardin""","""UCiQaJnMzlfzzG3VESgyZChA""","""C000141""",174,"""N00001955""","""S308""","""H6MD03177,S6MD03177""",4004,400064,26888,"""Ben Cardin""",,15408,"""Ben Cardin"""


In [41]:
from datetime import date

# 计算截止到2021.1.1的年龄
def compute_age() -> pl.Expr:
    # 函数后面跟着的-> pl.Expr是函数返回值的类型建议符，
    # 用来说明该函数返回的值是什么类型。
    # 返回出生日期距离2021,。1.1的天数
    return date(2021, 1, 1).year - pl.col("birthday").dt.year()

# 计算性别=gender的组内均值
def avg_birthday(gender: str) -> pl.Expr:
    return compute_age().filter(
            pl.col("gender") == gender
        ).mean().alias(f"avg {gender} birthday")


q = (
    datasetn.lazy()
    .groupby(["state"],maintain_order=True)
    .agg(
        [
            avg_birthday("M"), #女性年龄均值
            avg_birthday("F"),
            (pl.col("gender") == "M").count().alias("# male"), #女性的数量
            (pl.col("gender") == "F").sum().alias("# female"),
        ]
    )
)
# 卡住我了。
# 1 为啥要对lpsum
# 2 为啥每次输出的结果在随机变化，是因为并行了吧
df = q.collect()
df

state,avg M birthday,avg F birthday,# male,# female
str,f64,f64,u32,u32
"""OH""",59.466667,64.0,18,3
"""WA""",56.25,57.0,12,8
"""MD""",67.8,,10,0
"""DE""",66.0,59.0,3,1
"""PA""",56.9375,60.5,20,4
"""CA""",57.323529,67.75,54,20
"""MN""",55.666667,56.0,9,6
"""NJ""",62.166667,62.5,14,2
"""VT""",78.333333,,3,0
"""MI""",60.444444,55.571429,16,7


由于并行的原因，结果每次都不通  
所以如果需要每次结果都相同，则要使用下面的语句
> 
> ```python
> .groupby(["state"],maintain_order=True)
> ```

但是这会增加计算时间的开销

### 排序

假设我们想得到每个州最年长和最年轻的政治家的名字。

In [12]:
def get_person() -> pl.Expr:
    return pl.col("first_name") + pl.lit(" ") + pl.col("last_name")

#
q = (
    dataset.lazy()
    .sort("birthday")
    .groupby(["state"])
    .agg(
        [
            get_person().first().alias("youngest"),
            get_person().last().alias("oldest"),
        ]
    )
    .limit(5)
)

df = q.collect()
df

state,youngest,oldest
str,str,str
"""CA""","""Dianne Feinstein""","""Sara Jacobs"""
"""NM""","""Teresa Leger Fernandez""","""Melanie Stansbury"""
"""ID""","""James Risch""","""Russ Fulcher"""
"""VT""","""Patrick Leahy""","""Peter Welch"""
"""GA""","""David Scott""","""Jon Ossoff"""


如果我们还想按字母顺序对名称进行排序

In [13]:
def get_person() -> pl.Expr:
    return pl.col("first_name") + pl.lit(" ") + pl.col("last_name")


q = (
    dataset.lazy()
    .sort("birthday")
    .groupby(["state"])
    .agg(
        [
            get_person().first().alias("youngest"),
            get_person().last().alias("oldest"),
            get_person().sort().first().alias("alphabetical_first"),
        ]
    )
    .limit(5)
)

df = q.collect()
df

state,youngest,oldest,alphabetical_first
str,str,str,str
"""VT""","""Patrick Leahy""","""Peter Welch""","""Bernard Sanders"""
"""TX""","""Eddie Johnson""","""Dan Crenshaw""","""Al Green"""
"""GA""","""David Scott""","""Jon Ossoff""","""A. Ferguson"""
"""LA""","""John Kennedy""","""Julia Letlow""","""Bill Cassidy"""
"""SC""","""James Clyburn""","""William Timmons""","""James Clyburn"""


我们甚至可以按groupby上下文中的另一列进行排序。 

如果我们还想知道按字母顺序排列的第一个名字名字对应的人是男性还是女性

In [43]:
def get_person() -> pl.Expr:
    return pl.col("first_name") + pl.lit(" ") + pl.col("last_name")


q = (
    dataset.lazy()
    .sort("birthday")
    .groupby(["state"])
    .agg(
        [
            get_person().first().alias("youngest"),
            get_person().last().alias("oldest"),
            get_person().sort().first().alias("alphabetical_first"),
            pl.col("gender").sort_by("first_name").first().alias("gender"),
        ]
    )
    .sort("state")
    .limit(5)
)

df = q.collect()
df

state,youngest,oldest,alphabetical_first,gender
str,str,str,str,str
"""AK""","""Lisa Murkowski""","""Dan Sullivan""","""Dan Sullivan""","""M"""
"""AL""","""Richard Shelby""","""Barry Moore""","""Barry Moore""","""M"""
"""AR""","""John Boozman""","""Tom Cotton""","""Bruce Westerman""","""M"""
"""AS""","""Aumua Amata Radewagen""","""Aumua Amata Radewagen""","""Aumua Amata Radewagen""","""F"""
"""AZ""","""Tom O’Halleran""","""Ruben Gallego""","""Andy Biggs""","""M"""


注意：  
sort是对DataFrame操作的  
而sort_by是对column操作的，其实也是联动排序