In [1]:
# 数据聚合
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [2]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,2.449918,-1.012479,a,one
1,-1.427127,0.351162,a,two
2,1.097066,0.924132,b,one
3,0.161633,-0.808331,b,two
4,0.182924,0.184227,a,one


In [3]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9) # 计算分组之后的分位数

key1
a    1.996519
b    1.003523
Name: data1, dtype: float64

In [4]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak) # 对分组之后的数据使用自定义聚合函数

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3.877045,1.363642
b,0.935434,1.732463


In [5]:
grouped.describe() # 分别描述分组后的每一组数据

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.401905,1.947777,-1.427127,-0.622101,0.182924,1.316421,2.449918,3.0,-0.15903,0.743807,-1.012479,-0.414126,0.184227,0.267695,0.351162
b,2.0,0.629349,0.661451,0.161633,0.395491,0.629349,0.863208,1.097066,2.0,0.0579,1.225036,-0.808331,-0.375215,0.0579,0.491016,0.924132


In [6]:
# 优化过的聚合函数：
# count：     非NA值的数量
# sum：       非NA值的和
# mean：      非NA值的平均数
# median：    非NA值的中位数
# std/var：   无偏（分母为n - 1）的标准差和方差
# min/max：   非NA值的最小/最大值
# prod：      非NA值的积
# first/last：第一个/最后一个非NA值

In [7]:
tips = pd.read_csv('../data/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill'] # 新加一列，小费与账单金额的比例。
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [8]:
# 面向列的多函数应用

In [9]:
# 原书的例子根据sex和是否吸烟做分组，怀疑因为政治正确，sex字段被移除。
grouped = tips.groupby(['smoker', 'time']) # 根据性别和是否抽烟分组
grouped_pct = grouped['tip_pct']
# grouped_pct.agg('mean') # 和下面等价
grouped_pct.mean()

smoker  time  
No      Dinner    0.158653
        Lunch     0.160920
Yes     Dinner    0.160828
        Lunch     0.170404
Name: tip_pct, dtype: float64

In [10]:
grouped_pct.agg(['mean', 'std', peak_to_peak]) # 分别应用3个聚合函数

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,Dinner,0.158653,0.040458,0.235193
No,Lunch,0.16092,0.038989,0.19335
Yes,Dinner,0.160828,0.095153,0.674707
Yes,Lunch,0.170404,0.04277,0.1693


In [11]:
 grouped_pct.agg([('foo', 'mean'), ('bar', np.std)]) # 列重命名

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1
No,Dinner,0.158653,0.040458
No,Lunch,0.16092,0.038989
Yes,Dinner,0.160828,0.095153
Yes,Lunch,0.170404,0.04277


In [12]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions) # 对group后的两个字段分别作用functions
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
No,Dinner,106,0.158653,0.29199,106,20.09566,48.33
No,Lunch,45,0.16092,0.266312,45,17.050889,41.19
Yes,Dinner,70,0.160828,0.710345,70,21.859429,50.81
Yes,Lunch,23,0.170404,0.259314,23,17.39913,43.11


In [13]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,Dinner,106,0.158653,0.29199
No,Lunch,45,0.16092,0.266312
Yes,Dinner,70,0.160828,0.710345
Yes,Lunch,23,0.170404,0.259314


In [14]:
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
No,Dinner,0.158653,0.001637,20.09566,69.604821
No,Lunch,0.16092,0.00152,17.050889,59.587154
Yes,Dinner,0.160828,0.009054,21.859429,104.148753
Yes,Lunch,0.170404,0.001829,17.39913,61.958436


In [15]:
grouped.agg({'tip' : np.max, 'size' : 'sum'}) # 不同的列对应不同的函数

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1
No,Dinner,9.0,290
No,Lunch,6.7,113
Yes,Dinner,10.0,173
Yes,Lunch,5.0,51


In [16]:
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
             'size' : 'sum'}) # 每列可以对应不同数量的函数

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
No,Dinner,0.056797,0.29199,0.158653,0.040458,290
No,Lunch,0.072961,0.266312,0.16092,0.038989,113
Yes,Dinner,0.035638,0.710345,0.160828,0.095153,173
Yes,Lunch,0.090014,0.259314,0.170404,0.04277,51


In [17]:
# 以“无索引”的形式返回聚合数据

In [18]:
tips.groupby(['smoker', 'time'], as_index=False).mean() # 把原来的索引变成列

Unnamed: 0,smoker,time,total_bill,tip,size,tip_pct
0,No,Dinner,20.09566,3.126887,2.735849,0.158653
1,No,Lunch,17.050889,2.673778,2.511111,0.16092
2,Yes,Dinner,21.859429,3.066,2.471429,0.160828
3,Yes,Lunch,17.39913,2.834348,2.217391,0.170404
