[Group By: split-apply-combine](https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html)

In [29]:
import pandas as pd 
import numpy as np
import seaborn as sns
planets = sns.load_dataset('planets')

rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'C', 'A', 'B', 'C','C','C', 'A', 'B', 'C'],
                   'data1': range(10),
                   'data2': rng.randint(0, 10, 10)},
                   columns = ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,C,1,0
2,A,2,3
3,B,3,3
4,C,4,7
5,C,5,9
6,C,6,3
7,A,7,5
8,B,8,2
9,C,9,4


In [14]:
df = df.sort_values(by='key').reset_index(drop=True)
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,A,2,3
2,A,7,5
3,B,3,3
4,B,8,2
5,C,1,0
6,C,4,7
7,C,5,9
8,C,6,3
9,C,9,4


In [10]:
df.groupby('key').describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
A,3.0,3.0,3.605551,0.0,1.0,2.0,4.5,7.0,3.0,4.333333,1.154701,3.0,4.0,5.0,5.0,5.0
B,2.0,5.5,3.535534,3.0,4.25,5.5,6.75,8.0,2.0,2.5,0.707107,2.0,2.25,2.5,2.75,3.0
C,5.0,5.0,2.915476,1.0,4.0,5.0,6.0,9.0,5.0,4.6,3.507136,0.0,3.0,4.0,7.0,9.0


In [11]:
df.groupby('key').aggregate({'data1': 'min',
                             'data2': 'max'})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,3,3
C,1,9


In [54]:
# 聚合并rename
df.groupby('key').agg([np.sum, np.mean, np.std]).rename(columns={'sum': 'my_sum',
                                                                 'mean': 'my_mean',
                                                                 'std': 'my_std'})


Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,my_sum,my_mean,my_std,my_sum,my_mean,my_std
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,9,3.0,3.605551,13,4.333333,1.154701
B,11,5.5,3.535534,5,2.5,0.707107
C,25,5.0,2.915476,23,4.6,3.507136


In [56]:
# 添加匿名函数
df.groupby('key')['data1'].agg([lambda x: x.max() - x.min(),
                  lambda x: x.median() - x.mean()])


Unnamed: 0_level_0,<lambda_0>,<lambda_1>
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,7,-1
B,5,0
C,8,0


## NamedAgg

In [57]:
animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],
                        'height': [9.1, 6.0, 9.5, 34.0],
                        'weight': [7.9, 7.5, 9.9, 198.0]})

animals

Unnamed: 0,kind,height,weight
0,cat,9.1,7.9
1,dog,6.0,7.5
2,cat,9.5,9.9
3,dog,34.0,198.0


In [65]:
animals.groupby("kind").agg(
    YourNamedCol=pd.NamedAgg(column='height', aggfunc=np.min),
    min_height=pd.NamedAgg(column='height', aggfunc='min'),
    max_height=pd.NamedAgg(column='height', aggfunc='max'),
    std_height=pd.NamedAgg(column='weight', aggfunc='std'),
    average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean))

Unnamed: 0_level_0,YourNamedCol,min_height,max_height,std_height,average_weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cat,9.1,9.1,9.5,1.414214,8.9
dog,6.0,6.0,34.0,134.703842,102.75


In [68]:
# pandas.NamedAgg is just a namedtuple. Plain tuples are allowed as well.
animals.groupby("kind").agg(
    YourNamedCol=('height', np.min),
    min_height=('height', 'min'),
    max_height=('height', 'max'),
    std_height=('weight', 'std'),
    average_weight=('weight', np.mean))

Unnamed: 0_level_0,YourNamedCol,min_height,max_height,std_height,average_weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cat,9.1,9.1,9.5,1.414214,8.9
dog,6.0,6.0,34.0,134.703842,102.75


In [73]:
# 如果想命令的列名非Python支持的变量名：
animals.groupby("kind").agg(**{'我是列名': pd.NamedAgg(column='height', aggfunc=max)})

Unnamed: 0_level_0,我是列名
kind,Unnamed: 1_level_1
cat,9.5
dog,34.0


In [74]:
# 如果是Series groupby aggregations，则只需要写一个执行函数
animals.groupby("kind").height.agg(
    min_height='min',
    max_height='max')

Unnamed: 0_level_0,min_height,max_height
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.1,9.5
dog,6.0,34.0


In [76]:
# 这种方式将过时：
animals.groupby("kind").height.agg({'min_height':'min','max_height':'max'})

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,min_height,max_height
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.1,9.5
dog,6.0,34.0


In [83]:
# 这么写又可以了
animals.groupby("kind",as_index=False).height.agg({'min_height':'min','max_height':'max'})

Unnamed: 0,kind,min_height,max_height
0,cat,9.1,9.5
1,dog,6.0,34.0


## filter

In [16]:
# 根据条件 filter，满足条件的组会被保留。key=B的组不满足条件，会被过滤掉
def filter_func(x):
    return x['data2'].max() >= 5

df.groupby('key').filter(filter_func)

Unnamed: 0,key,data1,data2
0,A,0,5
1,A,2,3
2,A,7,5
5,C,1,0
6,C,4,7
7,C,5,9
8,C,6,3
9,C,9,4


## transform

In [20]:
# transform 方法，对group做一些操作然后再合并，返回的df长度与原始的一样
df.groupby('key').transform(lambda x: x - x.mean())

Unnamed: 0,data1,data2
0,-3.0,0.666667
1,-1.0,-1.333333
2,4.0,0.666667
3,-2.5,0.5
4,2.5,-0.5
5,-4.0,-4.6
6,-1.0,2.4
7,0.0,4.4
8,1.0,-1.6
9,4.0,-0.6


In [37]:
df.groupby('key').mean()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3.0,4.333333
B,5.5,2.5
C,5.0,4.6


In [35]:
df.groupby('key').transform(lambda x: x.mean()) # .transform('mean')

Unnamed: 0,data1,data2
0,3.0,4.333333
1,5.0,4.6
2,3.0,4.333333
3,5.5,2.5
4,5.0,4.6
5,5.0,4.6
6,5.0,4.6
7,3.0,4.333333
8,5.5,2.5
9,5.0,4.6


In [40]:
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,C,1,0
2,A,2,3
3,B,3,3
4,C,4,7
5,C,5,9
6,C,6,3
7,A,7,5
8,B,8,2
9,C,9,4


In [51]:
df.groupby('key').transform(lambda x: x.rank(ascending=True,method='first')) # 排序

Unnamed: 0,data1,data2
0,1.0,2.0
1,1.0,1.0
2,2.0,1.0
3,1.0,2.0
4,2.0,4.0
5,3.0,5.0
6,4.0,2.0
7,3.0,3.0
8,2.0,1.0
9,5.0,3.0


In [43]:
normalized = (df['data1'] - df.groupby('key')['data1'].transform('mean')) / df.groupby('key')['data1'].transform('std')
normalized

0   -0.832050
1   -1.371989
2   -0.277350
3   -0.707107
4   -0.342997
5    0.000000
6    0.342997
7    1.109400
8    0.707107
9    1.371989
Name: data1, dtype: float64

In [44]:
def normalize(x):
    return (x - x.mean()) / x.std()

df.groupby('key')['data1'].transform(normalize)

0   -0.832050
1   -1.371989
2   -0.277350
3   -0.707107
4   -0.342997
5    0.000000
6    0.342997
7    1.109400
8    0.707107
9    1.371989
Name: data1, dtype: float64

In [45]:
df.groupby('key')['data1'].apply(normalize)

0   -0.832050
1   -1.371989
2   -0.277350
3   -0.707107
4   -0.342997
5    0.000000
6    0.342997
7    1.109400
8    0.707107
9    1.371989
Name: data1, dtype: float64

## apply

In [26]:
# apply，对每个组进行自定义的函数操作
def norm_data1_by_data2_max(x):
    # x is a DataFrame of group values
    x['data1'] /= x['data2'].max()
    # x['data2'] /= x['data1'].max() # 如果执行这个操作，会在前一步操作的结果上进行，而非原始的data1取值
    return x

df.groupby('key').apply(norm_data1_by_data2_max)

Unnamed: 0,key,data1,data2
0,A,0.0,5
1,A,0.4,3
2,A,1.4,5
3,B,1.0,3
4,B,2.666667,2
5,C,0.111111,0
6,C,0.444444,7
7,C,0.555556,9
8,C,0.666667,3
9,C,1.0,4


In [28]:
# 对key分组然后再聚合
df2 = df.set_index('key')
mapping = {'A': 'A', 'B': 'B+C', 'C': 'B+C'}
df2.groupby(mapping).max()

Unnamed: 0,data1,data2
A,7,5
B+C,9,9


# 例子


In [30]:
planets

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [34]:
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0
