In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({'animal':'cat dog cat fish dog cat cat'.split(),
                   'size':list('SSMMMLL'),
                   'weight':[8,10,11,1,20,12,12],
                   'adult':[False]*5+[True]*2})
df

Unnamed: 0,animal,size,weight,adult
0,cat,S,8,False
1,dog,S,10,False
2,cat,M,11,False
3,fish,M,1,False
4,dog,M,20,False
5,cat,L,12,True
6,cat,L,12,True


In [14]:
df.groupby('animal').apply(lambda x:x['size'])

animal   
cat     0    S
        2    M
        5    L
        6    L
dog     1    S
        4    M
fish    3    M
Name: size, dtype: object

#### list the size of the animals with the highest weight

In [15]:
df.groupby('animal').apply(lambda x:x['size'][x['weight'].idxmax()])

animal
cat     L
dog     M
fish    M
dtype: object

#### using get_group

In [16]:
gb = df.groupby(['animal'])
gb.get_group('cat')

Unnamed: 0,animal,size,weight,adult
0,cat,S,8,False
2,cat,M,11,False
5,cat,L,12,True
6,cat,L,12,True


In [27]:
# 尽量使用列表, 不建议使用元祖
gb = df.groupby(['animal','size'])

In [28]:
# 必须使用元祖
gb.get_group(('cat','L'))

Unnamed: 0,animal,size,weight,adult
5,cat,L,12,True
6,cat,L,12,True


#### apply to different items ina group

In [31]:
def GrowUp(x):
    avg_weight = sum(x[x['size']=='S'].weight*1.5)
    avg_weight +=sum(x[x['size']=='M'].weight*1.25)
    avg_weight +=sum(x[x['size']=='L'].weight)
    avg_weight /=len(x)
    return pd.Series(['L',avg_weight,True],index=['size','weight','adult'])

In [32]:
expected_df = gb.apply(GrowUp)
expected_df

  stacked_values = np.vstack(map(np.asarray, values))


Unnamed: 0_level_0,Unnamed: 1_level_0,size,weight,adult
animal,size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cat,L,L,12.0,True
cat,M,L,13.75,True
cat,S,L,12.0,True
dog,M,L,25.0,True
dog,S,L,15.0,True
fish,M,L,1.25,True


#### expanding apply

In [44]:
s = pd.Series([i/100 for i in range(1,11)])
s

0    0.01
1    0.02
2    0.03
3    0.04
4    0.05
5    0.06
6    0.07
7    0.08
8    0.09
9    0.10
dtype: float64

In [54]:
def cumRet(x,y):
    return x+y

In [55]:
import functools
def red(x):
    return functools.reduce(cumRet,x,1)

In [56]:
s.expanding().apply(red,raw=True)

0    1.01
1    1.03
2    1.06
3    1.10
4    1.15
5    1.21
6    1.28
7    1.36
8    1.45
9    1.55
dtype: float64

### 排序 sort groups by aggregated data

In [18]:
df = pd.DataFrame({'code': ['foo', 'bar', 'baz'] * 2,
                   'data': [0.16, -0.21, 0.33, 0.45, -0.59, 0.62],
                   'flag': [False, True] * 3})
df

Unnamed: 0,code,data,flag
0,foo,0.16,False
1,bar,-0.21,True
2,baz,0.33,False
3,foo,0.45,True
4,bar,-0.59,False
5,baz,0.62,True


In [31]:
df['data'].groupby(df['code']).sum()

code
bar   -0.80
baz    0.95
foo    0.61
Name: data, dtype: float64

In [21]:
code_groups = df.groupby('code')

In [25]:
agg_n_sort_order = code_groups[['data']].transform(sum).sort_values(by='data')
agg_n_sort_order

Unnamed: 0,data
1,-0.8
4,-0.8
0,0.61
3,0.61
2,0.95
5,0.95


In [34]:
code_groups[['data']]

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x00000262CDCFFF60>