# 数据聚合与分组运算

In [1]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

## GroupBy技术
+ split-apply-combine（拆分——应用——合并）过程
+ 分组键可以由多种形式，且类型不必相同：


1. 列表或数组，其长度与待分组的轴一样
2. 表示DataFrame某个列名的值
3. 字典或Series，给出待分组轴上的值与分组名之间的关系
4. 函数，用于处理轴索引或索引中的各个标签

In [3]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,1.007189,0.886429,a,one
1,-1.296221,-2.001637,a,two
2,0.274992,-0.371843,b,one
3,0.228913,1.669025,b,two
4,1.352917,-0.43857,a,one


In [8]:
df['data1'].mean()

0.31355791920072618

In [10]:
df['data2'].sum()

-0.2565949325197644

In [11]:
df.sum()

data1            1.56779
data2          -0.256595
key1               aabba
key2     onetwoonetwoone
dtype: object

In [12]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x018E41D0>

+ 变量`grouped`是一个*GroubBy*对象，还没有进行任何计算，但已有分组计算所需要的信息。
+ *GroupBy*对象的`mean`方法可以计算分组平均值。

In [13]:
grouped.mean() # 返回Series
grouped.sum()

key1
a    0.354628
b    0.251952
Name: data1, dtype: float64

key1
a    1.063885
b    0.503905
Name: data1, dtype: float64

In [5]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means # 具有层次化索引的Series

key1  key2
a     one     0.880536
      two     0.478943
b     one    -0.519439
      two    -0.555730
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.880536,0.478943
b,-0.519439,-0.55573


+ 上述例子，分组键为Series。实际上可以为任意长度合适的数组。

In [7]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

California  2005    0.478943
            2006   -0.519439
Ohio        2005   -0.380219
            2006    1.965781
Name: data1, dtype: float64

In [8]:
df
df.groupby('key1').mean()
df.groupby(['key1', 'key2']).mean()

Unnamed: 0,data1,data2,key1,key2
0,-0.204708,1.393406,a,one
1,0.478943,0.092908,a,two
2,-0.519439,0.281746,b,one
3,-0.55573,0.769023,b,two
4,1.965781,1.246435,a,one


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.746672,0.910916
b,-0.537585,0.525384


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.880536,1.31992
a,two,0.478943,0.092908
b,one,-0.519439,0.281746
b,two,-0.55573,0.769023


In [9]:
df.groupby(['key1','key2']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,1.761073,2.639841
a,two,0.478943,0.092908
b,one,-0.519439,0.281746
b,two,-0.55573,0.769023


In [10]:
df.groupby('key1').idxmax()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4.0,0.0
b,2.0,3.0


+ 还可以将列名（可以是字符串、数字或其他Python对象）用作分组键。
+ *GroupBy*的`size()`方法可以返回一个含有分组大小的Series

In [11]:
df.groupby('key1').size()

key1
a    3
b    2
dtype: int64

### 对分组进行迭代
+ `GroupBy`对象支持迭代，可以产生一组二元元组，由分组名和数据块组成。

In [12]:
for (name, group) in df.groupby('key1'):
    print("Group %s:" % name)
    print(group)

Group a:
      data1     data2 key1 key2
0 -0.204708  1.393406    a  one
1  0.478943  0.092908    a  two
4  1.965781  1.246435    a  one
Group b:
      data1     data2 key1 key2
2 -0.519439  0.281746    b  one
3 -0.555730  0.769023    b  two


+ 对于多重键的情况，元组的第一个元素是由键值组成的元组
+ 可以对上述数据片段做任何操作，也可以转化为字典。

In [13]:
for ((k1, k2), group) in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
      data1     data2 key1 key2
0 -0.204708  1.393406    a  one
4  1.965781  1.246435    a  one
('a', 'two')
      data1     data2 key1 key2
1  0.478943  0.092908    a  two
('b', 'one')
      data1     data2 key1 key2
2 -0.519439  0.281746    b  one
('b', 'two')
     data1     data2 key1 key2
3 -0.55573  0.769023    b  two


In [14]:
ll = list(df.groupby('key1'))
ll

[('a',       data1     data2 key1 key2
  0  1.007189  0.886429    a  one
  1 -1.296221 -2.001637    a  two
  4  1.352917 -0.438570    a  one), ('b',       data1     data2 key1 key2
  2  0.274992 -0.371843    b  one
  3  0.228913  1.669025    b  two)]

In [16]:
ll[0][0]
ll[0][1]

'a'

Unnamed: 0,data1,data2,key1,key2
0,1.007189,0.886429,a,one
1,-1.296221,-2.001637,a,two
4,1.352917,-0.43857,a,one


In [16]:
pieces = dict(list(df.groupby('key1')))
pieces

{'a':       data1     data2 key1 key2
 0 -0.204708  1.393406    a  one
 1  0.478943  0.092908    a  two
 4  1.965781  1.246435    a  one, 'b':       data1     data2 key1 key2
 2 -0.519439  0.281746    b  one
 3 -0.555730  0.769023    b  two}

In [17]:
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,-0.519439,0.281746,b,one
3,-0.55573,0.769023,b,two


+ `groupby`默认是在axis=0上进行分组，通过设置也可以在其他任何轴上进行的分组。

In [18]:
df
df.dtypes

Unnamed: 0,data1,data2,key1,key2
0,-0.204708,1.393406,a,one
1,0.478943,0.092908,a,two
2,-0.519439,0.281746,b,one
3,-0.55573,0.769023,b,two
4,1.965781,1.246435,a,one


data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [20]:
grouped = df.groupby(df.dtypes,axis=1)
list(grouped)

[(dtype('float64'),       data1     data2
  0  1.007189  0.886429
  1 -1.296221 -2.001637
  2  0.274992 -0.371843
  3  0.228913  1.669025
  4  1.352917 -0.438570), (dtype('O'),   key1 key2
  0    a  one
  1    a  two
  2    b  one
  3    b  two
  4    a  one)]

In [20]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.204708  1.393406
1  0.478943  0.092908
2 -0.519439  0.281746
3 -0.555730  0.769023
4  1.965781  1.246435
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


### 选取一个或一组列
由DataFrame产生的GroupBy对象，如果用一个或一组字符串列名对其进行索引，就能实现选取部分列进行聚合。

```Python
df.groupby('key1')['data1']

df.groupby('key1')[['data2']]
```

```Python
df['data1'].groupby(df['key1'])

df[['data2']].groupby(df['key1'])
```

In [21]:
df.groupby(['key1', 'key2'])['data2']

<pandas.core.groupby.SeriesGroupBy object at 0x0161CDF0>

In [22]:
df.groupby(['key1', 'key2'])[['data2']]

<pandas.core.groupby.DataFrameGroupBy object at 0x018E49B0>

In [22]:
# 只计算data2列的平均值，并以DataFrame形式返回结果
ss = df.groupby(['key1', 'key2'])[['data2']]
ss
ss.mean()

<pandas.core.groupby.DataFrameGroupBy object at 0x07867ED0>

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,1.31992
a,two,0.092908
b,one,0.281746
b,two,0.769023


In [23]:
# 只计算data2列的平均值，并以Series形式返回结果
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one     1.319920
      two     0.092908
b     one     0.281746
      two     0.769023
Name: data2, dtype: float64

In [24]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped
s_grouped.mean()

<pandas.core.groupby.SeriesGroupBy object at 0x07867B10>

key1  key2
a     one     1.319920
      two     0.092908
b     one     0.281746
      two     0.769023
Name: data2, dtype: float64

### 通过字典或Series进行分组

In [24]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,-0.539741,0.476985,3.248944,-1.021228,-0.577087
Steve,0.124121,0.302614,0.523772,0.00094,1.34381
Wes,-0.713544,,,-1.860761,-0.860757
Jim,0.560145,-1.265934,0.119827,-1.063512,0.332883
Travis,-2.359419,-0.199543,-1.541996,-0.970736,-1.30703


In [25]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [42]:
by_column = people.groupby(mapping, axis=1)
d=dict(list(by_column))
d['blue']
d['red']

Unnamed: 0,c,d
Joe,3.248944,-1.021228
Steve,0.523772,0.00094
Wes,,-1.860761
Jim,0.119827,-1.063512
Travis,-1.541996,-0.970736


Unnamed: 0,a,b,e
Joe,-0.539741,0.476985,-0.577087
Steve,0.124121,0.302614,1.34381
Wes,-0.713544,,-0.860757
Jim,0.560145,-1.265934,0.332883
Travis,-2.359419,-0.199543,-1.30703


In [33]:
d.get('orange',None)

In [34]:
by_column.sum()

Unnamed: 0,blue,red
Joe,2.227716,-0.639844
Steve,0.524712,1.770545
Wes,-1.860761,-1.574301
Jim,-0.943685,-0.372906
Travis,-2.512731,-3.865992


In [27]:
arr = np.array(['red', 'red', 'blue', 'blue', 'red'])

In [28]:
people.groupby(arr, axis=1).sum()

Unnamed: 0,blue,red
Joe,0.503905,1.063885
Steve,1.297183,-1.553778
Wes,-1.021228,-1.116829
Jim,0.524712,1.770545
Travis,-4.230992,-2.405455


In [30]:
map_series = pd.Series(mapping)
map_series
people.groupby(map_series, axis=1).count()

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### 通过函数进行分组
+ 字典或Series，定义了一种分组映射关系。
+ Python函数定义映射关系更有创意、抽象。
+ 函数和字典可以混用

In [36]:
# 根据人名长度进行分组
list(people.groupby(len))

[(3,             a         b         c         d         e
  Joe -0.539741  0.476985  3.248944 -1.021228 -0.577087
  Wes -0.713544       NaN       NaN -1.860761 -0.860757
  Jim  0.560145 -1.265934  0.119827 -1.063512  0.332883),
 (5,               a         b         c        d        e
  Steve  0.124121  0.302614  0.523772  0.00094  1.34381),
 (6,                a         b         c         d        e
  Travis -2.359419 -0.199543 -1.541996 -0.970736 -1.30703)]

In [37]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.69314,-0.788949,3.368771,-3.945501,-1.104962
5,0.124121,0.302614,0.523772,0.00094,1.34381
6,-2.359419,-0.199543,-1.541996,-0.970736,-1.30703


In [32]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.539741,-1.296221,0.274992,-1.021228,-0.577087
3,two,0.124121,0.302614,0.523772,0.00094,1.34381
5,one,0.886429,-2.001637,-0.371843,1.669025,-0.43857
6,two,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757


### 根据索引级别进行分组

In [39]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.28635,0.377984,-0.753887,0.331286,1.349742
1,0.069877,0.246674,-0.011862,1.004812,1.327195
2,-0.919262,-1.549106,0.022185,0.758363,-0.660524
3,0.86258,-0.010032,0.050009,0.670216,0.852965


In [40]:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## 数据聚合

+ 许多常见的聚合运算，经过优化的groupby的方法

| 函数名 | 说明 |
|------|------|
|`count`|分组中非Na值的数量|
|`sum`|分组中非Na值的和|
|`mean`|分组中非Na值的平均值|
|`median`|分组中非Na值的算术中位数|
|`std`，`var`|无偏（分母为$n-1$）标准差和方差，即样本标准差和方差|
|`min`、`max`|分组中非Na值的最小值、最大值|
|`prod`|分组中非Na值的积|
|`first`、`last`|分组中第一个、最后一个非Na值|

+ 还可以使用自己定义的聚合运算，以及分组对象上已经定义好的任何方法

In [43]:
df
grouped = df.groupby('key1')
# 样本分位数
grouped['data1'].quantile(0.9)

Unnamed: 0,data1,data2,key1,key2
0,1.007189,0.886429,a,one
1,-1.296221,-2.001637,a,two
2,0.274992,-0.371843,b,one
3,0.228913,1.669025,b,two
4,1.352917,-0.43857,a,one


key1
a    1.283771
b    0.270384
Name: data1, dtype: float64

In [45]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.649138,2.888067
b,0.046079,2.040868


In [46]:
grouped.mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.354628,-0.517926
b,0.251952,0.648591


### 面像列的多函数应用

In [47]:
tips = pd.read_csv('examples/tips.csv')
# Add tip percentage of total bill
tips
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.tail()
tips.head(6)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.00,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
239,29.03,5.92,Male,No,Sat,Dinner,3,0.203927
240,27.18,2.0,Female,Yes,Sat,Dinner,2,0.073584
241,22.67,2.0,Male,Yes,Sat,Dinner,2,0.088222
242,17.82,1.75,Male,No,Sat,Dinner,2,0.098204
243,18.78,3.0,Female,No,Thur,Dinner,2,0.159744


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
5,25.29,4.71,Male,No,Sun,Dinner,4,0.18624


In [48]:
grouped = tips.groupby(['day', 'smoker'])

In [49]:
grouped_pct = grouped['tip_pct']
grouped_pct.mean()
# 将函数名以字符串形式传入
grouped_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [41]:
grouped_pct.mean()

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [42]:
grouped_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


+ 如果传入一个由**(name, function)**元组组成的列表，则各元组的第一个元素会被作为DataFrame的列名。

In [50]:
grouped_pct.agg(['mean', np.std])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [43]:
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [44]:
functions = ['count', 'mean', 'max']
result = grouped[['tip_pct', 'total_bill']].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [45]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [46]:
# 自定义名称的元组列表
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [52]:
# 不同的列应用不同的函数，向agg传入从列名映射到函数的字典
grouped.agg({'tip' : np.max, 'size' : 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [53]:
grouped.agg({'tip_pct' : [('小','min'), ('大','max'), 'mean', 'std'],
             'size' : 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,小,大,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


### 以无索引形式返回聚合数据
+ `as_index=False`以禁用索引

In [48]:
tips.groupby(['day', 'smoker'], as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


In [49]:
tips.groupby(['day', 'smoker'], as_index=True).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size,tip_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,18.42,2.8125,2.25,0.15165
Fri,Yes,16.813333,2.714,2.066667,0.174783
Sat,No,19.661778,3.102889,2.555556,0.158048
Sat,Yes,21.276667,2.875476,2.47619,0.147906
Sun,No,20.506667,3.167895,2.929825,0.160113
Sun,Yes,24.12,3.516842,2.578947,0.18725
Thur,No,17.113111,2.673778,2.488889,0.160298
Thur,Yes,19.190588,3.03,2.352941,0.163863


## Apply
+ *GroupBy*对象的`apply`方法，会将待处理的对象拆分为多个片段，然后对各片段调用传入的函数，最后将各片段组合到一起，即：**split--apply--combine**

In [54]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column, ascending=False)[-n:]
top(tips, n=6)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
210,30.06,2.0,Male,Yes,Sat,Dinner,3,0.066534
187,30.46,2.0,Male,Yes,Sun,Dinner,5,0.06566
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
57,26.41,1.5,Female,No,Sat,Dinner,2,0.056797
102,44.3,2.5,Female,Yes,Sat,Dinner,3,0.056433
237,32.83,1.17,Male,Yes,Sat,Dinner,2,0.035638


In [55]:
top(tips,n=10,column='total_bill')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
126,8.52,1.48,Male,No,Thur,Lunch,2,0.173709
135,8.51,1.25,Female,No,Thur,Lunch,2,0.146886
145,8.35,1.5,Female,No,Thur,Lunch,2,0.179641
218,7.74,1.44,Male,Yes,Sat,Dinner,2,0.186047
195,7.56,1.44,Male,No,Thur,Lunch,2,0.190476
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
111,7.25,1.0,Female,No,Sat,Dinner,1,0.137931
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
92,5.75,1.0,Female,Yes,Fri,Dinner,2,0.173913
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733


In [56]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,130,19.08,1.5,Male,No,Thur,Lunch,2,0.078616
No,146,18.64,1.36,Female,No,Thur,Lunch,3,0.072961
No,48,28.55,2.05,Male,No,Sun,Dinner,3,0.071804
No,0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
No,57,26.41,1.5,Female,No,Sat,Dinner,2,0.056797
Yes,240,27.18,2.0,Female,Yes,Sat,Dinner,2,0.073584
Yes,210,30.06,2.0,Male,Yes,Sat,Dinner,3,0.066534
Yes,187,30.46,2.0,Male,Yes,Sun,Dinner,5,0.06566
Yes,102,44.3,2.5,Female,Yes,Sat,Dinner,3,0.056433
Yes,237,32.83,1.17,Male,Yes,Sat,Dinner,2,0.035638


In [52]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,Fri,99,12.46,1.5,Male,No,Fri,Dinner,2,0.120385
No,Sat,111,7.25,1.0,Female,No,Sat,Dinner,1,0.137931
No,Sun,6,8.77,2.0,Male,No,Sun,Dinner,2,0.22805
No,Thur,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
Yes,Fri,92,5.75,1.0,Female,Yes,Fri,Dinner,2,0.173913
Yes,Sat,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,Sun,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
Yes,Thur,196,10.34,2.0,Male,Yes,Thur,Lunch,2,0.193424


In [53]:
result = tips.groupby('smoker')['tip_pct'].describe()
result
rr = result.unstack('smoker')
rr

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [54]:
rr.unstack(1)

smoker,No,Yes
count,151.0,93.0
mean,0.159328,0.163196
std,0.03991,0.085119
min,0.056797,0.035638
25%,0.136906,0.106771
50%,0.155625,0.153846
75%,0.185014,0.195059
max,0.29199,0.710345


当调用describe之类的方法时，实际上只是应用了下面代码的快捷方式：
```Python
f = lambda x: x.describe()
grouped.apply(f)
```

In [55]:
f = lambda x: x['tip_pct'].describe()
tips.groupby(['smoker']).apply(f)

tip_pct,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


### 禁止分组键
默认情况下，分组键会跟原始对象的索引键共同构成结果对象的层次化索引。将`group_keys=False`传入`groupby`可禁止该效果。

In [56]:
tips.groupby('smoker').apply(top,n=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,130,19.08,1.5,Male,No,Thur,Lunch,2,0.078616
No,146,18.64,1.36,Female,No,Thur,Lunch,3,0.072961
No,48,28.55,2.05,Male,No,Sun,Dinner,3,0.071804
No,0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
No,57,26.41,1.5,Female,No,Sat,Dinner,2,0.056797
Yes,240,27.18,2.0,Female,Yes,Sat,Dinner,2,0.073584
Yes,210,30.06,2.0,Male,Yes,Sat,Dinner,3,0.066534
Yes,187,30.46,2.0,Male,Yes,Sun,Dinner,5,0.06566
Yes,102,44.3,2.5,Female,Yes,Sat,Dinner,3,0.056433
Yes,237,32.83,1.17,Male,Yes,Sat,Dinner,2,0.035638


In [57]:
tips.groupby('smoker', group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
130,19.08,1.5,Male,No,Thur,Lunch,2,0.078616
146,18.64,1.36,Female,No,Thur,Lunch,3,0.072961
48,28.55,2.05,Male,No,Sun,Dinner,3,0.071804
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
57,26.41,1.5,Female,No,Sat,Dinner,2,0.056797
240,27.18,2.0,Female,Yes,Sat,Dinner,2,0.073584
210,30.06,2.0,Male,Yes,Sat,Dinner,3,0.066534
187,30.46,2.0,Male,Yes,Sun,Dinner,5,0.06566
102,44.3,2.5,Female,Yes,Sat,Dinner,3,0.056433
237,32.83,1.17,Male,Yes,Sat,Dinner,2,0.035638


### Quantile and Bucket Analysis（分位数和桶分析）
+ 根据指定面元或样本分位数将数据拆分为多块的工具，譬如`cut`和`qcut`。
+ `pandas.cut`: Return indices of half-open bins to which each value of x belongs.
+ `pandas.qcut`:Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles.
+ 将上述函数与groupby结合起来进行分位数分析或桶分析。

In [58]:
pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3)

[(0.19, 3.367], (0.19, 3.367], (0.19, 3.367], (3.367, 6.533], (6.533, 9.7], (0.19, 3.367]]
Categories (3, interval[float64]): [(0.19, 3.367] < (3.367, 6.533] < (6.533, 9.7]]

In [59]:
pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]),
...        3, labels=["good", "medium", "bad"])

[good, good, good, medium, bad, good]
Categories (3, object): [good < medium < bad]

In [60]:
pd.qcut(range(5), 4)

[(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] < (2.0, 3.0] < (3.0, 4.0]]

In [61]:
pd.qcut(range(5), 4, labels=False)

array([0, 0, 1, 2, 3], dtype=int64)

In [62]:
pd.cut(range(5), 4)

[(-0.004, 1.0], (-0.004, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
Categories (4, interval[float64]): [(-0.004, 1.0] < (1.0, 2.0] < (2.0, 3.0] < (3.0, 4.0]]

In [63]:
frame = pd.DataFrame({'data1': np.random.randn(1000),
                      'data2': np.random.randn(1000)})
frame

Unnamed: 0,data1,data2
0,-0.919262,1.165148
1,-1.549106,-0.621249
2,0.022185,-0.799318
3,0.758363,0.777233
4,-0.660524,-0.612905
5,0.862580,0.316447
6,-0.010032,0.838295
7,0.050009,-1.034423
8,0.670216,0.434304
9,0.852965,-2.213133


In [64]:
quartiles = pd.cut(frame.data1, 4)
quartiles[:10]

0     (-1.23, 0.489]
1    (-2.956, -1.23]
2     (-1.23, 0.489]
3     (0.489, 2.208]
4     (-1.23, 0.489]
5     (0.489, 2.208]
6     (-1.23, 0.489]
7     (-1.23, 0.489]
8     (0.489, 2.208]
9     (0.489, 2.208]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.956, -1.23] < (-1.23, 0.489] < (0.489, 2.208] < (2.208, 3.928]]

In [65]:
# 对分组求最小、最大、计数以及均值
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}
grouped = frame.data2.groupby(quartiles)
grouped.apply(get_stats)

data1                 
(-2.956, -1.23]  count     95.000000
                 max        1.670835
                 mean      -0.039521
                 min       -3.399312
(-1.23, 0.489]   count    598.000000
                 max        3.260383
                 mean      -0.002051
                 min       -2.989741
(0.489, 2.208]   count    297.000000
                 max        2.954439
                 mean       0.081822
                 min       -3.745356
(2.208, 3.928]   count     10.000000
                 max        1.765640
                 mean       0.024750
                 min       -1.929776
Name: data2, dtype: float64

In [66]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.956, -1.23]",95.0,1.670835,-0.039521,-3.399312
"(-1.23, 0.489]",598.0,3.260383,-0.002051,-2.989741
"(0.489, 2.208]",297.0,2.954439,0.081822,-3.745356
"(2.208, 3.928]",10.0,1.76564,0.02475,-1.929776


In [68]:
# Return quantile numbers
grouping = pd.qcut(frame.data1, 10, labels=False) # labels=False, return only integer indicators of the bins
grouping
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

0      1
1      0
2      5
3      7
4      2
5      8
6      5
7      5
8      7
9      8
      ..
990    4
991    0
992    0
993    1
994    6
995    3
996    6
997    4
998    3
999    0
Name: data1, Length: 1000, dtype: int64

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,1.670835,-0.049902,-3.399312
1,100.0,2.628441,0.030989,-1.950098
2,100.0,2.527939,-0.067179,-2.925113
3,100.0,3.260383,0.065713,-2.315555
4,100.0,2.074345,-0.111653,-2.047939
5,100.0,2.18481,0.05213,-2.989741
6,100.0,2.458842,-0.021489,-2.223506
7,100.0,2.954439,-0.026459,-3.05699
8,100.0,2.735527,0.103406,-3.745356
9,100.0,2.37702,0.220122,-2.064111
