In [1]:
import pandas as pd

# 分组聚合操作

## 分组聚合简介

- 在 SQL 中我们经常使用 GROUP BY 将某个字段，按不同的取值进行分组，在 pandas 中也有 groupby 函数；

- 分组之后，每组都会有至少1条数据，将这些数据进一步处理返回单个值的过程就是聚合。

- 比如：分组之后计算算术平均值，或者分组之后计算频数，都属于聚合。

基本格式：

- 方式1：
- df.groupby(列标签, ...).列标签.聚合函数()  按指定列分组，并对分组数据的相应列进行相应的 聚合操作
- 方式2：
- df.groupby(列标签, ...).agg({'列标签': '聚合', ...})
- df.groupby(列标签, ...).列表签.agg(聚合...)  按指定列分组，并对分组数据- 的相应列进行相应的 聚合操作
- 方式3：
- df.groupby(列标签, ...).aggregate({'列标签': '聚合', ...})
- df.groupby(列标签, ...).列表签.aggregate(聚合...)  按指定列分组，并对分组数据的相应列进行相应的聚合操作

注意：

- 1）方式1 只能使用 pandas 内置的聚合方法，并且只能进行一种聚合

- 2）方式2 和 方式3 除了能够使用 pandas 内置的聚合方法，还可以使用其他聚合方法，并且可以进行多种聚合

## pandas 内置的聚合方法

- pandas方法  Numpy函数  说明
- count  np.count_nonzero  频率统计(不包含NaN值)
- size    频率统计(包含NaN值)
- mean  np.mean  求平均值
- std  np.std  标准差
- min  np.min  最小值
- quantile()  np.percentile()  分位数
- max  np.max  求最大值
- sum  np.sum  求和
- var  np.var  方差
- describe    计数、平均值、标准差，最小值、分位数、最大值
- first    返回第一行
- last    返回最后一行
- nth    返回第N行(Python从0开始计数)

## 加载 gapminder.tsv 数据集

In [2]:
gapminder = pd.read_csv('./data/gapminder.tsv',sep='\t')
gapminder.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


### 计算每年期望年龄的平均值

In [3]:
gapminder.groupby('year')['lifeExp'].mean()

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [4]:
gapminder.groupby('year').lifeExp.mean()

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [5]:
gapminder.groupby('year').agg({'lifeExp':'mean'})

Unnamed: 0_level_0,lifeExp
year,Unnamed: 1_level_1
1952,49.05762
1957,51.507401
1962,53.609249
1967,55.67829
1972,57.647386
1977,59.570157
1982,61.533197
1987,63.212613
1992,64.160338
1997,65.014676


In [6]:
import numpy as np

In [7]:
gapminder.groupby('year').agg({'lifeExp':np.mean})

Unnamed: 0_level_0,lifeExp
year,Unnamed: 1_level_1
1952,49.05762
1957,51.507401
1962,53.609249
1967,55.67829
1972,57.647386
1977,59.570157
1982,61.533197
1987,63.212613
1992,64.160338
1997,65.014676


In [8]:
gapminder.groupby('year').aggregate({'lifeExp':'mean'})

Unnamed: 0_level_0,lifeExp
year,Unnamed: 1_level_1
1952,49.05762
1957,51.507401
1962,53.609249
1967,55.67829
1972,57.647386
1977,59.570157
1982,61.533197
1987,63.212613
1992,64.160338
1997,65.014676


In [9]:
gapminder.groupby('year').aggregate({'lifeExp':np.mean})

Unnamed: 0_level_0,lifeExp
year,Unnamed: 1_level_1
1952,49.05762
1957,51.507401
1962,53.609249
1967,55.67829
1972,57.647386
1977,59.570157
1982,61.533197
1987,63.212613
1992,64.160338
1997,65.014676


### 统计每年预期寿命的最小值、最大值和平均值

In [10]:
gapminder.groupby('year')['lifeExp'].agg(['min','max','mean'])

Unnamed: 0_level_0,min,max,mean
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,28.801,72.67,49.05762
1957,30.332,73.47,51.507401
1962,31.997,73.68,53.609249
1967,34.02,74.16,55.67829
1972,35.4,74.72,57.647386
1977,31.22,76.11,59.570157
1982,38.445,77.11,61.533197
1987,39.906,78.67,63.212613
1992,23.599,79.36,64.160338
1997,36.087,80.69,65.014676


In [11]:
gapminder.groupby('year').lifeExp.agg(['min','max','mean'])

Unnamed: 0_level_0,min,max,mean
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,28.801,72.67,49.05762
1957,30.332,73.47,51.507401
1962,31.997,73.68,53.609249
1967,34.02,74.16,55.67829
1972,35.4,74.72,57.647386
1977,31.22,76.11,59.570157
1982,38.445,77.11,61.533197
1987,39.906,78.67,63.212613
1992,23.599,79.36,64.160338
1997,36.087,80.69,65.014676


In [12]:
gapminder.groupby('year')['lifeExp'].aggregate(['min','max','mean'])

Unnamed: 0_level_0,min,max,mean
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,28.801,72.67,49.05762
1957,30.332,73.47,51.507401
1962,31.997,73.68,53.609249
1967,34.02,74.16,55.67829
1972,35.4,74.72,57.647386
1977,31.22,76.11,59.570157
1982,38.445,77.11,61.533197
1987,39.906,78.67,63.212613
1992,23.599,79.36,64.160338
1997,36.087,80.69,65.014676


In [13]:
gapminder.groupby('year').lifeExp.aggregate(['min','max','mean'])

Unnamed: 0_level_0,min,max,mean
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,28.801,72.67,49.05762
1957,30.332,73.47,51.507401
1962,31.997,73.68,53.609249
1967,34.02,74.16,55.67829
1972,35.4,74.72,57.647386
1977,31.22,76.11,59.570157
1982,38.445,77.11,61.533197
1987,39.906,78.67,63.212613
1992,23.599,79.36,64.160338
1997,36.087,80.69,65.014676


### 统计每年的人均寿命和GDP的最大值

In [14]:
ret = gapminder.groupby('year').agg({'lifeExp':'mean','gdpPercap':'max'})
ret

Unnamed: 0_level_0,lifeExp,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1952,49.05762,108382.3529
1957,51.507401,113523.1329
1962,53.609249,95458.11176
1967,55.67829,80894.88326
1972,57.647386,109347.867
1977,59.570157,59265.47714
1982,61.533197,33693.17525
1987,63.212613,31540.9748
1992,64.160338,34932.91959
1997,65.014676,41283.16433


In [15]:
ret.rename(columns={'lifeExp':'人均寿命','gdpPercap':'GDP的最大值'})

Unnamed: 0_level_0,人均寿命,GDP的最大值
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1952,49.05762,108382.3529
1957,51.507401,113523.1329
1962,53.609249,95458.11176
1967,55.67829,80894.88326
1972,57.647386,109347.867
1977,59.570157,59265.47714
1982,61.533197,33693.17525
1987,63.212613,31540.9748
1992,64.160338,34932.91959
1997,65.014676,41283.16433


### 示例：计算每年期望年龄的平均值(自定义聚合函数)

In [16]:
def my_mean(values):
#     计算平均值
#     获取数据条目数
    n = len(values)
    _sum = 0
    for value in values:
        _sum += value
    return _sum/n

In [17]:
gapminder.groupby('year')['lifeExp'].agg(my_mean)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [18]:
# 这种方式只能使用系统提供的聚合函数
gapminder.groupby('year').agg({'lifeExp':'mean'})

Unnamed: 0_level_0,lifeExp
year,Unnamed: 1_level_1
1952,49.05762
1957,51.507401
1962,53.609249
1967,55.67829
1972,57.647386
1977,59.570157
1982,61.533197
1987,63.212613
1992,64.160338
1997,65.014676


### 统计每年的平均年龄和所有平均年龄的差值(自定义聚合函数)

In [19]:
def diff_lifeExp(values,global_mean):
    return values.mean() - global_mean

In [20]:
# 计算所有年龄的平均值
global_mean = gapminder.lifeExp.mean()
global_mean

59.474439366197174

In [21]:
gapminder.groupby('year').lifeExp.agg(diff_lifeExp,global_mean = global_mean)

year
1952   -10.416820
1957    -7.967038
1962    -5.865190
1967    -3.796150
1972    -1.827053
1977     0.095718
1982     2.058758
1987     3.738173
1992     4.685899
1997     5.540237
2002     6.220483
2007     7.532983
Name: lifeExp, dtype: float64

# transform 转换

- transform 转换，需要把 DataFrame 中的值传递给一个函数， 而后由该函数"转换"数据
- aggregate(聚合) 返回单个聚合值，但 transform 不会减少数据量

## transform 功能演示

### 按年分组，并计算组内每个人的预期寿命和该组平均年龄的差值10月6日

In [22]:
def lifeExp_diff(x):
    return x - x.mean()

In [23]:
gapminder.groupby('year')['lifeExp'].transform(lifeExp_diff)

0      -20.256620
1      -21.175401
2      -21.612249
3      -21.658290
4      -21.559386
          ...    
1699    -0.861613
1700    -3.783338
1701   -18.205676
1702   -25.705923
1703   -23.520423
Name: lifeExp, Length: 1704, dtype: float64

In [24]:
gapminder.groupby('year').lifeExp.transform(lifeExp_diff)

0      -20.256620
1      -21.175401
2      -21.612249
3      -21.658290
4      -21.559386
          ...    
1699    -0.861613
1700    -3.783338
1701   -18.205676
1702   -25.705923
1703   -23.520423
Name: lifeExp, Length: 1704, dtype: float64

## transform 分组填充缺失值

- 之前介绍了填充缺失值的各种方法，对于某些数据集，可以使用列的平均值来填充缺失值。某些情况下，可以考虑将列进行分组，分组之后取平均再填充缺失值

### 加载 tips.csv 数据集，并从其中随机取出 10 条数据

In [25]:
tips_10 = pd.read_csv('./data/tips.csv').sample(10,random_state=42)
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
24,19.82,3.18,Male,No,Sat,Dinner,2
6,8.77,2.0,Male,No,Sun,Dinner,2
153,24.55,2.0,Male,No,Sun,Dinner,4
211,25.89,5.16,Male,Yes,Sat,Dinner,4
198,13.0,2.0,Female,Yes,Thur,Lunch,2
176,17.89,2.0,Male,Yes,Sun,Dinner,2
192,28.44,2.56,Male,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
9,14.78,3.23,Male,No,Sun,Dinner,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


### 构建缺失值

In [26]:
tips_10.iloc[[1,3,5,7],0] = np.nan
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
24,19.82,3.18,Male,No,Sat,Dinner,2
6,,2.0,Male,No,Sun,Dinner,2
153,24.55,2.0,Male,No,Sun,Dinner,4
211,,5.16,Male,Yes,Sat,Dinner,4
198,13.0,2.0,Female,Yes,Thur,Lunch,2
176,,2.0,Male,Yes,Sun,Dinner,2
192,28.44,2.56,Male,Yes,Thur,Lunch,2
124,,2.52,Female,No,Thur,Lunch,2
9,14.78,3.23,Male,No,Sun,Dinner,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


### 分组查看缺失情况

In [27]:
tips_10.groupby('sex').count()

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,2,3,3,3,3,3
Male,4,7,7,7,7,7


- total_bill 列中，Female 性别的有 1 个缺失， Male 性别的有 2 个缺失

### 定义函数，按性别分组填充缺失值


In [28]:
def fill_na_mean(x):
#     计算平均值
    avg = x.mean()
#     用平均值填充缺失值
    return x.fillna(avg)

In [29]:
total_bill_group_mean = tips_10.groupby('sex').total_bill.transform(fill_na_mean)
total_bill_group_mean

24     19.8200
6      21.8975
153    24.5500
211    21.8975
198    13.0000
176    21.8975
192    28.4400
124    14.1900
9      14.7800
101    15.3800
Name: total_bill, dtype: float64

### 将计算的结果赋值新列

In [30]:
tips_10['fill_total_bill'] = total_bill_group_mean
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,fill_total_bill
24,19.82,3.18,Male,No,Sat,Dinner,2,19.82
6,,2.0,Male,No,Sun,Dinner,2,21.8975
153,24.55,2.0,Male,No,Sun,Dinner,4,24.55
211,,5.16,Male,Yes,Sat,Dinner,4,21.8975
198,13.0,2.0,Female,Yes,Thur,Lunch,2,13.0
176,,2.0,Male,Yes,Sun,Dinner,2,21.8975
192,28.44,2.56,Male,Yes,Thur,Lunch,2,28.44
124,,2.52,Female,No,Thur,Lunch,2,14.19
9,14.78,3.23,Male,No,Sun,Dinner,2,14.78
101,15.38,3.0,Female,Yes,Fri,Dinner,2,15.38


In [31]:
tips_10.groupby('sex').total_bill.mean()

sex
Female    14.1900
Male      21.8975
Name: total_bill, dtype: float64

## transform 练习

- 需求：使用weight_loss.csv 数据集，找到减肥比赛赢家
- 注：weight_loss.csv 数据集中，包含了Bob、Amy两个人从1月到4月每周的减肥记录

### 加载weight_loss.csv数据集

In [32]:
weight_loss = pd.read_csv('./data/weight_loss.csv')
weight_loss

Unnamed: 0,Name,Month,Week,Weight
0,Bob,Jan,Week 1,291
1,Amy,Jan,Week 1,197
2,Bob,Jan,Week 2,288
3,Amy,Jan,Week 2,189
4,Bob,Jan,Week 3,283
5,Amy,Jan,Week 3,189
6,Bob,Jan,Week 4,283
7,Amy,Jan,Week 4,190
8,Bob,Feb,Week 1,283
9,Amy,Feb,Week 1,190


### 定义函数计算每人每周减肥比例并测试

In [33]:
def find_perc_loss(s):
    return abs((s - s.iloc[0]) / s.iloc[0])

In [34]:
# 查找Bob1月份的数据
bob_jan = weight_loss.query('Name=="Bob" and Month=="Jan"')
bob_jan

Unnamed: 0,Name,Month,Week,Weight
0,Bob,Jan,Week 1,291
2,Bob,Jan,Week 2,288
4,Bob,Jan,Week 3,283
6,Bob,Jan,Week 4,283


In [35]:
# 测试计算减肥比例的方法
find_perc_loss(bob_jan['Weight'])

0    0.000000
2    0.010309
4    0.027491
6    0.027491
Name: Weight, dtype: float64

### 计算每人每周的减肥比例

In [36]:
pcnt_loss = weight_loss.groupby(['Name','Month'])['Weight'].transform(find_perc_loss)
pcnt_loss

0     0.000000
1     0.000000
2     0.010309
3     0.040609
4     0.027491
5     0.040609
6     0.027491
7     0.035533
8     0.000000
9     0.000000
10    0.028269
11    0.031579
12    0.053004
13    0.068421
14    0.053004
15    0.089474
16    0.000000
17    0.000000
18    0.011194
19    0.000000
20    0.011194
21    0.017341
22    0.026119
23    0.017341
24    0.000000
25    0.000000
26    0.011494
27    0.035294
28    0.030651
29    0.035294
30    0.042146
31    0.052941
Name: Weight, dtype: float64

### 增加每周减肥比例列

In [37]:
weight_loss['Perc Weight Loss'] = pcnt_loss
weight_loss.head()

Unnamed: 0,Name,Month,Week,Weight,Perc Weight Loss
0,Bob,Jan,Week 1,291,0.0
1,Amy,Jan,Week 1,197,0.0
2,Bob,Jan,Week 2,288,0.010309
3,Amy,Jan,Week 2,189,0.040609
4,Bob,Jan,Week 3,283,0.027491


### 查找每个月最后一周的数据 用来比较减肥效果

In [38]:
week4 = weight_loss.query('Week == "Week 4"')
week4

Unnamed: 0,Name,Month,Week,Weight,Perc Weight Loss
6,Bob,Jan,Week 4,283,0.027491
7,Amy,Jan,Week 4,190,0.035533
14,Bob,Feb,Week 4,268,0.053004
15,Amy,Feb,Week 4,173,0.089474
22,Bob,Mar,Week 4,261,0.026119
23,Amy,Mar,Week 4,170,0.017341
30,Bob,Apr,Week 4,250,0.042146
31,Amy,Apr,Week 4,161,0.052941


### 在第四周数据基础上，找到 Bob 和 Amy的减肥数据

In [39]:
week4_Bob = week4.query('Name == "Bob"')[['Month','Perc Weight Loss']]
week4_Bob

Unnamed: 0,Month,Perc Weight Loss
6,Jan,0.027491
14,Feb,0.053004
22,Mar,0.026119
30,Apr,0.042146


In [40]:
week4_Amy = week4.query('Name == "Amy"')[['Month','Perc Weight Loss']]
week4_Amy

Unnamed: 0,Month,Perc Weight Loss
7,Jan,0.035533
15,Feb,0.089474
23,Mar,0.017341
31,Apr,0.052941


### 比较Bob 和 Amy的减肥效果，Amy的减肥效果更明显

In [41]:
week4_Bob.set_index('Month') - week4_Amy.set_index('Month')

Unnamed: 0_level_0,Perc Weight Loss
Month,Unnamed: 1_level_1
Jan,-0.008042
Feb,-0.03647
Mar,0.008778
Apr,-0.010796


In [42]:
week4_Amy.set_index('Month') - week4_Bob.set_index('Month')

Unnamed: 0_level_0,Perc Weight Loss
Month,Unnamed: 1_level_1
Jan,0.008042
Feb,0.03647
Mar,-0.008778
Apr,0.010796


# 分组过滤
- 使用 groupby 方法还可以过滤数据，调用 filter 方法，传入一个返回布尔值的函数，返回 False 的数据会被过滤掉

## 使用 tips.csv 用餐数据集，加载数据并不同用餐人数的数量

In [43]:
tips = pd.read_csv('./data/tips.csv')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [44]:
# 统计不同人数的数量
tips['size'].value_counts()

2    156
3     38
4     37
5      5
1      4
6      4
Name: size, dtype: int64

- 结果显示：人数为1、5和6人的数据比较少，考虑将这部分数据过滤掉

In [45]:
tips_filtered = tips.groupby('size').filter(lambda x: x['size'].count() > 30)
tips_filtered

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


## 查看结果

In [46]:
tips_filtered['size'].value_counts()

2    156
3     38
4     37
Name: size, dtype: int64

# DataFrameGroupBy 对象

## 分组操作

### 准备数据，加载 tips.csv 数据集，随机取出其中的 10 条数据

In [47]:
tips_10 = pd.read_csv('./data/tips.csv').sample(10,random_state=42)
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
24,19.82,3.18,Male,No,Sat,Dinner,2
6,8.77,2.0,Male,No,Sun,Dinner,2
153,24.55,2.0,Male,No,Sun,Dinner,4
211,25.89,5.16,Male,Yes,Sat,Dinner,4
198,13.0,2.0,Female,Yes,Thur,Lunch,2
176,17.89,2.0,Male,Yes,Sun,Dinner,2
192,28.44,2.56,Male,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
9,14.78,3.23,Male,No,Sun,Dinner,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


### 调用 groupby 方法，创建分组对象

In [48]:
sex_groups = tips_10.groupby('sex')
sex_groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022CB6D8EE50>

- 注意：sex_groups 是一个DataFrameGroupBy对象，如果想查看计算过的分组，可以借助groups属性实现

In [49]:
sex_groups.groups

{'Female': [198, 124, 101], 'Male': [24, 6, 153, 211, 176, 192, 9]}

- 结果说明：上面返回的结果是 DataFrame 的索引，实际上就是原始数据的行数

### 在 DataFrameGroupBy 对象基础上，直接就可以进行 aggregate、transform 等计算

In [50]:
sex_groups.mean()

  sex_groups.mean()


Unnamed: 0_level_0,total_bill,tip,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,13.62,2.506667,2.0
Male,20.02,2.875714,2.571429


- 结果说明：上面结果直接计算了按 sex 分组后，所有列的平均值，但只返回了数值列的结果，非数值列不会计算平均值

### 通过 get_group 方法选择分组

In [51]:
sex_groups.get_group('Female')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
198,13.0,2.0,Female,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


In [52]:
sex_groups.get_group('Male')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
24,19.82,3.18,Male,No,Sat,Dinner,2
6,8.77,2.0,Male,No,Sun,Dinner,2
153,24.55,2.0,Male,No,Sun,Dinner,4
211,25.89,5.16,Male,Yes,Sat,Dinner,4
176,17.89,2.0,Male,Yes,Sun,Dinner,2
192,28.44,2.56,Male,Yes,Thur,Lunch,2
9,14.78,3.23,Male,No,Sun,Dinner,2


## 遍历分组

- 通过 DataFrameGroupBy 对象，可以遍历所有分组，相比于在 groupby 之后使用aggregate、transform和filter，有时候使用 for 循环解决问题更简单

In [53]:
for sex_group in sex_groups:
    print(type(sex_group))
    print(sex_group)

<class 'tuple'>
('Female',      total_bill   tip     sex smoker   day    time  size
198       13.00  2.00  Female    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2)
<class 'tuple'>
('Male',      total_bill   tip   sex smoker   day    time  size
24        19.82  3.18  Male     No   Sat  Dinner     2
6          8.77  2.00  Male     No   Sun  Dinner     2
153       24.55  2.00  Male     No   Sun  Dinner     4
211       25.89  5.16  Male    Yes   Sat  Dinner     4
176       17.89  2.00  Male    Yes   Sun  Dinner     2
192       28.44  2.56  Male    Yes  Thur   Lunch     2
9         14.78  3.23  Male     No   Sun  Dinner     2)


- 注意：DataFrameGroupBy对象不支持下标取值，会报错

In [54]:
#sex_groups[0]

In [55]:
for sex_group in sex_groups:
    print(sex_group[0])
    print(type(sex_group[0]))
    print(sex_group[1])
    print(type(sex_group[1]))

Female
<class 'str'>
     total_bill   tip     sex smoker   day    time  size
198       13.00  2.00  Female    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2
<class 'pandas.core.frame.DataFrame'>
Male
<class 'str'>
     total_bill   tip   sex smoker   day    time  size
24        19.82  3.18  Male     No   Sat  Dinner     2
6          8.77  2.00  Male     No   Sun  Dinner     2
153       24.55  2.00  Male     No   Sun  Dinner     4
211       25.89  5.16  Male    Yes   Sat  Dinner     4
176       17.89  2.00  Male    Yes   Sun  Dinner     2
192       28.44  2.56  Male    Yes  Thur   Lunch     2
9         14.78  3.23  Male     No   Sun  Dinner     2
<class 'pandas.core.frame.DataFrame'>


## 多个分组

- 前面使用的 groupby 语句只包含一个变量，可以在 groupby 中添加多个变量

### 比如上面用到的 tips.csv 数据集，可以使用groupby按性别和用餐时间分别计算小费数据的平均值

In [56]:
group_avg = tips_10.groupby(['sex','time']).mean()
group_avg

  group_avg = tips_10.groupby(['sex','time']).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size
sex,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,Dinner,15.38,3.0,2.0
Female,Lunch,12.74,2.26,2.0
Male,Dinner,18.616667,2.928333,2.666667
Male,Lunch,28.44,2.56,2.0


### 分别查看分组之后结果的列标签和行标签

In [57]:
# 查看列标签
group_avg.columns

Index(['total_bill', 'tip', 'size'], dtype='object')

In [58]:
# 查看行标签
group_avg.index

MultiIndex([('Female', 'Dinner'),
            ('Female',  'Lunch'),
            (  'Male', 'Dinner'),
            (  'Male',  'Lunch')],
           names=['sex', 'time'])

- 可以看到，多个分组之后返回的是MultiIndex，如果想得到一个普通的DataFrame，可以在结果上调用reset_index 方法

In [59]:
group_avg.reset_index()

Unnamed: 0,sex,time,total_bill,tip,size
0,Female,Dinner,15.38,3.0,2.0
1,Female,Lunch,12.74,2.26,2.0
2,Male,Dinner,18.616667,2.928333,2.666667
3,Male,Lunch,28.44,2.56,2.0


### 也可以在分组的时候通过as_index=False参数（默认是True），效果与调用reset_index()一样

In [60]:
# as_index=False:分组字段不作为结果中的行标签索引
tips.groupby(['sex','time'],as_index=False).mean()

  tips.groupby(['sex','time'],as_index=False).mean()


Unnamed: 0,sex,time,total_bill,tip,size
0,Female,Dinner,19.213077,3.002115,2.461538
1,Female,Lunch,16.339143,2.582857,2.457143
2,Male,Dinner,21.461452,3.144839,2.701613
3,Male,Lunch,18.048485,2.882121,2.363636


# 总结
- 分组是数据分析中常见的操作，有助于从不同角度观察数据
- 分组之后可以得到 DataFrameGroupby 对象，该对象可以进行聚合、转换、过滤操作
- 分组之后的数据处理可以使用已有的内置函数，也可以使用自定义函数
- 分组不但可以对单个字段进行分组，也可以对多个字段进行分组，多个字段分组之后可以得到MultiIndex数据，可以通过 reset_index 方法将数据变成普通的 DataFrame