# 第六课 数据分析工具Pandas高阶
## 第四节 分组操作(2) -- 自定义分组及聚合操作

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 文件路径
filepath = './datasets/2016_happiness.csv'

* 读取文件

In [3]:
data = pd.read_csv(filepath, usecols=['Country', 'Region', 'Happiness Rank', 'Happiness Score'])

In [4]:
# 数据预览
data.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score
0,Denmark,Western Europe,1,7.526
1,Switzerland,Western Europe,2,7.509
2,Iceland,Western Europe,3,7.501
3,Norway,Western Europe,4,7.498
4,Finland,Western Europe,5,7.413


* 自定义分组

In [5]:
# 自定义分组规则
def get_score_group(score):
    if score <= 4:
        score_group = 'low'
    elif score <= 6:
        score_group = 'middle'
    else:
        score_group = 'high'
    return score_group

In [6]:
# 方法1：传入自定义的函数进行分组按单列分组
data2 = data.set_index('Happiness Score')
data2.groupby(get_score_group).size()

high      47
low       21
middle    89
dtype: int64

In [8]:
# 方法2：人为构造出一个分组列
data['score group'] = data['Happiness Score'].apply(get_score_group)
data.tail()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,score group
152,Benin,Sub-Saharan Africa,153,3.484,low
153,Afghanistan,Southern Asia,154,3.36,low
154,Togo,Sub-Saharan Africa,155,3.303,low
155,Syria,Middle East and Northern Africa,156,3.069,low
156,Burundi,Sub-Saharan Africa,157,2.905,low


In [9]:
data.groupby('score group').size()

score group
high      47
low       21
middle    89
dtype: int64

* 自定义聚合操作

In [10]:
data.groupby('Region').max()

Unnamed: 0_level_0,Country,Happiness Rank,Happiness Score,score group
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia and New Zealand,New Zealand,9,7.334,high
Central and Eastern Europe,Uzbekistan,129,6.596,middle
Eastern Asia,Taiwan,101,6.379,middle
Latin America and Caribbean,Venezuela,136,7.087,middle
Middle East and Northern Africa,Yemen,156,7.267,middle
North America,United States,13,7.404,high
Southeastern Asia,Vietnam,140,6.739,middle
Southern Asia,Sri Lanka,154,5.196,middle
Sub-Saharan Africa,Zimbabwe,157,5.648,middle
Western Europe,United Kingdom,99,7.526,middle


In [11]:
data.groupby('Region').agg(np.max)

Unnamed: 0_level_0,Country,Happiness Rank,Happiness Score,score group
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia and New Zealand,New Zealand,9,7.334,high
Central and Eastern Europe,Uzbekistan,129,6.596,middle
Eastern Asia,Taiwan,101,6.379,middle
Latin America and Caribbean,Venezuela,136,7.087,middle
Middle East and Northern Africa,Yemen,156,7.267,middle
North America,United States,13,7.404,high
Southeastern Asia,Vietnam,140,6.739,middle
Southern Asia,Sri Lanka,154,5.196,middle
Sub-Saharan Africa,Zimbabwe,157,5.648,middle
Western Europe,United Kingdom,99,7.526,middle


In [12]:
# 传入包含多个函数的列表
data.groupby('Region')['Happiness Score'].agg([np.max, np.min, np.mean])

Unnamed: 0_level_0,amax,amin,mean
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia and New Zealand,7.334,7.313,7.3235
Central and Eastern Europe,6.596,4.217,5.37069
Eastern Asia,6.379,4.907,5.624167
Latin America and Caribbean,7.087,4.028,6.10175
Middle East and Northern Africa,7.267,3.069,5.386053
North America,7.404,7.104,7.254
Southeastern Asia,6.739,3.907,5.338889
Southern Asia,5.196,3.36,4.563286
Sub-Saharan Africa,5.648,2.905,4.136421
Western Europe,7.526,5.033,6.685667


In [13]:
# 通过字典为每个列指定不同的操作方法
data.groupby('Region').agg({'Happiness Score': np.mean, 'Happiness Rank': np.max})

Unnamed: 0_level_0,Happiness Rank,Happiness Score
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Australia and New Zealand,9,7.3235
Central and Eastern Europe,129,5.37069
Eastern Asia,101,5.624167
Latin America and Caribbean,136,6.10175
Middle East and Northern Africa,156,5.386053
North America,13,7.254
Southeastern Asia,140,5.338889
Southern Asia,154,4.563286
Sub-Saharan Africa,157,4.136421
Western Europe,99,6.685667


In [14]:
# 传入自定义函数
def max_min_diff(x):
    return x.max() - x.min()

data.groupby('Region')['Happiness Rank'].agg(max_min_diff)

Region
Australia and New Zealand            1
Central and Eastern Europe         102
Eastern Asia                        67
Latin America and Caribbean        122
Middle East and Northern Africa    145
North America                        7
Southeastern Asia                  118
Southern Asia                       70
Sub-Saharan Africa                  91
Western Europe                      98
Name: Happiness Rank, dtype: int64