# pandas实现groupby分组统计

类似SQL:
select city, max(temperature) from city_weather group by city;

groupby: 先对数据分组, 然后在每个分组上应用聚合函数、转换函数

本次示例:
1. 分组使用聚合函数做数据统计
2. 遍历groupby的结果理解执行流程
3. 实例分组探索天气数据

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
# from pylab import mpl
import matplotlib.pyplot as plt
# 加上这一句, 能在jupyter notebook展示matplot图表
%matplotlib inline

plt.rcParams["font.sans-serif"]=["SimHei"] # 指定默认字体：解决plot不能显示中文问题
plt.rcParams["axes.unicode_minus"]=False  # 解决保存图像是负号'-'显示为方块的问题

# mpl.rcParams['font.sans-serif'] = ['Microsoft YaHei']
# mpl.rcParams['axes.unicode_minus'] = False

In [2]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
                   'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C': np.random.randn(8),
                   'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-1.800575,0.926776
1,bar,one,-0.725976,0.675453
2,foo,two,0.511159,0.33586
3,bar,three,1.340017,0.290663
4,foo,two,0.478787,-0.651279
5,bar,two,-0.861409,0.017083
6,foo,one,-1.001919,0.861418
7,foo,three,0.708341,0.576321


## 一. 分组使用聚合函数做数据统计
**1.单个列groupby, 查询所有数据列的统计**

In [3]:
df.groupby('A').sum()

  df.groupby('A').sum()


Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.247368,0.983199
foo,-1.104206,2.049096


**2.多个列groupby, 查询所有数据列的统计**

In [4]:
df.groupby(['A', 'B']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.725976,0.675453
bar,three,1.340017,0.290663
bar,two,-0.861409,0.017083
foo,one,-1.401247,0.894097
foo,three,0.708341,0.576321
foo,two,0.494973,-0.157709


我们看到('A', 'B')成对变成了二级索引

In [5]:
df.groupby(['A', 'B'], as_index=False).mean()

Unnamed: 0,A,B,C,D
0,bar,one,-0.725976,0.675453
1,bar,three,1.340017,0.290663
2,bar,two,-0.861409,0.017083
3,foo,one,-1.401247,0.894097
4,foo,three,0.708341,0.576321
5,foo,two,0.494973,-0.157709


**3.同时查看多种数据统计**

In [6]:
exclude_b_df = df.loc[:, df.columns != 'B']
exclude_b_df.groupby('A').agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,C,C,C,D,D,D
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bar,-0.247368,-0.082456,1.233757,0.983199,0.327733,0.330747
foo,-1.104206,-0.220841,1.117399,2.049096,0.409819,0.638418


我们看到: 列变成了多级索引

**4.查看单列的结果数据统计**

In [7]:
# 方法1: 预过滤, 性能更好
exclude_b_df.groupby('A')['C'].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,sum,mean,std
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,-0.247368,-0.082456,1.233757
foo,-1.104206,-0.220841,1.117399


In [8]:
# 方法2
exclude_b_df.groupby('A').agg([np.sum, np.mean, np.std])['C']

Unnamed: 0_level_0,sum,mean,std
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,-0.247368,-0.082456,1.233757
foo,-1.104206,-0.220841,1.117399


**5.不同列使用不同的聚合函数**

In [9]:
exclude_b_df.groupby('A').agg({'C': np.sum, 'D': [np.mean, np.std]})

Unnamed: 0_level_0,C,D,D
Unnamed: 0_level_1,sum,mean,std
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
bar,-0.247368,0.327733,0.330747
foo,-1.104206,0.409819,0.638418


## 二.遍历groupby的结果理解执行流程
for 循环可以直接遍历每个group

**1.遍历单个聚合的分组**

In [10]:
g = exclude_b_df.groupby('A')
g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002A335F81450>

In [11]:
for name, group in g:
    print(name)
    print(group)
    print()

bar
     A         C         D
1  bar -0.725976  0.675453
3  bar  1.340017  0.290663
5  bar -0.861409  0.017083

foo
     A         C         D
0  foo -1.800575  0.926776
2  foo  0.511159  0.335860
4  foo  0.478787 -0.651279
6  foo -1.001919  0.861418
7  foo  0.708341  0.576321



**可以获取单个返祖的数据**

In [12]:
g.get_group('bar')

Unnamed: 0,A,C,D
1,bar,-0.725976,0.675453
3,bar,1.340017,0.290663
5,bar,-0.861409,0.017083


**2遍历多个聚合的分组**

In [13]:
g1 = df.groupby(['A', 'B'])

In [14]:
for name, group in g1:
    print(name)
    print(group)
    print()

('bar', 'one')
     A    B         C         D
1  bar  one -0.725976  0.675453

('bar', 'three')
     A      B         C         D
3  bar  three  1.340017  0.290663

('bar', 'two')
     A    B         C         D
5  bar  two -0.861409  0.017083

('foo', 'one')
     A    B         C         D
0  foo  one -1.800575  0.926776
6  foo  one -1.001919  0.861418

('foo', 'three')
     A      B         C         D
7  foo  three  0.708341  0.576321

('foo', 'two')
     A    B         C         D
2  foo  two  0.511159  0.335860
4  foo  two  0.478787 -0.651279



可以看到, name是一个2个元素的tuple, 代表不同的列

In [15]:
g1.get_group(('foo', 'one'))

Unnamed: 0,A,B,C,D
0,foo,one,-1.800575,0.926776
6,foo,one,-1.001919,0.861418


**可以直接查询group后的某几列, 生成Series或者子DataFrame**

In [16]:
g1['C']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002A335F81960>

In [17]:
for name, group in g1['C']:
    print(name)
    print(group)
    print(type(group))
    print()

('bar', 'one')
1   -0.725976
Name: C, dtype: float64
<class 'pandas.core.series.Series'>

('bar', 'three')
3    1.340017
Name: C, dtype: float64
<class 'pandas.core.series.Series'>

('bar', 'two')
5   -0.861409
Name: C, dtype: float64
<class 'pandas.core.series.Series'>

('foo', 'one')
0   -1.800575
6   -1.001919
Name: C, dtype: float64
<class 'pandas.core.series.Series'>

('foo', 'three')
7    0.708341
Name: C, dtype: float64
<class 'pandas.core.series.Series'>

('foo', 'two')
2    0.511159
4    0.478787
Name: C, dtype: float64
<class 'pandas.core.series.Series'>



其实所有的聚合统计, 都是在DataFrame和Series上进行的

## 三. 实例分组探索天气数据

In [18]:
df_weather = pd.read_excel('./data/weather/weater_beijing.xlsx')
df_weather.loc[:, '最高温'] = df_weather['最高温'].str.replace('°', '').replace('', '0')
df_weather['最高温'].fillna('0', inplace=True)
df_weather.loc[:, '最高温'] = df_weather['最高温'].astype('int32')

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

In [None]:
df_weather

In [None]:
df_weather.loc[:, '最低温'] = df_weather['最低温'].str.replace('°', '').replace('', '0')
df_weather.fillna({'最低温': '0'}, inplace=True)
df_weather.loc[:, '最低温'] = df_weather['最低温'].astype('int32')

In [None]:
df_weather

In [None]:
df_weather.fillna({'空气质量指数': '未统计'}, inplace=True)
df_weather.loc[:, '空气质量指数'] = df_weather['空气质量指数'].str.split(' ')

air_quality = []
air_quality_index = []


def split_air(d):
    if len(d) > 1:
        air_quality.append(int(d[0]))
        air_quality_index.append(d[1])
    else:
        air_quality.append(-1)
        air_quality_index.append(d[0])


df_weather['空气质量指数'].apply(split_air)
df_weather.loc[:, '空气质量指数'] = air_quality
df_weather.loc[:, '空气质量'] = air_quality_index

In [None]:
df_weather

In [None]:
# 新增一列月份
df_weather['月份'] = df_weather['日期'].str[:7]

In [None]:
df_weather

**1.查看每个月的最高温**

In [None]:
data = df_weather.groupby('月份')['最高温'].max()
data

In [None]:
type(data)

In [None]:
data.plot()

**2.查看每个月的最高温、最低温、平均空气质量指数**

In [None]:
data1 = df_weather.groupby('月份').agg({'最高温': np.max, '最低温': np.min, '空气质量指数': np.mean})
data1

In [None]:
data1.plot()