In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# 利用列名和数据构建dataframe
df = pd.DataFrame({
    'key1':['a', 'a', 'b', 'b', 'a'],
    'key2' : ['one', 'two', 'one', 'two', 'one'],
    'data1' : np.random.randn(5),
    'data2' : np.random.randn(5)
})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.223888,0.11447
1,a,two,0.62446,-1.265352
2,b,one,0.371178,1.426521
3,b,two,-1.663985,0.810567
4,a,one,-0.570321,-0.005611


In [3]:
# 利用key1进行分组，并计算data1列的均值
# 方式：访问data1,并根据key1调用groupby
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000002AC8FF30FD0>

In [4]:
# 变量grouped是一个groupby对象，它实际上还没有进行计算，只是含有一些分组键df['key1']的中间数据
# 简言之该对象有了对各分组执行运算的所有信息
grouped.mean()

key1
a   -0.056583
b   -0.646404
Name: data1, dtype: float64

数据根据  分组键  进行了聚合，产生了一个新的series,其索引为key1列中的唯一值

In [6]:
# 一次传入多个数组,通过多个键对数据进行分组，得到具有层次化索引的series
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one    -0.397105
      two     0.624460
b     one     0.371178
      two    -1.663985
Name: data1, dtype: float64

In [7]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.397105,0.62446
b,0.371178,-1.663985


In [8]:
# 分组可以是任何长度适当的数组
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

In [9]:
df['data1'].groupby([states,years]).mean()

California  2005    0.624460
            2006    0.371178
Ohio        2005   -0.943936
            2006   -0.570321
Name: data1, dtype: float64

还可以将列名(可以是字符，数字等)作为分组建

In [10]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.056583,-0.385498
b,-0.646404,1.118544


In [11]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.397105,0.054429
a,two,0.62446,-1.265352
b,one,0.371178,1.426521
b,two,-1.663985,0.810567


groupby进行分组时对非数据列（麻烦列）进行了排除，默认情况下所有数值都会被聚合

In [12]:
# size方法返回分组大小
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [14]:
for name,group in df.groupby('key1'):
    print(name,'\n',group)

a 
   key1 key2     data1     data2
0    a  one -0.223888  0.114470
1    a  two  0.624460 -1.265352
4    a  one -0.570321 -0.005611
b 
   key1 key2     data1     data2
2    b  one  0.371178  1.426521
3    b  two -1.663985  0.810567


In [15]:
# 对于多重键的情况，元组的第一个元素将会是由键值组成的元组
for (k1,k2),group in df.groupby(['key1','key2']):
    print(k1,k2,'\n',group)

a one 
   key1 key2     data1     data2
0    a  one -0.223888  0.114470
4    a  one -0.570321 -0.005611
a two 
   key1 key2    data1     data2
1    a  two  0.62446 -1.265352
b one 
   key1 key2     data1     data2
2    b  one  0.371178  1.426521
b two 
   key1 key2     data1     data2
3    b  two -1.663985  0.810567


可以对数据片段做任何操作，比如做成字典

In [17]:
pieces = dict(list(df.groupby('key1')))
pieces

{'a':   key1 key2     data1     data2
 0    a  one -0.223888  0.114470
 1    a  two  0.624460 -1.265352
 4    a  one -0.570321 -0.005611, 'b':   key1 key2     data1     data2
 2    b  one  0.371178  1.426521
 3    b  two -1.663985  0.810567}

In [18]:
type(pieces['b'])

pandas.core.frame.DataFrame

In [20]:
list(df.groupby('key1'))

[('a',   key1 key2     data1     data2
  0    a  one -0.223888  0.114470
  1    a  two  0.624460 -1.265352
  4    a  one -0.570321 -0.005611), ('b',   key1 key2     data1     data2
  2    b  one  0.371178  1.426521
  3    b  two -1.663985  0.810567)]

In [21]:
df.groupby('key1')

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000002AC92BAA2E8>

In [22]:
for i in df.groupby('key1'):
    print(i)

('a',   key1 key2     data1     data2
0    a  one -0.223888  0.114470
1    a  two  0.624460 -1.265352
4    a  one -0.570321 -0.005611)
('b',   key1 key2     data1     data2
2    b  one  0.371178  1.426521
3    b  two -1.663985  0.810567)


In [24]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [28]:
list(df.groupby(df.dtypes,axis=1))

[(dtype('float64'),       data1     data2
  0 -0.223888  0.114470
  1  0.624460 -1.265352
  2  0.371178  1.426521
  3 -1.663985  0.810567
  4 -0.570321 -0.005611), (dtype('O'),   key1 key2
  0    a  one
  1    a  two
  2    b  one
  3    b  two
  4    a  one)]

In [29]:
df.groupby(['key1','key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.054429
a,two,-1.265352
b,one,1.426521
b,two,0.810567


In [34]:
df.groupby(['key1','key2'])['data2'].mean()

key1  key2
a     one     0.054429
      two    -1.265352
b     one     1.426521
      two     0.810567
Name: data2, dtype: float64

In [35]:
people = pd.DataFrame(
    np.random.randn(5,5),
    index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'],
    columns=['a', 'b', 'c', 'd', 'e']
)
people

Unnamed: 0,a,b,c,d,e
Joe,-1.530118,-0.355276,-1.780145,-0.060734,0.870525
Steve,1.228132,1.974597,1.269202,0.691717,0.099211
Wes,-0.556833,-1.178751,0.856106,0.12982,0.329898
Jim,-1.141639,-0.01951,-0.939034,-0.369156,-0.496966
Travis,1.149363,1.781469,-0.463331,0.280201,-0.407344


In [38]:
people.iloc[2:3, [1,2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-1.530118,-0.355276,-1.780145,-0.060734,0.870525
Steve,1.228132,1.974597,1.269202,0.691717,0.099211
Wes,-0.556833,,,0.12982,0.329898
Jim,-1.141639,-0.01951,-0.939034,-0.369156,-0.496966
Travis,1.149363,1.781469,-0.463331,0.280201,-0.407344


In [40]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'} 
by_column = people.groupby(mapping,axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,-1.840878,-1.014868
Steve,1.96092,3.30194
Wes,0.12982,-0.226935
Jim,-1.30819,-1.658116
Travis,-0.18313,2.523488


series作为分组键，pandas会检查series以确保其索引跟分组轴是对齐的

In [41]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [43]:
people.groupby(map_series,axis=1).sum()

Unnamed: 0,blue,red
Joe,-1.840878,-1.014868
Steve,1.96092,3.30194
Wes,0.12982,-0.226935
Jim,-1.30819,-1.658116
Travis,-0.18313,2.523488


In [44]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-3.22859,-0.374786,-2.719178,-0.30007,0.703458
5,1.228132,1.974597,1.269202,0.691717,0.099211
6,1.149363,1.781469,-0.463331,0.280201,-0.407344


In [45]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-1.530118,-0.355276,-1.780145,-0.060734,0.329898
3,two,-1.141639,-0.01951,-0.939034,-0.369156,-0.496966
5,one,1.228132,1.974597,1.269202,0.691717,0.099211
6,two,1.149363,1.781469,-0.463331,0.280201,-0.407344
