In [1]:
# GroupBy技术
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [2]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
grouped = df['data1'].groupby(df['key1']) # 根据key1的值分组
print(df)
grouped.mean() # 对分组后数字型的列求平均值

      data1     data2 key1 key2
0 -0.391481  0.411457    a  one
1 -0.212856 -0.161720    a  two
2  0.132626 -0.540404    b  one
3  0.460141 -0.059319    b  two
4 -1.968553 -0.752753    a  one


key1
a   -0.857630
b    0.296383
Name: data1, dtype: float64

In [3]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean() # 根据key1/2分组,产生多重索引
means

key1  key2
a     one    -1.180017
      two    -0.212856
b     one     0.132626
      two     0.460141
Name: data1, dtype: float64

In [4]:
means.unstack() # 把内层索引变成列

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.180017,-0.212856
b,0.132626,0.460141


In [5]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()
# 使用group后，原始数据可以认为变为如下形式：
#   data1      data2      key1  key2  states      years
# 0 -0.127927   0.026962  a     one   Ohio        2005
# 1 -1.424594  -0.800712  a     two   California  2005
# 2  1.619073  -0.165311  b     one   California  2006
# 3 -0.996192  -0.367086  b     two   Ohio        2005
# 4  0.020317  -1.238209  a     one   Ohio        2006

California  2005   -0.212856
            2006    0.132626
Ohio        2005    0.034330
            2006   -1.968553
Name: data1, dtype: float64

In [6]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.85763,-0.167672
b,0.296383,-0.299861


In [7]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-1.180017,-0.170648
a,two,-0.212856,-0.16172
b,one,0.132626,-0.540404
b,two,0.460141,-0.059319


In [8]:
df.groupby(['key1', 'key2']).size() # 统计记录条数，类似SQL的group by然后再count。

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [9]:
# 对分组进行迭代

In [10]:
for name, group in df.groupby('key1'): # 单列分组
    print(name)
    print('---')
    print(group)
    print('***')

a
---
      data1     data2 key1 key2
0 -0.391481  0.411457    a  one
1 -0.212856 -0.161720    a  two
4 -1.968553 -0.752753    a  one
***
b
---
      data1     data2 key1 key2
2  0.132626 -0.540404    b  one
3  0.460141 -0.059319    b  two
***


In [11]:
for (k1, k2), group in df.groupby(['key1', 'key2']): # 多列分组
    print(k1, k2)
    print('---')
    print(group)
    print('***')

a one
---
      data1     data2 key1 key2
0 -0.391481  0.411457    a  one
4 -1.968553 -0.752753    a  one
***
a two
---
      data1    data2 key1 key2
1 -0.212856 -0.16172    a  two
***
b one
---
      data1     data2 key1 key2
2  0.132626 -0.540404    b  one
***
b two
---
      data1     data2 key1 key2
3  0.460141 -0.059319    b  two
***


In [12]:
pieces = dict(list(df.groupby('key1')))
for k, v in pieces.items():
    print(k)
    print('---')
    print(v)
    print('***')

a
---
      data1     data2 key1 key2
0 -0.391481  0.411457    a  one
1 -0.212856 -0.161720    a  two
4 -1.968553 -0.752753    a  one
***
b
---
      data1     data2 key1 key2
2  0.132626 -0.540404    b  one
3  0.460141 -0.059319    b  two
***


In [13]:
grouped = df.groupby(df.dtypes, axis=1) # 默认根据列上的值做分组，axis=1使用行上的类型做分组。
for k, v in dict(list(grouped)).items():
    print(k)
    print('---')
    print(v)
    print('***')

float64
---
      data1     data2
0 -0.391481  0.411457
1 -0.212856 -0.161720
2  0.132626 -0.540404
3  0.460141 -0.059319
4 -1.968553 -0.752753
***
object
---
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
***


In [14]:
# 选取一个或一组列

In [15]:
print(df.groupby('key1')['data1']) # 等价df['data1'].groupby(df['key1'])
print(df.groupby('key1')[['data2']]) # df[['data2']].groupby(df['key1'])

<pandas.core.groupby.SeriesGroupBy object at 0x00000202F1D641D0>
<pandas.core.groupby.DataFrameGroupBy object at 0x00000202F1D642B0>


In [16]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.170648
a,two,-0.16172
b,one,-0.540404
b,two,-0.059319


In [17]:
s_grouped = df.groupby(['key1', 'key2'])['data2'] # 这里用'data2'而不是['data2']返回Series
s_grouped.mean()

key1  key2
a     one    -0.170648
      two    -0.161720
b     one    -0.540404
      two    -0.059319
Name: data2, dtype: float64

In [18]:
# 通过字典或Series进行分组

In [19]:
people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.loc[2:3, ['b', 'c']] = np.nan # 添加空值
people

Unnamed: 0,a,b,c,d,e
Joe,-0.439304,1.368053,0.088032,0.411676,1.506363
Steve,0.354057,-0.473297,0.518869,0.274243,0.556453
Wes,2.73992,,,0.81598,1.181493
Jim,-1.903019,-0.24409,1.958362,-1.311261,-0.153897
Travis,-0.228594,0.121615,0.855896,0.663112,0.183528


In [20]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [21]:
by_column = people.groupby(mapping, axis=1) # 每一行根据a/b/c/d/e对应的颜色求sum
by_column.sum()

Unnamed: 0,blue,red
Joe,0.499707,2.435112
Steve,0.793112,0.437214
Wes,0.81598,3.921412
Jim,0.647101,-2.301006
Travis,1.519009,0.076549


In [22]:
map_series = Series(mapping)
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [23]:
# 通过函数进行分组

In [24]:
people.groupby(len).sum() # 根据索引名字的长度做group，然后求sum

Unnamed: 0,a,b,c,d,e
3,0.397596,1.123963,2.046394,-0.083606,2.533959
5,0.354057,-0.473297,0.518869,0.274243,0.556453
6,-0.228594,0.121615,0.855896,0.663112,0.183528


In [25]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()
# key_list等于新加一列，结合len，形状如下：
#                  a         b         c         d         e
# Joe(3)     one,  0.254889 -0.812035  2.765460  1.113513  0.646795
# Steve(5)   one,  1.507490  0.463545 -1.396887  0.728163  1.078788
# Wes(3)     one, -2.099479       NaN       NaN  0.438572  0.134136
# Jim(3)     two,  0.167685  1.772127  1.372546 -0.758560 -1.241066
# Travis(6)  two, -0.834662 -0.933228  1.026441 -0.074524 -0.830303

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.439304,1.368053,0.088032,0.411676,1.181493
3,two,-1.903019,-0.24409,1.958362,-1.311261,-0.153897
5,one,0.354057,-0.473297,0.518869,0.274243,0.556453
6,two,-0.228594,0.121615,0.855896,0.663112,0.183528


In [26]:
# 根据索引级别分组

In [27]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                     [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.616084,-0.572021,-0.325839,-1.56154,-0.136162
1,-0.785219,-0.711985,-1.047321,1.197329,-0.433373
2,-0.613726,-0.548456,-1.568695,-0.743757,1.47907
3,0.018869,-0.353071,1.056401,-0.513008,0.291997


In [28]:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
