In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [2]:
dframe = DataFrame({'k1': ['X', 'X', 'Y', 'Y', 'Z'],
                    'k2': ['alpha', 'beta', 'alpha', 'beta', 'alpha'],
                    'dataset1': np.random.randn(5),
                    'dataset2': np.random.randn(5)})
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,0.813528,1.501523
1,X,beta,0.877812,-1.62514
2,Y,alpha,0.026969,-0.197223
3,Y,beta,-1.221948,0.735248
4,Z,alpha,-0.84062,0.641373


In [3]:
# k1をキーとして、データをグループにまとめます。
# これはちょっと丁寧な書き方です。

group1 = dframe['dataset1'].groupby(dframe['k1'])
group1

<pandas.core.groupby.generic.SeriesGroupBy object at 0x11d0e3010>

In [4]:
#グループごとの平均値を計算

group1.mean()

k1
X    0.84567
Y   -0.59749
Z   -0.84062
Name: dataset1, dtype: float64

In [17]:
# このように書くこともできます。

dframe.groupby('k1')['dataset1'].mean()

k1
X    0.84567
Y   -0.59749
Z   -0.84062
Name: dataset1, dtype: float64

In [6]:
# キーは変えられます。
cities = np.array(['NY', 'LA', 'LA', 'NY', 'NY'])
month = np.array(['JAN', 'FEB', 'JAN', 'FEB', 'JAN'])

#  それぞれでグループ化します。
dframe['dataset1'].groupby([cities, month]).mean()

LA  FEB    0.877812
    JAN    0.026969
NY  FEB   -1.221948
    JAN   -0.013546
Name: dataset1, dtype: float64

In [7]:
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,0.813528,1.501523
1,X,beta,0.877812,-1.62514
2,Y,alpha,0.026969,-0.197223
3,Y,beta,-1.221948,0.735248
4,Z,alpha,-0.84062,0.641373


In [18]:
# 複数の列名にも対応しています。

dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,0.813528,1.501523
X,beta,0.877812,-1.62514
Y,alpha,0.026969,-0.197223
Y,beta,-1.221948,0.735248
Z,alpha,-0.84062,0.641373


In [21]:
# 列を限定することもできます。
# こちらは、DataFrameになります。

dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]
dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,1.501523
X,beta,-1.62514
Y,alpha,-0.197223
Y,beta,0.735248
Z,alpha,0.641373


In [22]:
# size()と一緒に使うのも便利です。

dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [28]:
# イテレート（繰り返し処理）ができます。

for name, group in dframe.groupby('k1'):
    print(f'This is the {name} group')
    print(group)
    print('\n')

This is the X group
  k1     k2  dataset1  dataset2
0  X  alpha  0.813528  1.501523
1  X   beta  0.877812 -1.625140


This is the Y group
  k1     k2  dataset1  dataset2
2  Y  alpha  0.026969 -0.197223
3  Y   beta -1.221948  0.735248


This is the Z group
  k1     k2  dataset1  dataset2
4  Z  alpha  -0.84062  0.641373




In [30]:
# 複数のキーでも同じ事ができます。

for (k1, k2), group in dframe.groupby(['k1','k2']):
    print(f'Key1 = {k1} Key2 = {k2}')
    print(group)
    print('\n')

Key1 = X Key2 = alpha
  k1     k2  dataset1  dataset2
0  X  alpha  0.813528  1.501523


Key1 = X Key2 = beta
  k1    k2  dataset1  dataset2
1  X  beta  0.877812  -1.62514


Key1 = Y Key2 = alpha
  k1     k2  dataset1  dataset2
2  Y  alpha  0.026969 -0.197223


Key1 = Y Key2 = beta
  k1    k2  dataset1  dataset2
3  Y  beta -1.221948  0.735248


Key1 = Z Key2 = alpha
  k1     k2  dataset1  dataset2
4  Z  alpha  -0.84062  0.641373




In [32]:
# 特定のグループを取得する

gr = dframe.groupby('k1')
gr.get_group('X')

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,0.813528,1.501523
1,X,beta,0.877812,-1.62514


In [34]:
list(dframe.groupby('k1'))

[('X',
    k1     k2  dataset1  dataset2
  0  X  alpha  0.813528  1.501523
  1  X   beta  0.877812 -1.625140),
 ('Y',
    k1     k2  dataset1  dataset2
  2  Y  alpha  0.026969 -0.197223
  3  Y   beta -1.221948  0.735248),
 ('Z',
    k1     k2  dataset1  dataset2
  4  Z  alpha  -0.84062  0.641373)]

In [37]:
# ちょっとだけ前提知識の紹介
# タプルのリストから辞書を作る

dict([('key1', 1), ('key2', 2), ('key3', 3)])

{'key1': 1, 'key2': 2, 'key3': 3}

In [41]:
# リストを作ってそれを辞書にするこもできます。

group_dict = dict(list(dframe.groupby('k1')))
group_dict['X']

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,0.813528,1.501523
1,X,beta,0.877812,-1.62514
