In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [2]:

dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,1.67805,-1.715139,X,alpha
1,-0.791755,0.596357,X,beta
2,0.101378,-1.684937,Y,alpha
3,-3.256951,0.099847,Y,beta
4,-0.447513,-0.983055,Z,alpha


In [3]:
# k1をキーとして、データをグループにまとめます。
group1 = dframe['dataset1'].groupby(dframe['k1'])
group1

<pandas.core.groupby.SeriesGroupBy object at 0x000002117C23DBA8>

In [4]:

#グループごとの平均値を計算
group1.mean()

k1
X    0.443147
Y   -1.577786
Z   -0.447513
Name: dataset1, dtype: float64

In [5]:
# キーは変えられます。
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['JAN','FEB','JAN','FEB','JAN'])

In [6]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,1.67805,-1.715139,X,alpha
1,-0.791755,0.596357,X,beta
2,0.101378,-1.684937,Y,alpha
3,-3.256951,0.099847,Y,beta
4,-0.447513,-0.983055,Z,alpha


In [7]:
#  それぞれでグループ化します。
#  元々無いキーを外から使ってグループ化できる
dframe['dataset1'].groupby([cities,month]).mean()

LA  FEB   -0.791755
    JAN    0.101378
NY  FEB   -3.256951
    JAN    0.615268
Name: dataset1, dtype: float64

In [8]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,1.67805,-1.715139,X,alpha
1,-0.791755,0.596357,X,beta
2,0.101378,-1.684937,Y,alpha
3,-3.256951,0.099847,Y,beta
4,-0.447513,-0.983055,Z,alpha


In [9]:
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,0.443147,-0.559391
Y,-1.577786,-0.792545
Z,-0.447513,-0.983055


In [10]:
# 複数の列名にも対応しています。
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,1.67805,-1.715139
X,beta,-0.791755,0.596357
Y,alpha,0.101378,-1.684937
Y,beta,-3.256951,0.099847
Z,alpha,-0.447513,-0.983055


In [12]:
# 出力の列を限定することもできます。
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]
dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,-1.715139
X,beta,0.596357
Y,alpha,-1.684937
Y,beta,0.099847
Z,alpha,-0.983055


In [13]:
# size()と一緒に使うのも便利です。
dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [14]:

# イテレート（繰り返し処理）ができます。
for name, group in dframe.groupby('k1'):
    print('This is the {} group'.format(name))
    print(group)
    print('\n')

This is the X group
   dataset1  dataset2 k1     k2
0  1.678050 -1.715139  X  alpha
1 -0.791755  0.596357  X   beta


This is the Y group
   dataset1  dataset2 k1     k2
2  0.101378 -1.684937  Y  alpha
3 -3.256951  0.099847  Y   beta


This is the Z group
   dataset1  dataset2 k1     k2
4 -0.447513 -0.983055  Z  alpha




In [15]:
# 複数のキーでも同じ事ができます。
for (k1,k2) , group in dframe.groupby(['k1','k2']):
    print('Key1 = {} Key2 = {}'.format(k1,k2))
    print(group)
    print('\n')

Key1 = X Key2 = alpha
   dataset1  dataset2 k1     k2
0   1.67805 -1.715139  X  alpha


Key1 = X Key2 = beta
   dataset1  dataset2 k1    k2
1 -0.791755  0.596357  X  beta


Key1 = Y Key2 = alpha
   dataset1  dataset2 k1     k2
2  0.101378 -1.684937  Y  alpha


Key1 = Y Key2 = beta
   dataset1  dataset2 k1    k2
3 -3.256951  0.099847  Y  beta


Key1 = Z Key2 = alpha
   dataset1  dataset2 k1     k2
4 -0.447513 -0.983055  Z  alpha




In [16]:
gr = dframe.groupby('k1')
gr.get_group('X')

Unnamed: 0,dataset1,dataset2,k1,k2
0,1.67805,-1.715139,X,alpha
1,-0.791755,0.596357,X,beta


In [17]:
# リストを作ってそれを辞書にするこもできます。
group_dict = dict(list(dframe.groupby('k1')))
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,1.67805,-1.715139,X,alpha
1,-0.791755,0.596357,X,beta


In [18]:
# 列方向（axis = 1）についても同じような事ができます。
# ちょっと複雑ですが、
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0  1.678050 -1.715139
 1 -0.791755  0.596357
 2  0.101378 -1.684937
 3 -3.256951  0.099847
 4 -0.447513 -0.983055, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [20]:
# まとめ　グループ化 groupby

# dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
#                     'k2':['alpha','beta','alpha','beta','alpha'],
#                     'dataset1':np.random.randn(5),
#                     'dataset2':np.random.randn(5)})
# group1 = dframe['dataset1'].groupby(dframe['k1'])
# group1.mean()

# cities = np.array(['NY','LA','LA','NY','NY'])
# month = np.array(['JAN','FEB','JAN','FEB','JAN'])
# dframe['dataset1'].groupby([cities,month]).mean()


# dframe.groupby('k1').mean()
# dframe.groupby(['k1','k2']).mean()
# dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]
# dataset2_group.mean()
# dframe.groupby(['k1']).size()


In [21]:
animals = DataFrame(np.arange(16).reshape(4, 4),
                   columns=['W', 'X', 'Y', 'Z'],
                   index=['Dog', 'Cat', 'Bird', 'Mouse'])

In [23]:
animals

Unnamed: 0,W,X,Y,Z
Dog,0,1,2,3
Cat,4,5,6,7
Bird,8,9,10,11
Mouse,12,13,14,15


In [27]:
# 一部をNaNにします。
animals.loc[1:2, ['W', 'Y']] = np.nan 
animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Bird,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


In [28]:
# お行儀のよさをデータ化します。
behavior_map = {'W': 'bad', 'X': 'good', 'Y': 'bad','Z': 'good'}

In [29]:
# mapを使ってグループ化します。
animal_col = animals.groupby(behavior_map, axis=1)
animal_col.sum()

Unnamed: 0,bad,good
Dog,2.0,4.0
Cat,0.0,12.0
Bird,18.0,20.0
Mouse,26.0,28.0


In [32]:
# Seriesを作ります。
# 辞書型から簡単にSeriesができる
behav_series = Series(behavior_map)
behav_series

W     bad
X    good
Y     bad
Z    good
dtype: object

In [33]:
# このSeriesを使ってグループ化します。
animals.groupby(behav_series, axis=1).count()

Unnamed: 0,bad,good
Dog,2,2
Cat,0,2
Bird,2,2
Mouse,2,2


In [34]:
animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Bird,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


In [35]:
# 関数を渡す事もできます。
animals.groupby(len).sum()

Unnamed: 0,W,X,Y,Z
3,0.0,6,2.0,10
4,8.0,9,10.0,11
5,12.0,13,14.0,15


In [36]:
# 関数と、キーを混ぜることもできます。
keys = ['A', 'B', 'A', 'B']
# indexの長さと、別のキーを使ってグループ化
animals.groupby([len, keys]).max()

Unnamed: 0,Unnamed: 1,W,X,Y,Z
3,A,0.0,1,2.0,3
3,B,,5,,7
4,A,8.0,9,10.0,11
5,B,12.0,13,14.0,15


In [37]:
# まとめ　前回とほぼ同じ

# groupby には関数も使える

In [38]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [39]:
# サンプルデータは、次のURLからダウンロードできます。
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/'

# 保存したデータを読み込みます。
dframe_wine = pd.read_csv('winequality-red.csv',sep=';')

FileNotFoundError: File b'winequality-red.csv' does not exist

In [40]:
# ここをみて勉強すればいい
# http://www.tsjshg.info/udemy/Lec44.html
    
# http://www.tsjshg.info/udemy/Lec45.html

In [41]:

from io import StringIO 

data ="""Sample  Animal   Intelligence
1   Dog   Dumb
2 Dog Dumb
3   Cat Smart
4 Cat    Smart
5 Dog Smart
6 Cat Smart"""
dframe = pd.read_table(StringIO(data),sep='\s+')

In [42]:
dframe

Unnamed: 0,Sample,Animal,Intelligence
0,1,Dog,Dumb
1,2,Dog,Dumb
2,3,Cat,Smart
3,4,Cat,Smart
4,5,Dog,Smart
5,6,Cat,Smart


In [44]:

# クロス集計表が作れます。
pd.crosstab(dframe.Animal,dframe.Intelligence)

Intelligence,Dumb,Smart
Animal,Unnamed: 1_level_1,Unnamed: 2_level_1
Cat,0,3
Dog,2,1


In [45]:

# クロス集計表が作れます。
pd.crosstab(dframe.Animal,dframe.Intelligence)

Intelligence,Dumb,Smart
Animal,Unnamed: 1_level_1,Unnamed: 2_level_1
Cat,0,3
Dog,2,1
