## 10.1 GroupBy 機制

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.031661,-0.004566
1,a,two,0.143277,2.12595
2,b,one,-0.66908,-0.495781
3,b,two,2.822098,1.347192
4,a,one,1.635402,1.599679


In [5]:
# 以 key1 的標籤做分組，計算 data1 的分組平均值
grouped = df['data1'].groupby(df['key1'])
grouped.mean()

key1
a    0.663384
b    0.735482
Name: data1, dtype: float64

In [6]:
# key為多個陣列組成的 list
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     0.544489
      two     0.901173
b     one     0.742818
      two     0.728147
Name: data1, dtype: float64

In [7]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.544489,0.901173
b,0.742818,0.728147


In [9]:
# key為 Series
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

df['data1'].groupby([states, years]).mean()

California  2005    0.901173
            2006    0.742818
Ohio        2005    0.690598
            2006    0.435930
Name: data1, dtype: float64

In [11]:
# 輸入欄位名稱當作分組 key
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.663384,0.171832
b,0.735482,-0.479382


In [12]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.544489,0.012059
a,two,0.901173,0.491377
b,one,0.742818,-0.225516
b,two,0.728147,-0.733247


In [13]:
# 利用 groupby 計算各分組的大小
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

### 疊代分組

In [4]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)
    print('---')

a
  key1 key2     data1     data2
0    a  one -0.031661 -0.004566
1    a  two  0.143277  2.125950
4    a  one  1.635402  1.599679
---
b
  key1 key2     data1     data2
2    b  one -0.669080 -0.495781
3    b  two  2.822098  1.347192
---


In [5]:
for (k1 ,k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)
    print('---')

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.031661 -0.004566
4    a  one  1.635402  1.599679
---
('a', 'two')
  key1 key2     data1    data2
1    a  two  0.143277  2.12595
---
('b', 'one')
  key1 key2    data1     data2
2    b  one -0.66908 -0.495781
---
('b', 'two')
  key1 key2     data1     data2
3    b  two  2.822098  1.347192
---


In [12]:
# list(df.groupby('key1'))
piece = dict(list(df.groupby('key1')))
piece['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-0.66908,-0.495781
3,b,two,2.822098,1.347192


In [13]:
# 指定做欄分組
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [15]:
grouped = df.groupby(df.dtypes, axis=1)
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015B70153670>

In [16]:
for dtype, group in grouped:
    print(dtype)
    print(group)
    print('---')

float64
      data1     data2
0 -0.031661 -0.004566
1  0.143277  2.125950
2 -0.669080 -0.495781
3  2.822098  1.347192
4  1.635402  1.599679
---
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
---


### 選取一個或多個欄

In [17]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.031661,-0.004566
1,a,two,0.143277,2.12595
2,b,one,-0.66908,-0.495781
3,b,two,2.822098,1.347192
4,a,one,1.635402,1.599679


In [19]:
# 計算 data2 欄的分組平均，結果以 DataFrame回傳 <-- 參數傳遞 list/array
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.797557
a,two,2.12595
b,one,-0.495781
b,two,1.347192


In [21]:
# 計算 data2 欄的分組平均，結果以 Series 回傳 <-- 參數傳遞 常數欄位名稱

df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one     0.797557
      two     2.125950
b     one    -0.495781
      two     1.347192
Name: data2, dtype: float64

### 用Dict和Series進行分組

In [22]:
people = pd.DataFrame(np.random.randn(5,5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,0.745578,-0.413657,-0.075199,-0.414638,-1.416007
Steve,1.571365,-0.028639,-3.004358,2.240465,-0.740359
Wes,1.243278,0.224609,-1.677511,-0.315819,-0.230073
Jim,-1.168506,-1.231021,-0.420512,2.132458,-0.200163
Travis,0.853071,-0.701867,0.337946,0.629953,1.018118


In [26]:
people.iloc[2:3, [1,2]] = np.nan # 加入幾個 NA值

In [27]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.745578,-0.413657,-0.075199,-0.414638,-1.416007
Steve,1.571365,-0.028639,-3.004358,2.240465,-0.740359
Wes,1.243278,,,-0.315819,-0.230073
Jim,-1.168506,-1.231021,-0.420512,2.132458,-0.200163
Travis,0.853071,-0.701867,0.337946,0.629953,1.018118


In [28]:
# 分組對應表
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [29]:
by_column = people.groupby(mapping, axis=1)
by_column

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015B70178190>

In [30]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-0.489837,-1.084086
Steve,-0.763892,0.802367
Wes,-0.315819,1.013205
Jim,1.711946,-2.59969
Travis,0.967898,1.169323


In [31]:
# 用Series分組
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [32]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### 用函式分組

In [33]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.745578,-0.413657,-0.075199,-0.414638,-1.416007
Steve,1.571365,-0.028639,-3.004358,2.240465,-0.740359
Wes,1.243278,,,-0.315819,-0.230073
Jim,-1.168506,-1.231021,-0.420512,2.132458,-0.200163
Travis,0.853071,-0.701867,0.337946,0.629953,1.018118


In [34]:
# 以名字長度做分組
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.82035,-1.644678,-0.495711,1.402001,-1.846243
5,1.571365,-0.028639,-3.004358,2.240465,-0.740359
6,0.853071,-0.701867,0.337946,0.629953,1.018118


In [35]:
# 函式混搭 array/dict.Series
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()    # axis: 'Axis' = 0

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.745578,-0.413657,-0.075199,-0.414638,-1.416007
3,two,-1.168506,-1.231021,-0.420512,2.132458,-0.200163
5,one,1.571365,-0.028639,-3.004358,2.240465,-0.740359
6,two,0.853071,-0.701867,0.337946,0.629953,1.018118


### 用索引層級分組

In [36]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                     [1, 3, 5, 1, 3]],
                                    names=['city', 'tenor'])
columns

MultiIndex([('US', 1),
            ('US', 3),
            ('US', 5),
            ('JP', 1),
            ('JP', 3)],
           names=['city', 'tenor'])

In [37]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.309312,1.548185,-0.176203,-2.057458,-1.774331
1,-0.722857,-0.756337,0.056873,-1.470106,0.085368
2,-0.115956,-0.963551,0.564273,1.462182,-0.206158
3,0.061872,-1.010317,-0.583031,-0.939967,-0.262382


In [38]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


---

## 10.2 資料聚合

#### 任何對資料矩陣做轉換，得到一些常數的動作，就稱為聚合。