## 10.1 GroupBy 機制

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.567809,-0.132201
1,a,two,0.091732,-1.047888
2,b,one,-0.109994,0.40899
3,b,two,1.288401,1.114073
4,a,one,0.234972,0.168156


In [5]:
# 以 key1 的標籤做分組，計算 data1 的分組平均值
grouped = df['data1'].groupby(df['key1'])
grouped.mean()

key1
a    0.663384
b    0.735482
Name: data1, dtype: float64

In [6]:
# key為多個陣列組成的 list
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     0.544489
      two     0.901173
b     one     0.742818
      two     0.728147
Name: data1, dtype: float64

In [7]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.544489,0.901173
b,0.742818,0.728147


In [9]:
# key為 Series
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

df['data1'].groupby([states, years]).mean()

California  2005    0.901173
            2006    0.742818
Ohio        2005    0.690598
            2006    0.435930
Name: data1, dtype: float64

In [11]:
# 輸入欄位名稱當作分組 key
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.663384,0.171832
b,0.735482,-0.479382


In [12]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.544489,0.012059
a,two,0.901173,0.491377
b,one,0.742818,-0.225516
b,two,0.728147,-0.733247


In [13]:
# 利用 groupby 計算各分組的大小
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

### 疊代分組

In [4]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)
    print('---')

a
  key1 key2     data1     data2
0    a  one -0.031661 -0.004566
1    a  two  0.143277  2.125950
4    a  one  1.635402  1.599679
---
b
  key1 key2     data1     data2
2    b  one -0.669080 -0.495781
3    b  two  2.822098  1.347192
---


In [5]:
for (k1 ,k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)
    print('---')

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.031661 -0.004566
4    a  one  1.635402  1.599679
---
('a', 'two')
  key1 key2     data1    data2
1    a  two  0.143277  2.12595
---
('b', 'one')
  key1 key2    data1     data2
2    b  one -0.66908 -0.495781
---
('b', 'two')
  key1 key2     data1     data2
3    b  two  2.822098  1.347192
---


In [12]:
# list(df.groupby('key1'))
piece = dict(list(df.groupby('key1')))
piece['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-0.66908,-0.495781
3,b,two,2.822098,1.347192


In [13]:
# 指定做欄分組
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [15]:
grouped = df.groupby(df.dtypes, axis=1)
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015B70153670>

In [16]:
for dtype, group in grouped:
    print(dtype)
    print(group)
    print('---')

float64
      data1     data2
0 -0.031661 -0.004566
1  0.143277  2.125950
2 -0.669080 -0.495781
3  2.822098  1.347192
4  1.635402  1.599679
---
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
---


### 選取一個或多個欄

In [17]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.031661,-0.004566
1,a,two,0.143277,2.12595
2,b,one,-0.66908,-0.495781
3,b,two,2.822098,1.347192
4,a,one,1.635402,1.599679


In [19]:
# 計算 data2 欄的分組平均，結果以 DataFrame回傳 <-- 參數傳遞 list/array
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.797557
a,two,2.12595
b,one,-0.495781
b,two,1.347192


In [21]:
# 計算 data2 欄的分組平均，結果以 Series 回傳 <-- 參數傳遞 常數欄位名稱

df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one     0.797557
      two     2.125950
b     one    -0.495781
      two     1.347192
Name: data2, dtype: float64

### 用Dict和Series進行分組

In [22]:
people = pd.DataFrame(np.random.randn(5,5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,0.745578,-0.413657,-0.075199,-0.414638,-1.416007
Steve,1.571365,-0.028639,-3.004358,2.240465,-0.740359
Wes,1.243278,0.224609,-1.677511,-0.315819,-0.230073
Jim,-1.168506,-1.231021,-0.420512,2.132458,-0.200163
Travis,0.853071,-0.701867,0.337946,0.629953,1.018118


In [26]:
people.iloc[2:3, [1,2]] = np.nan # 加入幾個 NA值

In [27]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.745578,-0.413657,-0.075199,-0.414638,-1.416007
Steve,1.571365,-0.028639,-3.004358,2.240465,-0.740359
Wes,1.243278,,,-0.315819,-0.230073
Jim,-1.168506,-1.231021,-0.420512,2.132458,-0.200163
Travis,0.853071,-0.701867,0.337946,0.629953,1.018118


In [28]:
# 分組對應表
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [29]:
by_column = people.groupby(mapping, axis=1)
by_column

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015B70178190>

In [30]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-0.489837,-1.084086
Steve,-0.763892,0.802367
Wes,-0.315819,1.013205
Jim,1.711946,-2.59969
Travis,0.967898,1.169323


In [31]:
# 用Series分組
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [32]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### 用函式分組

In [33]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.745578,-0.413657,-0.075199,-0.414638,-1.416007
Steve,1.571365,-0.028639,-3.004358,2.240465,-0.740359
Wes,1.243278,,,-0.315819,-0.230073
Jim,-1.168506,-1.231021,-0.420512,2.132458,-0.200163
Travis,0.853071,-0.701867,0.337946,0.629953,1.018118


In [34]:
# 以名字長度做分組
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.82035,-1.644678,-0.495711,1.402001,-1.846243
5,1.571365,-0.028639,-3.004358,2.240465,-0.740359
6,0.853071,-0.701867,0.337946,0.629953,1.018118


In [35]:
# 函式混搭 array/dict.Series
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()    # axis: 'Axis' = 0

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.745578,-0.413657,-0.075199,-0.414638,-1.416007
3,two,-1.168506,-1.231021,-0.420512,2.132458,-0.200163
5,one,1.571365,-0.028639,-3.004358,2.240465,-0.740359
6,two,0.853071,-0.701867,0.337946,0.629953,1.018118


### 用索引層級分組

In [36]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                     [1, 3, 5, 1, 3]],
                                    names=['city', 'tenor'])
columns

MultiIndex([('US', 1),
            ('US', 3),
            ('US', 5),
            ('JP', 1),
            ('JP', 3)],
           names=['city', 'tenor'])

In [37]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.309312,1.548185,-0.176203,-2.057458,-1.774331
1,-0.722857,-0.756337,0.056873,-1.470106,0.085368
2,-0.115956,-0.963551,0.564273,1.462182,-0.206158
3,0.061872,-1.010317,-0.583031,-0.939967,-0.262382


In [38]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


---

## 10.2 資料聚合

#### 任何對資料矩陣做轉換，得到一些常數的動作，就稱為聚合。

In [3]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.567809,-0.132201
1,a,two,0.091732,-1.047888
2,b,one,-0.109994,0.40899
3,b,two,1.288401,1.114073
4,a,one,0.234972,0.168156


In [6]:
grouped = df.groupby('key1')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022CD8D2AF40>

In [8]:
grouped['data1'].quantile(0.9)   # 計算欄位的百分位數

key1
a    0.501242
b    1.148561
Name: data1, dtype: float64

In [31]:
# 使用自己的聚合函式
def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped.agg(peak_to_peak)  # 當作參數傳遞給 agg/aggregate

  results[key] = self.aggregate(func)


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.476077,1.216044
b,1.398394,0.705083


In [38]:
# ############################
# 怎麼改???
# ############################

df2 = pd.DataFrame({'key1': [1, 1, 2, 2, 1],
                    'key2': ['one', 'two', 'one', 'two', 'one'],
                    'data1': np.random.randn(5),
                    'data2': np.random.randn(5)})

def peak_to_peak(arr):
    return arr.max() - arr.min()

df2.groupby('key1').key1.agg(peak_to_peak)

key1
1    0
2    0
Name: key1, dtype: int64

In [15]:
from pandas import *
d = {"series": Series(['1','2','1','1','4','4','5'])}
dfex = DataFrame(d)
def get_count(values):
    return len(values)
dfex.groupby("series").series.agg(get_count)


series
1    3
2    1
4    2
5    1
Name: series, dtype: int64


In [40]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.298171,0.24425,0.091732,0.163352,0.234972,0.401391,0.567809,3.0,-0.337311,0.633438,-1.047888,-0.590045,-0.132201,0.017978,0.168156
b,2.0,0.589203,0.988814,-0.109994,0.239605,0.589203,0.938802,1.288401,2.0,0.761532,0.498569,0.40899,0.585261,0.761532,0.937803,1.114073


### 欄方向的多功能應用

In [41]:
tips = pd.read_csv('examples/tips.csv')

In [46]:
# tips.keys()
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [49]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [51]:
# 先進行分組
grouped = tips.groupby(['day', 'smoker'])

In [52]:
grouped_pct = grouped['tip_pct']

In [53]:
# 傳入一個函式名稱字串
grouped_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [54]:
# 傳入一串函式名稱
grouped_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [55]:
# 傳遞(name, function)tuple
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [56]:
# 指定函式 list，對不同的欄套用不同函式
function = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(function)
result

  result = grouped['tip_pct', 'total_bill'].agg(function)


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [61]:
# 同理
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [62]:
# 傳入 tuple 組成的 list來自定名稱
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)

  grouped['tip_pct', 'total_bill'].agg(ftuples)


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [65]:
# 將不同函式，套用到一或多個欄位上
grouped.agg({'tip': np.max, 'size': 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [66]:
grouped.agg({'tip_pct': ['min', 'max', 'mean', 'std'],
             'size': 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


### 回傳聚合資料時不要列索引

In [67]:
tips.groupby(['day', 'smoker'], as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


In [73]:
tips.groupby(['day', 'smoker']).mean().reset_index

<bound method DataFrame.reset_index of              total_bill       tip      size   tip_pct
day  smoker                                          
Fri  No       18.420000  2.812500  2.250000  0.151650
     Yes      16.813333  2.714000  2.066667  0.174783
Sat  No       19.661778  3.102889  2.555556  0.158048
     Yes      21.276667  2.875476  2.476190  0.147906
Sun  No       20.506667  3.167895  2.929825  0.160113
     Yes      24.120000  3.516842  2.578947  0.187250
Thur No       17.113111  2.673778  2.488889  0.160298
     Yes      19.190588  3.030000  2.352941  0.163863>

---

## 10.3 Allpy: 分裂-套用-合併