# Group by: split-apply-combine

- **Splitting** the data into groups based on some criteria.
- **Applying** a function to each group independency.
- **Combining** the results into a data structure.

## Splitting an object into groups

In [141]:
import pandas as pd
import numpy as np
from scipy import stats

In [42]:
df = pd.DataFrame(
    [
        ("bird", "Falconiformes", 389.0),
        ("bird", "Psittaciformes", 24.0),
        ("mammal", "Carnivora", 80.2),
        ("mammal", "Primates", np.nan),
        ("mammal", "Carnivora", 58),
    ],
    index=["falcon", "parrot", "lion", "monkey", "leopard"],
    columns=("class", "order", "max_speed"),
)

df

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [43]:
grouped = df.groupby("class")
grouped.sum()

Unnamed: 0_level_0,max_speed
class,Unnamed: 1_level_1
bird,413.0
mammal,138.2


In [47]:
grouped = df.groupby("order", axis="columns")
grouped.all()

order
falcon
parrot
lion
monkey
leopard


In [48]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.98378,-0.079171
1,bar,one,-1.292325,-0.910414
2,foo,two,-0.303005,1.259531
3,bar,three,-3.519748,-0.660475
4,foo,two,0.552424,0.321106
5,bar,two,-1.242636,0.525919
6,foo,one,-0.163662,-0.100302
7,foo,three,-0.904814,1.270391


In [49]:
df2 = df.set_index(['A', 'B'])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
foo,one,-0.98378,-0.079171
bar,one,-1.292325,-0.910414
foo,two,-0.303005,1.259531
bar,three,-3.519748,-0.660475
foo,two,0.552424,0.321106
bar,two,-1.242636,0.525919
foo,one,-0.163662,-0.100302
foo,three,-0.904814,1.270391


In [50]:
def get_letter_type(letter):
    if letter.lower() in 'ab':
        return 'vowel'
    else:
        return 'consonant'

In [51]:
grouped = df.groupby(get_letter_type, axis=1)
grouped.first()

Unnamed: 0,consonant,vowel
0,-0.98378,foo
1,-1.292325,bar
2,-0.303005,foo
3,-3.519748,bar
4,0.552424,foo
5,-1.242636,bar
6,-0.163662,foo
7,-0.904814,foo


In [57]:
df.groupby(by='A').groups

{'bar': [1, 3, 5], 'foo': [0, 2, 4, 6, 7]}

### GroupBy with MultiIndex

In [60]:
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]
index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"])
s = pd.Series(np.random.randn(8), index=index)
s

first  second
bar    one       1.803866
       two      -0.061058
baz    one      -3.024288
       two      -1.105323
foo    one       0.462183
       two       0.046003
qux    one       0.054685
       two      -1.276720
dtype: float64

In [61]:
s.groupby(level=0).sum()

first
bar    1.742808
baz   -4.129611
foo    0.508186
qux   -1.222035
dtype: float64

In [62]:
s.groupby(level=1).sum()

second
one   -0.703553
two   -2.397097
dtype: float64

### Grouping DataFrame with Index levels and columns

A DataFrame may be grouped by a combination of columns and index levels by specifying the column names as string and the index levels as `pd.Grouper` objects.

In [63]:
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]
index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"])
df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 3, 3], "B": np.arange(8)}, index=index)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


In [64]:
df.groupby([pd.Grouper(level=1), "A"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,B
second,A,Unnamed: 2_level_1
one,1,2
one,2,4
one,3,6
two,1,4
two,2,5
two,3,7


In [65]:
df.groupby(["second", "A"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,B
second,A,Unnamed: 2_level_1
one,1,2
one,2,4
one,3,6
two,1,4
two,2,5
two,3,7


In [68]:
type(df.groupby(["second", "A"]))

pandas.core.groupby.generic.DataFrameGroupBy

## Iterating through groups

With the GroupBy object in hand, iterating through the grouped data is very natural and functions similarly to `itertools.groupby()`:

In [70]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
df

Unnamed: 0,A,B,C,D
0,foo,one,-1.332407,-1.242745
1,bar,one,-1.258371,0.439743
2,foo,two,-0.418051,-0.14389
3,bar,three,1.701916,-0.920111
4,foo,two,0.570717,-0.014003
5,bar,two,1.87374,2.392586
6,foo,one,-0.50405,2.449194
7,foo,three,1.571275,0.932744


In [77]:
# grouped type: pandas.core.groupby.generic.DataFrameGroupBy object
grouped = df.groupby("A")

# call DataFrameGroupBy.__iter__()
for name, group in grouped:
    print(name)
    print(group)

bar
     A      B         C         D
1  bar    one -1.258371  0.439743
3  bar  three  1.701916 -0.920111
5  bar    two  1.873740  2.392586
foo
     A      B         C         D
0  foo    one -1.332407 -1.242745
2  foo    two -0.418051 -0.143890
4  foo    two  0.570717 -0.014003
6  foo    one -0.504050  2.449194
7  foo  three  1.571275  0.932744


In [83]:
for name, group in df.groupby(["A", "B"]):
    print(name)
    print(group)

('bar', 'one')
     A    B         C         D
1  bar  one -1.258371  0.439743
('bar', 'three')
     A      B         C         D
3  bar  three  1.701916 -0.920111
('bar', 'two')
     A    B        C         D
5  bar  two  1.87374  2.392586
('foo', 'one')
     A    B         C         D
0  foo  one -1.332407 -1.242745
6  foo  one -0.504050  2.449194
('foo', 'three')
     A      B         C         D
7  foo  three  1.571275  0.932744
('foo', 'two')
     A    B         C         D
2  foo  two -0.418051 -0.143890
4  foo  two  0.570717 -0.014003


## Selecting a group

A single group can be selected using `get_group()`:

In [84]:
grouped.get_group("bar")

Unnamed: 0,B,C,D
1,one,-1.258371,0.439743
3,three,1.701916,-0.920111
5,two,1.87374,2.392586


## Aggregation

In [85]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.991551,0.059717
1,bar,one,-2.195903,0.179069
2,foo,two,0.072263,-0.509086
3,bar,three,1.197655,-0.128177
4,foo,two,0.147867,-0.238085
5,bar,two,0.577249,1.184701
6,foo,one,-0.706565,1.675288
7,foo,three,0.505577,-1.295288


In [96]:
grouped = df.groupby(["A"])
# 每个分组取头 2 个
grouped.head(2)

Unnamed: 0,A,B,C,D
0,foo,one,-0.991551,0.059717
1,bar,one,-2.195903,0.179069
2,foo,two,0.072263,-0.509086
3,bar,three,1.197655,-0.128177


In [97]:
grouped.aggregate(np.sum)

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.420999,1.235593
foo,-0.972409,-0.307455


In [98]:
df.groupby(["A", "B"]).aggregate(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-2.195903,0.179069
bar,three,1.197655,-0.128177
bar,two,0.577249,1.184701
foo,one,-1.698116,1.735005
foo,three,0.505577,-1.295288
foo,two,0.22013,-0.747171


As you can see, the result of the aggregation will have the group names as the new index along the grouped axis. In the case of multiple keys, the result is a MultiIndex by default, though this can be change by using the `as_index` option:

In [101]:
df.groupby(["A", "B"], as_index=False).aggregate(np.sum)

Unnamed: 0,A,B,C,D
0,bar,one,-2.195903,0.179069
1,bar,three,1.197655,-0.128177
2,bar,two,0.577249,1.184701
3,foo,one,-1.698116,1.735005
4,foo,three,0.505577,-1.295288
5,foo,two,0.22013,-0.747171


Note that you could use the `reset_index` DataFrame function to achieve the same result as the column names are stored in the resulting `MultiIndex`:

In [102]:
df.groupby(["A", "B"]).aggregate(np.sum).reset_index()

Unnamed: 0,A,B,C,D
0,bar,one,-2.195903,0.179069
1,bar,three,1.197655,-0.128177
2,bar,two,0.577249,1.184701
3,foo,one,-1.698116,1.735005
4,foo,three,0.505577,-1.295288
5,foo,two,0.22013,-0.747171


The `size` method returns a Series whose index are the group names and whose values are the sizes of each group.

In [103]:
df.groupby("A").size()

A
bar    3
foo    5
dtype: int64

Another aggregation example is to compute the number of unique values of each group.

In [104]:
ll = [['foo', 1], ['foo', 2], ['foo', 2], ['bar', 1], ['bar', 1]]
df4 = pd.DataFrame(ll, columns=["A", "B"])
df4

Unnamed: 0,A,B
0,foo,1
1,foo,2
2,foo,2
3,bar,1
4,bar,1


In [106]:
# df4.groupby("A")["B"] type: pandas.core.groupby.generic.SeriesGroupBy object
df4.groupby("A")["B"].nunique()

A
bar    1
foo    2
Name: B, dtype: int64

In [113]:
df4.groupby(["A"]).agg(lambda ser: print(f'=== {ser} *** type is {type(ser)}'))

=== 3    1
4    1
Name: B, dtype: int64 *** type is <class 'pandas.core.series.Series'>
=== 0    1
1    2
2    2
Name: B, dtype: int64 *** type is <class 'pandas.core.series.Series'>


Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
bar,
foo,


### Applying multiple functions at once

With grouped `Serise` you can also pass a list or dict of functions to do aggregation with, outputing a DataFrame.

In [115]:
df.groupby("A")["C"].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,sum,mean,std
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,-0.420999,-0.140333,1.807001
foo,-0.972409,-0.194482,0.627691


In [118]:
df.groupby("A").agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,C,C,C,D,D,D
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bar,-0.420999,-0.140333,1.807001,1.235593,0.411864,0.686701
foo,-0.972409,-0.194482,0.627691,-0.307455,-0.061491,1.093718


You can rename in a similar manner:

In [121]:
df.groupby("A").agg([np.sum, np.mean, np.std]).rename(columns={"C": "=C=", "sum": "foo", "mean": "bar", "std": "baz"})

Unnamed: 0_level_0,=C=,=C=,=C=,D,D,D
Unnamed: 0_level_1,foo,bar,baz,foo,bar,baz
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bar,-0.420999,-0.140333,1.807001,1.235593,0.411864,0.686701
foo,-0.972409,-0.194482,0.627691,-0.307455,-0.061491,1.093718


### Named aggregation

To support column-specific aggregation **with control over the output column names**, pandas accepts the special syntax in `GroupBy.agg()`, known as "named aggregation", where

- The keywords are the output column names
- The values are tuples whose first element is the column to select and the second element is the aggregation to that column. Pandas provides the `pandas.NamedAgg` namedtuple with the fields `['column', 'aggfunc']` to make it clearer what the arguments are. As usual, the aggregation can be a callable or a string alias.

In [122]:
animals = pd.DataFrame(
    {
        "kind": ["cat", "dog", "cat", "dog"],
        "height": [9.1, 6.0, 9.5, 34.0],
        "weight": [7.9, 7.5, 9.9, 198.0],
    }
)
animals

Unnamed: 0,kind,height,weight
0,cat,9.1,7.9
1,dog,6.0,7.5
2,cat,9.5,9.9
3,dog,34.0,198.0


In [124]:
animals.groupby("kind").agg(
    min_height=pd.NamedAgg(column="height", aggfunc="min"),
    max_height=pd.NamedAgg(column="height", aggfunc="max"),
    average_weight=pd.NamedAgg(column="height", aggfunc=np.mean)
)

Unnamed: 0_level_0,min_height,max_height,average_weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,9.1,9.5,9.3
dog,6.0,34.0,20.0


`pandas.NamedAgg` is just a `namedtuple`. Plain tuples are allowed as well.

In [128]:
animals.groupby("kind").agg(
    min_height=("height", "min"),
    max_height=("height", "max"),
    average_weight=("weight", np.mean)
)

Unnamed: 0_level_0,min_height,max_height,average_weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,9.1,9.5,8.9
dog,6.0,34.0,102.75


If your desired output column names are not valid Python keywords, construct a dictionary and unpack the keyword arguments

In [127]:
animals.groupby("kind").agg(
    **{
        "total weight": ("weight", np.sum)
    }
)

Unnamed: 0_level_0,total weight
kind,Unnamed: 1_level_1
cat,17.8
dog,205.5


Named aggregation is also valid for `Series` groupby aggregations. In this case there's no column selection, so the values are just the functions.

In [129]:
animals.groupby("kind").height.agg(
    min_height="min",
    max_height=np.max
)

Unnamed: 0_level_0,min_height,max_height
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.1,9.5
dog,6.0,34.0


### Applying different functions to DataFrame columns

In [131]:
df.groupby("A").agg({"C": np.sum, "D": lambda x: np.std(x, ddof=1)})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.420999,0.686701
foo,-0.972409,1.093718


In [132]:
df.groupby("A").agg(
    C_sum=("C", np.sum),
    D_std=("D", lambda x: np.std(x, ddof=1))
)

Unnamed: 0_level_0,C_sum,D_std
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.420999,0.686701
foo,-0.972409,1.093718


## Transformation

The `transform` method returns an object that is indexed the same (same size) as the on being grouped.

In [146]:
index = pd.date_range("10/1/1999", periods=1100)
# 0.5: mean or median (平局值或中位数, 在正态分布中, 平均值=中位数)
#   2: Standard Deviations (标准偏差)
# 创建一个均值等于 0.5, 标准偏差等于 2 的正态分布数据 1100 个.
ts = pd.Series(np.random.normal(0.5, 2, 1100), index)
ts

1999-10-01   -0.620169
1999-10-02    2.307450
1999-10-03   -0.066246
1999-10-04   -0.613228
1999-10-05    1.021251
                ...   
2002-09-30   -2.216744
2002-10-01    2.992653
2002-10-02    0.666766
2002-10-03   -0.680169
2002-10-04    3.216459
Freq: D, Length: 1100, dtype: float64

In [155]:
ts_rolling = ts.rolling(window=100, min_periods=100).mean().dropna()
ts_rolling

2000-01-08    0.701720
2000-01-09    0.725692
2000-01-10    0.707895
2000-01-11    0.731020
2000-01-12    0.743738
                ...   
2002-09-30    0.611630
2002-10-01    0.649017
2002-10-02    0.664893
2002-10-03    0.673797
2002-10-04    0.684567
Freq: D, Length: 1001, dtype: float64