In [3]:
import pandas as pd
import numpy as np

# Group by: split-apply-combine

By “group by” we are referring to a process involving one or more of the following steps:

- Splitting the data into groups based on some criteria.

- Applying a function to each group independently.

- Combining the results into a data structure.

Out of these, the split step is the most straightforward. In fact, in many situations we may wish to split the data set into groups and do something with those groups. In the apply step, we might wish to do one of the following:

- Aggregation: compute a summary statistic (or statistics) for each group. Some examples:

    - Compute group sums or means.

    - Compute group sizes / counts.

- Transformation: perform some group-specific computations and return a like-indexed object. Some examples:

    - Standardize data (zscore) within a group.

    - Filling NAs within groups with a value derived from each group.

-   Filtration: discard some groups, according to a group-wise computation that evaluates True or False. Some examples:

    - Discard data that belongs to groups with only a few members.

    - Filter out data based on the group sum or mean.

Many of these operations are defined on GroupBy objects. These operations are similar to the aggregating API, window API, and resample API.

It is possible that a given operation does not fall into one of these categories or is some combination of them. In such a case, it may be possible to compute the operation using GroupBy’s apply method. This method will examine the results of the apply step and try to return a sensibly combined result if it doesn’t fit into either of the above two categories.

Note

An operation that is split into multiple steps using built-in GroupBy operations will be more efficient than using the apply method with a user-defined Python function.

Since the set of object instance methods on pandas data structures are generally rich and expressive, we often simply want to invoke, say, a DataFrame function on each group. The name GroupBy should be quite familiar to those who have used a SQL-based tool (or itertools), in which you can write code like:

SELECT Column1, Column2, mean(Column3), sum(Column4)
FROM SomeTable
GROUP BY Column1, Column2
We aim to make operations like this natural and easy to express using pandas. We’ll address each area of GroupBy functionality then provide some non-trivial examples / use cases.

See the cookbook for some advanced strategies: https://pandas.pydata.org/docs/user_guide/cookbook.html#cookbook-grouping



# Splitting an object into groups

In [47]:
speeds = pd.DataFrame(
    [
        ("bird", "Falconiformes", 389.0),
        ("bird", "Psittaciformes", 24.0),
        ("mammal", "Carnivora", 80.2),
        ("mammal", "Primates", np.nan),
        ("mammal", "Carnivora", 58),
    ],
    index=["falcon", "parrot", "lion", "monkey", "leopard"],
    columns=("class", "order", "max_speed"),
)

In [48]:
speeds

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [5]:
speeds.groupby('class').mean('max_speed')

Unnamed: 0_level_0,max_speed
class,Unnamed: 1_level_1
bird,206.5
mammal,69.1


In [6]:
grouped = speeds.groupby("class")

grouped = speeds.groupby("order", axis="columns")

grouped = speeds.groupby(["class", "order"])

In [8]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)

df

Unnamed: 0,A,B,C,D
0,foo,one,0.238672,-1.131748
1,bar,one,2.453083,1.48217
2,foo,two,0.328392,-0.272814
3,bar,three,1.824528,0.445265
4,foo,two,-0.733556,-0.462202
5,bar,two,0.190358,0.382747
6,foo,one,-0.116737,0.561222
7,foo,three,0.372826,1.764403


In [9]:
grouped = df.groupby("A")

grouped = df.groupby(["A", "B"])

In [10]:
df2 = df.set_index(["A", "B"])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
foo,one,0.238672,-1.131748
bar,one,2.453083,1.48217
foo,two,0.328392,-0.272814
bar,three,1.824528,0.445265
foo,two,-0.733556,-0.462202
bar,two,0.190358,0.382747
foo,one,-0.116737,0.561222
foo,three,0.372826,1.764403


In [11]:
df2 = df.set_index(["A", "B"])

grouped = df2.groupby(level=df2.index.names.difference(["B"]))

grouped.sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,4.467969,2.310182
foo,0.089598,0.45886


In [12]:
lst = [1, 2, 3, 1, 2, 3]

s = pd.Series([1, 2, 3, 10, 20, 30], lst)

grouped = s.groupby(level=0)

grouped.first()

1    1
2    2
3    3
dtype: int64

In [13]:
grouped.last()


1    10
2    20
3    30
dtype: int64

In [14]:
grouped.sum()


1    11
2    22
3    33
dtype: int64

# GroupBy sorting

In [15]:
df2 = pd.DataFrame({"X": ["B", "B", "A", "A"], "Y": [1, 2, 3, 4]})

df2.groupby(["X"]).sum()

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
A,7
B,3


In [16]:
df2.groupby(["X"], sort=False).sum()


Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
B,3
A,7


In [17]:
df3 = pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]})

df3.groupby(["X"]).get_group("A")

Unnamed: 0,X,Y
0,A,1
2,A,3


In [18]:
df3.groupby(["X"]).get_group("B")


Unnamed: 0,X,Y
1,B,4
3,B,2


## GroupBy dropna

In [19]:
df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]

df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"])

df_dropna

Unnamed: 0,a,b,c
0,1,2.0,3
1,1,,4
2,2,1.0,3
3,1,2.0,2


In [20]:
df_dropna.groupby(by=["b"], dropna=True).sum()


Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,3
2.0,2,5


In [21]:
df_dropna.groupby(by=["b"], dropna=False).sum()


Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,3
2.0,2,5
,1,4


## GroupBy object attributes

In [22]:
df.groupby("A").groups


{'bar': [1, 3, 5], 'foo': [0, 2, 4, 6, 7]}

In [23]:
df.groupby(get_letter_type, axis=1).groups


NameError: name 'get_letter_type' is not defined

## Grouping DataFrame with Index levels and columns

In [1]:
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]

In [4]:
index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"])

df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 3, 3], "B": np.arange(8)}, index=index)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


In [5]:
df.groupby([pd.Grouper(level=1), "A"]).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,B
second,A,Unnamed: 2_level_1
one,1,2
one,2,4
one,3,6
two,1,4
two,2,5
two,3,7


In [6]:
help(pd.Grouper)

Help on class Grouper in module pandas.core.groupby.grouper:

class Grouper(builtins.object)
 |  Grouper(*args, **kwargs)
 |  
 |  A Grouper allows the user to specify a groupby instruction for an object.
 |  
 |  This specification will select a column via the key parameter, or if the
 |  level and/or axis parameters are given, a level of the index of the target
 |  object.
 |  
 |  If `axis` and/or `level` are passed as keywords to both `Grouper` and
 |  `groupby`, the values passed to `Grouper` take precedence.
 |  
 |  Parameters
 |  ----------
 |  key : str, defaults to None
 |      Groupby key, which selects the grouping column of the target.
 |  level : name/number, defaults to None
 |      The level for the target index.
 |  freq : str / frequency object, defaults to None
 |      This will groupby the specified frequency if the target selection
 |      (via key or level) is a datetime-like object. For full specification
 |      of available frequencies, please see `here
 |     

In [7]:
df.groupby([pd.Grouper(level=1), "A"]).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,B
second,A,Unnamed: 2_level_1
one,1,2
one,2,4
one,3,6
two,1,4
two,2,5
two,3,7


In [8]:
df.groupby([pd.Grouper(level="second"), "A"]).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,B
second,A,Unnamed: 2_level_1
one,1,2
one,2,4
one,3,6
two,1,4
two,2,5
two,3,7


In [9]:
df.groupby(["second", "A"]).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,B
second,A,Unnamed: 2_level_1
one,1,2
one,2,4
one,3,6
two,1,4
two,2,5
two,3,7


## DataFrame column selection in GroupBy

In [10]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)

In [11]:
df

Unnamed: 0,A,B,C,D
0,foo,one,0.447018,-0.324695
1,bar,one,-1.180643,0.049256
2,foo,two,-2.410386,0.017462
3,bar,three,0.31804,0.855184
4,foo,two,1.218355,0.388534
5,bar,two,0.309101,0.87841
6,foo,one,1.70826,0.006584
7,foo,three,-0.000797,0.508436


In [12]:
grouped = df.groupby(["A"])

grouped_C = grouped["C"]

grouped_D = grouped["D"]

In [13]:
df["C"].groupby(df["A"])


<pandas.core.groupby.generic.SeriesGroupBy object at 0x10f978880>

# Iterating through groups

In [14]:
grouped = df.groupby('A')

for name, group in grouped:
    print(name)
    print(group)

bar
     A      B         C         D
1  bar    one -1.180643  0.049256
3  bar  three  0.318040  0.855184
5  bar    two  0.309101  0.878410
foo
     A      B         C         D
0  foo    one  0.447018 -0.324695
2  foo    two -2.410386  0.017462
4  foo    two  1.218355  0.388534
6  foo    one  1.708260  0.006584
7  foo  three -0.000797  0.508436


In [15]:
for name, group in df.groupby(['A', 'B']):
    print(name)
    print(group)

('bar', 'one')
     A    B         C         D
1  bar  one -1.180643  0.049256
('bar', 'three')
     A      B        C         D
3  bar  three  0.31804  0.855184
('bar', 'two')
     A    B         C        D
5  bar  two  0.309101  0.87841
('foo', 'one')
     A    B         C         D
0  foo  one  0.447018 -0.324695
6  foo  one  1.708260  0.006584
('foo', 'three')
     A      B         C         D
7  foo  three -0.000797  0.508436
('foo', 'two')
     A    B         C         D
2  foo  two -2.410386  0.017462
4  foo  two  1.218355  0.388534


# Selecting a group

In [16]:
grouped.get_group("bar")


Unnamed: 0,A,B,C,D
1,bar,one,-1.180643,0.049256
3,bar,three,0.31804,0.855184
5,bar,two,0.309101,0.87841


In [17]:
df.groupby(["A", "B"]).get_group(("bar", "one"))


Unnamed: 0,A,B,C,D
1,bar,one,-1.180643,0.049256


# Aggregation

In [18]:
animals = pd.DataFrame(
    {
        "kind": ["cat", "dog", "cat", "dog"],
        "height": [9.1, 6.0, 9.5, 34.0],
        "weight": [7.9, 7.5, 9.9, 198.0],
    }
)


animals

Unnamed: 0,kind,height,weight
0,cat,9.1,7.9
1,dog,6.0,7.5
2,cat,9.5,9.9
3,dog,34.0,198.0


In [19]:
animals.groupby("kind").sum()


Unnamed: 0_level_0,height,weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,18.6,17.8
dog,40.0,205.5


In [20]:
animals.groupby("kind", as_index=False).sum()

Unnamed: 0,kind,height,weight
0,cat,18.6,17.8
1,dog,40.0,205.5


## Built-in aggregation methods

In [22]:
df.groupby("A")[["C", "D"]].max()


Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.31804,0.87841
foo,1.70826,0.508436


In [23]:
df.groupby(["A", "B"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.180643,0.049256
bar,three,0.31804,0.855184
bar,two,0.309101,0.87841
foo,one,1.077639,-0.159056
foo,three,-0.000797,0.508436
foo,two,-0.596016,0.202998


In [24]:
grouped = df.groupby(["A", "B"])

grouped.size()

A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     three    1
     two      2
dtype: int64

In [25]:
grouped.describe()


Unnamed: 0_level_0,Unnamed: 1_level_0,C,C,C,C,C,C,C,C,D,D,D,D,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
bar,one,1.0,-1.180643,,-1.180643,-1.180643,-1.180643,-1.180643,-1.180643,1.0,0.049256,,0.049256,0.049256,0.049256,0.049256,0.049256
bar,three,1.0,0.31804,,0.31804,0.31804,0.31804,0.31804,0.31804,1.0,0.855184,,0.855184,0.855184,0.855184,0.855184,0.855184
bar,two,1.0,0.309101,,0.309101,0.309101,0.309101,0.309101,0.309101,1.0,0.87841,,0.87841,0.87841,0.87841,0.87841,0.87841
foo,one,2.0,1.077639,0.891833,0.447018,0.762328,1.077639,1.392949,1.70826,2.0,-0.159056,0.23425,-0.324695,-0.241875,-0.159056,-0.076236,0.006584
foo,three,1.0,-0.000797,,-0.000797,-0.000797,-0.000797,-0.000797,-0.000797,1.0,0.508436,,0.508436,0.508436,0.508436,0.508436,0.508436
foo,two,2.0,-0.596016,2.565907,-2.410386,-1.503201,-0.596016,0.311169,1.218355,2.0,0.202998,0.262388,0.017462,0.11023,0.202998,0.295766,0.388534


In [26]:
ll = [['foo', 1], ['foo', 2], ['foo', 2], ['bar', 1], ['bar', 1]]

df4 = pd.DataFrame(ll, columns=["A", "B"])

df4

Unnamed: 0,A,B
0,foo,1
1,foo,2
2,foo,2
3,bar,1
4,bar,1


In [27]:
df4.groupby("A")["B"].nunique()


A
bar    1
foo    2
Name: B, dtype: int64

## The aggregate() method

In [28]:
grouped = df.groupby("A")

grouped[["C", "D"]].aggregate("sum")

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.553502,1.782849
foo,0.962449,0.596321


In [29]:
grouped = df.groupby(["A", "B"])

grouped.agg("sum")

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.180643,0.049256
bar,three,0.31804,0.855184
bar,two,0.309101,0.87841
foo,one,2.155278,-0.318111
foo,three,-0.000797,0.508436
foo,two,-1.192032,0.405996


In [30]:
grouped = df.groupby(["A", "B"], as_index=False)

grouped.agg("sum")

Unnamed: 0,A,B,C,D
0,bar,one,-1.180643,0.049256
1,bar,three,0.31804,0.855184
2,bar,two,0.309101,0.87841
3,foo,one,2.155278,-0.318111
4,foo,three,-0.000797,0.508436
5,foo,two,-1.192032,0.405996


In [31]:
df.groupby(["A", "B"]).agg("sum").reset_index()


Unnamed: 0,A,B,C,D
0,bar,one,-1.180643,0.049256
1,bar,three,0.31804,0.855184
2,bar,two,0.309101,0.87841
3,foo,one,2.155278,-0.318111
4,foo,three,-0.000797,0.508436
5,foo,two,-1.192032,0.405996


## Aggregation with User-Defined Functions

In [32]:
animals

Unnamed: 0,kind,height,weight
0,cat,9.1,7.9
1,dog,6.0,7.5
2,cat,9.5,9.9
3,dog,34.0,198.0


In [35]:
animals.groupby("kind")[["height"]].agg(lambda x: set(x))


Unnamed: 0_level_0,height
kind,Unnamed: 1_level_1
cat,"{9.1, 9.5}"
dog,"{34.0, 6.0}"


In [36]:
animals.groupby("kind")[["height"]].agg(lambda x: x.astype(int).sum())


Unnamed: 0_level_0,height
kind,Unnamed: 1_level_1
cat,18
dog,40


## Applying multiple functions at once

In [37]:
grouped = df.groupby("A")

grouped["C"].agg(["sum", "mean", "std"])

Unnamed: 0_level_0,sum,mean,std
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,-0.553502,-0.184501,0.862696
foo,0.962449,0.19249,1.598989


In [38]:
grouped[["C", "D"]].agg(["sum", "mean", "std"])


Unnamed: 0_level_0,C,C,C,D,D,D
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bar,-0.553502,-0.184501,0.862696,1.782849,0.594283,0.47215
foo,0.962449,0.19249,1.598989,0.596321,0.119264,0.333212


In [39]:
(
    grouped["C"]
    .agg(["sum", "mean", "std"])
    .rename(columns={"sum": "foo", "mean": "bar", "std": "baz"})
)


Unnamed: 0_level_0,foo,bar,baz
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,-0.553502,-0.184501,0.862696
foo,0.962449,0.19249,1.598989


In [41]:
grouped["C"].agg(["sum", "mean", "std"]).rename(columns={"sum": "foo", "mean": "bar", "std": "baz"})

Unnamed: 0_level_0,foo,bar,baz
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,-0.553502,-0.184501,0.862696
foo,0.962449,0.19249,1.598989


## Named aggregation

In [42]:
animals


Unnamed: 0,kind,height,weight
0,cat,9.1,7.9
1,dog,6.0,7.5
2,cat,9.5,9.9
3,dog,34.0,198.0


In [43]:
animals.groupby("kind").agg(
    min_height=pd.NamedAgg(column="height", aggfunc="min"),
    max_height=pd.NamedAgg(column="height", aggfunc="max"),
    average_weight=pd.NamedAgg(column="weight", aggfunc="mean"),
)

Unnamed: 0_level_0,min_height,max_height,average_weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,9.1,9.5,8.9
dog,6.0,34.0,102.75


## Applying different functions to DataFrame columns

In [44]:
grouped.agg({"C": "sum", "D": lambda x: np.std(x, ddof=1)})


Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.553502,0.47215
foo,0.962449,0.333212


In [45]:
grouped.agg({"C": "sum", "D": "std"})


Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.553502,0.47215
foo,0.962449,0.333212


# Transformation

In [46]:
grouped = speeds.groupby("class")["max_speed"]

grouped.cumsum()

NameError: name 'speeds' is not defined

## The transform() method

In [49]:
speeds

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [52]:
grouped = speeds.groupby('class')[['max_speed']]

In [53]:
grouped.transform('sum')

Unnamed: 0,max_speed
falcon,413.0
parrot,413.0
lion,138.2
monkey,138.2
leopard,138.2


In [54]:
index = pd.date_range("10/1/1999", periods=1100)

ts = pd.Series(np.random.normal(0.5, 2, 1100), index)

ts = ts.rolling(window=100, min_periods=100).mean().dropna()

ts.head()

2000-01-08    0.491325
2000-01-09    0.505392
2000-01-10    0.452310
2000-01-11    0.461060
2000-01-12    0.474567
Freq: D, dtype: float64

In [55]:
transformed = ts.groupby(lambda x: x.year).transform(
    lambda x: (x - x.mean()) / x.std()
)

In [56]:
grouped = ts.groupby(lambda x: x.year)

grouped.mean()

2000    0.419609
2001    0.486071
2002    0.384980
dtype: float64

In [57]:
grouped_trans = transformed.groupby(lambda x: x.year)

grouped_trans.mean()

2000    1.948302e-17
2001   -4.139763e-16
2002   -1.867740e-16
dtype: float64

In [58]:
ts.groupby(lambda x: x.year).transform(lambda x: x.max() - x.min())


2000-01-08    1.049566
2000-01-09    1.049566
2000-01-10    1.049566
2000-01-11    1.049566
2000-01-12    1.049566
                ...   
2002-09-30    0.600880
2002-10-01    0.600880
2002-10-02    0.600880
2002-10-03    0.600880
2002-10-04    0.600880
Freq: D, Length: 1001, dtype: float64

Difference between transform and aggregate : 
    https://stackoverflow.com/questions/40957932/transform-vs-aggregate-in-pandas

In [60]:
df_re = pd.DataFrame({"A": [1] * 10 + [5] * 10, "B": np.arange(20)})
df_re

Unnamed: 0,A,B
0,1,0
1,1,1
2,1,2
3,1,3
4,1,4
5,1,5
6,1,6
7,1,7
8,1,8
9,1,9


In [61]:
df_re.groupby("A").rolling(4).B.mean()


A    
1  0      NaN
   1      NaN
   2      NaN
   3      1.5
   4      2.5
   5      3.5
   6      4.5
   7      5.5
   8      6.5
   9      7.5
5  10     NaN
   11     NaN
   12     NaN
   13    11.5
   14    12.5
   15    13.5
   16    14.5
   17    15.5
   18    16.5
   19    17.5
Name: B, dtype: float64

In [62]:
df_re.groupby("A").expanding().sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,B
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,0.0
1,1,1.0
1,2,3.0
1,3,6.0
1,4,10.0
1,5,15.0
1,6,21.0
1,7,28.0
1,8,36.0
1,9,45.0


# Filtration

In [64]:
sf = pd.Series([1, 1, 2, 3, 3, 3])

sf.groupby(sf).filter(lambda x: x.sum() > 2)

sf

0    1
1    1
2    2
3    3
4    3
5    3
dtype: int64

In [65]:
dff = pd.DataFrame({"A": np.arange(8), "B": list("aabbbbcc")})

dff.groupby("B").filter(lambda x: len(x) > 2)

Unnamed: 0,A,B
2,2,b
3,3,b
4,4,b
5,5,b


# Flexible apply

In [66]:
grouped = df.groupby('A')['C']

def f(group):
    return pd.DataFrame({'original': group,
                         'demeaned': group - group.mean()})

In [67]:
grouped.apply(f)


Unnamed: 0_level_0,Unnamed: 1_level_0,original,demeaned
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,1,-1.180643,-0.996142
bar,3,0.31804,0.502541
bar,5,0.309101,0.493601
foo,0,0.447018,0.254528
foo,2,-2.410386,-2.602876
foo,4,1.218355,1.025865
foo,6,1.70826,1.51577
foo,7,-0.000797,-0.193287
