In [2]:
import numpy as np
import pandas as pd

df = pd.DataFrame({'key1': ['a', 'a', None, 'b', 'b', 'a', None],
                   'key2': pd.Series([1, 2, 1, 2, 1, None, 1], dtype='Int64'),
                   'data1': np.random.standard_normal(7),
                   'data2': np.random.standard_normal(7)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.133139,0.666999
1,a,2.0,-0.193855,0.285153
2,,1.0,0.086425,-0.873976
3,b,2.0,1.186621,0.135153
4,b,1.0,0.228648,-0.090498
5,a,,0.910269,-1.913772
6,,1.0,1.39616,-0.132621


In [3]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000200CEE907C0>

In [4]:
grouped.mean()

key1
a    0.194425
b    0.707634
Name: data1, dtype: float64

In [5]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     1      -0.133139
      2      -0.193855
b     1       0.228648
      2       1.186621
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.133139,-0.193855
b,0.228648,1.186621


In [7]:
states = np.array(['OH', 'CA', 'CA', 'OH', 'OH', 'CA', 'OH'])
years = np.array([2005, 2005, 2006, 2005, 2006, 2006, 2005])
df['data1'].groupby([states, years]).mean()

CA  2005   -0.193855
    2006    0.498347
OH  2005    0.816547
    2006    0.228648
Name: data1, dtype: float64

In [8]:
df.groupby('key1').mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,0.194425,-0.32054
b,1.5,0.707634,0.022328


In [9]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,-0.133139,0.666999
a,2,-0.193855,0.285153
b,1,0.228648,-0.090498
b,2,1.186621,0.135153


In [10]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

In [11]:
df.groupby('key1').size()

key1
a    3
b    2
dtype: int64

In [12]:
df.groupby('key1', dropna=False).size()

key1
a      3
b      2
NaN    2
dtype: int64

In [13]:
df.groupby(['key1', 'key2'], dropna=False).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

In [14]:
df.groupby('key1').count()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,3,3
b,2,2,2


In [15]:
df.groupby('key2').mean(numeric_only=True)

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.394523,-0.107524
2,0.496383,0.210153


In [16]:
for name, group in df.groupby('key1'):
    print(f"Group name: {name}")
    print(group)

Group name: a
  key1  key2     data1     data2
0    a     1 -0.133139  0.666999
1    a     2 -0.193855  0.285153
5    a  <NA>  0.910269 -1.913772
Group name: b
  key1  key2     data1     data2
3    b     2  1.186621  0.135153
4    b     1  0.228648 -0.090498


In [17]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print(f"Group keys: {k1}, {k2}")
    print(group)

Group keys: a, 1
  key1  key2     data1     data2
0    a     1 -0.133139  0.666999
Group keys: a, 2
  key1  key2     data1     data2
1    a     2 -0.193855  0.285153
Group keys: b, 1
  key1  key2     data1     data2
4    b     1  0.228648 -0.090498
Group keys: b, 2
  key1  key2     data1     data2
3    b     2  1.186621  0.135153


In [18]:
pieces = {name: group for name, group in df.groupby('key1')}
pieces['b']

Unnamed: 0,key1,key2,data1,data2
3,b,2,1.186621,0.135153
4,b,1,0.228648,-0.090498


In [20]:
grouped = df.T.groupby({'key1': 'key', 'key2': 'key',
                      'data1': 'data', 'data2': 'data'})

In [21]:
for group_key, group_values in grouped:
    print(f"Group key: {group_key}")
    print(group_values.T)

Group key: data
      data1     data2
0 -0.133139  0.666999
1 -0.193855  0.285153
2  0.086425 -0.873976
3  1.186621  0.135153
4  0.228648 -0.090498
5  0.910269 -1.913772
6   1.39616 -0.132621
Group key: key
   key1  key2
0     a     1
1     a     2
2  None     1
3     b     2
4     b     1
5     a  <NA>
6  None     1


In [22]:
df.groupby(['key1', 'key2'])['data1'].mean()

key1  key2
a     1      -0.133139
      2      -0.193855
b     1       0.228648
      2       1.186621
Name: data1, dtype: float64

In [23]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,0.666999
a,2,0.285153
b,1,-0.090498
b,2,0.135153


In [24]:
df.groupby(['key1', 'key2'])[['data1']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1
key1,key2,Unnamed: 2_level_1
a,1,-0.133139
a,2,-0.193855
b,1,0.228648
b,2,1.186621


In [25]:
s_grouped = df.groupby(['key1', 'key2'])['data1']
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000200D0B58DF0>

In [26]:
s_grouped.mean()

key1  key2
a     1      -0.133139
      2      -0.193855
b     1       0.228648
      2       1.186621
Name: data1, dtype: float64

In [27]:
people = pd.DataFrame(np.random.standard_normal((5, 5)),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,-1.52422,0.189091,-0.18063,-1.914643,0.274668
Steve,0.939271,0.579804,0.129022,0.384349,-0.015112
Wes,-0.148566,1.002955,-0.895535,-1.148097,0.855359
Jim,0.606105,1.223429,-1.0124,0.933283,1.025983
Travis,2.005258,1.020309,-0.939736,-0.384855,0.789687


In [28]:
people.iloc[2:3, [1, 2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-1.52422,0.189091,-0.18063,-1.914643,0.274668
Steve,0.939271,0.579804,0.129022,0.384349,-0.015112
Wes,-0.148566,,,-1.148097,0.855359
Jim,0.606105,1.223429,-1.0124,0.933283,1.025983
Travis,2.005258,1.020309,-0.939736,-0.384855,0.789687


In [29]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}
by_column = people.T.groupby(mapping)
by_column.mean().T

Unnamed: 0,blue,red
Joe,-1.047637,-0.353487
Steve,0.256685,0.501321
Wes,-1.148097,0.353397
Jim,-0.039559,0.951839
Travis,-0.662296,1.271751


In [30]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [34]:
people.T.groupby(map_series).count().T

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [36]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-1.066681,1.41252,-1.19303,-2.129457,2.156011
5,0.939271,0.579804,0.129022,0.384349,-0.015112
6,2.005258,1.020309,-0.939736,-0.384855,0.789687


In [None]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-1.52422,0.189091,-0.18063,-1.914643,0.274668
3,two,0.606105,1.223429,-1.0124,0.933283,1.025983
5,one,0.939271,0.579804,0.129022,0.384349,-0.015112
6,two,2.005258,1.020309,-0.939736,-0.384855,0.789687


In [38]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.standard_normal((4, 5)), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.852831,-0.869551,-0.164822,-0.501094,1.367144
1,1.24708,-1.038045,0.051321,-3.206072,-2.211248
2,0.212684,1.151735,1.178121,-0.100634,0.207192
3,1.080754,1.377285,0.643311,1.596826,-1.362825


In [39]:
hier_df.T.groupby(level='cty').count().T

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
