In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)
                  })

In [4]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.397701,-1.583054
1,a,two,1.381144,-0.579219
2,b,one,-0.874732,-0.640418
3,b,two,-0.926008,0.065863
4,a,one,-0.096632,0.370211


In [5]:
grouped = df['data1'].groupby(df['key1'])

In [6]:
grouped.mean()

key1
a    0.295604
b   -0.900370
Name: data1, dtype: float64

In [7]:
means  = df['data1'].groupby([df['key1'],df['key2']]).mean()

In [8]:
means

key1  key2
a     one    -0.247166
      two     1.381144
b     one    -0.874732
      two    -0.926008
Name: data1, dtype: float64

In [9]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.247166,1.381144
b,-0.874732,-0.926008


In [10]:
means.unstack('key1')

key1,a,b
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
one,-0.247166,-0.874732
two,1.381144,-0.926008


In [11]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005,2005, 2006, 2005, 2006])

In [12]:
df['data1'].groupby([states, years]).mean()

California  2005    1.381144
            2006   -0.874732
Ohio        2005   -0.661855
            2006   -0.096632
Name: data1, dtype: float64

# Iterating Over Groups

In [13]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)
    print('\n')

a
  key1 key2     data1     data2
0    a  one -0.397701 -1.583054
1    a  two  1.381144 -0.579219
4    a  one -0.096632  0.370211


b
  key1 key2     data1     data2
2    b  one -0.874732 -0.640418
3    b  two -0.926008  0.065863




In [14]:
for (k1,k2) , group in df.groupby(['key1', 'key2']):
    print((k1,k2))
    print(group)
    print('\n')

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.397701 -1.583054
4    a  one -0.096632  0.370211


('a', 'two')
  key1 key2     data1     data2
1    a  two  1.381144 -0.579219


('b', 'one')
  key1 key2     data1     data2
2    b  one -0.874732 -0.640418


('b', 'two')
  key1 key2     data1     data2
3    b  two -0.926008  0.065863




In [15]:
pieces = dict(list(df.groupby('key1')))


In [17]:
pieces

{'a':   key1 key2     data1     data2
 0    a  one -0.397701 -1.583054
 1    a  two  1.381144 -0.579219
 4    a  one -0.096632  0.370211, 'b':   key1 key2     data1     data2
 2    b  one -0.874732 -0.640418
 3    b  two -0.926008  0.065863}

In [18]:
grouped = df.groupby(df.dtypes, axis=1)

In [19]:

for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.397701 -1.583054
1  1.381144 -0.579219
2 -0.874732 -0.640418
3 -0.926008  0.065863
4 -0.096632  0.370211
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


# Selecting a Column or subset of Columns

In [20]:
df.groupby(['key1', 'key2'])[['data1', 'data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.247166,-0.606421
a,two,1.381144,-0.579219
b,one,-0.874732,-0.640418
b,two,-0.926008,0.065863


# Grouping with Dics and Series

In [21]:
people = pd.DataFrame(np.random.randn(5,5),
                     columns = ['a', 'b', 'c', 'd', 'e'],
                     index = ['joe', 'Steve', 'Wes', 'Jim', 'Travis']
                     )

In [22]:
people.iloc[2:3, [1,2]] = np.nan

In [23]:
people

Unnamed: 0,a,b,c,d,e
joe,1.271053,-0.531363,-0.736703,-0.406707,1.043436
Steve,-0.08776,-0.912908,0.884993,-0.603766,-0.000656
Wes,-2.576796,,,0.612529,1.466158
Jim,-1.077427,0.135338,-1.946526,0.480994,0.730758
Travis,-0.071989,-1.382827,-0.663555,2.149269,-0.117246


In [24]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [25]:
by_column = people.groupby(mapping, axis=1)

In [26]:
by_column.mean()

Unnamed: 0,blue,red
joe,-0.571705,0.594376
Steve,0.140614,-0.333775
Wes,0.612529,-0.555319
Jim,-0.732766,-0.070443
Travis,0.742857,-0.52402


In [27]:
by_column.sum()

Unnamed: 0,blue,red
joe,-1.14341,1.783127
Steve,0.281228,-1.001324
Wes,0.612529,-1.110639
Jim,-1.465532,-0.21133
Travis,1.485713,-1.572061


In [28]:
map_series = pd.Series(mapping)

In [29]:
people

Unnamed: 0,a,b,c,d,e
joe,1.271053,-0.531363,-0.736703,-0.406707,1.043436
Steve,-0.08776,-0.912908,0.884993,-0.603766,-0.000656
Wes,-2.576796,,,0.612529,1.466158
Jim,-1.077427,0.135338,-1.946526,0.480994,0.730758
Travis,-0.071989,-1.382827,-0.663555,2.149269,-0.117246


In [30]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


# Grouping with Functions

In [31]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-2.38317,-0.396025,-2.68323,0.686817,3.240351
5,-0.08776,-0.912908,0.884993,-0.603766,-0.000656
6,-0.071989,-1.382827,-0.663555,2.149269,-0.117246


Mixing functions with arrays , dicts or series is not an problem as everything gets converted to array internally:

In [36]:
key_list = ['one', 'one', 'one', 'two', 'two']

In [37]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-2.576796,-0.531363,-0.736703,-0.406707,1.043436
3,two,-1.077427,0.135338,-1.946526,0.480994,0.730758
5,one,-0.08776,-0.912908,0.884993,-0.603766,-0.000656
6,two,-0.071989,-1.382827,-0.663555,2.149269,-0.117246


# Grouping By Index Levels

In [55]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                     [1, 3, 5, 1, 3]
                                    ], names=['city', 'tenor'])

In [56]:
columns

MultiIndex([('US', 1),
            ('US', 3),
            ('US', 5),
            ('JP', 1),
            ('JP', 3)],
           names=['city', 'tenor'])

In [57]:
hier_df = pd.DataFrame(np.random.randn(4, 5),
                      columns=columns
                      )

In [58]:
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,1.826465,0.73149,0.551604,-0.106827,-1.221299
1,-0.89286,0.03832,0.117894,1.555868,-0.075431
2,0.816902,2.402333,0.898175,0.421816,-2.549478
3,-0.462638,-0.305823,-0.48574,-0.117367,-0.700853


In [59]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
