In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'key1':['a','a','b','b','a'],
                  'key2':['one','two','one','two','one'],
                  'data1':np.random.randn(5),
                  'data2':np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.347742,0.196026
1,a,two,0.87969,-1.221406
2,b,one,-1.004264,0.197204
3,b,two,0.269155,1.322084
4,a,one,-1.199331,-1.878806


In [3]:
grouped = df['data1'].groupby(df['key1'])

In [4]:
grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x0000000007FE9B70>

In [5]:
grouped.mean()

key1
a    0.009367
b   -0.367554
Name: data1, dtype: float64

In [6]:
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one    -0.425794
      two     0.879690
b     one    -1.004264
      two     0.269155
Name: data1, dtype: float64

In [7]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.425794,0.87969
b,-1.004264,0.269155


In [8]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])

In [9]:
years = np.array([2005,2005,2006,2005,2006])

In [10]:
df['data1'].groupby([states, years]).mean()

California  2005    0.879690
            2006   -1.004264
Ohio        2005    0.308449
            2006   -1.199331
Name: data1, dtype: float64

In [11]:
df['data1'].groupby([states, years])

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000000000504E080>

In [12]:
df['data1']

0    0.347742
1    0.879690
2   -1.004264
3    0.269155
4   -1.199331
Name: data1, dtype: float64

In [13]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.347742,0.196026
1,a,two,0.87969,-1.221406
2,b,one,-1.004264,0.197204
3,b,two,0.269155,1.322084
4,a,one,-1.199331,-1.878806


In [14]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.009367,-0.968062
b,-0.367554,0.759644


In [15]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.425794,-0.84139
a,two,0.87969,-1.221406
b,one,-1.004264,0.197204
b,two,0.269155,1.322084


In [16]:
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [17]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  0.347742  0.196026
1    a  two  0.879690 -1.221406
4    a  one -1.199331 -1.878806
b
  key1 key2     data1     data2
2    b  one -1.004264  0.197204
3    b  two  0.269155  1.322084


In [18]:
for (k1,k2), group in df.groupby(['key1','key2']):
    print(k1,k2)
    print(group)

a one
  key1 key2     data1     data2
0    a  one  0.347742  0.196026
4    a  one -1.199331 -1.878806
a two
  key1 key2    data1     data2
1    a  two  0.87969 -1.221406
b one
  key1 key2     data1     data2
2    b  one -1.004264  0.197204
b two
  key1 key2     data1     data2
3    b  two  0.269155  1.322084


In [19]:
pieces = dict(list(df.groupby('key1')))
pieces

{'a':   key1 key2     data1     data2
 0    a  one  0.347742  0.196026
 1    a  two  0.879690 -1.221406
 4    a  one -1.199331 -1.878806, 'b':   key1 key2     data1     data2
 2    b  one -1.004264  0.197204
 3    b  two  0.269155  1.322084}

In [20]:
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-1.004264,0.197204
3,b,two,0.269155,1.322084


In [21]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [22]:
grouped = df.groupby(df.dtypes,axis=1)

In [24]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0  0.347742  0.196026
1  0.879690 -1.221406
2 -1.004264  0.197204
3  0.269155  1.322084
4 -1.199331 -1.878806
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [25]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.347742,0.196026
1,a,two,0.87969,-1.221406
2,b,one,-1.004264,0.197204
3,b,two,0.269155,1.322084
4,a,one,-1.199331,-1.878806


In [26]:
df.groupby(['key1','key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.84139
a,two,-1.221406
b,one,0.197204
b,two,1.322084


In [27]:
s_grouped = df.groupby(['key1','key2'])['data2']

In [28]:
s_grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000000000830CA20>

In [29]:
s_grouped.mean()

key1  key2
a     one    -0.841390
      two    -1.221406
b     one     0.197204
      two     1.322084
Name: data2, dtype: float64

In [30]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])


In [31]:
people.iloc[2:3, [1,2]] = np.nan

In [32]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.266088,-0.828786,-1.526767,-0.601302,-1.410668
Steve,-1.364824,-0.226806,0.858722,-1.888315,-0.594687
Wes,0.809241,,,1.004874,0.94335
Jim,-1.892202,1.462586,-0.22523,1.088184,-0.306218
Travis,-0.342564,-1.1412,0.064962,0.709534,0.188302


In [33]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f': 'orange'}

In [35]:
by_column = people.groupby(mapping, axis=1)

In [36]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-2.128068,-2.505542
Steve,-1.029592,-2.186316
Wes,1.004874,1.752591
Jim,0.862953,-0.735834
Travis,0.774496,-1.295462


In [37]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [38]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [39]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-1.349048,0.633801,-1.751997,1.491756,-0.773537
5,-1.364824,-0.226806,0.858722,-1.888315,-0.594687
6,-0.342564,-1.1412,0.064962,0.709534,0.188302


In [40]:
key_list = ['one', 'one', 'one', 'two', 'two']

In [41]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.266088,-0.828786,-1.526767,-0.601302,-1.410668
3,two,-1.892202,1.462586,-0.22523,1.088184,-0.306218
5,one,-1.364824,-0.226806,0.858722,-1.888315,-0.594687
6,two,-0.342564,-1.1412,0.064962,0.709534,0.188302


In [42]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], 
                                     [1, 3, 5, 1, 3]], 
                                    names=['cty', 'tenor'])
columns

MultiIndex(levels=[['JP', 'US'], [1, 3, 5]],
           labels=[[1, 1, 1, 0, 0], [0, 1, 2, 0, 1]],
           names=['cty', 'tenor'])

In [43]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.073254,0.579428,0.189527,-0.630344,-2.22506
1,-0.041515,-0.250912,-0.025977,0.694967,-0.861105
2,0.051152,1.205632,0.898432,0.131788,0.139635
3,-0.085738,-0.153977,-1.089015,1.138468,-0.535947


In [44]:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
