In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1': np.random.randn(5),
'data2': np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.817436,-0.74412
1,a,two,2.247099,0.418606
2,b,one,-0.081961,1.230678
3,b,two,-0.844096,-1.220409
4,a,one,-1.016743,2.130587


# Groupby on Series

In [3]:
g1 = df['data1'].groupby(df['key1'])
g1.count()

key1
a    3
b    2
Name: data1, dtype: int64

In [4]:
g1.mean()

key1
a    0.137640
b   -0.463029
Name: data1, dtype: float64

# Groupby on Dataframe

In [5]:
# g2 = df.groupby(df['key1'])
g2 = df.groupby('key1')
g2.count()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,3,3,3
b,2,2,2


In [6]:
g2.mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.13764,0.601691
b,-0.463029,0.005135


In [7]:
g3 = df.groupby(['key1', 'key2'])
g3.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,2,2
a,two,1,1
b,one,1,1
b,two,1,1


In [8]:
g3.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.917089,0.693234
a,two,2.247099,0.418606
b,one,-0.081961,1.230678
b,two,-0.844096,-1.220409


# For statement on Groupby

In [9]:
for key, group in g1:
    print(key)
    print(group)

a
0   -0.817436
1    2.247099
4   -1.016743
Name: data1, dtype: float64
b
2   -0.081961
3   -0.844096
Name: data1, dtype: float64


In [10]:
for key, group in g2:
    print(key)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.817436 -0.744120
1    a  two  2.247099  0.418606
4    a  one -1.016743  2.130587
b
  key1 key2     data1     data2
2    b  one -0.081961  1.230678
3    b  two -0.844096 -1.220409


In [11]:
for (key1, key2), group in g3:
    print(key1, key2)
    print(group)

a one
  key1 key2     data1     data2
0    a  one -0.817436 -0.744120
4    a  one -1.016743  2.130587
a two
  key1 key2     data1     data2
1    a  two  2.247099  0.418606
b one
  key1 key2     data1     data2
2    b  one -0.081961  1.230678
b two
  key1 key2     data1     data2
3    b  two -0.844096 -1.220409


# Groupby with other group(dict or series)

In [12]:
df2 = pd.DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'], 
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
df2

Unnamed: 0,a,b,c,d,e
Joe,-0.429439,-0.890479,-0.49628,-0.132745,0.43589
Steve,0.664086,-0.904074,-0.2991,1.276674,-0.441658
Wes,-0.371314,0.794432,0.124739,0.370282,0.165954
Jim,1.024425,0.174757,0.075045,-0.728534,1.186281
Travis,0.268618,1.172176,-1.26661,-0.573503,0.595167


In [13]:
other_group = {'a': 'red', 'b': 'red', 'c': 'blue', 
               'd': 'blue', 'e': 'red', 'f' : 'orange'}
other_group

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [14]:
g4 = df2.groupby(other_group, axis=1)
for key, group in g4:
    print(key)
    print(group)

blue
               c         d
Joe    -0.496280 -0.132745
Steve  -0.299100  1.276674
Wes     0.124739  0.370282
Jim     0.075045 -0.728534
Travis -1.266610 -0.573503
red
               a         b         e
Joe    -0.429439 -0.890479  0.435890
Steve   0.664086 -0.904074 -0.441658
Wes    -0.371314  0.794432  0.165954
Jim     1.024425  0.174757  1.186281
Travis  0.268618  1.172176  0.595167


In [15]:
g4.count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,2,3
Jim,2,3
Travis,2,3


In [16]:
g4.mean()

Unnamed: 0,blue,red
Joe,-0.314513,-0.294676
Steve,0.488787,-0.227215
Wes,0.24751,0.196357
Jim,-0.326745,0.795155
Travis,-0.920057,0.678654


In [17]:
g4.median()

Unnamed: 0,blue,red
Joe,-0.314513,-0.429439
Steve,0.488787,-0.441658
Wes,0.24751,0.165954
Jim,-0.326745,1.024425
Travis,-0.920057,0.595167


In [18]:
g4.std()

Unnamed: 0,blue,red
Joe,0.257058,0.673375
Steve,1.114241,0.805773
Wes,0.173625,0.583467
Jim,0.568216,0.543341
Travis,0.490101,0.457528


In [19]:
g4.var()

Unnamed: 0,blue,red
Joe,0.066079,0.453434
Steve,1.241533,0.64927
Wes,0.030146,0.340434
Jim,0.322869,0.295219
Travis,0.240199,0.209332


In [20]:
g4.min()

Unnamed: 0,blue,red
Joe,-0.49628,-0.890479
Steve,-0.2991,-0.904074
Wes,0.124739,-0.371314
Jim,-0.728534,0.174757
Travis,-1.26661,0.268618


In [21]:
g4.max()

Unnamed: 0,blue,red
Joe,-0.132745,0.43589
Steve,1.276674,0.664086
Wes,0.370282,0.794432
Jim,0.075045,1.186281
Travis,-0.573503,1.172176


In [22]:
g4.prod()

Unnamed: 0,blue,red
Joe,0.065879,0.166687
Steve,-0.381854,0.265164
Wes,0.046188,-0.048954
Jim,-0.054672,0.212375
Travis,0.726405,0.187399


In [23]:
g4.first()

Unnamed: 0,blue,red
Joe,-0.49628,-0.429439
Steve,-0.2991,0.664086
Wes,0.124739,-0.371314
Jim,0.075045,1.024425
Travis,-1.26661,0.268618


In [24]:
g4.last()

Unnamed: 0,blue,red
Joe,-0.132745,0.43589
Steve,1.276674,-0.441658
Wes,0.370282,0.165954
Jim,-0.728534,1.186281
Travis,-0.573503,0.595167


# user_defined_func on Groupby

In [25]:
def diff_max_min(arr):
    return arr.max() - arr.min()
g2.agg(diff_max_min)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3.263842,2.874707
b,0.762135,2.451087


In [26]:
for key, group in g2:
    print(key)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.817436 -0.744120
1    a  two  2.247099  0.418606
4    a  one -1.016743  2.130587
b
  key1 key2     data1     data2
2    b  one -0.081961  1.230678
3    b  two -0.844096 -1.220409
