# Groupby on DataFrames

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
dframe = DataFrame({'k1': ['X', 'X', 'Y', 'Y', 'Z'],
                   'k2':['alpha', 'beta', 'alpha', 'beta', 'alpha'],
                   'dataset1':np.random.randn(5),
                   'dataset2':np.random.randn(5)})
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.604692,0.540031,X,alpha
1,0.669353,0.144727,X,beta
2,0.470103,-0.230465,Y,alpha
3,-0.155475,0.240348,Y,beta
4,-0.446995,0.427376,Z,alpha


In [3]:
group1 = dframe['dataset1'].groupby(dframe['k1']) 
group1

<pandas.core.groupby.SeriesGroupBy object at 0x0000000009AC3630>

In [4]:
group1.mean() 

k1
X    0.637023
Y    0.157314
Z   -0.446995
Name: dataset1, dtype: float64

In [5]:
    arr1 = np.array(['Qom', 'Bam', 'Qom', 'Bam', 'Qom'])
    dframe['dataset1'].groupby(arr1).mean()

Bam    0.256939
Qom    0.209267
Name: dataset1, dtype: float64

In [6]:
arr2 = np.array(['AA', 'AA', 'BB', 'BB', 'BB'])
dframe['dataset1'].groupby([arr1, arr2]).mean()

Bam  AA    0.669353
     BB   -0.155475
Qom  AA    0.604692
     BB    0.011554
Name: dataset1, dtype: float64

In [7]:
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,0.637023,0.342379
Y,0.157314,0.004942
Z,-0.446995,0.427376


In [8]:
dframe.groupby(['k1', 'k2']).mean() 

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,0.604692,0.540031
X,beta,0.669353,0.144727
Y,alpha,0.470103,-0.230465
Y,beta,-0.155475,0.240348
Z,alpha,-0.446995,0.427376


In [9]:
for name, group in dframe.groupby('k1'):
    print("This is the %s group" %name)
    print(group)
    print("\n")

This is the X group
   dataset1  dataset2 k1     k2
0  0.604692  0.540031  X  alpha
1  0.669353  0.144727  X   beta


This is the Y group
   dataset1  dataset2 k1     k2
2  0.470103 -0.230465  Y  alpha
3 -0.155475  0.240348  Y   beta


This is the Z group
   dataset1  dataset2 k1     k2
4 -0.446995  0.427376  Z  alpha




In [13]:
for name, group in dframe.groupby(['k1', 'k2']):
    print("key1 = %s, key2 = %s" %(name[0], name[1]))
    print(group, "\n")    

key1 = X, key2 = alpha
(   dataset1  dataset2 k1     k2
0  0.604692  0.540031  X  alpha, '\n')
key1 = X, key2 = beta
(   dataset1  dataset2 k1    k2
1  0.669353  0.144727  X  beta, '\n')
key1 = Y, key2 = alpha
(   dataset1  dataset2 k1     k2
2  0.470103 -0.230465  Y  alpha, '\n')
key1 = Y, key2 = beta
(   dataset1  dataset2 k1    k2
3 -0.155475  0.240348  Y  beta, '\n')
key1 = Z, key2 = alpha
(   dataset1  dataset2 k1     k2
4 -0.446995  0.427376  Z  alpha, '\n')


In [14]:
for (k1, k2), group in dframe.groupby(['k1', 'k2']):
    print("key1 = %s, key2 = %s" %(k1, k2))
    print(group, "\n")  

key1 = X, key2 = alpha
(   dataset1  dataset2 k1     k2
0  0.604692  0.540031  X  alpha, '\n')
key1 = X, key2 = beta
(   dataset1  dataset2 k1    k2
1  0.669353  0.144727  X  beta, '\n')
key1 = Y, key2 = alpha
(   dataset1  dataset2 k1     k2
2  0.470103 -0.230465  Y  alpha, '\n')
key1 = Y, key2 = beta
(   dataset1  dataset2 k1    k2
3 -0.155475  0.240348  Y  beta, '\n')
key1 = Z, key2 = alpha
(   dataset1  dataset2 k1     k2
4 -0.446995  0.427376  Z  alpha, '\n')


In [15]:
list(dframe.groupby('k1'))

[('X',    dataset1  dataset2 k1     k2
  0  0.604692  0.540031  X  alpha
  1  0.669353  0.144727  X   beta), ('Y',    dataset1  dataset2 k1     k2
  2  0.470103 -0.230465  Y  alpha
  3 -0.155475  0.240348  Y   beta), ('Z',    dataset1  dataset2 k1     k2
  4 -0.446995  0.427376  Z  alpha)]

In [16]:
group_dict = dict(list(dframe.groupby('k1')))
group_dict

{'X':    dataset1  dataset2 k1     k2
 0  0.604692  0.540031  X  alpha
 1  0.669353  0.144727  X   beta, 'Y':    dataset1  dataset2 k1     k2
 2  0.470103 -0.230465  Y  alpha
 3 -0.155475  0.240348  Y   beta, 'Z':    dataset1  dataset2 k1     k2
 4 -0.446995  0.427376  Z  alpha}

In [17]:
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.604692,0.540031,X,alpha
1,0.669353,0.144727,X,beta


In [18]:
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes, axis = 1)))
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0  0.604692  0.540031
 1  0.669353  0.144727
 2  0.470103 -0.230465
 3 -0.155475  0.240348
 4 -0.446995  0.427376, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [19]:
group_dict_axis1[np.dtype('float64')]

Unnamed: 0,dataset1,dataset2
0,0.604692,0.540031
1,0.669353,0.144727
2,0.470103,-0.230465
3,-0.155475,0.240348
4,-0.446995,0.427376


In [20]:
dataset2_group = dframe.groupby(['k1', 'k2'])[['dataset2']]
dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,0.540031
X,beta,0.144727
Y,alpha,-0.230465
Y,beta,0.240348
Z,alpha,0.427376


# Groupby on Dict and Series

In [21]:
animals = DataFrame(np.arange(16).reshape(4,4), columns = ['W','X','Y','Z'],
                   index = ['Dog', 'Cat', 'Bird', 'Mouse'])
animals

Unnamed: 0,W,X,Y,Z
Dog,0,1,2,3
Cat,4,5,6,7
Bird,8,9,10,11
Mouse,12,13,14,15


In [22]:
animals.ix[[1, 3], ['W', 'Y']] = np.nan
animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Bird,8.0,9,10.0,11
Mouse,,13,,15


In [23]:
behavior_map = {"W":"good", "X":"bad", "Y":"good", "Z":"bad"}
result = animals.groupby(behavior_map, axis = 1)
result.sum()

Unnamed: 0,bad,good
Dog,4.0,2.0
Cat,12.0,
Bird,20.0,18.0
Mouse,28.0,


In [24]:
behavior_series = Series(behavior_map)
animals.groupby(behavior_series, axis = 1).count()

Unnamed: 0,bad,good
Dog,2,2
Cat,2,0
Bird,2,2
Mouse,2,0


In [25]:
animals.groupby(len).sum()

Unnamed: 0,W,X,Y,Z
3,0.0,6,2.0,10
4,8.0,9,10.0,11
5,,13,,15


In [26]:
animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Bird,8.0,9,10.0,11
Mouse,,13,,15
