In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [2]:
np.random.seed(100)
df = DataFrame({'k1':['X', 'X', 'Y', 'Y', 'Z'],
                    'k2':['alpha', 'beta', 'alpha', 'beta', 'alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)}, columns = ['k1', 'k2', 'dataset1', 'dataset2'])

In [3]:
df

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-1.749765,0.514219
1,X,beta,0.34268,0.22118
2,Y,alpha,1.153036,-1.070043
3,Y,beta,-0.252436,-0.189496
4,Z,alpha,0.981321,0.255001


In [4]:
# Grab the dataset1 column and group it by the k1 key
group1 = df['dataset1'].groupby(df['k1'])

In [5]:
for g in group1:
    print(g[0], g[1])

X 0   -1.749765
1    0.342680
Name: dataset1, dtype: float64
Y 2    1.153036
3   -0.252436
Name: dataset1, dtype: float64
Z 4    0.981321
Name: dataset1, dtype: float64


In [6]:
# Perform average on group1
group1.mean()

k1
X   -0.703543
Y    0.450300
Z    0.981321
Name: dataset1, dtype: float64

In [7]:
#We'll make some arrays and use them as keys
cities = np.array(['NY', 'LA', 'LA', 'NY', 'NY'])
month = np.array(['Oct', 'Jun', 'Jan', 'Feb', 'Sep'])

In [8]:
# Use the data from dataset1, group the means by city and month
df['dataset1'].groupby([cities, month]).mean()

LA  Jan    1.153036
    Jun    0.342680
NY  Feb   -0.252436
    Oct   -1.749765
    Sep    0.981321
Name: dataset1, dtype: float64

In [9]:
# Pass column names as group keys, will return values that could be calculated
df.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,-0.703543,0.367699
Y,0.4503,-0.62977
Z,0.981321,0.255001


In [10]:
# Or multiple column names
df.groupby(['k1', 'k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,-1.749765,0.514219
X,beta,0.34268,0.22118
Y,alpha,1.153036,-1.070043
Y,beta,-0.252436,-0.189496
Z,alpha,0.981321,0.255001


In [11]:
# Another useful groupby method is getting the group sizes
df.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [12]:
# We can also iterate over groups
for group_name, group in df.groupby('k1'):
    print ("This is the %s group" % group_name)
    print (group)
    print ('\n')

This is the X group
  k1     k2  dataset1  dataset2
0  X  alpha -1.749765  0.514219
1  X   beta  0.342680  0.221180


This is the Y group
  k1     k2  dataset1  dataset2
2  Y  alpha  1.153036 -1.070043
3  Y   beta -0.252436 -0.189496


This is the Z group
  k1     k2  dataset1  dataset2
4  Z  alpha  0.981321  0.255001




In [13]:
# We can also iterate with multiple keys
for (k1, k2), group in df.groupby(['k1', 'k2']):
    print ("Key1 = %s Key2 = %s" % (k1, k2))
    print (group)
    print ('\n')

Key1 = X Key2 = alpha
  k1     k2  dataset1  dataset2
0  X  alpha -1.749765  0.514219


Key1 = X Key2 = beta
  k1    k2  dataset1  dataset2
1  X  beta   0.34268   0.22118


Key1 = Y Key2 = alpha
  k1     k2  dataset1  dataset2
2  Y  alpha  1.153036 -1.070043


Key1 = Y Key2 = beta
  k1    k2  dataset1  dataset2
3  Y  beta -0.252436 -0.189496


Key1 = Z Key2 = alpha
  k1     k2  dataset1  dataset2
4  Z  alpha  0.981321  0.255001




In [14]:
group_list = list(df.groupby('k1'))
group_list[1]

('Y',   k1     k2  dataset1  dataset2
 2  Y  alpha  1.153036 -1.070043
 3  Y   beta -0.252436 -0.189496)

In [15]:
# A possibly useful tactic is creating a dictionary of the data pieces 
group_dict = dict(list(df.groupby('k1')))

#Show the group with X
group_dict['X']

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-1.749765,0.514219
1,X,beta,0.34268,0.22118


In [16]:
# We could have also chosen to do this with axis = 1
group_dict_axis1 = dict(list(df.groupby(df.dtypes, axis = 1)))

In [17]:
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0 -1.749765  0.514219
 1  0.342680  0.221180
 2  1.153036 -1.070043
 3 -0.252436 -0.189496
 4  0.981321  0.255001, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [18]:
# Group the dataset2 column with both sets of keys
dataset2_group = df.groupby(['k1', 'k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,0.514219
X,beta,0.22118
Y,alpha,-1.070043
Y,beta,-0.189496
Z,alpha,0.255001
