## Groupby Operations in Pandas

### The Split - Apply - Combine method

http://pandas.pydata.org/pandas-docs/stable/groupby.html

* Split :  groupby()
* Apply : aggregate, transform, filter
* Combine : combine results

In [30]:
%matplotlib inline

import matplotlib

matplotlib.style.use('fivethirtyeight')

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
# Import the IRIS dataset from the csv file

irisdf = pd.read_csv('iris.csv')

irisdf.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
irisdf.columns

Index([u'sepal_length', u'sepal_width', u'petal_length', u'petal_width',
       u'species'],
      dtype='object')

In [5]:
# How many species are there in the dataset?

set(irisdf.species)

{'setosa', 'versicolor', 'virginica'}

## Split Operation using GROUPBY

In [6]:
# Let us group by species so we can get interesting information

irisdf.groupby('species')

<pandas.core.groupby.DataFrameGroupBy object at 0x0000000007D004A8>

In [7]:
# Let us iterate through this object 

irisgroup = irisdf.groupby('species')

irisgroup.indices

{'setosa': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], dtype=int64),
 'versicolor': array([50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
        67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
        84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], dtype=int64),
 'virginica': array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
        113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
        139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149], dtype=int64)}

In [8]:
irisgroup.indices.keys()

['setosa', 'versicolor', 'virginica']

In [9]:
irisgroup.indices.values()

[array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], dtype=int64),
 array([50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
        67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
        84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], dtype=int64),
 array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
        113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
        139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149], dtype=int64)]

In [14]:
# Ok lets iterate this then! 

for k, v in irisgroup:
    print '\n\n----New Group Starts-----\n\n'
    print k
    print v
    print '\n\n----New Group Ends-----\n\n'
    
    



----New Group Starts-----


setosa
    sepal_length  sepal_width  petal_length  petal_width species
0            5.1          3.5           1.4          0.2  setosa
1            4.9          3.0           1.4          0.2  setosa
2            4.7          3.2           1.3          0.2  setosa
3            4.6          3.1           1.5          0.2  setosa
4            5.0          3.6           1.4          0.2  setosa
5            5.4          3.9           1.7          0.4  setosa
6            4.6          3.4           1.4          0.3  setosa
7            5.0          3.4           1.5          0.2  setosa
8            4.4          2.9           1.4          0.2  setosa
9            4.9          3.1           1.5          0.1  setosa
10           5.4          3.7           1.5          0.2  setosa
11           4.8          3.4           1.6          0.2  setosa
12           4.8          3.0           1.4          0.1  setosa
13           4.3          3.0           1.1          

In [19]:
# Another way to look at a groupby object 

list(irisgroup)

[('setosa',     sepal_length  sepal_width  petal_length  petal_width species
  0            5.1          3.5           1.4          0.2  setosa
  1            4.9          3.0           1.4          0.2  setosa
  2            4.7          3.2           1.3          0.2  setosa
  3            4.6          3.1           1.5          0.2  setosa
  4            5.0          3.6           1.4          0.2  setosa
  5            5.4          3.9           1.7          0.4  setosa
  6            4.6          3.4           1.4          0.3  setosa
  7            5.0          3.4           1.5          0.2  setosa
  8            4.4          2.9           1.4          0.2  setosa
  9            4.9          3.1           1.5          0.1  setosa
  10           5.4          3.7           1.5          0.2  setosa
  11           4.8          3.4           1.6          0.2  setosa
  12           4.8          3.0           1.4          0.1  setosa
  13           4.3          3.0           1.1       

In [26]:
irisgroup.groups

{'setosa': [0L,
  1L,
  2L,
  3L,
  4L,
  5L,
  6L,
  7L,
  8L,
  9L,
  10L,
  11L,
  12L,
  13L,
  14L,
  15L,
  16L,
  17L,
  18L,
  19L,
  20L,
  21L,
  22L,
  23L,
  24L,
  25L,
  26L,
  27L,
  28L,
  29L,
  30L,
  31L,
  32L,
  33L,
  34L,
  35L,
  36L,
  37L,
  38L,
  39L,
  40L,
  41L,
  42L,
  43L,
  44L,
  45L,
  46L,
  47L,
  48L,
  49L],
 'versicolor': [50L,
  51L,
  52L,
  53L,
  54L,
  55L,
  56L,
  57L,
  58L,
  59L,
  60L,
  61L,
  62L,
  63L,
  64L,
  65L,
  66L,
  67L,
  68L,
  69L,
  70L,
  71L,
  72L,
  73L,
  74L,
  75L,
  76L,
  77L,
  78L,
  79L,
  80L,
  81L,
  82L,
  83L,
  84L,
  85L,
  86L,
  87L,
  88L,
  89L,
  90L,
  91L,
  92L,
  93L,
  94L,
  95L,
  96L,
  97L,
  98L,
  99L],
 'virginica': [100L,
  101L,
  102L,
  103L,
  104L,
  105L,
  106L,
  107L,
  108L,
  109L,
  110L,
  111L,
  112L,
  113L,
  114L,
  115L,
  116L,
  117L,
  118L,
  119L,
  120L,
  121L,
  122L,
  123L,
  124L,
  125L,
  126L,
  127L,
  128L,
  129L,
  130L,
  131L,
  132L,
  133L,

In [28]:
irisgroup.get_group('setosa')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


## Apply - Combine Operations

### Aggregation Techniques:  summary statistics

In [15]:
irisgroup.sepal_length

<pandas.core.groupby.SeriesGroupBy object at 0x0000000007DB9400>

In [20]:
irisgroup.sepal_length.mean()

species
setosa        5.006
versicolor    5.936
virginica     6.588
Name: sepal_length, dtype: float64

In [21]:
irisgroup['sepal_length', 'petal_width'].mean()

Unnamed: 0_level_0,sepal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1
setosa,5.006,0.244
versicolor,5.936,1.326
virginica,6.588,2.026


In [22]:
# Lets transpose this! 

irisgroup['sepal_length', 'petal_width'].mean().T

species,setosa,versicolor,virginica
sepal_length,5.006,5.936,6.588
petal_width,0.244,1.326,2.026


In [31]:
irisgroup.aggregate(np.mean)

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.418,1.464,0.244
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [32]:
irisgroup.agg(np.sum)

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,250.3,170.9,73.2,12.2
versicolor,296.8,138.5,213.0,66.3
virginica,329.4,148.7,277.6,101.3


In [33]:
irisgroup.agg([np.sum, np.mean])

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_width,sepal_width,petal_length,petal_length,petal_width,petal_width
Unnamed: 0_level_1,sum,mean,sum,mean,sum,mean,sum,mean
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
setosa,250.3,5.006,170.9,3.418,73.2,1.464,12.2,0.244
versicolor,296.8,5.936,138.5,2.77,213.0,4.26,66.3,1.326
virginica,329.4,6.588,148.7,2.974,277.6,5.552,101.3,2.026


In [34]:
irisgroup.agg([np.sum, np.mean]).columns

MultiIndex(levels=[[u'sepal_length', u'sepal_width', u'petal_length', u'petal_width'], [u'sum', u'mean']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]])

In [35]:
irisgroup.agg([np.sum, np.mean]).unstack()

                    species   
sepal_length  sum   setosa        250.300
                    versicolor    296.800
                    virginica     329.400
              mean  setosa          5.006
                    versicolor      5.936
                    virginica       6.588
sepal_width   sum   setosa        170.900
                    versicolor    138.500
                    virginica     148.700
              mean  setosa          3.418
                    versicolor      2.770
                    virginica       2.974
petal_length  sum   setosa         73.200
                    versicolor    213.000
                    virginica     277.600
              mean  setosa          1.464
                    versicolor      4.260
                    virginica       5.552
petal_width   sum   setosa         12.200
                    versicolor     66.300
                    virginica     101.300
              mean  setosa          0.244
                    versicolor      1.326
   

In [36]:
irisgroup.agg([np.min, np.max, np.mean])

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_length,sepal_width,sepal_width,sepal_width,petal_length,petal_length,petal_length,petal_width,petal_width,petal_width
Unnamed: 0_level_1,amin,amax,mean,amin,amax,mean,amin,amax,mean,amin,amax,mean
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
setosa,4.3,5.8,5.006,2.3,4.4,3.418,1.0,1.9,1.464,0.1,0.6,0.244
versicolor,4.9,7.0,5.936,2.0,3.4,2.77,3.0,5.1,4.26,1.0,1.8,1.326
virginica,4.9,7.9,6.588,2.2,3.8,2.974,4.5,6.9,5.552,1.4,2.5,2.026


In [38]:
irisgroup['sepal_length'].agg({
        'minimum' : np.min,
        'mean' : np.mean,
        'maximum' : np.max
    })

Unnamed: 0_level_0,minimum,maximum,mean
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,4.3,5.8,5.006
versicolor,4.9,7.0,5.936
virginica,4.9,7.9,6.588


In [39]:
irisgroup['sepal_length'].agg({
        'minimum' : np.min,
        'mean' : np.mean,
        'maximum' : np.max,
        'maxrange' : lambda x: np.max(x) - np.min(x)
    })

Unnamed: 0_level_0,maxrange,minimum,maximum,mean
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,1.5,4.3,5.8,5.006
versicolor,2.1,4.9,7.0,5.936
virginica,3.0,4.9,7.9,6.588


## Apply - Combine Operations

### Transformation Techniques
http://pandas.pydata.org/pandas-docs/stable/groupby.html#transformation

Important points to remember from the documentation:

* The transform method returns an object that is indexed the same (same size) as the one being grouped
* Thus, the passed transform function should return a result that is the same size as the group chunk



In [56]:
zscore = lambda x: (x - x.mean()) / x.std()

irisgroup.transform(zscore)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0.266674,0.215209,-0.368852,-0.410411
1,-0.300718,-1.097043,-0.368852,-0.410411
2,-0.868111,-0.572142,-0.945184,-0.410411
3,-1.151807,-0.834592,0.207479,-0.410411
4,-0.017022,0.477660,-0.368852,-0.410411
5,1.117763,1.265011,1.360143,1.455095
6,-1.151807,-0.047241,-0.368852,0.522342
7,-0.017022,-0.047241,0.207479,-0.410411
8,-1.719199,-1.359493,-0.368852,-0.410411
9,-0.300718,-0.834592,0.207479,-1.343165


## Apply - Combine Operations

### Filter Techniques

http://pandas.pydata.org/pandas-docs/stable/groupby.html#filtration

Important points to remember from the documetation:

* The filter method returns a subset of the original object
* The argument of filter must be a function that, applied to the group as a whole, returns True or False
* For dataframes with multiple columns, filters should explicitly specify a column as the filter criterion

In [41]:
irisgroup.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
setosa,count,50.0,50.0,50.0,50.0
setosa,mean,5.006,3.418,1.464,0.244
setosa,std,0.35249,0.381024,0.173511,0.10721
setosa,min,4.3,2.3,1.0,0.1
setosa,25%,4.8,3.125,1.4,0.2
setosa,50%,5.0,3.4,1.5,0.2
setosa,75%,5.2,3.675,1.575,0.3
setosa,max,5.8,4.4,1.9,0.6
versicolor,count,50.0,50.0,50.0,50.0
versicolor,mean,5.936,2.77,4.26,1.326


In [51]:
def maxfilter(x):
    if np.max(x['sepal_length']) > 6:
        return True
    else : 
        return False
    
filtered = irisgroup.filter(maxfilter)

filtered.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor


In [49]:
type(filtered)

pandas.core.frame.DataFrame

In [50]:
filtered.groupby('species').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,petal_length,petal_width,sepal_length,sepal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
versicolor,count,50.0,50.0,50.0,50.0
versicolor,mean,4.26,1.326,5.936,2.77
versicolor,std,0.469911,0.197753,0.516171,0.313798
versicolor,min,3.0,1.0,4.9,2.0
versicolor,25%,4.0,1.2,5.6,2.525
versicolor,50%,4.35,1.3,5.9,2.8
versicolor,75%,4.6,1.5,6.3,3.0
versicolor,max,5.1,1.8,7.0,3.4
virginica,count,50.0,50.0,50.0,50.0
virginica,mean,5.552,2.026,6.588,2.974
