# Aggregation and Grouping

We'll use planets dataset here.

In [2]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape

(1035, 6)

In [3]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


### 1. Simple Aggregation in Pandas

In [4]:
import numpy as np
import pandas as pd

rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
print(ser)

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64


In [5]:
print(ser.sum())
print(ser.mean())

2.811925491708157
0.5623850983416314


In [11]:
df = pd.DataFrame( {'A' : rng.rand(5), 'B' : rng.rand(5)} )
print(df)
print(df.mean())
print(df.mean(axis='columns'))

          A         B
0  0.662522  0.969585
1  0.311711  0.775133
2  0.520068  0.939499
3  0.546710  0.894827
4  0.184854  0.597900
A    0.445173
B    0.835389
dtype: float64
0    0.816053
1    0.543422
2    0.729783
3    0.720769
4    0.391377
dtype: float64


In [9]:
# common aggregates
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


### 2. GroupBy : Split, Apply, Combine

##### -Split
Breaking up and grouping a DF depending on the value of the specified key
##### -Apply
Computing some function, usually an aggregate, transformation, or filtering, within ind group
##### -Combine
Merges the results of these operations into an output array

It's important to realize that the *intermediate* splits do not need to be explicitly instantiated.
<br> The user need not think about how the computation is done under the hood, but rather think about the operation as a whole.

In [12]:
df = pd.DataFrame( {'key' : ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data' : range(6)}, columns=['key', 'data'])
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [14]:
# We can compute the most basic split-apply-combine operation with the groupby().
# method of DataFrames, passing the name of the desired key column.
df.groupby('key')

# What is returned is not a set of DF, but a DFGB(group by) object.

<pandas.core.groupby.DataFrameGroupBy object at 0x7f3be043eb38>

In [15]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


##### - the GroupBy Object

In [17]:
# Column indexing
print(planets.groupby('method'))
print(planets.groupby('method')['orbital_period'])

<pandas.core.groupby.DataFrameGroupBy object at 0x7f3be03700b8>
<pandas.core.groupby.SeriesGroupBy object at 0x7f3be0370278>


In [25]:
# Iteration over groups
# returns each group as a Series or DataFrame
for (method, group) in planets.groupby('method') :
    print("{0:30s} shape={1}".format(method, group.shape))

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


In [30]:
# Dispatch methods
print(planets.groupby('method')['year'].describe().unstack())

       method                       
count  Astrometry                          2.000000
       Eclipse Timing Variations           9.000000
       Imaging                            38.000000
       Microlensing                       23.000000
       Orbital Brightness Modulation       3.000000
       Pulsar Timing                       5.000000
       Pulsation Timing Variations         1.000000
       Radial Velocity                   553.000000
       Transit                           397.000000
       Transit Timing Variations           4.000000
mean   Astrometry                       2011.500000
       Eclipse Timing Variations        2010.000000
       Imaging                          2009.131579
       Microlensing                     2009.782609
       Orbital Brightness Modulation    2011.666667
       Pulsar Timing                    1998.400000
       Pulsation Timing Variations      2007.000000
       Radial Velocity                  2007.518987
       Transit             

### 3. Other Methods

In [32]:
# We'll use this DataFrame
rng = np.random.RandomState(0)
df = pd.DataFrame({'key' : ['A', 'B', 'C', 'A', 'B', 'C'], 
                  'data1' : range(6),
                  'data2' : rng.randint(0, 10, 6)}, columns = ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


##### - Aggregate
Aggregate function allows for even more flexibility.
<br>It can take a string, a function, or a list thereof, and compute them at once.

In [33]:
# string, function, list
df.groupby('key').aggregate(['min', np.median, max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [34]:
# specify operations on column
df.groupby('key').aggregate({'data1' : 'min', 'data2' : 'max'})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,7
C,2,9


##### - Filter
**Filter function should return a Boolean value specifying whether the group passes the filtering **

In [36]:
def filter_func(x) : 
    return x['data2'].std() > 4
print(df); print(df.groupby('key').std())

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
       data1     data2
key                   
A    2.12132  1.414214
B    2.12132  4.949747
C    2.12132  4.242641


In [38]:
print(df.groupby('key').filter(filter_func))

  key  data1  data2
1   B      1      0
2   C      2      3
4   B      4      7
5   C      5      9


##### - Transform
While aggregation must return a reduced version of the data, transformation can return some transformed version of the full data to recombine.

In [72]:
# center the data by subtracting thr group-wise mean
# same as data - data group's function value
df.groupby('key').transform(lambda x: x - x.mean())

Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-3.5
2,-1.5,-3.0
3,1.5,-1.0
4,1.5,3.5
5,1.5,3.0


##### - Apply
Apply an arbitrary function to the group results.
The function should take a DataFrame, and return either a Pandas object or a scalar.

In [95]:
# normalize the first column by the sum of the second
def norm_by_data2(x) :
    # x is a DataFrame ob group values
    x['data1'] /= x['data2'].sum()
    return x

print(df); print(df.groupby('key').apply(norm_by_data2))

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
  key     data1  data2
0   A  0.000000      5
1   B  0.142857      0
2   C  0.166667      3
3   A  0.375000      3
4   B  0.571429      7
5   C  0.416667      9


### 4. Specifying the split key

##### - A list, array, series, or index providing the grouping keys

In [96]:
L = [0, 1, 0, 1, 2, 0]
print(df); print(df.groupby(L).sum())

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
   data1  data2
0      7     17
1      4      3
2      4      7


In [97]:
print(df); print(df.groupby(df['key']).sum())

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
     data1  data2
key              
A        3      8
B        5      7
C        7     12


##### - A dictionary or series mapping index to group

In [98]:
df2 = df.set_index('key')
mapping = {'A' : 'vowel', 'B': 'consonant', 'C' : 'consonant'}
print(df2); print(df2.groupby(mapping).sum())

     data1  data2
key              
A        0      5
B        1      0
C        2      3
A        3      3
B        4      7
C        5      9
           data1  data2
consonant     12     19
vowel          3      8


##### - Any Python function

In [100]:
# Any of the preceding key choices can be combined to group on a multi-index

df2.groupby([str.lower, mapping]).mean()

Unnamed: 0,Unnamed: 1,data1,data2
a,vowel,1.5,4.0
b,consonant,2.5,3.5
c,consonant,3.5,6.0


##### - Grouping Example

In [104]:
planets.head(5)

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [101]:
# Put all we learned together

decade = 10*(planets['year'] // 10 )
decade = decade.astype(str) + 's'
decade.name = 'decade'
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0


In [111]:
# What if we don't unstack ?
planets.groupby(['method', decade])['number'].sum().fillna(0)

method                         year 
Astrometry                     2010s      2
Eclipse Timing Variations      2000s      5
                               2010s     10
Imaging                        2000s     29
                               2010s     21
Microlensing                   2000s     12
                               2010s     15
Orbital Brightness Modulation  2010s      5
Pulsar Timing                  1990s      9
                               2000s      1
                               2010s      1
Pulsation Timing Variations    2000s      1
Radial Velocity                1980s      1
                               1990s     52
                               2000s    475
                               2010s    424
Transit                        2000s     64
                               2010s    712
Transit Timing Variations      2010s      9
Name: number, dtype: int64