# Advanced Aggregate Functions

* aggregate
* filter
* transform
* apply

In [52]:
import pandas as pd
import numpy as np

In [53]:
df = pd.DataFrame(
        {
            'groups': ['A', 'B', 'C', 'A', 'B', 'C'],
            'variable1': np.random.rand(1, 6)[0] * 100,
            'variable2': np.random.rand(1, 6)[0] * 100,
        }
)

In [54]:
df

Unnamed: 0,groups,variable1,variable2
0,A,86.939861,98.791215
1,B,29.580588,81.170852
2,C,22.509174,81.424642
3,A,70.040478,22.088352
4,B,50.672852,47.466167
5,C,33.434801,23.994841


In [55]:
df.groupby('groups').describe().T

Unnamed: 0,groups,A,B,C
variable1,count,2.0,2.0,2.0
variable1,mean,78.49017,40.12672,27.971988
variable1,std,11.949668,14.914483,7.725585
variable1,min,70.040478,29.580588,22.509174
variable1,25%,74.265324,34.853654,25.240581
variable1,50%,78.49017,40.12672,27.971988
variable1,75%,82.715015,45.399786,30.703394
variable1,max,86.939861,50.672852,33.434801
variable2,count,2.0,2.0,2.0
variable2,mean,60.439784,64.31851,52.709742


### Aggregate

In [170]:
df.groupby('groups').aggregate(['std', np.median, 'max']).T

Unnamed: 0,groups,A,B,C
variable1,std,11.949668,14.914483,7.725585
variable1,median,78.49017,40.12672,27.971988
variable1,max,86.939861,50.672852,33.434801
variable2,std,54.237114,23.832811,40.609002
variable2,median,60.439784,64.31851,52.709742
variable2,max,98.791215,81.170852,81.424642


In [59]:
df.groupby('groups').aggregate({'variable1': 'min', 'variable2': np.median})

Unnamed: 0_level_0,variable1,variable2
groups,Unnamed: 1_level_1,Unnamed: 2_level_1
A,70.040478,60.439784
B,29.580588,64.31851
C,22.509174,52.709742


### Filter

In [158]:
def filter_func(x):
    return x['variable1'].std() > 12

In [159]:
df.groupby('groups').std()

Unnamed: 0_level_0,variable1,variable2
groups,Unnamed: 1_level_1,Unnamed: 2_level_1
A,11.949668,54.237114
B,14.914483,23.832811
C,7.725585,40.609002


In [160]:
df.groupby('groups').filter(filter_func)

Unnamed: 0,groups,variable1,variable2
1,B,29.580588,81.170852
4,B,50.672852,47.466167


### Transform

In [161]:
df

Unnamed: 0,groups,variable1,variable2
0,A,86.939861,98.791215
1,B,29.580588,81.170852
2,C,22.509174,81.424642
3,A,70.040478,22.088352
4,B,50.672852,47.466167
5,C,33.434801,23.994841


In [162]:
df_a = df.iloc[:, 1:3]

In [163]:
df_a.transform(lambda x: (x - x.mean()) / x.std()) 

Unnamed: 0,variable1,variable2
0,1.503719,1.217562
1,-0.761492,0.676278
2,-1.040754,0.684075
3,0.836335,-1.138689
4,0.071476,-0.359103
5,-0.609283,-1.080123


### Apply

In [164]:
df_a

Unnamed: 0,variable1,variable2
0,86.939861,98.791215
1,29.580588,81.170852
2,22.509174,81.424642
3,70.040478,22.088352
4,50.672852,47.466167
5,33.434801,23.994841


In [165]:
df_a.apply(np.sum)

variable1    293.177754
variable2    354.936070
dtype: float64

In [166]:
df_a.apply(np.median)

variable1    42.053826
variable2    64.318510
dtype: float64

In [167]:
df_a.apply('std')

variable1    25.321822
variable2    32.552930
dtype: float64