# Advanced Aggregate Functions

* aggregate
* filter
* transform
* apply

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame(
        {
            'groups': ['A', 'B', 'C', 'A', 'B', 'C'],
            'variable1': np.random.rand(1, 6)[0] * 100,
            'variable2': np.random.rand(1, 6)[0] * 100,
        }
)

In [4]:
df

Unnamed: 0,groups,variable1,variable2
0,A,69.957312,59.883921
1,B,30.574944,75.518725
2,C,11.47543,77.912175
3,A,3.48855,35.982094
4,B,77.45111,79.598955
5,C,25.861779,44.95861


In [5]:
df.groupby('groups').describe().T

Unnamed: 0,groups,A,B,C
variable1,count,2.0,2.0,2.0
variable1,mean,36.722931,54.013027,18.668604
variable1,std,47.000512,33.146454,10.172685
variable1,min,3.48855,30.574944,11.47543
variable1,25%,20.10574,42.293986,15.072017
variable1,50%,36.722931,54.013027,18.668604
variable1,75%,53.340121,65.732068,22.265192
variable1,max,69.957312,77.45111,25.861779
variable2,count,2.0,2.0,2.0
variable2,mean,47.933007,77.55884,61.435393


### Aggregate

In [6]:
df.groupby('groups').aggregate(['std', np.median, 'max']).T

Unnamed: 0,groups,A,B,C
variable1,std,47.000512,33.146454,10.172685
variable1,median,36.722931,54.013027,18.668604
variable1,max,69.957312,77.45111,25.861779
variable2,std,16.901144,2.885158,23.301689
variable2,median,47.933007,77.55884,61.435393
variable2,max,59.883921,79.598955,77.912175


In [7]:
df.groupby('groups').aggregate({'variable1': 'min', 'variable2': np.median})

Unnamed: 0_level_0,variable1,variable2
groups,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3.48855,47.933007
B,30.574944,77.55884
C,11.47543,61.435393


### Filter

In [8]:
def filter_func(x):
    return x['variable1'].std() > 12

In [9]:
df.groupby('groups').std().T

groups,A,B,C
variable1,47.000512,33.146454,10.172685
variable2,16.901144,2.885158,23.301689


In [10]:
df.groupby('groups').filter(filter_func)

Unnamed: 0,groups,variable1,variable2
0,A,69.957312,59.883921
1,B,30.574944,75.518725
3,A,3.48855,35.982094
4,B,77.45111,79.598955


### Transform

In [11]:
df

Unnamed: 0,groups,variable1,variable2
0,A,69.957312,59.883921
1,B,30.574944,75.518725
2,C,11.47543,77.912175
3,A,3.48855,35.982094
4,B,77.45111,79.598955
5,C,25.861779,44.95861


In [12]:
df_a = df.iloc[:, 1:3]
df_a

Unnamed: 0,variable1,variable2
0,69.957312,59.883921
1,30.574944,75.518725
2,11.47543,77.912175
3,3.48855,35.982094
4,77.45111,79.598955
5,25.861779,44.95861


In [13]:
df_a.transform(lambda x: (x - x.mean()) / x.std()) 

Unnamed: 0,variable1,variable2
0,1.096897,-0.130873
1,-0.193026,0.712855
2,-0.818609,0.842016
3,-1.08021,-1.420728
4,1.342348,0.933043
5,-0.347401,-0.936313


### Apply

In [14]:
df_a

Unnamed: 0,variable1,variable2
0,69.957312,59.883921
1,30.574944,75.518725
2,11.47543,77.912175
3,3.48855,35.982094
4,77.45111,79.598955
5,25.861779,44.95861


In [15]:
df_a.apply(np.sum)

variable1    218.809124
variable2    373.854481
dtype: float64

In [16]:
df_a.apply(np.median)

variable1    28.218362
variable2    67.701323
dtype: float64

In [41]:
df_a.apply('std')

variable1    30.530778
variable2    18.530631
dtype: float64

In [68]:
def my_func(value):
    return True if value >25 else False

In [69]:
df_a

Unnamed: 0,variable1,variable2
0,69.957312,59.883921
1,30.574944,75.518725
2,11.47543,77.912175
3,3.48855,35.982094
4,77.45111,79.598955
5,25.861779,44.95861


In [70]:
df_a[df_a.variable1.apply(my_func)]

Unnamed: 0,variable1,variable2
0,69.957312,59.883921
1,30.574944,75.518725
4,77.45111,79.598955
5,25.861779,44.95861
