# Notes from PB Python's group-aggregate

[Comprehensive Guide to Grouping and Aggregating with Pandas](https://pbpython.com/groupby-agg.html)

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

# Titanic dataset

In [2]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# Aggregation

>"Aggregation: takes multiple individual values and returns a summary"

In [3]:
# agg of a single column
df['fare'].agg(['sum', 'mean'])

sum     28693.949300
mean       32.204208
Name: fare, dtype: float64

In [4]:
# agg of multiple columns
df[['fare', 'age']].agg(['sum', 'mean'])

Unnamed: 0,fare,age
sum,28693.9493,21205.17
mean,32.204208,29.699118


**Named aggregrations**

You can have different set of agg functions on different columns. Three ways of doing this
- List: What we did above
- Dict (recommended):
- Tuple: one agg per tuple; assign name to output

In [5]:
# dict
df.agg({'fare':['sum', 'mean'], 'sex':['count']})

Unnamed: 0,fare,sex
sum,28693.9493,
mean,32.204208,
count,,891.0


In [6]:
# tuple
# df.agg(x=('fare', max), y=('fare', 'min'), z=('C', np.mean))

# Groupby

### Basic match

In [7]:
agg_func_math = {'fare': ['sum', 'mean']}
df.groupby(by=['embark_town']).agg(agg_func_math).round(2)

Unnamed: 0_level_0,fare,fare
Unnamed: 0_level_1,sum,mean
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2
Cherbourg,10072.3,59.95
Queenstown,1022.25,13.28
Southampton,17439.4,27.08


In [8]:
agg_func_math = {'fare':['describe']}
df.groupby(by=['embark_town']).agg(agg_func_math)

Unnamed: 0_level_0,fare,fare,fare,fare,fare,fare,fare,fare
Unnamed: 0_level_1,describe,describe,describe,describe,describe,describe,describe,describe
Unnamed: 0_level_2,count,mean,std,min,25%,50%,75%,max
embark_town,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Cherbourg,168.0,59.954144,83.912994,4.0125,13.69795,29.7,78.500025,512.3292
Queenstown,77.0,13.27603,14.188047,6.75,7.75,7.75,15.5,90.0
Southampton,644.0,27.079812,35.887993,0.0,8.05,13.0,27.9,263.0


### Counting

In [9]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [11]:
agg_func_count = {'embark_town':['count', 'nunique', 'size']}
# df.groupby(by=['deck']).agg(agg_func_count)
df.agg(agg_func_count)

Unnamed: 0,embark_town
count,889
nunique,3
size,891


### First & Last

In [12]:
agg_func_dict = {'fare': ['first', 'last']}
df.sort_values(by='fare',ascending=False).groupby(by=['embark_town']).agg(agg_func_dict)

Unnamed: 0_level_0,fare,fare
Unnamed: 0_level_1,first,last
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2
Cherbourg,512.3292,4.0125
Queenstown,90.0,6.75
Southampton,263.0,0.0


In [13]:
#idxmin,max
agg_func_dict = {'fare': ['idxmin', 'idxmax']}
df.groupby(by=['embark_town']).agg(agg_func_dict)

Unnamed: 0_level_0,fare,fare
Unnamed: 0_level_1,idxmin,idxmax
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2
Cherbourg,378,258
Queenstown,143,245
Southampton,179,27


In [14]:
df.loc[[378, 258]]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
378,0,3,male,20.0,0,0,4.0125,C,Third,man,True,,Cherbourg,no,True
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True


In [15]:
# see rows with max value

df.loc[df.groupby(by='class')['fare'].idxmax()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
72,0,2,male,21.0,0,0,73.5,S,Second,man,True,,Southampton,no,True
159,0,3,male,,8,2,69.55,S,Third,man,True,,Southampton,no,False


### Other libraries

In [16]:
from scipy.stats import skew, mode
agg_func_stats = {'fare': [skew, mode, pd.Series.mode]}

df.groupby(by='embark_town').agg(agg_func_stats)

Unnamed: 0_level_0,fare,fare,fare
Unnamed: 0_level_1,skew,mode,mode
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Cherbourg,3.305112,"([7.2292], [15])",7.2292
Queenstown,4.265111,"([7.75], [30])",7.75
Southampton,3.640276,"([8.05], [43])",8.05


### Working with text

In [17]:
agg_func_text = {'deck': ['nunique', mode, set]}
df.groupby(by=['class']).agg(agg_func_text)

Unnamed: 0_level_0,deck,deck,deck
Unnamed: 0_level_1,nunique,mode,set
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
First,5,"([C], [59])","{nan, D, C, A, E, B}"
Second,3,"([F], [8])","{nan, F, D, E}"
Third,3,"([F], [5])","{nan, F, G, E}"


# Custom functions

In [18]:
from functools import partial

In [20]:
q_25 = partial(pd.Series.quantile, q=0.25)
q_25.__name__ = '%25'

In [21]:
def percentile_25(x):
    return x.quantile(.25)

In [22]:
lambda_25 = lambda x:x.quantile(.25)
lambda_25.__name__ = 'lambda_25%'

In [23]:
agg_func = {'fare':[q_25, percentile_25, lambda_25, lambda x:x.quantile(.25)]}
df.groupby(by=['embark_town']).agg(agg_func).round(2)

Unnamed: 0_level_0,fare,fare,fare,fare
Unnamed: 0_level_1,%25,percentile_25,lambda_25%,<lambda_0>
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Cherbourg,13.7,13.7,13.7,13.7
Queenstown,7.75,7.75,7.75,7.75
Southampton,8.05,8.05,8.05,8.05
