# Notes from PB Python's group-aggregate

[Comprehensive Guide to Grouping and Aggregating with Pandas](https://pbpython.com/groupby-agg.html)

In [23]:
import numpy as np
import pandas as pd
import seaborn as sns

# Titanic dataset

In [4]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# Aggregation

>"Aggregation: takes multiple individual values and returns a summary"

In [10]:
# agg of a single column
df['fare'].agg(['sum', 'mean'])

sum     28693.949300
mean       32.204208
Name: fare, dtype: float64

In [11]:
# agg of multiple columns
df[['fare', 'age']].agg(['sum', 'mean'])

Unnamed: 0,fare,age
sum,28693.9493,21205.17
mean,32.204208,29.699118


**Named aggregrations**

You can have different set of agg functions on different columns. Three ways of doing this
- List: What we did above
- Dict (recommended):
- Tuple: one agg per tuple; assign name to output

In [20]:
# dict
df.agg({'fare':['sum', 'mean'], 'sex':['count']})

Unnamed: 0,fare,sex
count,,891.0
mean,32.204208,
sum,28693.9493,


In [27]:
# tuple
df.agg(x=('fare', max), y=('fare', 'min'), z=('C', np.mean))

TypeError: aggregate() missing 1 required positional argument: 'func'

# Groupby

In [11]:
agg_func_math = {'fare': ['sum', 'mean']}
df.groupby(by=['embark_town']).agg(agg_func_math).round(2)

Unnamed: 0_level_0,fare,fare
Unnamed: 0_level_1,sum,mean
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2
Cherbourg,10072.3,59.95
Queenstown,1022.25,13.28
Southampton,17439.4,27.08


In [13]:
agg_func_describe = {'fare': ['describe']}
df.groupby(by=['embark_town']).agg(agg_func_describe).round(2)

Unnamed: 0_level_0,fare,fare,fare,fare,fare,fare,fare,fare
Unnamed: 0_level_1,describe,describe,describe,describe,describe,describe,describe,describe
Unnamed: 0_level_2,count,mean,std,min,25%,50%,75%,max
embark_town,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Cherbourg,168.0,59.95,83.91,4.01,13.7,29.7,78.5,512.33
Queenstown,77.0,13.28,14.19,6.75,7.75,7.75,15.5,90.0
Southampton,644.0,27.08,35.89,0.0,8.05,13.0,27.9,263.0


In [14]:
agg_func_count = {'embark_town': ['count', 'nunique', 'size']}
df.groupby(by=['deck']).agg(agg_func_count)

Unnamed: 0_level_0,embark_town,embark_town,embark_town
Unnamed: 0_level_1,count,nunique,size
deck,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,15,2,15
B,45,2,47
C,59,3,59
D,33,2,33
E,32,3,32
F,13,3,13
G,4,1,4
