In [1]:
import pandas as pd

In [2]:
titanic = pd.read_csv('titanic.csv', usecols= ['survived', 'pclass', 'sex', 'age', 'fare'])

In [3]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [4]:
titanic.groupby('sex').mean()

Unnamed: 0_level_0,survived,pclass,age,fare
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.742038,2.159236,27.915709,44.479818
male,0.188908,2.389948,30.726645,25.523893


In [5]:
titanic.groupby('sex').sum()

Unnamed: 0_level_0,survived,pclass,age,fare
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,233,678,7286.0,13966.6628
male,109,1379,13919.17,14727.2865


In [6]:
titanic.groupby('sex').agg(['mean', 'sum', 'min', 'max'])

Unnamed: 0_level_0,survived,survived,survived,survived,pclass,pclass,pclass,pclass,age,age,age,age,fare,fare,fare,fare
Unnamed: 0_level_1,mean,sum,min,max,mean,sum,min,max,mean,sum,min,max,mean,sum,min,max
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
female,0.742038,233,0,1,2.159236,678,1,3,27.915709,7286.0,0.75,63.0,44.479818,13966.6628,6.75,512.3292
male,0.188908,109,0,1,2.389948,1379,1,3,30.726645,13919.17,0.42,80.0,25.523893,14727.2865,0.0,512.3292


As seen below, we can chain the `.agg()` method on to the group by, and pass in a dictionary with keys being the columns we want to aggregate and the values being the types of aggregate functions we want to apply to those columns

In [9]:
titanic.groupby('sex').agg({'survived': ['sum', 'mean'], 'pclass': 'mean', 'age': ['mean', 'median'], 'fare': 'max'})

Unnamed: 0_level_0,survived,survived,pclass,age,age,fare
Unnamed: 0_level_1,sum,mean,mean,mean,median,max
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,233,0.742038,2.159236,27.915709,27.0,512.3292
male,109,0.188908,2.389948,30.726645,29.0,512.3292


We can relabel the results of our aggregate functions within the `.agg` function as follows.  We set the new column name to be equal to a tuple where the first element is the column we are aggregating, and the second is the aggregate function.

In [10]:
titanic.groupby('sex').agg(survival_rate = ('survived', 'mean'))

Unnamed: 0_level_0,survival_rate
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


If we want to avoid the multi-indexed column results that we saw above with the dictionary example, we can use this relabeling technique as we see below

In [11]:
titanic.groupby('sex').agg(survived_total = ('survived', 'sum'),
                          survival_rate = ('survived', 'mean'), mean_age = ('age', 'mean'))

Unnamed: 0_level_0,survived_total,survival_rate,mean_age
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,233,0.742038,27.915709
male,109,0.188908,30.726645
