In [1]:
import pandas as pd

In [2]:
titanic = pd.read_csv('titanic.csv', usecols = ['survived', 'pclass', 'sex', 'age', 'fare'])

In [3]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [4]:
titanic.groupby('sex').mean()

Unnamed: 0_level_0,survived,pclass,age,fare
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.742038,2.159236,27.915709,44.479818
male,0.188908,2.389948,30.726645,25.523893


We can use the `list` method to extract just the female group from the groupby object and store it in a variable.

In [5]:
female_group = list(titanic.groupby('sex'))[0][1]
female_group

Unnamed: 0,survived,pclass,sex,age,fare
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.9250
3,1,1,female,35.0,53.1000
8,1,3,female,27.0,11.1333
9,1,2,female,14.0,30.0708
...,...,...,...,...,...
880,1,2,female,25.0,26.0000
882,0,3,female,22.0,10.5167
885,0,3,female,39.0,29.1250
887,1,1,female,19.0,30.0000


In [6]:
female_group.mean()

  female_group.mean()


survived     0.742038
pclass       2.159236
age         27.915709
fare        44.479818
dtype: float64

Alternately, we can create a user-defined function to calculate the mean of the group, and then pass that into our sorted group-by list using the `.apply()` method, as seen below.

In [7]:
def group_mean(group):
    return group.mean()

In [8]:
group_mean(female_group)

  return group.mean()


survived     0.742038
pclass       2.159236
age         27.915709
fare        44.479818
dtype: float64

In [9]:
titanic.groupby('sex').apply(group_mean)

  return group.mean()


Unnamed: 0_level_0,survived,pclass,age,fare
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.742038,2.159236,27.915709,44.479818
male,0.188908,2.389948,30.726645,25.523893


All that is a pretty long way to go about getting the mean, which is a built-in method, but it could be used to do much more complicated things, with more specific user-defined functions.

In [10]:
titanic.nlargest(5, 'age')

Unnamed: 0,survived,pclass,sex,age,fare
630,1,1,male,80.0,30.0
851,0,3,male,74.0,7.775
96,0,1,male,71.0,34.6542
493,0,1,male,71.0,49.5042
116,0,3,male,70.5,7.75


In [11]:
titanic.groupby('sex').nlargest(5, 'age')

AttributeError: 'DataFrameGroupBy' object has no attribute 'nlargest'

As we can see above, the `.nlargest()` method will let us identify the five oldest passengers, but it does not work on a groupby object.  To do that, we need to define a function that we can use with `.apply()`

In [12]:
def five_oldest_surv(group):
    return group[group.survived == 1].nlargest(5, 'age')

In [13]:
titanic.groupby('sex').apply(five_oldest_surv)

Unnamed: 0_level_0,Unnamed: 1_level_0,survived,pclass,sex,age,fare
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,275,1,1,female,63.0,77.9583
female,483,1,3,female,63.0,9.5875
female,829,1,1,female,62.0,80.0
female,366,1,1,female,60.0,75.25
female,11,1,1,female,58.0,26.55
male,630,1,1,male,80.0,30.0
male,570,1,2,male,62.0,10.5
male,587,1,1,male,60.0,79.2
male,647,1,1,male,56.0,35.5
male,449,1,1,male,52.0,30.5
