In [1]:
import pandas as pd

In [2]:
titanic = pd.read_csv('titanic.csv')

In [3]:
titanic.head(20)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,
5,0,3,male,,0,0,8.4583,Q,
6,0,1,male,54.0,0,0,51.8625,S,E
7,0,3,male,2.0,3,1,21.075,S,
8,1,3,female,27.0,0,2,11.1333,S,
9,1,2,female,14.0,1,0,30.0708,C,


In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


Using the info above, we can see that there are a total of 177 missing values in our age column. One way to solve this is to just calculate the mean age of all passengers and use that to fill the NaN values

In [5]:
mean_age = titanic.age.mean()
mean_age

29.69911764705882

In [6]:
titanic.age.fillna(mean_age, inplace=False)

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: age, Length: 891, dtype: float64

Instead of doing that, which is pretty broad, we can use groupby to fill in the data with slightly more group-specific values, as seen below.

In [7]:
titanic.groupby(['sex', 'pclass']).age.mean()

sex     pclass
female  1         34.611765
        2         28.722973
        3         21.750000
male    1         41.281386
        2         30.740707
        3         26.507589
Name: age, dtype: float64

In [9]:
titanic['group_mean_age'] = titanic.groupby(['sex', 'pclass']).age.transform('mean')

In [10]:
titanic.head(20)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,group_mean_age
0,0,3,male,22.0,1,0,7.25,S,,26.507589
1,1,1,female,38.0,1,0,71.2833,C,C,34.611765
2,1,3,female,26.0,0,0,7.925,S,,21.75
3,1,1,female,35.0,1,0,53.1,S,C,34.611765
4,0,3,male,35.0,0,0,8.05,S,,26.507589
5,0,3,male,,0,0,8.4583,Q,,26.507589
6,0,1,male,54.0,0,0,51.8625,S,E,41.281386
7,0,3,male,2.0,3,1,21.075,S,,26.507589
8,1,3,female,27.0,0,2,11.1333,S,,21.75
9,1,2,female,14.0,1,0,30.0708,C,,28.722973


In [11]:
titanic.age.fillna(titanic.group_mean_age, inplace = True)

In [12]:
titanic.head(20)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,group_mean_age
0,0,3,male,22.0,1,0,7.25,S,,26.507589
1,1,1,female,38.0,1,0,71.2833,C,C,34.611765
2,1,3,female,26.0,0,0,7.925,S,,21.75
3,1,1,female,35.0,1,0,53.1,S,C,34.611765
4,0,3,male,35.0,0,0,8.05,S,,26.507589
5,0,3,male,26.507589,0,0,8.4583,Q,,26.507589
6,0,1,male,54.0,0,0,51.8625,S,E,41.281386
7,0,3,male,2.0,3,1,21.075,S,,26.507589
8,1,3,female,27.0,0,2,11.1333,S,,21.75
9,1,2,female,14.0,1,0,30.0708,C,,28.722973


In [13]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   survived        891 non-null    int64  
 1   pclass          891 non-null    int64  
 2   sex             891 non-null    object 
 3   age             891 non-null    float64
 4   sibsp           891 non-null    int64  
 5   parch           891 non-null    int64  
 6   fare            891 non-null    float64
 7   embarked        889 non-null    object 
 8   deck            203 non-null    object 
 9   group_mean_age  891 non-null    float64
dtypes: float64(3), int64(4), object(3)
memory usage: 69.7+ KB
