# Group families
This notebook tries to infer the family relationships between passengers.

In [None]:
import pandas as pd
import dataset as ds

X_dataset, y_dataset = ds.load_training_set()
X_testset = ds.load_test_set()

df = pd.merge(X_dataset.reset_index(), X_testset.reset_index(), how='outer').set_index(ds.ID_COLUMN_NAME)
df.shape

In [None]:
# Just to have them under the eyes.
attributes = [
    'Pclass',
    'Name',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Ticket',
    'Fare',
    'Cabin',
    'Embarked',
]

Men's names contain last name, title and first name.

In [None]:
men_name_pattern = r'(?P<LastName>.+),\s(?P<Title>\S+)\s(?P<FirstName>.*?)$'

men_with_sibsp_df = df.loc[(df['SibSp'] != 0) & (df['Sex'] == 'male')]
men_df = men_with_sibsp_df.join(men_with_sibsp_df['Name'].str.extract(men_name_pattern)) # .reset_index()

print('{} men with SibSp'.format(len(men_df)))
men_df.head()

Married women's names are the same as their husbands' names (apart for the title), but they have their unmarried name between parentheses at the end.

In [None]:
women_name_pattern = r'(?P<LastName>.+),\s(?P<Title>\S+)\s(?P<FirstName>.*?)(\s\((?P<ProperName>.*)\))?$'

women_with_sibsp_df = df.loc[(df['SibSp'] != 0) & (df['Sex'] == 'female')]
women_df = women_with_sibsp_df.join(women_with_sibsp_df['Name'].str.extract(women_name_pattern).drop(columns=3)) # .reset_index()

print('{} women with SibSp'.format(len(women_df)))
women_df.head()

Match male and female passengers with the same first and last names: most ikely they are married.

In [None]:
# Merge on first and last name.
pairs_df = men_df.merge(
    women_df,
    how='outer',
    on=['LastName', 'FirstName'],
    suffixes=('_men', '_women'),
)

print(pairs_df.shape)

Drop any rows not having either the man name or the woman name.

In [None]:
pairs_df = pairs_df.dropna(axis='rows', subset=['Name_men', 'Name_women'])
print(pairs_df.shape)

These 92 pairs should definitely be married couple. A way to double check is checking that they have the same cabin.

In [None]:
pairs_df.dropna(axis='rows', subset=['Cabin_men', 'Cabin_women'], how='any')\
.loc[pairs_df['Cabin_men'] != pairs_df['Cabin_women']]

Fair enough: the only couple not sharing the cabin is the captain and his wife.
But many other couples do not have the Cabin data. We can check the embark port or the class, maybe.

In [None]:
print(len(pairs_df.dropna(axis='rows', subset=['Pclass_men', 'Pclass_women'], how='any')))
pairs_df.dropna(axis='rows', subset=['Pclass_men', 'Pclass_women'], how='any')\
.loc[pairs_df['Pclass_men'] != pairs_df['Pclass_women']]

In [None]:
print(len(pairs_df.dropna(axis='rows', subset=['Embarked_men', 'Embarked_women'], how='any')))
pairs_df.dropna(axis='rows', subset=['Embarked_men', 'Embarked_women'], how='any')\
.loc[pairs_df['Embarked_men'] != pairs_df['Embarked_women']]

All the 92 pairs have at least the class and the embark port in common. I think we are sure enough now.

In [None]:
import numpy as np

num_passengers = len(df)
married_with = np.zeros((num_passengers, num_passengers))
parent_of = np.zeros((num_passengers, num_passengers))
sibling_of = np.zeros((num_passengers, num_passengers))

# Group the passengers by last name

In [None]:
df['LastName'] = df['Name'].str.extract('(?P<LastName>.*),')
df2 = df.reset_index()

len(df2.groupby('LastName')['Name'].count())

In [None]:
all_titles = [
    'Capt', # It seems the captain had wife and daughter onboard
    'Col', # 1/4 Cols travels with a wife (29 years younger!)
    'Countess', # The only Countess is travelling alone
    'Don', # The only Don is travelling alone
    'Dona', # The only Dona is travelling alone
    'Dr', # 3/8 Drs travels with a wife
    'Jonkheer', # The only Jonkheer is trvaelling alone
    'Lady', # The only Lady is married to the only Sir
    'Major', # None of the Majors travels with a partner
    'Master',
    'Miss',
    'Mlle',
    'Mme',
    'Mr',
    'Mrs',
    'Ms',
    'Rev', # 2/8 Revs travel with a wife
    'Sir',
]