# Group families
This notebook tries to infer the family relationships between passengers.

In [None]:
import pandas as pd
import dataset as ds
import preprocessing as pp

X_dataset, y_dataset = ds.load_training_set()
X_testset = ds.load_test_set()

df = pd.merge(X_dataset.reset_index(), X_testset.reset_index(), how='outer').set_index(ds.ID_COLUMN_NAME)
df = pp.manual_fixes(df)
df = pp.format_name(df)
df = df.reset_index()
df.shape

Match male and female passengers with the same first and last names: most likely they are married.

In [None]:
edf = pd.read_csv('/home/tom/kaggle/titanic/data/extra_data.csv')
edf['Survived'].sum()

In [None]:
df.loc[df['LastName'].str.contains('Ford')]

In [None]:
from collections import Counter

num_same_last_name = Counter([
    len(last_name_df)
    for last_name, last_name_df
    in list(df.groupby('LastName'))
])
num_same_last_name

In [None]:
# Merge on first and last name.
pairs_df = df.loc[df['Sex'] == 'male'].merge(
    df.loc[df['Sex'] == 'female'],
    how='inner',
    on=['LastName', 'FirstName'],
    suffixes=('_m', '_w'),
)

print(pairs_df.shape)

These 96 pairs should definitely be married couple. A way to double check is checking that they have the same cabin (if it is indicated).

In [None]:
pairs_df.dropna(axis='rows', subset=['Cabin_m', 'Cabin_w'], how='any').loc[pairs_df['Cabin_m'] != pairs_df['Cabin_w']]

The only couple with a cabin indicated that does not share the cabin is the captain and his wife.
But many other couples do not have the Cabin data. We can check the embark port or the class, maybe.

In [None]:
print(len(pairs_df.dropna(axis='rows', subset=['Pclass_m', 'Pclass_w'], how='any')))
pairs_df.dropna(axis='rows', subset=['Pclass_m', 'Pclass_w'], how='any').loc[pairs_df['Pclass_m'] != pairs_df['Pclass_w']]

In [None]:
print(len(pairs_df.dropna(axis='rows', subset=['Embarked_m', 'Embarked_w'], how='any')))
pairs_df.dropna(axis='rows', subset=['Embarked_m', 'Embarked_w'], how='any').loc[pairs_df['Embarked_m'] != pairs_df['Embarked_w']]

All the 96 pairs have at least the class and the embark port in common. I think we are sure enough now.
Do they have the same number of children/parents?

In [None]:
pairs_df.loc[pairs_df['Parch_m'] != pairs_df['Parch_w']]

In [None]:
import numpy as np
from tqdm import tqdm
from prettytable import PrettyTable

class FamilyMatched(Exception):
    pass

class FamilyMatchingError(Exception):
    pass

class FamilyMatchingErrorNotEnoughRelatedPeople(FamilyMatchingError):
    pass


num_passengers = len(df)
married_with = np.zeros((num_passengers, num_passengers))
parent_of = np.zeros((num_passengers, num_passengers))
sibling_of = np.zeros((num_passengers, num_passengers))

tdf = df.assign(UnmatchedSibSp=df['SibSp'], UnmatchedParch=df['Parch'])


def _print_family(last_name, father_id, mother_id, children_ids):
    print()
    print('Family {}'.format(last_name))
    
    field_names = ['Role', 'Name', 'SibSp', 'Parch', 'Age']
    pt = PrettyTable()
    pt.field_names = field_names
    
    roles = ['Father', 'Mother'] + ['Child'] * len(children_ids)
    people_ids = [father_id, mother_id] + list(children_ids)
    for role, person_id in zip(roles, people_ids):
        if person_id is None:
            continue

        row = list(tdf.loc[tdf['PassengerId'] == person_id][field_names[1:]].values[0])
        pt.add_row([role] + row)

    print(pt)


def match_family(fdf):
    # Drop all the rows of people without relationships.
    fdf = fdf.loc[(fdf['SibSp'] > 0) & (fdf['Parch'] > 0)]
    if len(fdf) < 2:
        # No family to gather.
        raise FamilyMatchingErrorNotEnoughRelatedPeople('{} people with relationships'.format(len(fdf)))
    
    # Check if there is a married couple in the family.
    pair_df = fdf.loc[fdf['Sex'] == 'male'].merge(
        fdf.loc[fdf['Sex'] == 'female'],
        how='inner',
        on=['LastName', 'FirstName'],
        suffixes=('_m', '_w'),
    )
    
    if len(pair_df) == 0:
        # No married couple.
        # Probably a single parent travelling with children. In this case the parent should
        # have no SibSp and many Parch and the children should have many SibSp and one Parch.
        
        print(fdf[['Name', 'Age', 'Cabin', 'Fare', 'SibSp', 'Parch']])
        
#         single_parents = fdf.loc[(fdf['SibSp'] == 0) & (fdf['Parch'] > 0)]
#         if len(single_parents) == 1:
#             # Only one single parent.
#             parent_id, num_children = single_parents.iloc[0][['PassengerId', 'Parch']]
#             candidate_children_df = fdf.loc[(fdf['PassengerId'] != parent_id) & (fdf['Parch'] == 1)]
            
#             if len(candidate_children_df) == num_children:
#                 # Cool. But are all the children siblings?
#                 expected_sibsp = num_children - 1
#                 mismatch_sibsp_df = candidate_children_df.loc[candidate_children_df['SibSp'] != expected_sibsp]
                
#                 if len(mismatch_sibsp_df) > 0:
#                     print(fdf)
        
#         elif len(single_parents) > 1:
#             # Many possible single parents: maybe they are siblings travelling together.
#             pass
        
    elif len(pair_df) > 1:
        # Multiple possible couples.
        pass
    
    elif len(pair_df) == 1:
        # Exactly one couple. How many children?
        id_m, id_w, parch_m, parch_w = pair_df.iloc[0][['PassengerId_m', 'PassengerId_w', 'Parch_m', 'Parch_w']]
        
        married_with[id_m - 1, id_w - 1] += 1
        married_with[id_w - 1, id_m - 1] += 1

        if parch_m == 0 or parch_w == 0:
            # No children.
            pass
        
        elif parch_m == parch_w:
            # Same number of children.
            num_children = parch_m
            
            # All the children should report 2 parents.
            fdf_no_parents = fdf.loc[(~fdf['PassengerId'].isin((id_m, id_w))) & (fdf['Parch'] == 2)]
            
            if len(fdf_no_parents) == num_children:
                # Assume these passengers are the children.
                children_ids = fdf_no_parents['PassengerId'].values
                for child_id in children_ids:
                    parent_of[id_m - 1, child_id - 1] += 1
                    parent_of[id_w - 1, child_id - 1] += 1
                
                # _print_family(last_name, id_m, id_w, children_ids)
                raise FamilyMatched('2 parents and {} children'.format(num_children))
            
            elif len(fdf_no_parents) < num_children:
                pass
            
            elif len(fdf_no_parents) > num_children:
                pass
        
        elif parch_m != parch_w:
            # PRobably one of the two has parents on board.
            num_children = min(parch_m, parch_w)
            pass
    
    raise FamilyMatchingError()

        
num_all_families = 0
num_completely_matched_families = 0
no_relationships = 0
other = 0

for last_name, last_name_df in tqdm(list(tdf.groupby('LastName'))):
    if len(last_name_df) < 2:
        # Single person.
        continue
    
    num_all_families += 1

    try:
        match_family(last_name_df)
    except FamilyMatched as fme:
        # print('{}: {} - {}'.format(type(fme).__name__, fme, last_name))
        num_completely_matched_families += 1
        continue
    except FamilyMatchingErrorNotEnoughRelatedPeople as fme:
        # print('{}: {} - {}'.format(type(fme).__name__, fme, last_name))
        no_relationships += 1
        continue
    except FamilyMatchingError as fme:
        # print('{}: {} - {}'.format(type(fme).__name__, fme, last_name))
        other += 1
        continue
    
#     if allfamilies > 10:
#         break

family_pt = PrettyTable()
family_pt.field_names = [
    'num_completely_matched_families',
    'no_relationships',
    'other',
    'Sum',
    'num_all_families',
]

partial_values = [
    num_completely_matched_families,
    no_relationships,
    other,
]
tot = sum(partial_values)
rows_to_add = partial_values + [tot, num_all_families]
family_pt.add_row(rows_to_add)
print(family_pt)

print('End!')

In [None]:
husbands_wives_ids = pairs_df[['PassengerId_m', 'PassengerId_w']].values
for id1, id2 in husbands_wives_ids:
    idx1 = int(id1 - 1)
    idx2 = int(id2 - 1)
    married_with[idx1, idx2] = 1
    married_with[idx2, idx1] = 1

np.sum(married_with) / 2

Match married couples with possible children.

In [None]:
df['LastName'] = df['Name'].str.extract('(?P<LastName>.*),')
df['UnmatchedSibSp'] = df['SibSp']
df['UnmatchedParch'] = df['Parch']

In [None]:
for idx, pair_row in pairs_df.iterrows():
    id_men = pair_row['PassengerId_men']
    id_wom = pair_row['PassengerId_women']
    
    df.loc[[id_men, id_wom], ['UnmatchedSibSp']] -= 1
    
    parch_men = pair_row['Parch_men']
    parch_wom = pair_row['Parch_women']
    
    age_men = pair_row['Age_men']
    age_wom = pair_row['Age_women']
    
    cabin_men = pair_row['Cabin_men'] # we know that wife's cabin is the same
    
    last_name = pair_row['LastName']
    
    if parch_men == 0 and parch_wom == 0:
        continue
    
    if parch_wom > parch_men:
        # The women has her parents on board.
        unmarried_name = pair_row['ProperName']
        print(unmarried_name)
        continue
    
    if parch_men != parch_wom:
        print('parch_men = {}, parch_wom = {}'.format(parch_men, parch_wom))
        continue
    
    children_df = df.loc[(df['LastName'] == last_name) & (~df.index.isin([id_men, id_wom]))]

    for child_id, child_row in children_df.iterrows():
        child_parch = child_row['Parch']
        child_age = child_row['Age']

        if child_parch < 2:
            print('child_parch = {}'.format(child_parch))
            continue

        if child_age >= age_men or child_age >= age_wom:
            print('Age {} vs ({})'.format(child_age, age_men, age_wom))
            continue

        parent_of[id_men + 1, child_id + 1] = 1
        parent_of[id_wom + 1, child_id + 1] = 1
        
        df.loc[[id_men, id_wom, child_id], ['UnmatchedParch']] -= 1

df[['UnmatchedSibSp', 'UnmatchedParch']].sum()

# Group the passengers by last name and cabin
If they have the same name and share the same cabin, they are proably a family.

In [None]:
df['LastName'] = df['Name'].str.extract('(?P<LastName>.*),')

# Drop all the passengers travelling alone.
accompanied_df = df.loc[(df['SibSp'] != 0) | (df['Parch'] != 0)]

for (last_name, cabin), last_name_df in list(accompanied_df.groupby(['LastName', 'Cabin'])):
    print('{} in cabin {}'.format(last_name, cabin))
    print(last_name_df.reset_index()[['Name', 'Age', 'SibSp', 'Parch']])
    print()

In [None]:
for last_name, last_name_df in list(df.groupby('LastName')):
    for idx, last_name_df in df.iterrows():
        parch = df_row['Parch']
        if parch == 0:
            # No parents nor children.
            continue
        
"""
for each passenger
    does it have a parch?
    if 0:
        continue
        
    look for people with the same surname and remove the possible partner
    for all the candidate children:
        if it has parch = 0:
            continue
        
        if younger
            this person is child
        else
            this person is parent
"""

In [None]:
look_for_relationship = set()

for (last_name, cabin), last_name_df in list(df.reset_index().groupby(['LastName', 'Cabin'])):
#     print('{} in cabin {}'.format(last_name, cabin))
    if len(last_name_df) == 1:
        p_id, sibsp, parch = last_name_df[['PassengerId', 'SibSp', 'Parch']].values[0]
        if sibsp == 0 and parch == 0:
            continue
        
        look_for_relationship.add(p_id)
        
        print(p_id, sibsp, parch)
    
    

#     print(last_name_df.reset_index()[['Name', 'Age', 'SibSp', 'Parch']])
#     print()

In [None]:
all_titles = [
    'Capt', # It seems the captain had wife and daughter onboard
    'Col', # 1/4 Cols travels with a wife (29 years younger!)
    'Countess', # The only Countess is travelling alone
    'Don', # The only Don is travelling alone
    'Dona', # The only Dona is travelling alone
    'Dr', # 3/8 Drs travels with a wife
    'Jonkheer', # The only Jonkheer is trvaelling alone
    'Lady', # The only Lady is married to the only Sir
    'Major', # None of the Majors travels with a partner
    'Master',
    'Miss',
    'Mlle',
    'Mme',
    'Mr',
    'Mrs',
    'Ms',
    'Rev', # 2/8 Revs travel with a wife
    'Sir',
]