# Group families
This notebook tries to infer the family relationships between passengers.

In [None]:
import pandas as pd
import dataset as ds
import itertools
import numpy as np
from tqdm import tqdm
import data.integration.relationships.mapping as relmap
import json
import data.integration.relationships.manual_fixes as mf

X_dataset, y_dataset = ds.load_training_set()
X_testset = ds.load_test_set()

df = pd.merge(X_dataset.reset_index(), X_testset.reset_index(), how='outer').set_index(ds.ID_COLUMN_NAME)
df = df.reset_index()

relationship_filepath = '/home/tom/kaggle/titanic/data/relationships_data.json'
with open(relationship_filepath, 'r') as f:
    relationships = json.load(f)

# Insert the relationships in a mtrix.
rel_matrix = [[None] * len(df) for _ in range(len(df))]

# Passenger 1 says: "Passenger 2 is my ...".
for passenger_dict in tqdm(relationships,
                          desc='Inserting obvious relationships'):
    psgr1_url = passenger_dict['UrlId']
    psgr1_id = int(df.loc[df['UrlId'] == psgr1_url]['PassengerId'])

    for psgr2_url, rel_type, rel_desc in passenger_dict['Relationships']:
        psgr2_row = df.loc[df['UrlId'] == psgr2_url]
        if len(psgr2_row) == 0:
            # Some of the downloaded relationships refer to people not aboard.
            continue

        psgr2_id = int(psgr2_row['PassengerId'])
        
        assert psgr1_id != psgr2_id

        if rel_desc:
            # If the description is available, prefer it!
            coarse_desc = relmap.fine_description_to_coarse_description[rel_desc]
        elif rel_type:
            # Otherwise fall back to the relationship type.
            coarse_desc = relmap.relationship_type_to_coarse_description[rel_type]
        else:
            # In case also the relationship type is invalid (should this ever happen?)
            raise ValueError('No valid relationship between {} and {}'.format(psgr1_url, psgr2_url))
            
        # In case we parsed already passenger 2 and assigned a relationship with passenger 1,
        # check if it is consistent.
        existing_rel = rel_matrix[psgr1_id][psgr2_id]
        if existing_rel is not None and existing_rel != coarse_desc:
            # The passengers are already related by a relationship, but it does not match the current guess.
            # Which one should we take?

            if coarse_desc == 'knows':
                # If the current relationship is "knows", then it is the most generic and the existing
                # one is to prefer because more specific.
                coarse_desc = existing_rel

            elif existing_rel != 'knows':
                # If neither the current nor the existing relationship are "knows", then we have
                # a proper mismatch!
                print('Mismatching relationships between {} and {}: '
                        '{} vs {}'.format(psgr1_url, psgr2_url, existing_rel, coarse_desc))
            
        reciprocal_coarse_desc = relmap.reciprocals[coarse_desc]
        rel_matrix[psgr1_id][psgr2_id] = coarse_desc
        rel_matrix[psgr2_id][psgr1_id] = reciprocal_coarse_desc

num_matches = 0
for row in rel_matrix:
    for cell in row:
        if cell is not None:
            num_matches += 1

# Apply the manual fixes.
for url_id1, rel_list in tqdm(mf.manual_fixes.items(), desc='Applying manual fixes'):
    for coarse_desc, url_id2 in rel_list:
        psgr1_id = int(df.loc[df['UrlId'] == url_id1]['PassengerId'])
        psgr2_id = int(df.loc[df['UrlId'] == url_id2]['PassengerId'])

        reciprocal_coarse_desc = relmap.reciprocals[coarse_desc]
        rel_matrix[psgr1_id][psgr2_id] = coarse_desc
        rel_matrix[psgr2_id][psgr1_id] = reciprocal_coarse_desc

df = df.set_index('PassengerId')
df['Age'] = df['AgeInDays'] / 365

In [None]:
def assess_relationship(psgr1_id, psgr2_id):
    psgr1 = df.loc[psgr1_id]
    psgr2 = df.loc[psgr2_id]

    attributes = ['LastName', 'Sex', 'Age', 'MaritalStatus', 'Title']
    last_name1, gender1, age1, marital_status1, title1 = psgr1[attributes]
    last_name2, gender2, age2, marital_status2, title2 = psgr2[attributes]
    
    if last_name1 != last_name2:
        # Not related in any particular way. This is too difficult to infer.
        return

    if gender1 == gender2:
        # Same gender: they canot be married

        if abs(age1 - age2) > 18:
            # Large age gap: one of them is a parent.
            parent_id, child_id = (psgr1_id, psgr2_id) if age1 > age2 else (psgr2_id, psgr1_id)
            rel_matrix[child_id][parent_id] = 'parent'
            rel_matrix[parent_id][child_id] = 'child'

        else:
            # About the same age: siblings.
            rel_matrix[psgr1_id][psgr2_id] = 'sibling'
            rel_matrix[psgr2_id][psgr1_id] = 'sibling'

    else:
        # Different gender.

        if age1 < 16 or age2 < 16:
            # One of them is very young: they must be either siblings or parent-child.

            if np.abs(age1 - age2) > 18:
                # Large age gap: one of them is a parent.
                parent_id, child_id = (psgr1_id, psgr2_id) if age1 > age2 else (psgr2_id, psgr1_id)
                rel_matrix[child_id][parent_id] = 'parent'
                rel_matrix[parent_id][child_id] = 'child'

            else:
                # About the same age: siblings.
                rel_matrix[psgr1_id][psgr2_id] = 'sibling'
                rel_matrix[psgr2_id][psgr1_id] = 'sibling'

        else:
            # Both grown-ups: they could be either siblings, parent-child or a couple.

            if np.abs(age1 - age2) > 18:
                # Large age gap: one of them is a parent.
                parent_id, child_id = (psgr1_id, psgr2_id) if age1 > age2 else (psgr2_id, psgr1_id)
                rel_matrix[child_id][parent_id] = 'parent'
                rel_matrix[parent_id][child_id] = 'child'
                return

            # About the same age: siblings or a couple.

            if marital_status1 == 'Single' or marital_status2 == 'Single':
                # If any of the two is single, they cannot be married.
                rel_matrix[psgr1_id][psgr2_id] = 'sibling'
                rel_matrix[psgr2_id][psgr1_id] = 'sibling'
                return

            # Does the female title provide any hint?
            single_woman_titles = ['Miss', 'Mlle']
            single_man_titles = ['Master', 'Jonkh']

            if gender1 == 'male':
                if title1 in single_man_titles or title2 in single_woman_titles:
                    # One of them has an unmarried title: siblings.
                    rel_matrix[psgr1_id][psgr2_id] = 'sibling'
                    rel_matrix[psgr2_id][psgr1_id] = 'sibling'
                    return

            if gender2 == 'male':
                if title2 in single_man_titles or title1 in single_woman_titles:
                    # One of them has an unmarried title: siblings.
                    rel_matrix[psgr1_id][psgr2_id] = 'sibling'
                    rel_matrix[psgr2_id][psgr1_id] = 'sibling'
                    return 

            # We reached the default case.
            # No evidence that they are NOT married: assume they are.
            rel_matrix[psgr1_id][psgr2_id] = 'spouse'
            rel_matrix[psgr2_id][psgr1_id] = 'spouse'


In [None]:
# Try to infer the "knows".
for row_idx in tqdm(range(1, len(rel_matrix)), desc='Infer the "knows"'):
    for col_idx in range(row_idx):
        if rel_matrix[row_idx][col_idx] != 'knows':
            continue

        assess_relationship(row_idx, col_idx)
        

In [None]:
# So far we based only on explicit relationships. But there are other wasy to infer relationships.

# Use the ticket number. Since sometimes husband and wife have not the same ticket, but its continugous,
# truncate the ticket number so that we disregard the least significant digit. We are missing tickets that
# cross over ten, but they should be statistically only the 10%.
df_tn = df.loc[~df['TicketNumber'].isna()].copy()
df_tn['TruncatedTicketNumber'] = np.floor(df_tn['TicketNumber'] / 10).astype(int)

for _truncated_ticket_nr, gdf in tqdm(list(df_tn.groupby('TruncatedTicketNumber')),
                                     desc='Check based on truncated ticket number'):
    
    num_psgrs = len(gdf)
    if num_psgrs < 2:
        continue
    
    # Check the passengers in pairs.
    for idx1 in range(num_psgrs - 1):
        psgr1_id = gdf.index[idx1]
        for idx2 in range(idx1 + 1, num_psgrs):
            psgr2_id = gdf.index[idx2]

            if rel_matrix[psgr1_id][psgr2_id] is None:
                assess_relationship(psgr1_id, psgr2_id)


In [None]:
# Check that each rows has only "spouse" and at most two "parent".
# It seems that the number of parents always checks out. Spouses is an issue, though.
for row_idx, row in tqdm(enumerate(rel_matrix)):
    spouses_ids = [
        col_idx
        for col_idx, cell in enumerate(row)
        if cell == 'spouse'
    ]
        
    if len(spouses_ids) <= 1:
        continue
    
    # Something is wrong!
    
    attrs = ['UrlId', 'Age']
    url_id, age = df.loc[row_idx][attrs]
    print('{}: {}'.format(url_id, age))
    for spouse_id in spouses_ids:
        sp_url_id, sp_age = df.loc[spouse_id][attrs]
        print('  {}: {}'.format(sp_url_id, sp_age))

In [None]:
# People to ignore in the next check, because they are actually ok.
ignore = {
    '/titanic-victim/alfrida-konstantia-brogren-andersson.html',
    '/titanic-victim/ida-augusta-margareta-andersson.html',
    '/titanic-victim/mary-bourke.html',
    '/titanic-victim/john-bourke.html',
    '/titanic-victim/anna-sigrid-maria-danbom.html',
    '/titanic-victim/neal-thomas-ford.html',
    '/titanic-victim/anna-sigrid-maria-danbom.html',
    '/titanic-victim/anders-andersson.html',
    '/titanic-victim/catherine-bourke.html',
    '/titanic-victim/mary-bourke.html',
    '/titanic-victim/john-bourke.html',
    '/titanic-victim/harry-faunthorpe.html',
    '/titanic-victim/dollina-margaret-ford.html',
    '/titanic-victim/neal-thomas-ford.html',
    '/titanic-victim/robina-maggie-ford.html',
    '/titanic-victim/edward-watson-ford.html',
    '/titanic-victim/margaret-ann-watson-ford.html',
    '/titanic-victim/george-edward-graham.html',
    '/titanic-victim/charles-melville-hays.html',
    '/titanic-victim/marta-hiltunen.html',
    '/titanic-victim/claus-peter-hansen.html',
    '/titanic-victim/henrik-juul-hansen.html',
    '/titanic-victim/henry-damsgaard-hansen.html',
    '/titanic-survivor/jennie-louise-hansen.html',
    '/titanic-victim/eliza-johnston.html',
    '/titanic-victim/niels-peder-rasmus-jensen.html',
    '/titanic-victim/charles-natsch.html',
    '/titanic-victim/elna-matilda-strom.html',
    '/titanic-victim/charles-rad-thomas.html',
    '/titanic-victim/gertrud-emilia-klasen.html',
    '/titanic-victim/hulda-kristina-eugenia-klasen.html',
    '/titanic-victim/gertrud-emilia-klasen.html',
    '/titanic-victim/hulda-kristina-eugenia-klasen.html',
    '/titanic-victim/klas-albin-klasen.html',
    '/titanic-victim/william-lahtinen.html',
    '/titanic-victim/anna-amelia-lahtinen.html',
    '/titanic-victim/william-jeffery-ware.html',
    '/titanic-survivor/carla-jensen.html',
    '/titanic-survivor/erna-andersson.html',
    '/titanic-survivor/kornelia-theodosia-andrews.html',
    '/titanic-survivor/harriette-rebecca-crosby.html',
    '/titanic-survivor/catherine-elizabeth-crosby.html',
    '/titanic-victim/edward-gifford-crosby.html',
    '/titanic-survivor/orian-davidson.html',
    '/titanic-survivor/elizabeth-agnes-mary-davies.html',
    '/titanic-survivor/elsie-doling.html',
    '/titanic-survivor/ada-doling.html',
    '/titanic-survivor/anna-hamalainen.html',
    '/titanic-survivor/viljo-hamalainen.html',
    '/titanic-survivor/helene-baxter.html',
    '/titanic-survivor/clara-jennings-hays.html',
    '/titanic-survivor/lyyli-karoliina-silven.html',
    '/titanic-survivor/thamine-thelma-thomas.html',
    '/titanic-survivor/elizabeth-anne-wilkinson.html',
    '/titanic-survivor/selini-celiney-yazbeck.html',
}

In [None]:
# Cross-check the relationships we found against the ones of the Kaggle dataset.

num_sibsp_matches = 0
num_parch_matches = 0
for psgr1_id, row in tqdm(enumerate(rel_matrix)):
    url_id, sibsp, parch = df.loc[psgr1_id][['UrlId', 'SibSp', 'Parch']]
    
    if url_id in ignore:
        continue
    
    sp = []
    sib = []
    par = []
    ch = []
    num_oth = 0
    for psgr2_id, cell in enumerate(row):
        if cell is None:
            continue
        
        url_id2 = df.loc[psgr2_id]['UrlId']
        
        if cell == 'spouse':
            sp.append(url_id2)
        elif cell == 'sibling':
            sib.append(url_id2)
        elif cell == 'parent':
            par.append(url_id2)
        elif cell == 'child':
            ch.append(url_id2)
        else:
            num_oth += 1
    
    num_sib = len(sib)
    num_sp = len(sp)
    num_par = len(par)
    num_ch = len(ch)

    if sibsp == num_sib + num_sp:
        num_sibsp_matches += 1
    
    if parch == num_par + num_ch:
        num_parch_matches += 1
    
    if sibsp > 0 and sibsp != num_sib + num_sp:
        print('{}'.format(url_id))
        print('  SibSp: {} vs {} + {}'.format(sibsp, num_sib, num_sp))
        for u_id in sib:
            print('    sibling - {}'.format(u_id))
        for u_id in sp:
            print('    spouse - {}'.format(u_id))
        print('  Others: {}'.format(num_oth))
    
    if parch > 0 and parch != num_par + num_ch:
        print('{}'.format(url_id))
        print('  Parch: {} vs {} + {}'.format(parch, num_par, num_ch))
        for u_id in par:
            print('    parent - {}'.format(u_id))
        for u_id in ch:
            print('    child - {}'.format(u_id))
        print('  Others: {}'.format(num_oth))
    
print('SibSp matches: {}/{}'.format(num_sibsp_matches, len(rel_matrix)))
print('Parch matches: {}/{}'.format(num_parch_matches, len(rel_matrix)))