# Group families
This notebook tries to infer the family relationships between passengers.

In [None]:
import pandas as pd
import dataset as ds

X_dataset, y_dataset = ds.load_training_set()
X_testset = ds.load_test_set()

df = pd.merge(X_dataset.reset_index(), X_testset.reset_index(), how='outer').set_index(ds.ID_COLUMN_NAME)
df = df.reset_index()

In [None]:
import itertools
import json

relationship_filepath = '/home/tom/kaggle/titanic/data/relationships_data.json'
with open(relationship_filepath, 'r') as f:
    relationships = json.load(f)

In [None]:
import numpy as np
from tqdm import tqdm
import data.integration.relationships.mapping as relmap

# Insert the relationships in a mtrix.
rel_matrix = [[None] * len(df) for _ in range(len(df))]

# Passenger 1 says: "Passenger 2 is my ...".
for passenger_dict in tqdm(relationships):
    psgr1_url = passenger_dict['UrlId']
    psgr1_id = int(df.loc[df['UrlId'] == psgr1_url]['PassengerId'])

    for psgr2_url, rel_type, rel_desc in passenger_dict['Relationships']:
        psgr2_row = df.loc[df['UrlId'] == psgr2_url]
        if len(psgr2_row) == 0:
            # Some of the downloaded relationships refer to people not aboard.
            continue

        psgr2_id = int(psgr2_row['PassengerId'])
        
        assert psgr1_id != psgr2_id

        if rel_desc:
            # If the description is available, prefer it!
            coarse_desc = relmap.fine_description_to_coarse_description[rel_desc]
        elif rel_type:
            # Otherwise fall back to the relationship type.
            coarse_desc = relmap.relationship_type_to_coarse_description[rel_type]
        else:
            # In case also the relationship type is invalid (should this ever happen?)
            raise ValueError('No valid relationship between {} and {}'.format(psgr1_url, psgr2_url))
            
        # In case we parsed already passenger 2 and assigned a relationship with passenger 1,
        # check if it is consistent.
        existing_rel = rel_matrix[psgr1_id][psgr2_id]
        if existing_rel is not None and existing_rel != coarse_desc:
            # The passengers are already related by a relationship, but it does not match the current guess.
            # Which one should we take?

            if coarse_desc == 'knows':
                # If the current relationship is "knows", then it is the most generic and the existing
                # one is to prefer because more specific.
                coarse_desc = existing_rel

            elif existing_rel != 'knows':
                # If neither the current nor the existing relationship are "knows", then we have
                # a proper mismatch!
                print('Mismatching relationships between {} and {}: '
                        '{} vs {}'.format(psgr1_url, psgr2_url, existing_rel, coarse_desc))
            
        reciprocal_coarse_desc = relmap.reciprocals[coarse_desc]
        rel_matrix[psgr1_id][psgr2_id] = coarse_desc
        rel_matrix[psgr2_id][psgr1_id] = reciprocal_coarse_desc

num_matches = 0
for row in rel_matrix:
    for cell in row:
        if cell is not None:
            num_matches += 1

print(num_matches / 2)

In [None]:
df = df.set_index('PassengerId')
df['Age'] = df['AgeInDays'] / 365

In [None]:
df['Title'].unique()

In [None]:
same_surname = 0
diff_surname = 0

num_knows = 0

# Try to infer the "knows".
for row_idx in tqdm(range(1, len(rel_matrix))):
    for col_idx in range(row_idx):
        if rel_matrix[row_idx][col_idx] != 'knows':
            continue

        psgr1 = df.loc[row_idx]
        psgr2 = df.loc[col_idx]
        
        attributes = ['LastName', 'Sex', 'Age', 'MaritalStatus', 'Title']
        last_name1, gender1, age1, marital_status1, title1 = psgr1[attributes]
        last_name2, gender2, age2, marital_status2, title2 = psgr2[attributes]
        
        if last_name1 == last_name2:
            # Different tickets, but same surname: still very likely they are related.
            same_surname += 1
            
            if gender1 == gender2:
                # Same gender: they canot be married
                
                if abs(age1 - age2) > 18:
                    # Large age gap: one of them is a parent.
                    parent_id, child_id = (row_idx, col_idx) if age1 > age2 else (col_idx, row_idx)
                    rel_matrix[child_id][parent_id] = 'parent'
                    rel_matrix[parent_id][child_id] = 'child'
                
                else:
                    # About the same age: siblings.
                    rel_matrix[row_idx][col_idx] = 'sibling'
                    rel_matrix[col_idx][row_idx] = 'sibling'
            
            else:
                # Different gender.
                
                if age1 < 16 or age2 < 16:
                    # One of them is very young: they must be either siblings or parent-child.
                
                    if np.abs(age1 - age2) > 18:
                        # Large age gap: one of them is a parent.
                        parent_id, child_id = (row_idx, col_idx) if age1 > age2 else (col_idx, row_idx)
                        rel_matrix[child_id][parent_id] = 'parent'
                        rel_matrix[parent_id][child_id] = 'child'
                    
                    else:
                        # About the same age: siblings.
                        rel_matrix[row_idx][col_idx] = 'sibling'
                        rel_matrix[col_idx][row_idx] = 'sibling'
                
                else:
                    # Both grown-ups and about the same age: they could be either siblings or a couple.
                    # Generally, it is more likely that they are a couple, so try to exclude it by
                    # looking for clear evidence they are not. If we cannot prove that they are NOT a
                    # couple, we will assume they are.
                    
                    if marital_status1 == 'Single' or marital_status2 == 'Single':
                        # If any of the two is single, they cannot be married.
                        rel_matrix[row_idx][col_idx] = 'sibling'
                        rel_matrix[col_idx][row_idx] = 'sibling'
                        continue
                    
                    # Does the female title provide any hint?
                    single_woman_titles = ['Miss', 'Mlle']
                    single_man_titles = ['Master', 'Jonkh']

                    if gender1 == 'male':
                        if title1 in single_man_titles or title2 in single_woman_titles:
                            # One of them has an unmarried title: siblings.
                            rel_matrix[row_idx][col_idx] = 'sibling'
                            rel_matrix[col_idx][row_idx] = 'sibling'
                            continue
                    
                    if gender2 == 'male':
                        if title2 in single_man_titles or title1 in single_woman_titles:
                            # One of them has an unmarried title: siblings.
                            rel_matrix[row_idx][col_idx] = 'sibling'
                            rel_matrix[col_idx][row_idx] = 'sibling'
                            continue 
                    
                    # We reached the default case.
                    # No evidence that they are NOT married: assume they are.
                    rel_matrix[row_idx][col_idx] = 'spouse'
                    rel_matrix[col_idx][row_idx] = 'spouse'
                    
#                     indices = [row_idx, col_idx]
#                     print()
#                     print(df.loc[indices][['UrlId', 'FirstName', 'LastName', 'TicketNumber', 'Age', 'Sex', 'Cabin', 'SibSp', 'Parch']])

        else:
            # Not related in any particular way.
            diff_surname += 1
            continue
            
#         indices = [row_idx, col_idx]
#         print()
#         print(df.loc[indices][['UrlId', 'FirstName', 'LastName', 'TicketNumber', 'Age', 'Sex', 'Cabin', 'SibSp', 'Parch']])
        
# print(same_surname)
# print(diff_surname)
# print(num_knows)

In [None]:
# Check that each rows has only "spouse" and at most two "parent".
# It seems that the number of parents always checks out. Spouses is an issue, though.
for row_idx, row in tqdm(enumerate(rel_matrix)):
    spouses_ids = [
        col_idx
        for col_idx, cell in enumerate(row)
        if cell == 'spouse'
    ]
        
    if len(spouses_ids) <= 1:
        continue
    
    # Something is wrong!
    
    attrs = ['UrlId', 'Age']
    url_id, age = df.loc[row_idx][attrs]
    print('{}: {}'.format(url_id, age))
    for spouse_id in spouses_ids:
        sp_url_id, sp_age = df.loc[spouse_id][attrs]
        print('  {}: {}'.format(sp_url_id, sp_age))

In [None]:
# Cross-check the relationships we found against the ones of the Kaggle dataset.

num_sibsp_matches = 0
num_parch_matches = 0
for psgr1_id, row in tqdm(enumerate(rel_matrix)):
    url_id, sibsp, parch = df.loc[psgr1_id][['UrlId', 'SibSp', 'Parch']]
    
    num_sp = 0
    num_sib = 0
    num_par = 0
    num_ch = 0
    num_oth = 0
    for cell in row:
        if cell == 'spouse':
            num_sp += 1
        elif cell == 'sibling':
            num_sib += 1
        elif cell == 'parent':
            num_par += 1
        elif cell == 'child':
            num_ch += 1
        elif cell is None:
            continue
        else:
            num_oth += 1
    
    if sibsp == num_sib + num_sp:
        num_sibsp_matches += 1
    
    if parch == num_par + num_ch:
        num_parch_matches += 1
    
#     if sibsp != num_sib + num_sp or parch != num_par + num_ch:
#         print('{}'.format(url_id))
#         print('  SibSp: {} vs {} + {}'.format(sibsp, num_sib, num_sp))
#         print('  Parch: {} vs {} + {}'.format(parch, num_par, num_ch))
#         print('  Others: {}'.format(num_oth))
    
print('SibSp matches: {}/{}'.format(num_sibsp_matches, len(rel_matrix)))
print('Parch matches: {}/{}'.format(num_parch_matches, len(rel_matrix)))