In [None]:
import pandas as pd
import dataset as ds

X_dataset, y_dataset = ds.load_training_set()
X_testset = ds.load_test_set()

df = pd.merge(X_dataset.reset_index(), X_testset.reset_index(), how='outer').set_index(ds.ID_COLUMN_NAME)
df.index

In [None]:
df.head()

# Titles
In this section we try to extract as much information as possible from the name.

In [None]:
import preprocessing as pp

title_list = pp.get_title_list(df)
set(title_list)

In [None]:
from enum import Enum

# Map each title to (married, status, English) where
# * status = (married, single, any)
# * status = (simple, noble, special)
# * English = (True, False) whether the title sounds English or not

class Married(Enum):
    MARRIED = 'married'
    SINGLE = 'single'
    ANY = 'any'


class Status(Enum):
    SIMPLE = 'simple'
    NOBLE = 'noble'
    SPECIAL = 'special'
    

title_mapping = {
    'Capt': (Married.MARRIED, Status.SPECIAL, True), # It seems the captain had wife and daughter onboard
    'Col': (Married.ANY, Status.SPECIAL, True), # 1/4 Cols travels with a wife (29 years younger!)
    'Countess': (Married.ANY, Status.NOBLE, True), # The only Countess is travelling alone
    'Don': (Married.SINGLE, Status.SPECIAL, False), # The only Don is travelling alone
    'Dona': (Married.MARRIED, Status.NOBLE, False), # The only Dona is travelling alone
    'Dr': (Married.ANY, Status.SPECIAL, True), # 3/8 Drs travels with a wife
    'Jonkheer': (Married.ANY, Status.NOBLE, False), # The only Jonkheer is trvaelling alone
    'Lady': (Married.ANY, Status.NOBLE, True), # The only Lady is married to the only Sir
    'Major': (Married.SINGLE, Status.SPECIAL, True), # None of the Majors travels with a partner
    'Master': (Married.ANY, Status.SIMPLE, True),
    'Miss': (Married.SINGLE, Status.SIMPLE, True),
    'Mlle': (Married.SINGLE, Status.SIMPLE, False),
    'Mme': (Married.MARRIED, Status.SIMPLE, False),
    'Mr': (Married.ANY, Status.SIMPLE, True),
    'Mrs': (Married.MARRIED, Status.SIMPLE, True),
    'Ms': (Married.MARRIED, Status.SIMPLE, True),
    'Rev': (Married.ANY, Status.SPECIAL, True), # 2/8 Revs travel with a wife
    'Sir': (Married.MARRIED, Status.NOBLE, True), # The only Sir is married with the only Lady
}

In [None]:
df = pp.add_title_column(df)

df = df.assign(TitleMarried=df['Title'].apply(lambda title: title_mapping[title][0].value),
              TitleStatus=df['Title'].apply(lambda title: title_mapping[title][1].value),
              TitleEnglish=df['Title'].apply(lambda title: title_mapping[title][2]))

for title, title_info in title_mapping.items():
    if title_info[2] == Status.SIMPLE:
        continue
    
    ddf = df.loc[df['Title'] == title][['Name', 'SibSp', 'Parch']]
    print('{}: {}'.format(title, len(ddf)))
    print(ddf)
    print('===')


In [None]:
names_to_check = [
    'Minahan',
    'Frauenthal',
    'Crosby',
    'Dodge',
    'Astor',
    'Carter',
    'Lahtinen',
]
for name in map(lambda n: n.lower(), names_to_check):
    print(name)
    print(df.loc[df['Name'].apply(lambda x: x.lower()).str.contains(name)][['Name', 'Age', 'SibSp', 'Parch']])
    print('=====')

In [None]:
# Try to group passengers by their last name.
last_names_df = df.assign(last_name=df['Name'].apply(lambda name: name.split(',')[0].lower()))
last_names_df.reset_index().groupby('last_name')['PassengerId'].count()

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

train_df = pd.merge(y_dataset, df, how='inner', on='PassengerId')

title_df = train_df.groupby(['TitleStatus', 'Sex']).agg({'Survived': ['mean', 'count', 'sum']})
title_df

num_survived = title_df['Survived', 'sum']
num_not_survived = title_df['Survived', 'count'] - num_survived

plt.figure()

plt.barh(range(len(title_df)), num_survived,
       tick_label=title_df.index)
plt.barh(range(len(title_df)), num_not_survived, left=num_survived, color='r',
       tick_label=title_df.index)
plt.xlabel('Prob of surviving')

plt.xscale('log')

plt.show()

In [None]:
plt.figure()

plt.barh(range(len(title_df)), title_df['Survived', 'mean'],
       tick_label=title_df.index)
plt.xlabel('Prob of surviving')

plt.show()