#### ML05: ML Project Part 3

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [3]:
path = '../data/'

In [8]:
# Reading in the data
df = pd.read_csv('{}titanic_train.csv'.format(path))
df.columns = [
    x.lower() for x in df.columns
]

# Fill the nan values in embarked with '' to make dealing with them easier
df['embarked'].fillna('')

sex_codes = {
    'male' : 1,
    'female' : 2,
}

embarked_codes = {
    'S': 1,
    'Q': 2,
    'C': 3,
    '': None
}

# Encoding sex values
df['sex'].replace(sex_codes, inplace=True)
df['embarked'].replace(embarked_codes, inplace=True)

# Family size
df['family_size'] = df['sibsp'] + df['parch'] + 1

# Lone Travellers
def is_alone(row):
    '''
    Returns a value to determine whether a person was
    travelling alone
    '''
    if row['family_size'] >= 2:
        return 0
    else:
        return 1
    
df['alone'] = df.apply(is_alone, axis=1)
df.head()

# Dropping cabin
df.drop('cabin', axis=1, inplace=True)

# Splitting the name
df['split_name'] = df['name'].str.split()
surnames = [str.strip(name[0][:-1]) for name in df['split_name'].values]
df['surname'] = surnames
df['family_id'] = df['surname'].astype(str) + df['family_size'].astype(str)

# Titles
def title(row):
    title_search = re.search(' ([A-Za-z]+)\.', row['name'])
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

df['title'] = df.apply(title, axis=1)

# Encoding Title
title_codes = {
    'Mr': 1,       # General adult male
    'Mrs': 2,      # General adult female
    'Miss': 3,     # General young female
    'Master': 4,   # General young male
    'Don': 5,      # Noble male
    'Rev': 6,      # Professional
    'Dr': 6,       # Professional
    'Mme': 2,      # General adult female
    'Ms': 2,       # General adult female
    'Major': 6,    # Professional
    'Lady': 7,     # Noble female
    'Sir' : 5,     # Noble male
    'Mlle': 3,     # General young female
    'Col': 6,      # Professional
    'Capt': 6,     # Professional
    'Countess': 7, # Noble female
    'Jonkheer': 5  # Noble male
}

df['title'].replace(title_codes, inplace=True)
df.head()

# Inferring age

def infer_age(row):
    '''
    Infers the age for nan values
    '''
    if(pd.isnull(row['age'])):
        
        if row['title'] == 1:    # Mr
            return 30
        elif row['title']  == 2:  # Mrs
            return 35
        elif row['title']  == 3:  # Miss
            return 21
        elif row['title']  == 4:  # Master
            return 4
        elif row['title']  == 5:  # Noble male
            return 40
        elif row['title']  == 6:  # Professional
            return 50
        elif row['title']  == 7:  # Noble female
            return 40

    else:
        return row['age']

df['age'] = df.apply(infer_age, axis=1)

In [9]:
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,family_size,alone,split_name,surname,family_id,title
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,1.0,2,0,"[Braund,, Mr., Owen, Harris]",Braund,Braund2,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,PC 17599,71.2833,3.0,2,0,"[Cumings,, Mrs., John, Bradley, (Florence, Bri...",Cumings,Cumings2,2
2,3,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,STON/O2. 3101282,7.925,1.0,1,1,"[Heikkinen,, Miss., Laina]",Heikkinen,Heikkinen1,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,113803,53.1,1.0,2,0,"[Futrelle,, Mrs., Jacques, Heath, (Lily, May, ...",Futrelle,Futrelle2,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,1.0,1,1,"[Allen,, Mr., William, Henry]",Allen,Allen1,1


In [None]:
# One Hot Encoding

# Clustering the Family ID