In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

train = pd.read_csv ('titanic_train_master.csv')
test = pd.read_csv ('titanic_test_master.csv')

In [2]:
# lower case all column names
train.columns = map (str.lower, train.columns)
test.columns = map (str.lower, test.columns)

In [3]:
# dummies function to create binary columns for groups
def create_dummies (df, column_names):
    for col in column_names:
        dummies = pd.get_dummies (df[col], prefix=col)
        df = pd.concat([df, dummies], axis=1)
    return df

In [4]:
# PCLASS
train = create_dummies (train, ['pclass'])
test = create_dummies (test, ['pclass'])

In [5]:
# extract titles from name
titles = {
    "Mr" :         "mr",
    "Mme":         "mme",
    "Ms":          "mme",
    "Mrs" :        "mrs",
    "Master" :     "master",
    "Mlle":        "miss",
    "Miss" :       "miss",
    "Capt":        "officer",
    "Col":         "officer",
    "Major":       "officer",
    "Dr":          "officer",
    "Rev":         "officer",
    "Jonkheer":    "royalty_male",
    "Don":         "royalty_male",
    "Sir" :        "royalty_male",
    "Countess":    "royalty_female",
    "Dona":        "royalty_female",
    "Lady" :       "royalty_female"
}
extracted_titles = train['name'].str.extract (' ([A-Za-z]+)\.', expand=False)
train['title'] = extracted_titles.map(titles)
extracted_titles = test['name'].str.extract (' ([A-Za-z]+)\.', expand=False)
test['title'] = extracted_titles.map(titles)

# create dummies for titles
train = create_dummies (train, ['title'])
test = create_dummies (test, ['title'])

In [9]:
test.columns

Index(['passengerid', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch',
       'ticket', 'fare', 'cabin', 'embarked', 'pclass_1', 'pclass_2',
       'pclass_3', 'title', 'title_master', 'title_miss', 'title_mme',
       'title_mr', 'title_mrs', 'title_officer', 'title_royalty_female'],
      dtype='object')

In [10]:
train.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked', 'pclass_1', 'pclass_2',
       'pclass_3', 'title', 'title_master', 'title_miss', 'title_mme',
       'title_mr', 'title_mrs', 'title_officer', 'title_royalty_female',
       'title_royalty_male'],
      dtype='object')

In [6]:
# need to add a royalty_male dummy column to test
test['title_royalty_male'] = 0

In [12]:
test.columns

Index(['passengerid', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch',
       'ticket', 'fare', 'cabin', 'embarked', 'pclass_1', 'pclass_2',
       'pclass_3', 'title', 'title_master', 'title_miss', 'title_mme',
       'title_mr', 'title_mrs', 'title_officer', 'title_royalty_female',
       'title_royalty_male'],
      dtype='object')

In [7]:
# SEX
train = create_dummies (train, ['sex'])
test = create_dummies (test, ['sex'])

In [8]:
# AGE
# if title_master is true and age is null set age to 1. Then group ages. 
# Trying to save some rows
def set_age (row):
    if row['title_master'] & pd.isnull(row['age']):
        return 1
    else:
        return row['age']

train['age'] = train.apply (lambda row: set_age(row), axis=1)
test['age'] = test.apply (lambda row: set_age(row), axis=1)

# create age groups.
def create_groups_dummies (df, col, cat_name, cut_points, label_names):
    df[col] = df[col].fillna(-0.5)
    df[cat_name] = pd.cut (df[col], cut_points, labels=label_names)
    df = create_dummies (df, [cat_name])
    return df

age_cut_points = [-1, 0, 12, 18, 30, 60, 100]
age_group_labels = ['missing', 'child', 'teenager', 'young_adult', 'adult', 'senior']

train = create_groups_dummies (train, 'age', 'age_cat', age_cut_points, age_group_labels)
test = create_groups_dummies (test, 'age', 'age_cat', age_cut_points, age_group_labels)

In [9]:
# SIBSP
# number of siblings / spouses aboard
train = create_dummies (train, ['sibsp'])
test = create_dummies (test, ['sibsp'])

In [10]:
# PARCH 
# number of parents / children aboard
train = create_dummies (train, ['parch'])
test = create_dummies (test, ['parch'])

In [11]:
# FARE
fare_group_labels = ['missing', '0-12', '12-50', '50-100', '100+']
fare_cut_points = [-1, 0, 12, 50, 100, 1000]
train = create_groups_dummies (train, 'fare', 'fare_cat', fare_cut_points, fare_group_labels)
test = create_groups_dummies (test, 'fare', 'fare_cat', fare_cut_points, fare_group_labels)

In [12]:
# CABIN
train['cabin_type'] = train['cabin'].str[0]
train['cabin_type'] = train['cabin_type'].fillna('UNKNOWN')
test['cabin_type'] = test['cabin'].str[0]
test['cabin_type'] = test['cabin_type'].fillna('UNKNOWN')
train = create_dummies (train, ['cabin_type'])
test = create_dummies (test, ['cabin_type'])

In [13]:
# EMBARKED 
# C = Cherbourg, Q = Queenstown, S = Southampton
# embarked has some empties. Fill with U for unknown
train['embarked'] = train['embarked'].fillna('U')
test['embarked'] = test['embarked'].fillna('U')
train = create_dummies (train, ['embarked'])
test = create_dummies (test, ['embarked'])

In [14]:
# lower case all column names
train.columns = map (str.lower, train.columns)
test.columns = map (str.lower, test.columns)

In [15]:
# write out final files
train.to_csv ('titanic_train_wrangled.csv', index=False)
test.to_csv ('titanic_test_wrangled.csv', index=False)