In [69]:
import numpy as np
import pandas as pd
from keras.utils import to_categorical

In [70]:
X_train = pd.read_csv("input/train.csv")
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [71]:
X_test = pd.read_csv("input/test.csv")
X_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [72]:
X_train.drop('PassengerId', inplace = True, axis = 1)
X_test.drop('PassengerId', inplace = True, axis = 1)

y_train = X_train.iloc[:, 0].values.astype('int32')
X_train.drop('Survived', inplace = True, axis = 1)

In [73]:
X_train.Sex.unique()

array(['male', 'female'], dtype=object)

In [74]:
X_test.Sex.unique()

array(['male', 'female'], dtype=object)

In [75]:
X_train["Sex"] = X_train["Sex"].replace(["male", "female"],[-1, 1])
X_test["Sex"] = X_test["Sex"].replace(["male", "female"],[-1, 1])

In [76]:
X_train["Name"] = X_train["Name"].str.lower()
X_test["Name"] = X_test["Name"].str.lower()

In [77]:
X_train.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [78]:
X_test.Embarked.unique()

array(['Q', 'S', 'C'], dtype=object)

In [79]:
embarked_types = ['Q', 'S', 'C']
X_train = X_train.join(pd.get_dummies(X_train.Embarked, prefix = 'Embarked'))
X_train.drop('Embarked', inplace = True, axis = 1)

X_test = X_test.join(pd.get_dummies(X_test.Embarked, prefix = 'Embarked'))
X_test.drop('Embarked', inplace = True, axis = 1)

In [80]:
X_train['Name'] = X_train['Name'].str.extract('.*, ([A-Za-z]+)\..*', expand = False).str.strip()
X_test['Name'] = X_test['Name'].str.extract('.*, ([A-Za-z]+)\..*', expand = False).str.strip()

In [81]:
X_train.Name.unique()

array(['mr', 'mrs', 'miss', 'master', 'don', 'rev', 'dr', 'mme', 'ms',
       'major', 'lady', 'sir', 'mlle', 'col', 'capt', nan, 'jonkheer'],
      dtype=object)

In [82]:
X_test.Name.unique()

array(['mr', 'mrs', 'miss', 'master', 'ms', 'col', 'rev', 'dr', 'dona'],
      dtype=object)

In [83]:
title_types = ['mr', 'mrs', 'miss', 'master', 'don', 'rev', 'dr', 'mme', 'ms',
       'major', 'lady', 'sir', 'mlle', 'col', 'capt', 'jonkheer']
X_train = X_train.join(pd.get_dummies(X_train.Name, prefix = 'Title'))
X_train.drop('Name', inplace = True, axis = 1)

X_test = X_test.join(pd.get_dummies(X_test.Name, prefix = 'Title'))
X_test.drop('Name', inplace = True, axis = 1)

In [84]:
X_train['Age'] = X_train['Age'].fillna(0.0) / 100.0
X_test['Age'] = X_test['Age'].fillna(0.0) / 100.0

In [85]:
X_train.drop('Ticket', inplace = True, axis = 1)
X_test.drop('Ticket', inplace = True, axis = 1)

In [86]:
X_train['Fare'] = X_train['Fare'].fillna(0.0) / 512.3292
X_test['Fare'] = X_test['Fare'].fillna(0.0) / 512.3292

In [87]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S,...,Title_major,Title_master,Title_miss,Title_mlle,Title_mme,Title_mr,Title_mrs,Title_ms,Title_rev,Title_sir
0,3,-1,0.22,1,0,0.014151,,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0.38,1,0,0.139136,C85,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,1,0.26,0,0,0.015469,,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3,1,1,0.35,1,0,0.103644,C123,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,3,-1,0.35,0,0,0.015713,,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,-1,0.27,0,0,0.025374,,0,0,1,...,0,0,0,0,0,0,0,0,1,0
887,1,1,0.19,0,0,0.058556,B42,0,0,1,...,0,0,1,0,0,0,0,0,0,0
888,3,1,0.00,1,2,0.045771,,0,0,1,...,0,0,1,0,0,0,0,0,0,0
889,1,-1,0.26,0,0,0.058556,C148,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [88]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S,Title_col,Title_dona,Title_dr,Title_master,Title_miss,Title_mr,Title_mrs,Title_ms,Title_rev
0,3,-1,0.345,0,0,0.015282,,0,1,0,0,0,0,0,0,1,0,0,0
1,3,1,0.470,1,0,0.013663,,0,0,1,0,0,0,0,0,0,1,0,0
2,2,-1,0.620,0,0,0.018909,,0,1,0,0,0,0,0,0,1,0,0,0
3,3,-1,0.270,0,0,0.016908,,0,0,1,0,0,0,0,0,1,0,0,0
4,3,1,0.220,1,1,0.023984,,0,0,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,-1,0.000,0,0,0.015713,,0,0,1,0,0,0,0,0,1,0,0,0
414,1,1,0.390,0,0,0.212559,C105,1,0,0,0,1,0,0,0,0,0,0,0
415,3,-1,0.385,0,0,0.014151,,0,0,1,0,0,0,0,0,1,0,0,0
416,3,-1,0.000,0,0,0.015713,,0,0,1,0,0,0,0,0,1,0,0,0


In [89]:
X_train.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [90]:
X_test.Cabin.unique()

array([nan, 'B45', 'E31', 'B57 B59 B63 B66', 'B36', 'A21', 'C78', 'D34',
       'D19', 'A9', 'D15', 'C31', 'C23 C25 C27', 'F G63', 'B61', 'C53',
       'D43', 'C130', 'C132', 'C101', 'C55 C57', 'B71', 'C46', 'C116',
       'F', 'A29', 'G6', 'C6', 'C28', 'C51', 'E46', 'C54', 'C97', 'D22',
       'B10', 'F4', 'E45', 'E52', 'D30', 'B58 B60', 'E34', 'C62 C64',
       'A11', 'B11', 'C80', 'F33', 'C85', 'D37', 'C86', 'D21', 'C89',
       'F E46', 'A34', 'D', 'B26', 'C22 C26', 'B69', 'C32', 'B78',
       'F E57', 'F2', 'A18', 'C106', 'B51 B53 B55', 'D10 D12', 'E60',
       'E50', 'E39 E41', 'B52 B54 B56', 'C39', 'B24', 'D28', 'B41', 'C7',
       'D40', 'D38', 'C105'], dtype=object)