## Background
This kernel is intended to use Keras on the classic Titanic survivors dataset.  It is assuming that you are familiar with the titanic survivors data and skips most of the very necessary EDA. <br />
Specifically I want to see if some of the SibSp and Parch feature engineering can be avoided by using a deep learning architecture and still get a decent enough score.

## Load environment

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout
import theano.sandbox.cuda
theano.sandbox.cuda.use("gpu")

Using Theano backend.


In [2]:
raw_train = pd.read_csv('train.csv', index_col=0)
raw_train['is_test'] = 0
raw_test = pd.read_csv('test.csv', index_col=0)
raw_test['is_test'] = 1

In [3]:
all_data = pd.concat((raw_train, raw_test), axis=0)

## Functions to preprocess the data

In [4]:
def get_title_last_name(name):
    full_name = name.str.split(', ', n=0, expand=True)
    last_name = full_name[0]
    titles = full_name[1].str.split('.', n=0, expand=True)
    titles = titles[0]
    return(titles)

def get_titles_from_names(df):
    df['Title'] = get_title_last_name(df['Name'])
    df = df.drop(['Name'], axis=1)
    return(df)

def get_dummy_cats(df):
    return(pd.get_dummies(df, columns=['Title', 'Pclass', 'Sex', 'Embarked',
                                       'Cabin', 'Cabin_letter']))

def get_cabin_letter(df):    
    df['Cabin'].fillna('Z', inplace=True)
    df['Cabin_letter'] = df['Cabin'].str[0]    
    return(df)

def process_data(df):
    # preprocess titles, cabin, embarked
    df = get_titles_from_names(df)    
    df['Embarked'].fillna('S', inplace=True)
    df = get_cabin_letter(df)
    
    # drop remaining features
    df = df.drop(['Ticket', 'Fare'], axis=1)
    
    # create dummies for categorial features
    df = get_dummy_cats(df)
    
    return(df)

proc_data = process_data(all_data)
proc_train = proc_data[proc_data['is_test'] == 0]
proc_test = proc_data[proc_data['is_test'] == 1]

In [5]:
proc_data.head()

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,0,1,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,38.0,0,1,1.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,26.0,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,35.0,0,1,1.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,35.0,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


## Build Network to predict missing ages

In [6]:
for_age_train = proc_data.drop(['Survived', 'is_test'], axis=1).dropna(axis=0)
X_train_age = for_age_train.drop('Age', axis=1)
y_train_age = for_age_train['Age']

In [7]:
# create model
tmodel = Sequential()
tmodel.add(Dense(input_dim=X_train_age.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
tmodel.add(Activation('relu'))

for i in range(0, 8):
    tmodel.add(Dense(units=64, kernel_initializer='normal',
                     bias_initializer='zeros'))
    tmodel.add(Activation('relu'))
    tmodel.add(Dropout(.25))

tmodel.add(Dense(units=1))
tmodel.add(Activation('linear'))

tmodel.compile(loss='mean_squared_error', optimizer='rmsprop')

In [8]:
tmodel.fit(X_train_age.values, y_train_age.values, epochs=600, verbose=2)



Epoch 1/600
0s - loss: 530.0692
Epoch 2/600
0s - loss: 232.1248
Epoch 3/600
0s - loss: 210.6602
Epoch 4/600
0s - loss: 207.0879
Epoch 5/600
0s - loss: 185.7582
Epoch 6/600
0s - loss: 182.1054
Epoch 7/600
0s - loss: 182.0427
Epoch 8/600
0s - loss: 180.9612
Epoch 9/600
0s - loss: 162.4591
Epoch 10/600
0s - loss: 158.3717
Epoch 11/600
0s - loss: 175.3220
Epoch 12/600
0s - loss: 154.4551
Epoch 13/600
0s - loss: 157.8224
Epoch 14/600
0s - loss: 160.0000
Epoch 15/600
0s - loss: 148.6917
Epoch 16/600
0s - loss: 149.0184
Epoch 17/600
0s - loss: 146.7311
Epoch 18/600
0s - loss: 144.4838
Epoch 19/600
0s - loss: 128.4545
Epoch 20/600
0s - loss: 125.7445
Epoch 21/600
0s - loss: 131.7982
Epoch 22/600
0s - loss: 132.3405
Epoch 23/600
0s - loss: 117.4237
Epoch 24/600
0s - loss: 141.5612
Epoch 25/600
0s - loss: 114.9771
Epoch 26/600
0s - loss: 119.4417
Epoch 27/600
0s - loss: 114.5314
Epoch 28/600
0s - loss: 111.8016
Epoch 29/600
0s - loss: 121.7436
Epoch 30/600
0s - loss: 115.9408
Epoch 31/600
0s - l

0s - loss: 85.1720
Epoch 251/600
0s - loss: 86.3587
Epoch 252/600
0s - loss: 88.2551
Epoch 253/600
0s - loss: 82.6431
Epoch 254/600
0s - loss: 85.5116
Epoch 255/600
0s - loss: 86.4610
Epoch 256/600
0s - loss: 82.9153
Epoch 257/600
0s - loss: 84.5576
Epoch 258/600
0s - loss: 88.3811
Epoch 259/600
0s - loss: 85.5838
Epoch 260/600
0s - loss: 87.3986
Epoch 261/600
0s - loss: 83.8791
Epoch 262/600
0s - loss: 81.4282
Epoch 263/600
0s - loss: 84.5451
Epoch 264/600
0s - loss: 86.9312
Epoch 265/600
0s - loss: 82.7666
Epoch 266/600
0s - loss: 81.1542
Epoch 267/600
0s - loss: 86.4679
Epoch 268/600
0s - loss: 81.6921
Epoch 269/600
0s - loss: 79.6821
Epoch 270/600
0s - loss: 86.4199
Epoch 271/600
0s - loss: 82.5076
Epoch 272/600
0s - loss: 82.7721
Epoch 273/600
0s - loss: 87.6904
Epoch 274/600
0s - loss: 80.1991
Epoch 275/600
0s - loss: 83.6307
Epoch 276/600
0s - loss: 81.5857
Epoch 277/600
0s - loss: 85.1187
Epoch 278/600
0s - loss: 85.0972
Epoch 279/600
0s - loss: 81.1062
Epoch 280/600
0s - loss:

0s - loss: 79.3435
Epoch 500/600
0s - loss: 78.1982
Epoch 501/600
0s - loss: 78.7918
Epoch 502/600
0s - loss: 77.7252
Epoch 503/600
0s - loss: 78.6179
Epoch 504/600
0s - loss: 82.6277
Epoch 505/600
0s - loss: 83.2059
Epoch 506/600
0s - loss: 82.3070
Epoch 507/600
0s - loss: 78.4139
Epoch 508/600
0s - loss: 79.1373
Epoch 509/600
0s - loss: 76.4343
Epoch 510/600
0s - loss: 80.0337
Epoch 511/600
0s - loss: 78.3259
Epoch 512/600
0s - loss: 79.7590
Epoch 513/600
0s - loss: 75.8917
Epoch 514/600
0s - loss: 78.8722
Epoch 515/600
0s - loss: 79.1645
Epoch 516/600
0s - loss: 78.9403
Epoch 517/600
0s - loss: 77.2216
Epoch 518/600
0s - loss: 78.7246
Epoch 519/600
0s - loss: 80.6653
Epoch 520/600
0s - loss: 77.2441
Epoch 521/600
0s - loss: 79.9453
Epoch 522/600
0s - loss: 72.7190
Epoch 523/600
0s - loss: 80.3604
Epoch 524/600
0s - loss: 77.7848
Epoch 525/600
0s - loss: 78.4719
Epoch 526/600
0s - loss: 80.0479
Epoch 527/600
0s - loss: 78.9010
Epoch 528/600
0s - loss: 82.2508
Epoch 529/600
0s - loss:

<keras.callbacks.History at 0x7fd1d8ddfa50>

In [9]:
train_data = proc_train
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
18,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
20,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
27,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
29,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
30,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
32,,0,1,1.0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
33,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
37,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
43,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [10]:
to_pred = train_data.loc[train_data['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
train_data['Age'].loc[train_data['Age'].isnull()] = p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


ValueError: shape mismatch: value array of shape (177,1) could not be broadcast to indexing result of shape (177,)

In [11]:
test_data = proc_test
to_pred = test_data.loc[test_data['Age'].isnull()].drop(
          ['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
test_data['Age'].loc[test_data['Age'].isnull()] = p

ValueError: shape mismatch: value array of shape (86,1) could not be broadcast to indexing result of shape (86,)

In [12]:
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Age,Parch,SibSp,Survived,is_test,Title_Capt,Title_Col,Title_Don,Title_Dona,Title_Dr,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
18,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
20,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
27,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
29,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
30,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
32,,0,1,1.0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
33,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
37,,0,0,1.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
43,,0,0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [13]:
y = pd.get_dummies(train_data['Survived'])
y.head()

Unnamed: 0_level_0,0.0,1.0
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,0,1
3,0,1
4,0,1
5,1,0


In [14]:
X = train_data.drop(['Survived', 'is_test'], axis=1)

In [15]:
# create model
model = Sequential()
model.add(Dense(input_dim=X.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
model.add(Activation('relu'))

for i in range(0, 15):
    model.add(Dense(units=128, kernel_initializer='normal',
                     bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Dropout(.40))

model.add(Dense(units=2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.fit(X.values, y.values, epochs=500, verbose=2)

Epoch 1/500
0s - loss: nan - acc: 0.6195
Epoch 2/500
0s - loss: nan - acc: 0.6162
Epoch 3/500
0s - loss: nan - acc: 0.6162
Epoch 4/500
0s - loss: nan - acc: 0.6162
Epoch 5/500
0s - loss: nan - acc: 0.6162
Epoch 6/500
0s - loss: nan - acc: 0.6162
Epoch 7/500
0s - loss: nan - acc: 0.6162
Epoch 8/500
0s - loss: nan - acc: 0.6162
Epoch 9/500
0s - loss: nan - acc: 0.6162
Epoch 10/500
0s - loss: nan - acc: 0.6162
Epoch 11/500
0s - loss: nan - acc: 0.6162
Epoch 12/500
0s - loss: nan - acc: 0.6162
Epoch 13/500
0s - loss: nan - acc: 0.6162
Epoch 14/500
0s - loss: nan - acc: 0.6162
Epoch 15/500
0s - loss: nan - acc: 0.6162
Epoch 16/500
0s - loss: nan - acc: 0.6162
Epoch 17/500
0s - loss: nan - acc: 0.6162
Epoch 18/500
0s - loss: nan - acc: 0.6162
Epoch 19/500
0s - loss: nan - acc: 0.6162
Epoch 20/500
0s - loss: nan - acc: 0.6162
Epoch 21/500
0s - loss: nan - acc: 0.6162
Epoch 22/500
0s - loss: nan - acc: 0.6162
Epoch 23/500
0s - loss: nan - acc: 0.6162
Epoch 24/500
0s - loss: nan - acc: 0.6162
E

0s - loss: nan - acc: 0.6162
Epoch 195/500
0s - loss: nan - acc: 0.6162
Epoch 196/500
0s - loss: nan - acc: 0.6162
Epoch 197/500
0s - loss: nan - acc: 0.6162
Epoch 198/500
0s - loss: nan - acc: 0.6162
Epoch 199/500
0s - loss: nan - acc: 0.6162
Epoch 200/500
0s - loss: nan - acc: 0.6162
Epoch 201/500
0s - loss: nan - acc: 0.6162
Epoch 202/500
0s - loss: nan - acc: 0.6162
Epoch 203/500
0s - loss: nan - acc: 0.6162
Epoch 204/500
0s - loss: nan - acc: 0.6162
Epoch 205/500
0s - loss: nan - acc: 0.6162
Epoch 206/500
0s - loss: nan - acc: 0.6162
Epoch 207/500
0s - loss: nan - acc: 0.6162
Epoch 208/500
0s - loss: nan - acc: 0.6162
Epoch 209/500
0s - loss: nan - acc: 0.6162
Epoch 210/500
0s - loss: nan - acc: 0.6162
Epoch 211/500
0s - loss: nan - acc: 0.6162
Epoch 212/500
0s - loss: nan - acc: 0.6162
Epoch 213/500
0s - loss: nan - acc: 0.6162
Epoch 214/500
0s - loss: nan - acc: 0.6162
Epoch 215/500
0s - loss: nan - acc: 0.6162
Epoch 216/500
0s - loss: nan - acc: 0.6162
Epoch 217/500
0s - loss: 

0s - loss: nan - acc: 0.6162
Epoch 386/500
0s - loss: nan - acc: 0.6162
Epoch 387/500
0s - loss: nan - acc: 0.6162
Epoch 388/500
0s - loss: nan - acc: 0.6162
Epoch 389/500
0s - loss: nan - acc: 0.6162
Epoch 390/500
0s - loss: nan - acc: 0.6162
Epoch 391/500
0s - loss: nan - acc: 0.6162
Epoch 392/500
0s - loss: nan - acc: 0.6162
Epoch 393/500
0s - loss: nan - acc: 0.6162
Epoch 394/500
0s - loss: nan - acc: 0.6162
Epoch 395/500
0s - loss: nan - acc: 0.6162
Epoch 396/500
0s - loss: nan - acc: 0.6162
Epoch 397/500
0s - loss: nan - acc: 0.6162
Epoch 398/500
0s - loss: nan - acc: 0.6162
Epoch 399/500
0s - loss: nan - acc: 0.6162
Epoch 400/500
0s - loss: nan - acc: 0.6162
Epoch 401/500
0s - loss: nan - acc: 0.6162
Epoch 402/500
0s - loss: nan - acc: 0.6162
Epoch 403/500
0s - loss: nan - acc: 0.6162
Epoch 404/500
0s - loss: nan - acc: 0.6162
Epoch 405/500
0s - loss: nan - acc: 0.6162
Epoch 406/500
0s - loss: nan - acc: 0.6162
Epoch 407/500
0s - loss: nan - acc: 0.6162
Epoch 408/500
0s - loss: 

<keras.callbacks.History at 0x7fd1d0c16e90>

In [None]:
test_data.columns

In [None]:
p_survived = model.predict_classes(test_data.drop(['Survived', 'is_test'], axis=1).values)

In [None]:
submission = pd.DataFrame()
submission['PassengerId'] = test_data.index
submission['Survived'] = p_survived

In [None]:
submission.shape

In [None]:
submission.to_csv('titanic_keras_cs.csv', index=False)