In [433]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import numpy as np
from statistics import mode
import re
from pycaret.classification import *
from keras.models import Sequential
from keras.layers import Dense

# Read the data
X = pd.read_csv('data/train.csv')
X_test_full = pd.read_csv('data/test.csv')

In [434]:
full = pd.concat([X, X_test_full])
full.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [435]:
# mode is the value which appears the most often, it's S for embarked
# mode(full["Embarked"]) # => 'S'
full["Embarked"] = full["Embarked"].fillna(mode(full["Embarked"]))

In [436]:
 # fill missing cabins with new U type
full['Cabin'] = full['Cabin'].fillna('U')

In [437]:
# the first letters of the cabins seem to be referring to a specific part of the boat
# keep only the initials in the Cabin column
full['Cabin'] = full['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())

In [438]:
# in age column Pclass has the highest correlation, so we use that to fill missing values
full['Age'] = full.groupby("Pclass")['Age'].transform(lambda x: x.fillna(x.median()))

In [439]:
# in Fare column Pclass has the highest correlation, so we use that to fill missing values
full['Fare'] = full.groupby("Pclass")['Fare'].transform(lambda x: x.fillna(x.median()))

In [440]:
# get a unique list of Initials from Cabin types
full['Cabin'].unique().tolist()

In [441]:
# Extract the salutation! A few letters with a . in the end
full['Title'] = full.Name.str.extract(' ([A-Za-z]+)\.', expand = False)
full['Title'].unique().tolist()

In [442]:
# Sibsp is the number of siblings / spouses aboard
# Parch is the number of parents / children aboard
# So we calculate the family size
full['familySize'] = full['SibSp'] + full['Parch'] + 1

In [443]:
# Drop redundant features
full = full.drop(['Name', 'SibSp', 'Parch', 'Ticket'], axis = 1)

In [444]:
# One hot encoded
full_ohe = pd.get_dummies(full)



In [445]:
# recover the original test and train dataset
train = full_ohe[full_ohe['Survived'].notna()]
test = full_ohe[full_ohe['Survived'].isna()].drop(['Survived'], axis = 1)

In [446]:
# convert Survived values from float to int, it became float when concatenating
train.loc[:,'Survived'] = train.loc[:,'Survived'].astype(np.int8)

In [447]:
train.shape

(891, 38)

In [451]:
# Neural Network
nn = Sequential()
nn.add(Dense(152, kernel_initializer = 'uniform', activation = 'relu', input_dim = 37))
nn.add(Dense(304, kernel_initializer = 'uniform', activation = 'relu'))
nn.add(Dense(1, kernel_initializer = 'uniform', activation = 'sigmoid'))
nn.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])

In [452]:
nn.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 152)               5776      
_________________________________________________________________
dense_23 (Dense)             (None, 304)               46512     
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 305       
Total params: 52,593
Trainable params: 52,593
Non-trainable params: 0
_________________________________________________________________


In [453]:
nn.fit(train.drop(['Survived'], axis=1), train.Survived, batch_size=64, epochs=250)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

Epoch 81/250
Epoch 82/250
Epoch 83/250
Epoch 84/250
Epoch 85/250
Epoch 86/250
Epoch 87/250
Epoch 88/250
Epoch 89/250
Epoch 90/250
Epoch 91/250
Epoch 92/250
Epoch 93/250
Epoch 94/250
Epoch 95/250
Epoch 96/250
Epoch 97/250
Epoch 98/250
Epoch 99/250
Epoch 100/250
Epoch 101/250
Epoch 102/250
Epoch 103/250
Epoch 104/250
Epoch 105/250
Epoch 106/250
Epoch 107/250
Epoch 108/250
Epoch 109/250
Epoch 110/250
Epoch 111/250
Epoch 112/250
Epoch 113/250
Epoch 114/250
Epoch 115/250
Epoch 116/250
Epoch 117/250
Epoch 118/250
Epoch 119/250
Epoch 120/250
Epoch 121/250
Epoch 122/250
Epoch 123/250
Epoch 124/250
Epoch 125/250
Epoch 126/250
Epoch 127/250
Epoch 128/250
Epoch 129/250
Epoch 130/250
Epoch 131/250
Epoch 132/250
Epoch 133/250
Epoch 134/250
Epoch 135/250
Epoch 136/250
Epoch 137/250
Epoch 138/250
Epoch 139/250
Epoch 140/250
Epoch 141/250
Epoch 142/250
Epoch 143/250
Epoch 144/250
Epoch 145/250
Epoch 146/250
Epoch 147/250
Epoch 148/250
Epoch 149/250
Epoch 150/250
Epoch 151/250
Epoch 152/250
Epoch 153/2

Epoch 160/250
Epoch 161/250
Epoch 162/250
Epoch 163/250
Epoch 164/250
Epoch 165/250
Epoch 166/250
Epoch 167/250
Epoch 168/250
Epoch 169/250
Epoch 170/250
Epoch 171/250
Epoch 172/250
Epoch 173/250
Epoch 174/250
Epoch 175/250
Epoch 176/250
Epoch 177/250
Epoch 178/250
Epoch 179/250
Epoch 180/250
Epoch 181/250
Epoch 182/250
Epoch 183/250
Epoch 184/250
Epoch 185/250
Epoch 186/250
Epoch 187/250
Epoch 188/250
Epoch 189/250
Epoch 190/250
Epoch 191/250
Epoch 192/250
Epoch 193/250
Epoch 194/250
Epoch 195/250
Epoch 196/250
Epoch 197/250
Epoch 198/250
Epoch 199/250
Epoch 200/250
Epoch 201/250
Epoch 202/250
Epoch 203/250
Epoch 204/250
Epoch 205/250
Epoch 206/250
Epoch 207/250
Epoch 208/250
Epoch 209/250
Epoch 210/250
Epoch 211/250
Epoch 212/250
Epoch 213/250
Epoch 214/250
Epoch 215/250
Epoch 216/250
Epoch 217/250
Epoch 218/250
Epoch 219/250
Epoch 220/250
Epoch 221/250
Epoch 222/250
Epoch 223/250
Epoch 224/250
Epoch 225/250
Epoch 226/250
Epoch 227/250
Epoch 228/250
Epoch 229/250
Epoch 230/250
Epoch 

Epoch 238/250
Epoch 239/250
Epoch 240/250
Epoch 241/250
Epoch 242/250
Epoch 243/250
Epoch 244/250
Epoch 245/250
Epoch 246/250
Epoch 247/250
Epoch 248/250
Epoch 249/250
Epoch 250/250


<keras.callbacks.callbacks.History at 0x1a8485c590>

In [454]:
score, acc = nn.evaluate(train.drop(['Survived'], axis=1), train.Survived, batch_size=32)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.2889716106265215
Test accuracy: 0.872053861618042


In [455]:
# Make predictions on test set
test['Survived'] = nn.predict_classes(test)

In [456]:
test

Unnamed: 0,PassengerId,Pclass,Age,Fare,familySize,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,...,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Survived
0,892,3,34.5,7.8292,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,893,3,47.0,7.0000,2,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,894,2,62.0,9.6875,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,895,3,27.0,8.6625,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,896,3,22.0,12.2875,3,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,24.0,8.0500,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
414,1306,1,39.0,108.9000,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
415,1307,3,38.5,7.2500,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
416,1308,3,24.0,8.0500,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [457]:
# Create file for submission
test[['PassengerId', 'Survived']].to_csv('data/submission_FE_nn.csv', index = False)