In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import math

# Basic exploration
* Load the dataset
* Get basic stats on numeric features
* Get basic stats on cathegorical features

In [69]:
df_train = pd.read_csv('Data/train.csv', index_col=0)
df_test = pd.read_csv('Data/test.csv', index_col=0)
ids_test = df_test.index

# Split dataframe in features + labels
y = df_train["Survived"]
df_train = df_train.drop(["Survived"], axis=1)

print("Train shape: {}. Test shape: {}".format(df_train.shape, df_test.shape))

df = pd.concat([df_train, df_test])

print("\nColumns in dataset: {}".format(df.columns))
df.head()

Train shape: (891, 10). Test shape: (418, 10)

Columns in dataset: Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')


Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [70]:
df.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1046.0,1309.0,1309.0,1308.0
mean,2.294882,29.881138,0.498854,0.385027,33.295479
std,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.17,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.8958
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,39.0,1.0,0.0,31.275
max,3.0,80.0,8.0,9.0,512.3292


In [71]:
df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,1309,1309,1309,295,1307
unique,1307,2,929,186,3
top,"Connolly, Miss. Kate",male,CA. 2343,C23 C25 C27,S
freq,2,843,11,6,914


# Preprocess data
* Split features and labels
* Drop unuseful columns
* Create "Title" new feature
* Create dummies from categorical features

In [72]:
df = df.drop(['Ticket', 'Cabin'], axis=1)

In [73]:
df['Title'] = df["Name"].str.extract(' ([A-Za-z]+)\.', expand=False)

df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
                                        'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

In [74]:
dummy_cols = ["Title", "Sex", "Embarked", "Ticket", "Cabin"]

for col in dummy_cols:
    if col in df.columns:
        dummies = pd.get_dummies(df[col], prefix=col, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
                
df.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,Mr,0,0,1,0,0,0,1,0,0,1
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,Mrs,0,0,0,1,0,1,0,1,0,0
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,Miss,0,1,0,0,0,1,0,0,0,1
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,Mrs,0,0,0,1,0,1,0,0,0,1
5,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,Mr,0,0,1,0,0,0,1,0,0,1


### Fill missing values with expected ones

In [75]:
for sex in df["Sex"].unique():
    for pclass in df["Pclass"].unique():
        age_guess = df[(df["Sex"] == sex) & (df["Pclass"] == pclass)]["Age"].dropna().median()

        # Convert random age float to nearest .5 age
        age_guess = int( age_guess/0.5 + 0.5 ) * 0.5
            
        df.loc[ (df["Age"].isnull()) & (df["Sex"] == sex) & (df["Pclass"] == pclass), "Age"] = age_guess

df["Age"] = df["Age"].astype(int)

df['Fare'].fillna(df['Fare'].dropna().median(), inplace=True)

df.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,3,"Braund, Mr. Owen Harris",male,22,1,0,7.25,S,Mr,0,0,1,0,0,0,1,0,0,1
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,71.2833,C,Mrs,0,0,0,1,0,1,0,1,0,0
3,3,"Heikkinen, Miss. Laina",female,26,0,0,7.925,S,Miss,0,1,0,0,0,1,0,0,0,1
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,53.1,S,Mrs,0,0,0,1,0,1,0,0,0,1
5,3,"Allen, Mr. William Henry",male,35,0,0,8.05,S,Mr,0,0,1,0,0,0,1,0,0,1


In [76]:
cols_to_drop = [c for c in ['Name'] + dummy_cols if c in df.columns]
df.drop(cols_to_drop, axis=1, inplace=True)

df.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,3,22,1,0,7.25,0,0,1,0,0,0,1,0,0,1
2,1,38,1,0,71.2833,0,0,0,1,0,1,0,1,0,0
3,3,26,0,0,7.925,0,1,0,0,0,1,0,0,0,1
4,1,35,1,0,53.1,0,0,0,1,0,1,0,0,0,1
5,3,35,0,0,8.05,0,0,1,0,0,0,1,0,0,1


### Normalize input data

In [77]:
df = (df - df.mean())/df.std()
df.head(3)

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0.841595,-0.547094,0.481104,-0.444829,-0.503099,-0.221,-0.502433,0.853602,-0.421997,-0.150462,-0.743213,0.743213,-0.509575,-0.321917,0.657142
2,-1.545507,0.662357,0.481104,-0.444829,0.734463,-0.221,-0.502433,-1.170611,2.367873,-0.150462,1.344482,-1.344482,1.96092,-0.321917,-1.520578
3,0.841595,-0.244731,-0.478904,-0.444829,-0.490053,-0.221,1.988796,-1.170611,-0.421997,-0.150462,1.344482,-1.344482,-0.509575,-0.321917,0.657142


# Split dataset in train, test, val
* Train --> for training the model
* Val --> to check accuracy achived and prevent overfitting
* Test --> final data where to perfome predictions

I will use stratified shuffle split to randomly split train and validation

In [78]:
from sklearn.model_selection import StratifiedShuffleSplit

ss = StratifiedShuffleSplit(n_splits=1, test_size=0.1)

x_test = df.ix[ids_test, :]
df = df[~df.index.isin(ids_test)]

train_idx, val_idx = next(ss.split(df, y))

x_train, y_train = df[df.index.isin(train_idx)], y[y.index.isin(train_idx)]
x_val, y_val = df[df.index.isin(val_idx)], y[y.index.isin(val_idx)]

print("x_train: {}, y_train: {}.".format(x_train.shape, y_train.shape))
print("x_val: {}, y_val: {}.".format(x_val.shape, y_val.shape))
print("x_test: {}".format(x_test.shape))

x_train: (800, 15), y_train: (800,).
x_val: (90, 15), y_val: (90,).
x_test: (418, 15)


# Build the Neural Network

In [123]:
tf.reset_default_graph()

inputs = tf.placeholder(tf.float32, (None, x_train.shape[1]), name='input')
labels = tf.placeholder(tf.float32, (None, 2), name='label')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
learning_rate = tf.placeholder(tf.float32, name='learning_rate')

### Design the neural network

In [125]:
alpha = 0.1

nn = tf.layers.dense(inputs, 16, activation=None)
nn = tf.maximum(nn, nn*alpha)
nn = tf.nn.dropout(nn, keep_prob)

nn = tf.layers.dense(nn, 16, activation=None)
nn = tf.maximum(nn, nn*alpha)
nn = tf.nn.dropout(nn, keep_prob)

logits = tf.layers.dense(nn, 2, activation=tf.nn.softmax)

### Define functions needed to train

In [126]:
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

In [127]:
def get_batch(x, y, batch_size):
    
    n_batches = x.shape[0]//batch_size
    
    for idx in range(0, n_batches):
        idx_low, idx_high = (idx*batch_size, (idx+1)*batch_size)
        yield x[idx_low:idx_high], y[idx_low:idx_high]
    
    # If there is more data, yield the remaining
    if (idx + 1)*batch_size + 1 < x.shape[0]:
        idx_low, idx_high = (idx_high, x.shape[0])
        yield x[idx_low:idx_high], y[idx_low:idx_high]

### Train NN

In [128]:
epochs = 300
keep_probability = 0.5
lr = 0.01
batch_size = 64

print_every = 10

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
        
    for e in range(epochs+1):
            
        for x, y in get_batch(x_train, y_train, batch_size):
                
            y = [[label, 1-label] for label in y]
                
            sess.run(optimizer, feed_dict={
                                inputs: x, 
                                labels: y,
                                keep_prob: keep_probability,
                                learning_rate: lr})
            
        # Print accuracy
        if e % print_every == 0:
            acc_train = accuracy.eval({inputs: x_train, 
                                            labels: [[label, 1-label] for label in y_train],
                                            keep_prob: 1})
            
            acc_val = accuracy.eval({inputs: x_val, 
                                            labels: [[label, 1-label] for label in y_val],
                                            keep_prob: 1})
                
            print("Epoch {}/{}\tAccuracy train: {:.4f}\tAccuracy val: {:.4f}".format(e, epochs, acc_train, acc_val))
                
    labels = logits.eval({inputs: x_test, keep_prob: 1})

Epoch 0/300	Accuracy train: 0.5813	Accuracy val: 0.5444
Epoch 10/300	Accuracy train: 0.7225	Accuracy val: 0.7000
Epoch 20/300	Accuracy train: 0.7850	Accuracy val: 0.7222
Epoch 30/300	Accuracy train: 0.7962	Accuracy val: 0.7222
Epoch 40/300	Accuracy train: 0.7988	Accuracy val: 0.7222
Epoch 50/300	Accuracy train: 0.8025	Accuracy val: 0.7333
Epoch 60/300	Accuracy train: 0.8038	Accuracy val: 0.7333
Epoch 70/300	Accuracy train: 0.8000	Accuracy val: 0.7333
Epoch 80/300	Accuracy train: 0.8000	Accuracy val: 0.7333
Epoch 90/300	Accuracy train: 0.8000	Accuracy val: 0.7333
Epoch 100/300	Accuracy train: 0.7987	Accuracy val: 0.7333
Epoch 110/300	Accuracy train: 0.8012	Accuracy val: 0.7333
Epoch 120/300	Accuracy train: 0.8037	Accuracy val: 0.7444
Epoch 130/300	Accuracy train: 0.8075	Accuracy val: 0.7556
Epoch 140/300	Accuracy train: 0.8075	Accuracy val: 0.7556
Epoch 150/300	Accuracy train: 0.8087	Accuracy val: 0.7556
Epoch 160/300	Accuracy train: 0.8113	Accuracy val: 0.7556
Epoch 170/300	Accuracy tr

In [129]:
predictions = pd.DataFrame([1 if x[0] > x[1] else 0 for x in labels], index=df_test.index, columns=["Survived"])
predictions.to_csv("Data/predictions.csv", sep=";")
predictions.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1
