# Titanic Survival Prediction

In [None]:
# Import our standard libraries.
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns  # for nicer plots
sns.set(style='darkgrid')  # default style
import tensorflow as tf
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Load and prepare data

In [None]:
# https://www.kaggle.com/c/titanic/
df = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/test.csv')
print('Labeled data records:', len(df))
print('Kaggle test data records:', len(df_test))
print('First few records:')
display(df.head(10))

### Deal with missing values

In [None]:
# Check for missing (NaN) values.
print(df.isnull().sum())
print(df_test.isnull().sum())

In [None]:
# Replace missing values.
df['Age'].fillna(0, inplace=True)
df['Cabin'].fillna('Unknown', inplace=True)
df['Embarked'].fillna('Unknown', inplace=True)

df_test['Age'].fillna(0, inplace=True)
df_test['Cabin'].fillna('Unknown', inplace=True)
df_test['Fare'].fillna(0, inplace=True)

### Feature engineering

In [None]:
def get_title(x):
  return x['Name'].split(',')[1].split(' ')[1]

df['Title'] = df.apply(get_title, axis=1)
df_test['Title'] = df_test.apply(get_title, axis=1)

print(df['Title'].value_counts())

### Train/dev split

In [None]:
df_train = df.sample(frac=0.85, random_state=3)
df_dev = df.drop(df_train.index)

print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)

## Basic data analysis

In [None]:
label_values = df_train['Survived'].value_counts()
display(label_values)
plt.pie(label_values, labels=['Died', 'Survived'], autopct='%.2f')
plt.show()

In [None]:
plt.hist([df_train['Age'][df_train['Survived']==0],
          df_train['Age'][df_train['Survived']==1]])
plt.show()

In [None]:
plt.figure(figsize=(14, 8))
plt.subplot(2,2,1)
plt.hist([df_train['Age'][df_train['Survived']==0],
          df_train['Age'][df_train['Survived']==1]],
          label=['Died', 'Survived'], bins=20)
plt.xlabel('Age')
plt.legend()
plt.subplot(2,2,2)
plt.hist([df_train['Sex'][df_train['Survived']==0],
          df_train['Sex'][df_train['Survived']==1]],
          label=['Died', 'Survived'],
          bins=3)
plt.xlabel('Sex')
plt.legend()
plt.subplot(2,2,3)
plt.hist([df_train['Pclass'][df_train['Survived']==0],
          df_train['Pclass'][df_train['Survived']==1]],
          label=['Died', 'Survived'],
          bins=5)
plt.xlabel('Pclass')
plt.legend()
plt.subplot(2,2,4)
plt.hist([df_train['Embarked'][df_train['Survived']==0],
          df_train['Embarked'][df_train['Survived']==1]],
          label=['Died', 'Survived'],
          bins=4)
plt.xlabel('Embarked')
plt.legend()
plt.show()

## Baselines

In [None]:
def majority_baseline(x):
  return 0  # corresponds to 'died'

def rule_baseline(x):
  if x['Sex'] == 'female':
    return 1  # 'survived'
  else:
    return 0  # 'died'

In [None]:
Y_train_pred_majority_baseline = df_train.apply(majority_baseline, axis=1)
Y_train_pred_rule_baseline = df_train.apply(rule_baseline, axis=1)
Y_dev_pred_majority_baseline = df_dev.apply(majority_baseline, axis=1)
Y_dev_pred_rule_baseline = df_dev.apply(rule_baseline, axis=1)

Y_train = df_train['Survived']
Y_dev = df_dev['Survived']

print('Majority baseline train accuracy: %.3f'
  %(np.sum(Y_train_pred_majority_baseline == Y_train) / len(Y_train)))
print('Majority baseline dev accuracy:   %.3f'
  %(np.sum(Y_dev_pred_majority_baseline == Y_dev) / len(Y_dev)))
print('Rule baseline train accuracy:     %.3f'
  %(np.sum(Y_train_pred_rule_baseline == Y_train) / len(Y_train)))
print('Rule baseline dev accuracy:       %.3f'
  %(np.sum(Y_dev_pred_rule_baseline == Y_dev) / len(Y_dev)))

## Model 1: Age as a Single Value

In [None]:
# Plot the loss at each epoch and show final numbers.
def show_history(history):
  plt.plot(history.history['loss'], color='black', label='train')
  plt.plot(history.history['val_loss'], color='red', label='dev')
  plt.xlabel('epoch')
  plt.ylabel('loss')
  plt.legend()
  plt.show()
  print('final train acc: %.3f' %history.history['accuracy'][-1])
  print('final dev acc:   %.3f' %history.history['val_accuracy'][-1])

In [None]:
def build_model_sequential():
  # Clear session and remove randomness.
  tf.keras.backend.clear_session()
  tf.random.set_seed(0)

  model = tf.keras.Sequential(name='Titanic')
  model.add(tf.keras.layers.InputLayer(input_shape=(1,)))
  model.add(tf.keras.layers.Dense(
      units=1, activation='sigmoid', name='Survived'))

  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=0.02),
      loss='binary_crossentropy',
      metrics=['accuracy'])

  return model

### Functional API

In [None]:
def build_model():
  # Clear session and remove randomness.
  tf.keras.backend.clear_session()
  tf.random.set_seed(0)

  age = tf.keras.layers.Input(shape=(1,), dtype=tf.float32, name='Age')

  survived = tf.keras.layers.Dense(
      units=1, activation='sigmoid', name='Survived')(age)

  model = tf.keras.Model(inputs=age,
                         outputs=survived,
                         name='Titanic')
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
      loss='binary_crossentropy',
      metrics=['accuracy'])

  return model

In [None]:
model = build_model()
model.summary()

history = model.fit(
  x=df_train[['Age']],
  y=df_train[['Survived']],
  epochs=10,
  batch_size=64,
  validation_data=(
      df_dev[['Age']],
      df_dev[['Survived']]),
  verbose=0)

show_history(history)

## Model 2: Age in Buckets

### Tabular data processing

In [None]:
emb = np.array(df_train['Embarked'])[10:20]
print('Embarked:', emb)

emb_id = tf.keras.layers.StringLookup(
    vocabulary=['S', 'C', 'Q'], output_mode='one_hot')(emb)
print('Embarked ids:', emb_id)

In [None]:
tf.keras.backend.clear_session()
age = np.array(df_train['Age'])[10:20]
print('Ages:', age)

age_bins = [0.1, 10, 40]
age_binned = tf.keras.layers.Discretization(bin_boundaries=age_bins)(age)
print('Ages binned:', age_binned)

age_id = tf.keras.layers.IntegerLookup(vocabulary=np.arange(0, len(age_bins)+1),
                                       output_mode='one_hot')(age_binned)
print('Ages as one-hot vectors:', age_id)

In [None]:
def build_model():
  # Clear session and remove randomness.
  tf.keras.backend.clear_session()
  tf.random.set_seed(0)

  age = tf.keras.layers.Input(shape=(1,), dtype=tf.float32, name='Age')

  age_bins = [0.1, 1, 5, 10, 20, 30, 40, 50, 60]
  age_binned = tf.keras.layers.Discretization(bin_boundaries=age_bins)(age)
  age_id = tf.keras.layers.IntegerLookup(
      vocabulary=np.arange(0, len(age_bins)+1),
      output_mode='one_hot')(age_binned)

  survived = tf.keras.layers.Dense(
      units=1, activation='sigmoid', name='Survived')(age_id)

  model = tf.keras.Model(inputs=age,
                         outputs=survived,
                         name='Titanic')
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
      loss='binary_crossentropy',
      metrics=['accuracy'])

  return model

In [None]:
model = build_model()
model.summary()

history = model.fit(
  x=df_train[['Age']],
  y=df_train[['Survived']],
  epochs=10,
  batch_size=64,
  validation_data=(
      df_dev[['Age']],
      df_dev[['Survived']]),
  verbose=0)

show_history(history)

## Model 3: Multiple Inputs

In [None]:
def build_model():
  # Clear session and remove randomness.
  tf.keras.backend.clear_session()
  tf.random.set_seed(0)

  age = tf.keras.layers.Input(shape=(1,), dtype=tf.float32, name='Age')
  sex = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name='Sex')
  pclass = tf.keras.layers.Input(shape=(1,), dtype=tf.int64, name='Pclass')
  emb = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name='Embarked')

  age_bins = [0.1, 1, 5, 10, 20, 30, 40, 50, 60]
  age_binned = tf.keras.layers.Discretization(bin_boundaries=age_bins)(age)
  age_id = tf.keras.layers.IntegerLookup(
      vocabulary=np.arange(0, len(age_bins)+1),
      output_mode='one_hot')(age_binned)

  sex_id = tf.keras.layers.StringLookup(
      vocabulary=['male', 'female'], output_mode='one_hot')(sex)

  pclass_id = tf.keras.layers.IntegerLookup(
      vocabulary=[1, 2, 3], output_mode='one_hot')(pclass)

  emb_id = tf.keras.layers.StringLookup(
      vocabulary=['S', 'C', 'Q'], output_mode='one_hot')(emb)
  
  features = tf.keras.layers.Concatenate()([age_id, sex_id, pclass_id, emb_id])
  
  dense = tf.keras.layers.Dense(
      units=12, activation='tanh', name='hidden1')(features)
  
  survived = tf.keras.layers.Dense(
      units=1, activation='sigmoid', name='Survived')(dense)

  model = tf.keras.Model(inputs=[age, sex, pclass, emb],
                         outputs=survived,
                         name='Titanic')
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=0.02),
      loss='binary_crossentropy',
      metrics=['accuracy'])

  return model

In [None]:
model = build_model()
display(tf.keras.utils.plot_model(model))

history = model.fit(
  x={
      'Age': df_train[['Age']],
      'Sex': df_train[['Sex']],
      'Pclass': df_train[['Pclass']],
      'Embarked': df_train[['Embarked']],
    },
  y=df_train[['Survived']],
  epochs=20,
  batch_size=64,
  validation_data=(
      {
        'Age': df_dev[['Age']],
        'Sex': df_dev[['Sex']],
        'Pclass': df_dev[['Pclass']],
        'Embarked': df_dev[['Embarked']],
      },
      df_dev[['Survived']]),
  verbose=0)

show_history(history)

## Model 4: Using Embeddings

In [None]:
emb = np.array(df_train['Embarked'])[20:30]
print('Embarked:', emb)

emb_binned = tf.keras.layers.StringLookup(vocabulary=['S', 'C', 'Q'])(emb)
print('Embarked ids:', emb_binned)

emb_embed = tf.keras.layers.Embedding(
    input_dim=4, output_dim=4, input_length=1)(emb_binned)
print('Embarked embeddings:', emb_embed)

In [None]:
def build_model():
  # Clear session and remove randomness.
  tf.keras.backend.clear_session()
  tf.random.set_seed(0)

  age = tf.keras.layers.Input(shape=(1,), dtype=tf.float32, name='Age')
  sex = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name='Sex')
  pclass = tf.keras.layers.Input(shape=(1,), dtype=tf.int64, name='Pclass')
  emb = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name='Embarked')

  embed_dim = 4

  age_bins = [0.1, 1, 5, 10, 20, 30, 40, 50, 60]
  age_binned = tf.keras.layers.Discretization(bin_boundaries=age_bins)(age)
  sex_binned = tf.keras.layers.StringLookup(vocabulary=['male', 'female'])(sex)
  pclass_binned = tf.keras.layers.IntegerLookup(vocabulary=[1, 2, 3])(pclass)
  emb_binned = tf.keras.layers.StringLookup(vocabulary=['S', 'C', 'Q'])(emb)

  age_embed = tf.keras.layers.Embedding(
      input_dim=len(age_bins)+1, output_dim=embed_dim, input_length=1)(age_binned)
  sex_embed = tf.keras.layers.Embedding(
      input_dim=3, output_dim=embed_dim, input_length=1)(sex_binned)
  pclass_embed = tf.keras.layers.Embedding(
      input_dim=4, output_dim=embed_dim, input_length=1)(pclass_binned)
  emb_embed = tf.keras.layers.Embedding(
      input_dim=4, output_dim=embed_dim, input_length=1)(emb_binned)

  # features = tf.keras.layers.Concatenate()(
  #     [age_embed, sex_embed, pclass_embed, emb_embed])
  features = tf.keras.layers.Add()(
      [age_embed, sex_embed, pclass_embed, emb_embed])
  
  dense = tf.keras.layers.Dense(
      units=12, activation='tanh', name='hidden1')(features)

  survived = tf.keras.layers.Dense(
      units=1, activation='sigmoid', name='Survived')(dense)

  model = tf.keras.Model(inputs=[age, sex, pclass, emb],
                         outputs=survived,
                         name='Titanic')
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=0.02),
      loss='binary_crossentropy',
      metrics=['accuracy'])

  return model

In [None]:
model = build_model()
display(tf.keras.utils.plot_model(model))

history = model.fit(
  x={
      'Age': df_train[['Age']],
      'Sex': df_train[['Sex']],
      'Pclass': df_train[['Pclass']],
      'Embarked': df_train[['Embarked']],
    },
  y=df_train[['Survived']],
  epochs=20,
  batch_size=64,
  validation_data=(
      {
        'Age': df_dev[['Age']],
        'Sex': df_dev[['Sex']],
        'Pclass': df_dev[['Pclass']],
        'Embarked': df_dev[['Embarked']],
      },
      df_dev[['Survived']]),
  verbose=0)

show_history(history)

In [None]:
# Write to csv in the required format for kaggle submission
def output_csv(predictions):
  df_test['Survived'] = predictions.reshape(-1).round().astype(int)
  display(df_test.head(10))
  filename = 'titanic_predictions.csv'
  df_test.to_csv(
      filename,
      columns=['PassengerId', 'Survived'],
      index=False)
  print('Saved as:', filename)

In [None]:
test_preds = model.predict(
  x={
      'Age': df_test[['Age']],
      'Sex': df_test[['Sex']],
      'Pclass': df_test[['Pclass']],
      'Embarked': df_test[['Embarked']],
    }
)

output_csv(test_preds)

## Model 5: Multiple Outputs

In [None]:
def build_model():
  # Clear session and remove randomness.
  tf.keras.backend.clear_session()
  tf.random.set_seed(0)

  age = tf.keras.layers.Input(shape=(1,), dtype=tf.float32, name='Age')
  sex = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name='Sex')
  pclass = tf.keras.layers.Input(shape=(1,), dtype=tf.int64, name='Pclass')
  emb = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name='Embarked')

  embed_dim = 4

  age_bins = [0.1, 1, 5, 10, 20, 30, 40, 50, 60]
  age_binned = tf.keras.layers.Discretization(bin_boundaries=age_bins)(age)
  sex_binned = tf.keras.layers.StringLookup(vocabulary=['male', 'female'])(sex)
  pclass_binned = tf.keras.layers.IntegerLookup(vocabulary=[1, 2, 3])(pclass)
  emb_binned = tf.keras.layers.StringLookup(vocabulary=['S', 'C', 'Q'])(emb)

  age_embed = tf.keras.layers.Embedding(
      input_dim=len(age_bins)+1, output_dim=embed_dim, input_length=1)(age_binned)
  sex_embed = tf.keras.layers.Embedding(
      input_dim=3, output_dim=embed_dim, input_length=1)(sex_binned)
  pclass_embed = tf.keras.layers.Embedding(
      input_dim=4, output_dim=embed_dim, input_length=1)(pclass_binned)
  emb_embed = tf.keras.layers.Embedding(
      input_dim=4, output_dim=embed_dim, input_length=1)(emb_binned)

  features = tf.keras.layers.Add()(
      [age_embed, sex_embed, pclass_embed, emb_embed])
  
  dense = tf.keras.layers.Dense(
      units=12, activation='tanh', name='hidden1')(features)

  survived = tf.keras.layers.Dense(
      units=1, activation='sigmoid', name='Survived')(dense)

  fare = tf.keras.layers.Dense(
      units=1, activation=None, name='Fare')(dense)

  model = tf.keras.Model(inputs=[age, sex, pclass, emb],
                         outputs=[survived, fare],
                         name='Titanic')
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=0.02),
      loss={'Survived': 'binary_crossentropy', 'Fare': 'mean_absolute_error'},
      loss_weights={'Survived': 9.0, 'Fare': 1.0},
      metrics={'Survived': 'accuracy', 'Fare': 'mean_absolute_error'})

  return model

In [None]:
model = build_model()
display(tf.keras.utils.plot_model(model))

history = model.fit(
  x={
      'Age': df_train[['Age']],
      'Sex': df_train[['Sex']],
      'Pclass': df_train[['Pclass']],
      'Embarked': df_train[['Embarked']],
    },
  y={
      'Survived': df_train[['Survived']],
      'Fare': np.log(df_train[['Fare']] + 1)
    },
  epochs=30,
  batch_size=64,
  validation_data=(
      {
        'Age': df_dev[['Age']],
        'Sex': df_dev[['Sex']],
        'Pclass': df_dev[['Pclass']],
        'Embarked': df_dev[['Embarked']],
      },
      {
        'Survived': df_dev[['Survived']],
        'Fare': np.log(df_dev[['Fare']] + 1),
      }),
  verbose=0)

plt.plot(history.history['loss'], color='black', label='train')
plt.plot(history.history['val_loss'], color='red', label='dev')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend()
plt.show()
print('final train acc: %.3f' %history.history['Survived_accuracy'][-1])
print('final dev acc:   %.3f' %history.history['val_Survived_accuracy'][-1])
print('final train mae: %.3f' %history.history['Fare_mean_absolute_error'][-1])
print('final dev mae:   %.3f' %history.history['val_Fare_mean_absolute_error'][-1])