### Data sets were acquired from https://www.kaggle.com/c/titanic

In [1]:
# Import libraries and the data set
import pandas as pd
import numpy as np
import sidetable
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('train.csv', index_col=['PassengerId'])
X_test = pd.read_csv('test.csv', index_col=['PassengerId'])

In [2]:
# Have a look at the first five rows
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Column __Name__ and __Ticket__ has nothing to do with predicting __Survived__, it will be dropped

In [3]:
# Column 'Name' and 'Ticket' have nothing to do with predicting 'Survived', they will be dropped
df.drop(columns=['Name'], inplace=True)
X_test.drop(columns=['Name'], inplace=True)

In [4]:
# See the missing data in columns
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Cabin,687,891,77.10%
Age,177,891,19.87%
Embarked,2,891,0.22%
Survived,0,891,0.00%
Pclass,0,891,0.00%
Sex,0,891,0.00%
SibSp,0,891,0.00%
Parch,0,891,0.00%
Ticket,0,891,0.00%
Fare,0,891,0.00%


As column __Cabin__ has 77% missing values, the column will be dropped.
<br>Column __Age__ can be imputed.
<br>As column __Embarked__ has 2 missing rows, the rows will be dropped.

In [5]:
# As Pclass includes fare, I will drop 'Fare'
df.drop(columns=['Ticket', 'Cabin'], inplace=True, axis=1)

In [6]:
X_test.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Cabin,327,418,78.23%
Age,86,418,20.57%
Fare,1,418,0.24%
Pclass,0,418,0.00%
Sex,0,418,0.00%
SibSp,0,418,0.00%
Parch,0,418,0.00%
Ticket,0,418,0.00%
Embarked,0,418,0.00%


As column __Cabin__ has 77% missing values, the column will be dropped.
<br>Column __Age__ can be imputed.
<br>As column __Fare__ has 2 missing rows, the rows will be dropped.

In [7]:
# As column Cabin is missing a lot, and column Ticket has a lot of unique values, which is hard to encode, I will drop them
X_test.drop(columns=['Ticket', 'Cabin'], axis=1, inplace=True)

In [8]:
# Recheck the missing data in columns
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Age,177,891,19.87%
Embarked,2,891,0.22%
Survived,0,891,0.00%
Pclass,0,891,0.00%
Sex,0,891,0.00%
SibSp,0,891,0.00%
Parch,0,891,0.00%
Fare,0,891,0.00%


In [9]:
X_test.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Age,86,418,20.57%
Fare,1,418,0.24%
Pclass,0,418,0.00%
Sex,0,418,0.00%
SibSp,0,418,0.00%
Parch,0,418,0.00%
Embarked,0,418,0.00%


In [10]:
# Get X_train and y_train
y_train = df.Survived.copy()
X_train = df.drop(columns='Survived').copy()

In [11]:
# Retrive num_cols and cat_cols
num_cols = list(X_train._get_numeric_data().columns)
cat_cols = list(set(X_train.columns) - set(num_cols))

In [12]:
# Encode cat_cols
X_train = pd.get_dummies(data=X_train, columns=cat_cols, drop_first=True)
X_test = pd.get_dummies(data=X_test, columns=cat_cols, drop_first=True)

### As no categorical columns have missing data, there is no need to impute them

In [13]:
# Impute numeric data
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
num_imputer = IterativeImputer()
num_imputer.fit(X_train[num_cols])
X_train[num_cols] = num_imputer.transform(X_train[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

In [14]:
def scale(X, num_cols):
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X[num_cols])
    X_scaled = pd.DataFrame(data=X_scaled, columns=num_cols, index=X.index)
    return X_scaled

In [15]:
# Scaling data
X_scaled = scale(X_train, num_cols)
X_test_scaled = scale(X_test, num_cols)

In [16]:
# Update cat_cols
cat_cols = set(X_train.columns) - set(num_cols)

In [17]:
X_train = pd.concat([X_scaled, X_train[cat_cols]], axis=1)

In [18]:
# Upsample y_train
from imblearn.combine import SMOTEENN
sampler = SMOTEENN()
X_train, y_train = sampler.fit_resample(X_train, y_train)

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

In [20]:
X_val.shape

(131, 8)

In [21]:
y_val.shape

(131,)

In [22]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Activation, Dense, Flatten, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam

# Create the keras tuner model.
def build_model(hp):
    hp_drop_out = hp.Float('dropout', 0, 0.5, step=0.1)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model = Sequential()
    for i in range(hp.Int('num_layers', 2, 20)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32), activation='relu'))
#         model.add(BatchNormalization())
        model.add(Dropout(hp_drop_out))
#     model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    opt = Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4]))
    model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                loss='binary_crossentropy',
                metrics=['accuracy'])
    return model

In [23]:
'''
Credit source: 
    https://www.tensorflow.org/tutorials/keras/keras_tuner
    https://github.com/keras-team/keras-tuner/blob/master/examples/cifar10.py
'''
import kerastuner as kt

tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=16,
                     overwrite=False)

In [24]:
es = EarlyStopping(monitor='val_accuracy', verbose=0, patience=10, min_delta=1e-3)
callbacks_list = [es]

In [25]:
tuner.search(X_train, y_train, epochs=64, batch_size=32, validation_data=(X_val, y_val), callbacks=callbacks_list)

Trial 30 Complete [00h 00m 06s]
val_accuracy: 0.5343511700630188

Best val_accuracy So Far: 1.0
Total elapsed time: 00h 02m 09s
INFO:tensorflow:Oracle triggered exit


In [26]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=64, validation_split=0.2)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


In [27]:
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Best epoch: 7


In [28]:
# Re-instantiate the hypermodel and train it with the optimal number of epochs from above.
hypermodel = tuner.hypermodel.build(best_hps)
# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x122648b5788>

In [29]:
eval_result = hypermodel.evaluate(X_val, y_val)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [0.03463554009795189, 0.9847328066825867]


In [30]:
# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x12264bad208>

In [31]:
# from keras.models import save_model
# hypermodel.save('best_model.h5')

In [32]:
# Load the model and predict
# from keras.models import load_model
# hypermodel = load_model('best_model.h5')
y_pred = hypermodel.predict(X_test)
y_pred = y_pred.astype(int)

In [33]:
y_pred = pd.DataFrame(y_pred, columns=['Survived'], index=X_test.index)
# saving the dataframe 
y_pred.to_csv('Predictions.csv') 

In [34]:
import winsound
duration = 100  # milliseconds
freq = 3000  # Hz
winsound.Beep(freq, duration)