### Data sets were acquired from https://www.kaggle.com/c/titanic

In [1]:
import pandas as pd
import sidetable
import numpy as np
import sidetable
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

df_train = pd.read_csv('train.csv', index_col=['PassengerId'])
df_test = pd.read_csv('test.csv', index_col=['PassengerId'])

In [2]:
# Have a look at the first five rows
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_train.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Cabin,687,891,77.10%
Age,177,891,19.87%
Embarked,2,891,0.22%
Survived,0,891,0.00%
Pclass,0,891,0.00%
Name,0,891,0.00%
Sex,0,891,0.00%
SibSp,0,891,0.00%
Parch,0,891,0.00%
Ticket,0,891,0.00%


In [4]:
df_train.shape

(891, 11)

In [5]:
def drop_irrelevant(df):
    return df.drop(columns=['Name', 'Ticket', 'Cabin'])
df_train = drop_irrelevant(df_train)
X_test = drop_irrelevant(df_test)

In [6]:
# Get X_train and y_train
y = df_train.Survived.copy()
X = df_train.drop(columns='Survived').copy()

In [7]:
# Retrive num_cols and cat_cols
num_cols = list(X._get_numeric_data().columns)
cat_cols = list(set(X.columns) - set(num_cols))

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector

cat_pipe = make_pipeline(
            (SimpleImputer(strategy='most_frequent')),
            (OneHotEncoder(drop='first', handle_unknown='error'))
            )
num_pipe = make_pipeline(
            (IterativeImputer()),
            (StandardScaler())
            )            
preprocess_pipeline = make_column_transformer(
            (cat_pipe, cat_cols),
            (num_pipe, num_cols)
            )

In [9]:
preprocessor =  preprocess_pipeline.fit(X)
X = preprocessor.transform(X)
X_test = preprocessor.transform(X_test)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)

# check selected features - first 5 features are selected
print(f'first 5 features are selected: {df_train.columns[feat_selector.support_]}')

# check ranking of features
print(f'check ranking of features {feat_selector.ranking_}')

# call transform() on X to filter it down to selected features
X = feat_selector.transform(X)
X_test = feat_selector.transform(X_test)

first 5 features are selected: Index(['Survived', 'Age', 'SibSp', 'Embarked'], dtype='object')
check ranking of features [1 5 4 1 1 2 3 1]


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y)

In [12]:
'''
Credit source: 
    https://www.tensorflow.org/tutorials/keras/keras_tuner
    https://github.com/keras-team/keras-tuner/blob/master/examples/cifar10.py
'''

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Activation, Dense, Flatten, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam

# Create the keras tuner model.
def build_model(hp):
    hp_drop_out = hp.Float('dropout', 0, 0.5, step=0.1)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model = Sequential()
    for i in range(hp.Int('num_layers', 2, 20)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32), activation='relu'))
#         model.add(BatchNormalization())
        model.add(Dropout(hp_drop_out))
#     model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    opt = Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4]))
    model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                loss='binary_crossentropy',
                metrics=['accuracy'])
    return model

import kerastuner as kt

tuner = kt.Hyperband(build_model,
                     objective='val_accuracy', 
                     max_epochs=16,
                     overwrite=False)

INFO:tensorflow:Reloading Oracle from existing project .\untitled_project\oracle.json
INFO:tensorflow:Reloading Tuner from .\untitled_project\tuner0.json


In [13]:
es = EarlyStopping(monitor='val_accuracy', verbose=0, patience=10, min_delta=1e-3)
tuner.search(X_train, y_train, epochs=64, batch_size=32, validation_data=(X_val, y_val), callbacks=[es])

INFO:tensorflow:Oracle triggered exit


In [14]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=64, validation_data=(X_val, y_val))

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


In [15]:
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Best epoch: 47


In [16]:
# Re-instantiate the hypermodel and train it with the optimal number of epochs from above.
hypermodel = tuner.hypermodel.build(best_hps)
# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch)

Epoch 1/47
Epoch 2/47
Epoch 3/47
Epoch 4/47
Epoch 5/47
Epoch 6/47
Epoch 7/47
Epoch 8/47
Epoch 9/47
Epoch 10/47
Epoch 11/47
Epoch 12/47
Epoch 13/47
Epoch 14/47
Epoch 15/47
Epoch 16/47
Epoch 17/47
Epoch 18/47
Epoch 19/47
Epoch 20/47
Epoch 21/47
Epoch 22/47
Epoch 23/47
Epoch 24/47
Epoch 25/47
Epoch 26/47
Epoch 27/47
Epoch 28/47
Epoch 29/47
Epoch 30/47
Epoch 31/47
Epoch 32/47
Epoch 33/47
Epoch 34/47
Epoch 35/47
Epoch 36/47
Epoch 37/47
Epoch 38/47
Epoch 39/47
Epoch 40/47
Epoch 41/47
Epoch 42/47
Epoch 43/47
Epoch 44/47
Epoch 45/47
Epoch 46/47
Epoch 47/47


<tensorflow.python.keras.callbacks.History at 0x1f840009e88>

In [17]:
eval_result = hypermodel.evaluate(X_val, y_val)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [0.453054815530777, 0.8283582329750061]


In [18]:
# Retrain the model
hypermodel.fit(X, y, epochs=best_epoch)

Epoch 1/47
Epoch 2/47
Epoch 3/47
Epoch 4/47
Epoch 5/47
Epoch 6/47
Epoch 7/47
Epoch 8/47
Epoch 9/47
Epoch 10/47
Epoch 11/47
Epoch 12/47
Epoch 13/47
Epoch 14/47
Epoch 15/47
Epoch 16/47
Epoch 17/47
Epoch 18/47
Epoch 19/47
Epoch 20/47
Epoch 21/47
Epoch 22/47
Epoch 23/47
Epoch 24/47
Epoch 25/47
Epoch 26/47
Epoch 27/47
Epoch 28/47
Epoch 29/47
Epoch 30/47
Epoch 31/47
Epoch 32/47
Epoch 33/47
Epoch 34/47
Epoch 35/47
Epoch 36/47
Epoch 37/47
Epoch 38/47
Epoch 39/47
Epoch 40/47
Epoch 41/47
Epoch 42/47
Epoch 43/47
Epoch 44/47
Epoch 45/47
Epoch 46/47
Epoch 47/47


<tensorflow.python.keras.callbacks.History at 0x1f84ba83708>

In [19]:
# from keras.models import save_model
# hypermodel.save('best_model.h5')

In [20]:
# Load the model and predict
# from keras.models import load_model
# hypermodel = load_model('best_model.h5')
y_pred = hypermodel.predict(X_test)
y_pred = y_pred.astype(int)

In [23]:
y_pred = pd.DataFrame(y_pred, columns=['Survived'], index=df_test.index)
# saving the dataframe 
y_pred.to_csv('Predictions.csv') 

In [None]:
import winsound
duration = 100  # milliseconds
freq = 3000  # Hz
winsound.Beep(freq, duration)