In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import sys
import tensorflow as tf
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
print("Started")
print(os.getcwd())
#print(os.fspath('..'))
for dirname, _, filenames in os.walk('/kaggle/input'):
    #print("In loop")
    #print(filenames)
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.



In [None]:
# Other import stuff

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Layer, Dense, Embedding, Input, LSTM, Masking, BatchNormalization, Dropout, Activation
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam, Nadam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from tensorflow.keras.constraints import max_norm
import time
import matplotlib.pyplot as plt
%matplotlib inline
# for jupyter notebook
#%matplotlib notebook
# for jupyter-lab
#%matplotlib widget
# Import the train_test_split function and uncomment
from sklearn.model_selection import train_test_split

In [None]:
# Pre-trained models, etc. from sklearn

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
# Other imports

from tensorflow_addons.metrics import RSquare # tensorflow addons
import keras_tuner as kt

# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
!rm -rf ./logs/

from tensorboard.plugins.hparams import api as hp

import category_encoders as ce

<a id="TOC-01"></a>
# Import data and perform a preliminary analysis

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
PassengerId_train = train_data.PassengerId
train_data = train_data.set_index('PassengerId')
train_data_original = train_data.copy()
train_data['is_test'] = 0
display(train_data)

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
PassengerId_test = test_data.PassengerId
test_data = test_data.set_index('PassengerId')
test_data_original = test_data.copy()
test_data['is_test'] = 1
display(test_data)

In [None]:
# Compare the train and test data

display(train_data.describe())
display(test_data.describe())

<a id="TOC-02"></a>
# Define pre-processing functions and model functions

>Various functions to convert features to categorical-nominal, categorical-ordinal, categorical-range values

In [None]:
def build_age_prediction_regression_model(hp):
    model = Sequential()
    hp_kernel_initializer = hp.Choice('kernel_initializer', init_mode)
    hp_activation = hp.Choice('activation', activation)
    hp_kernel_constraint = hp.Choice('kernel_constraint', weight_constraint)
    hp_dropout_rate = hp.Choice('dropout_rate', dropout_rate)
    hp_learning_rate = hp.Choice('learning_rate', values=learning_rate)
    hp_l2_regularizer = hp.Choice('l2_weight_decay', values=l2_weight_decay)
    
    # Input layer
    
    model.add(Dense(input_dim=input_dim, 
                    units=hp.Int('units_input', min_value=32, max_value=1280, step=32),
                    kernel_regularizer=regularizers.l2(hp_l2_regularizer),
                    kernel_initializer=hp_kernel_initializer,
                    bias_initializer='zeros', 
                    activation=hp_activation,
                    kernel_constraint=max_norm(hp_kernel_constraint)))
    model.add(Dropout(rate=hp_dropout_rate))
    
    # Hidden layers
    
    for i in range(hp.Int('layers', 0, 3)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=1280, step=32), 
                        kernel_regularizer=regularizers.l2(hp_l2_regularizer),
                        kernel_initializer=hp_kernel_initializer,
                        bias_initializer='zeros', 
                        activation=hp_activation,
                        kernel_constraint=max_norm(hp_kernel_constraint) 
                        ))
        model.add(Dropout(rate=hp_dropout_rate))
    
    # Add the output layer
    
    model.add(Dense(units=1, 
                    kernel_regularizer=regularizers.l2(hp_l2_regularizer),
                    kernel_initializer=hp_kernel_initializer, 
                    bias_initializer='zeros', 
                    activation='linear'))

    # Optimizer

    hp_optimizer = hp.Choice('optimizer', optimizer)

    model.compile(loss='mean_squared_error', 
            optimizer=hp_optimizer,
            metrics=['accuracy'])
    return model

In [None]:
def build_survival_prediction_classification_model(hp):
    model = Sequential()
    hp_kernel_initializer = hp.Choice('kernel_initializer', init_mode)
    hp_activation = hp.Choice('activation', activation)
    hp_kernel_constraint = hp.Choice('kernel_constraint', weight_constraint)
    hp_dropout_rate = hp.Choice('dropout_rate', dropout_rate)
    hp_learning_rate = hp.Choice('learning_rate', values=learning_rate)
    hp_l2_regularizer = hp.Choice('l2_weight_decay', values=l2_weight_decay)

    # Input layer

    model.add(Dense(input_dim=input_dim, 
                    units=hp.Int('units_input', min_value=32, max_value=1600, step=32),
                    kernel_regularizer=regularizers.l2(hp_l2_regularizer),
                    kernel_initializer=hp_kernel_initializer,
                    bias_initializer='zeros', 
                    activation=hp_activation,
                    kernel_constraint=max_norm(hp_kernel_constraint)))
    model.add(Dropout(rate=hp_dropout_rate))
    
    # Hidden layers
    
    for i in range(hp.Int('layers', 0, 3)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=1280, step=32), 
                        kernel_regularizer=regularizers.l2(hp_l2_regularizer),
                        kernel_initializer=hp_kernel_initializer,
                        bias_initializer='zeros', 
                        activation=hp_activation,
                        kernel_constraint=max_norm(hp_kernel_constraint) 
                        ))
        model.add(Dropout(rate=hp_dropout_rate))
    
    # Add the output layer

    model.add(Dense(units=1, 
                    kernel_regularizer=regularizers.l2(hp_l2_regularizer),
                    kernel_initializer=hp_kernel_initializer, 
                    bias_initializer='zeros', 
                    activation='sigmoid'))

    # Optimizer

    hp_optimizer = hp.Choice('optimizer', optimizer)

    model.compile(loss='binary_crossentropy', 
                optimizer=hp_optimizer,
                metrics=['binary_accuracy'])
    return model

In [None]:
# Setup callback

def get_callbacks():
    """
    This function should create and return a tuple (early_stopping, learning_rate_reduction) callbacks.
    The callbacks should be instantiated according to the above requirements.
    """
    
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=100, verbose=1)

    learning_rate_reduction = tf.keras.callbacks.ReduceLROnPlateau(factor=0.2, patience=5, verbose=1)
    
    checkpoint_path = 'checkpoints_every_epoch/checkpoint_{epoch:03d}'
    checkpoint_epoch = ModelCheckpoint(filepath=checkpoint_path,
                                 frequency='epoch',
                                 save_weights_only=True,
                                 verbose=0)
    
    checkpoint_best_path = 'checkpoints_best_only/checkpoint'
    checkpoint_best = ModelCheckpoint(filepath=checkpoint_best_path,
                                      save_weights_only=True,
                                      save_freq='epoch',
                                      monitor='val_loss',
                                      save_best_only=True,
                                      verbose=0)

    #return [early_stopping, learning_rate_reduction, checkpoint_epoch, checkpoint_best]
    
    return [early_stopping, learning_rate_reduction, checkpoint_epoch, checkpoint_best]


#callbacks = get_callbacks()

In [None]:
# Define a 'Score' function similar to Sklearn's RandomForestRegressor, etc. for regression analysis

def coeff_determination(y_true, y_pred):
    SS_res =  np.sum(np.square( y_true-y_pred ))
    SS_tot = np.sum(np.square( y_true - np.mean(y_true) ) )
    return (round( (1 - SS_res/(SS_tot + 1e-7)) * 100, 2))


In [None]:
# Analyze a 'feature_name' for 'Survived' rate by classification, without modifying the 'feature_name'

def analyze_feature_name(dataset, feature_name):
    feature_name = feature_name
    dataset = dataset
    list_feature_name = dataset[feature_name].unique()
    list_feature_name = np.sort(list_feature_name)
    i_feature_name = 0
    for feature in list_feature_name:
        filter = dataset[feature_name] == feature
        temp_df = dataset.loc[:, [feature_name, 'Survived']]
        temp_df = temp_df.where(filter)
        print(feature_name, "-",feature, " survival rate = {0:4.1f}%".format((temp_df.Survived.sum()/temp_df.Survived.count())*100))
        i_feature_name += 1

In [None]:
# Convert string in a 'feature_name' (such as 'Name') to pick out the last name of the person and analyze the 'feature_name'

def analyze_feature_name_to_return_last_name(dataset, feature_name):
    feature_name = feature_name
    dataset = dataset
    dataset.loc[:, feature_name] = dataset[feature_name].apply(lambda p: p[:(p.find(','))])
    
    # Analyze the data
    
    list_feature_name = dataset[feature_name].unique()
    i_feature_name = 0
    for feature in list_feature_name:
        filter = dataset[feature_name] == feature
        temp_df = dataset.loc[:, [feature_name, 'Survived']]
        temp_df = temp_df.where(filter)
        print(feature_name, "-",feature, " survival rate = {0:4.1f}%".format((temp_df.Survived.sum()/temp_df.Survived.count())*100))
        i_feature_name += 1
        

In [None]:
# Convert string in a 'feature_name' to pick out the first 3 letters analyze the 'feature_name' (such as 'Ticket')

def analyze_feature_name_to_return_first_3_letters(dataset, feature_name):
    feature_name = feature_name
    dataset = dataset
    dataset.loc[:, feature_name] = dataset[feature_name].apply(lambda p: str(p)[:3])
    
    # Analyze the data
    
    list_feature_name = dataset[feature_name].unique()
    i_feature_name = 0
    for feature in list_feature_name:
        filter = dataset[feature_name] == feature
        temp_df = dataset.loc[:, [feature_name, 'Survived']]
        temp_df = temp_df.where(filter)
        print(feature_name, "-",feature, " survival rate = {0:4.1f}%".format((temp_df.Survived.sum()/temp_df.Survived.count())*100))
        i_feature_name += 1

In [None]:
# Convert a unique class (string or numerical) 'feature_name' to categorical

def convert_unique_feature_name_to_caterorical(dataset, feature_name):
    feature_name = feature_name
    dataset = dataset
    unique_names = dataset[feature_name]
    list_feature_name = unique_names.unique()
    list_feature_name = np.sort(list_feature_name)
    i_feature_name = 0
    for feature in list_feature_name:
        dataset[feature_name].replace(feature, i_feature_name, inplace=True)
        i_feature_name += 1

In [None]:
# Convert a numerical-value 'feature_name' to categorical based on ranges

# This is another way to do the same:

#dataset['AgeBand'] = pd.cut(dataset['Age'], 5)
#dataset[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

def convert_numerical_feature_name_to_categorical(dataset, feature_name, min_value, max_value, steps):
    feature_name = feature_name
    dataset = dataset
    min_value = min_value
    max_value = max_value
    steps = steps
    
    step_length = (max_value - min_value)/steps
    for step_count in np.arange(steps):
        dataset.loc[((dataset[feature_name] > (step_count - 1e-8)*(step_length)) & ((dataset[feature_name] <= (step_count+1)*step_length))), feature_name] = step_count

    return (dataset)

In [None]:
# One hot encode a list of features

def convert_feature_to_one_hot(database, features):
    features = features
    database = database
    i = 0
    for feature in features:
        #print(feature)
        #print(database.shape)
        #display(database.head())
        feature_one_hot = pd.get_dummies(database[feature], prefix=feature)
        #print(feature_one_hot.shape)
        #display(feature_one_hot)
        database = database.drop(feature, axis=1)
        database = database.join(feature_one_hot)
        #display(database)
        #print(database.shape)
            
    return database

In [None]:
# Convert data to binary a list of features based on ordinal values

def convert_feature_to_binary(database, features):
    features = features
    database = database
    i = 0
    for feature in features:
        encoder = ce.BinaryEncoder(cols=[feature])
        database = encoder.fit_transform(database)
        
    return database

<a id="TOC-03"></a>
# Examine 'NaN'

>The first step is to examine if there 'NaN' in the data and rectify it where required in the analysis.

In [None]:
#print(train_data.shape)
list_columns = list(train_data.columns)
#print(list_columns)
for column in list_columns:
    print("train_data Nan in ", column, " = ", train_data[column].isna().sum())
    if column != 'Survived':
        print("test_data Nan in ", column, " = ", test_data[column].isna().sum())

In [None]:
# All the features will be updated, as required, after combining the test_data and train_data into one combined all_data
# You can use a different approach from what is shown below. You can make a combined 'list' like all_data = [train_data, test_data] and do the same operations on the
# combined list. You may have to use 'with dataset in all_data:' to do the operations on both datasets.

all_data = pd.concat((train_data, test_data), axis=0, sort=False) # NOTE: this will add a 'Survived' column witn NaN for test_data portion of data.

# 'Pclass' in all_data will be updated using the most frequent value to replace NaN - THIS IS NOT REQUIRED SINCE THERE ARE NO NaN VALUES.
all_data[['Pclass']] = all_data[['Pclass']].fillna(value=all_data['Pclass'].value_counts().idxmax())

# 'Name' in all_data will be updated using 'Unknown' to replace NaN
all_data[['Name']] = all_data[['Name']].fillna('Unknown')

# 'Age': is an important feature with many missing data (NaN). Build various models to predict 'Age' for all NaN in all_data.

# 'SibSp' in all_data will be updated using the most frequent value (mode) to replace NaN - THIS IS NOT REQUIRED SINCE THERE ARE NO NaN VALUES.
all_data[['SibSp']] = all_data[['SibSp']].fillna(value=all_data['SibSp'].value_counts().idxmax())

# 'Parch' in all_data will be updated using the most frequent value (mode) to replace NaN - THIS IS NOT REQUIRED SINCE THERE ARE NO NaN VALUES.
all_data[['Parch']] = all_data[['Parch']].fillna(value=all_data['Parch'].value_counts().idxmax())

# 'Ticket': will not be currently updated. If required, it will be modified for analysis or it will be removed.

# 'Fare': in all_data will be updated using the mean value for each 'Pclass'.
Fare_mean = all_data.groupby('Pclass')['Fare'].mean()
all_data['Fare'].fillna(all_data['Pclass'].map(Fare_mean), inplace=True)

### Another way to do this is using 'transform':
### Fare_mean = all_data.groupby('Pclass')['Fare'].transform('mean')
### all_data['Fare'].fillna(Fare_mean, inplace=True)

##### One more option: all_data["Fare"] = all_data.groupby("Pclass").transform(lambda x: x.fillna(x.mean()))

# 'Cabin': has too many NaNs will not be used in analyses

# 'Embarked' in all_data will be updated using the most frequent value to replace NaN
all_data[['Embarked']] = all_data[['Embarked']].fillna(value=all_data['Embarked'].value_counts().idxmax())

# 'is_test': will not be updated since it will not be used in analyses

all_data

In [None]:
list_columns = list(all_data.columns)
#print(list_columns)
for column in list_columns:
    print("all_data Nan in ", column, " = ", all_data[column].isna().sum())

<a id="TOC-04"></a>
# Convert to Categorical

>Examine each feature and convert to categorical, and remove unwanted features.

In [None]:
# Analyze 'Name' and convert to categorical based on last name

all_data_copy = all_data.copy()
display(all_data_copy.head())
analyze_feature_name_to_return_last_name(all_data_copy, 'Name')
display(all_data_copy.head())

In [None]:
# As seen above, there are hundreds of last names and they are quite useless in predicting the survivability of a person. Discard 'all_data_copy'.
# Instead of categorizing last names, let us extract the 'Title' of each person. Titles are generally related to the age of a person
# and may be useful in estimating the 'Age' feature as well as the 'Survived' feature.

del all_data_copy

all_data['Title'] = all_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(all_data['Title'], all_data['Sex'])

In [None]:
# Analyze 'Title'


all_data['Title'] = all_data['Title'].replace(['Lady', 'Countess','Capt', 'Col', 
                                             'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

all_data['Title'] = all_data['Title'].replace('Mlle', 'Miss')
all_data['Title'] = all_data['Title'].replace('Ms', 'Miss')
all_data['Title'] = all_data['Title'].replace('Mme', 'Mrs')
    
display(all_data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())
display(all_data[['Title', 'Age']].groupby(['Title'], as_index=False).min())
display(all_data[['Title', 'Age']].groupby(['Title'], as_index=False).max())
display(all_data[['Title', 'Age']].groupby(['Title'], as_index=False).mean())

In [None]:
# From above, 'Title' is usefull in estimating 'Age'

# Convert Title to categorical-nominal values

title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
#for dataset in combine:
#    dataset['Title'] = dataset['Title'].map(title_mapping)
#    dataset['Title'] = dataset['Title'].fillna(0)

all_data['Title'] = all_data['Title'].map(title_mapping)
all_data['Title'] = all_data['Title'].fillna(0)

all_data.head()

In [None]:
display(all_data)

In [None]:
# Evaluate 'Embarked'

display(all_data.head())
analyze_feature_name(all_data, 'Embarked')

In [None]:
# Convert 'Embarked' to categorical-nominal since survival rate is dependent on feature 'Embarked'

convert_unique_feature_name_to_caterorical(all_data, 'Embarked')
display(all_data.head())

In [None]:
# Analyze 'Sex'
display(all_data)
analyze_feature_name(all_data, 'Sex')

In [None]:
# Clearly survival rate is dependent on 'Sex'. Convert 'Sex' to categorical-nominal

convert_unique_feature_name_to_caterorical(all_data, 'Sex')
display(all_data)

# Could have also done it this way:
#all_data['Sex'] = all_data['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [None]:
# Analyze 'Pclass'.

display(all_data.head())
analyze_feature_name(all_data, 'Pclass')

In [None]:
# Clearly survival rate is dependent on 'Pclass'. But, Pclass has only three values 1, 2 and 3. So there is no need to categorize.

#convert_unique_feature_name_to_caterorical(train_data, 'Pclass')
display(all_data.head())

In [None]:
# Analyze 'SibSp'.

display(all_data)
analyze_feature_name(all_data, 'SibSp')

In [None]:
# Survival rate is dependent on 'SibSp'. But, 'SibSp' has only six values 0 to 5 and 8. So there is no need to categorize.
# Later on this will be binarized to reduce it to four features.

#convert_unique_feature_name_to_caterorical(all_data, 'SibSp')
display(all_data)

In [None]:
# Analyze 'Parch'.

display(all_data)
analyze_feature_name(all_data, 'Parch')

In [None]:
# Survival rate is dependent on 'Parch'. But, 'Parch' has only eight values 0 to 6 and 9. So there is no need to categorize.
# Later on this will be binarized to reduce it to four features.

#convert_unique_feature_name_to_caterorical(all_data, 'Parch')
display(all_data)

In [None]:
# Combine SibSp and Parch as 'Family_members'

all_data['Family_members'] = all_data['SibSp'] + all_data['Parch']
display(all_data)

In [None]:
# Drop SibSp and Parch

all_data = all_data.drop(['SibSp', 'Parch'], axis=1)
display(all_data)

In [None]:
# Analyze 'Family_members'.

display(all_data)
analyze_feature_name(all_data, 'Family_members')

In [None]:
# Survival rate is dependent on 'Family_members'. But, 'Family_members' has only eight values 0 to 7 and 10. So there is no need to categorize.
# Later on this will be binarized to reduce it to four features.

#convert_unique_feature_name_to_caterorical(all_data, 'Parch')
display(all_data)

In [None]:
# Almost all 'Fare' values are unique. So 'Fare' will be converted to categorical-range and analyzed.
# Later on this will be binarized to reduce it to four features.

display(all_data.head())
convert_numerical_feature_name_to_categorical(all_data, 'Fare', 0, 520, 16)
analyze_feature_name(all_data, 'Fare')
display(all_data.head())

In [None]:
# Obtain the first 3 letter in 'Ticket'. Then convert 'Ticket' to categorical.

all_data_copy = all_data.copy()
display(all_data_copy)
analyze_feature_name_to_return_first_3_letters(all_data_copy, 'Ticket')

In [None]:
# From above, the ticket numbers have to correlation to survivability or 'Age'. So this will be dropped. Delete 'all_data_copy'.

del all_data_copy
#convert_unique_feature_name_to_caterorical(all_data, 'Ticket')
display(all_data.head())

In [None]:
# Remove unwanted columns:
#     Remove 'Ticket'
#     Remove 'Cabin'
#     Now that 'Title' have be extracted, remove 'Name'

all_data.drop('Ticket', axis=1, inplace=True)
all_data.drop('Cabin', axis=1, inplace=True)
all_data.drop('Name', axis=1, inplace=True)

display(all_data)

In [None]:
# One hot encode some of the features

#print(all_data.shape)
#display(all_data.head())
features = ['Pclass', 'Sex', 'Embarked']
all_data = convert_feature_to_one_hot(all_data, features)
display(all_data)

In [None]:
# Binarize some features.

print(all_data.shape)
display(all_data.head())
features = ['Family_members', 'Fare', 'Title']
all_data = convert_feature_to_binary(all_data, features)
display(all_data)

In [None]:
print(all_data.columns)

<a id="TOC-05"></a>
# Estimate 'Age' - using a Regression model

>Age is an important component to predict survivability of a passenger. Remove all 'NaN' values by estimating age using a model.

In [None]:
# 'Age' is an important feature. Build model to predict missing ages and fill in the missing data.
# Generate DataFrames for 'Age' analyses and prediction

# Based on intuition, a combination of 'Title', Pclass' and 'Fare' should provide a reasonable estimate of age. It is assumed 
# that for each passenger class (1,2,3) the ticket value will be based on age (discounted rates for children and seniors, 
# regular rates for adults). Although the charts don't really show this. We will let ML figure this out.

# None of the other features are in any way mathematically related to 'Age'

# We will use the following steps to estimate 'Age'.
# 1 - Create a new DataFrame from 'all_data'
# 2 - Separate out rows 'with age' and 'without age'.
# 3 - Split the 'with age' data into 'train' and 'check' sets.

features_for_Age = ['Title', 'Pclass', 'Fare', 'Age']
features_for_Age_columns = []
for feature in features_for_Age:
    features_for_Age_columns.extend([col for col in all_data.columns if feature in col])
#print(features_for_Age_columns)

df_for_age_all = all_data[features_for_Age_columns]
#display(df_for_age)

# DataFrame of data with 'Age'
df_with_age = df_for_age_all.dropna(subset=['Age'])

# DataFrame of data without 'Age'
df_without_age =df_for_age_all.loc[df_for_age_all['Age'].isna()]
df_without_age.pop('Age')
#display(df_without_age)

# Split the 'with age' data into 'train' and 'check' sets.

X_with_age_to_split = df_with_age.copy()
y_with_age_to_split = X_with_age_to_split.pop('Age')

In [None]:
X_with_age_train, X_with_age_check, y_with_age_train, y_with_age_check = train_test_split(X_with_age_to_split, \
                                                                                                      y_with_age_to_split, test_size=0.2, random_state=82)

# Specific data for 'Age' analyses and prediction - NO NEED TO PERFORM StandardScaler conversion

In [None]:
# define the grid search parameters

#'''
# First try

input_dim = X_with_age_train.shape[1]
#epochs = [100, 200]
optimizer = ['Adam', 'Nadam'] #
learning_rate = [0.000001, 0.0001, 0.3] #
momentum = [0.4, 0.9]
init_mode = ['glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'] #
activation = ['relu', 'tanh'] #
weight_constraint = [3, 5] #
dropout_rate = [0.2, 0.5] #
l2_weight_decay = [1e-6, 1e-5, 1e-3, 1e-2] #
# cv = 5

#'''



In [None]:
tuner_age_model = kt.Hyperband(
    build_age_prediction_regression_model,
    objective='val_loss',
    directory='kt_02_01',
    project_name='Titanic_keras_tuner',
    #max_trials = 30,
    factor=3,
    max_epochs=200,
    executions_per_trial=5,
    overwrite=True)

tuner_age_model.search_space_summary()

In [None]:
# Setup callback
callback_early_stopping = EarlyStopping(monitor='loss', patience = 25)
checkpoint_path = 'mlp_checkpoints_every_epoch/checkpoint_{epoch:03d}'
callback_checkpoint_epoch = ModelCheckpoint(filepath=checkpoint_path,
                                 frequency='epoch',
                                 save_weights_only=True,
                                 verbose=0)

In [None]:
callbacks_age = get_callbacks()
tuner_age_model.search(X_with_age_train, y_with_age_train.values, validation_split=0.2, batch_size=32, epochs=600, callbacks=[callbacks_age])

In [None]:
model_best_age = tuner_age_model.get_best_models()[0]
model_best_age.summary()

In [None]:
tuner_age_model.results_summary()

In [None]:
# Get best hyperparameters

best_hps=tuner_age_model.get_best_hyperparameters(num_trials=1)[0]
best_hps.get('optimizer')

In [None]:
# Build the model with the optimal hyperparameters and train it on the data for 600 epochs

tuner_age_hypermodel = tuner_age_model.hypermodel.build(best_hps)

In [None]:
# Extract the best epoch when val_loss_per_epoch is minimum.

history = tuner_age_hypermodel.fit(X_with_age_train, y_with_age_train.values, epochs=600, validation_split=0.2, verbose=0, callbacks=[callbacks_age])
val_acc_per_epoch = history.history['val_accuracy']
val_loss_per_epoch = history.history['val_loss']
best_epoch = val_loss_per_epoch.index(min(val_loss_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

In [None]:
# Retrain the model to the best number of epochs.

tuner_age_hypermodel.fit(X_with_age_train, y_with_age_train.values, epochs=best_epoch, validation_split=0.2)

In [None]:
eval_result = tuner_age_hypermodel.evaluate(X_with_age_check, y_with_age_check.values)
print("[check loss (mse), check accuracy (r-squared)]:", eval_result)

<a id="TOC-07"></a>
# Plot learning curves for age-prediction

In [None]:
# Plot the training and validation loss

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss vs. epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper right')
plt.show()

In [None]:
acc_my_age_model = coeff_determination(y_with_age_check.values, np.squeeze(tuner_age_hypermodel.predict(X_with_age_check)))
print(acc_my_age_model)
pred_my_age_model = tuner_age_hypermodel.predict(df_without_age)

<a id="TOC-08"></a>
# Use various predictions to predict 'Age' and compare

In [None]:
# Plot the 'check' data

plt.rcParams["figure.figsize"] = (20,3)
#plt.plot(np.squeeze(y_with_age_check.values))
X_axis = np.arange(len(y_with_age_check.values))
plt.bar(X_axis - 0.2, np.squeeze(y_with_age_check.values), 0.4)
#plt.plot(np.squeeze(tuner_age_hypermodel.predict(X_with_age_check)))
plt.bar(X_axis + 0.2, np.squeeze(tuner_age_hypermodel.predict(X_with_age_check)), 0.4)
plt.title('ID vs. Age')
plt.ylabel('Age')
plt.xlabel('ID')
plt.legend(['Y', 'Prediction'], loc='upper right')
plt.show()

In [None]:
# What is the score if you use a mean value for age from training data?

acc_my_age_mean = coeff_determination(y_with_age_check.values, np.mean(y_with_age_train.values))
print(acc_my_age_mean)

In [None]:
# What is the score if you use a median value for age from training data?

acc_my_age_median = coeff_determination(y_with_age_check.values, np.median(y_with_age_train.values))
print(acc_my_age_median)

In [None]:
# Perceptron and Naive Bayes are classifiers.

In [None]:
# Support Vector Machines - linear support vector regressor

svr_age = SVR()
svr_age.fit(X_with_age_train, np.squeeze(y_with_age_train.values))
Y_pred_svr = svr_age.predict(df_without_age)
acc_svr_age = round(svr_age.score(X_with_age_check, np.squeeze(y_with_age_check.values)) * 100, 2)
acc_svr_age

In [None]:
# Plot the training and validation loss

plt.rcParams["figure.figsize"] = (20,3)
#plt.plot(np.squeeze(y_with_age_check.values))
#plt.plot(np.squeeze(svr_age.predict(X_with_age_check)))
plt.bar(X_axis - 0.2, np.squeeze(y_with_age_check.values), 0.4)
plt.bar(X_axis + 0.2, np.squeeze(svr_age.predict(X_with_age_check)), 0.4)
plt.title('ID vs. Age')
plt.ylabel('Age')
plt.xlabel('ID')
plt.legend(['Y', 'Prediction'], loc='upper right')
plt.show()

In [None]:
#help(DecisionTreeRegressor.score)

In [None]:
# Decision Tree Regressor

fig = plt.figure(figsize=(200,100))
#fig.set_size_inches(40, 10)

decision_tree_regressor = DecisionTreeRegressor()
decision_tree_regressor = decision_tree_regressor.fit(X_with_age_train, np.squeeze(y_with_age_train.values))
Y_pred_decision_tree_regressor = decision_tree_regressor.predict(df_without_age)
acc_decision_tree_regressor = round(decision_tree_regressor.score(X_with_age_check, np.squeeze(y_with_age_check.values))*100 , 2)
print(acc_decision_tree_regressor)
#acc_decision_tree_regressor_1 = coeff_determination(y_with_age_check.values, decision_tree_regressor.predict(X_with_age_check))
#print(acc_decision_tree_regressor_1)

In [None]:
tree.plot_tree(decision_tree_regressor, filled=True)
#plt.show()
#plt.savefig('decision_tree.png')
fig.savefig("decision_tree.png")

In [None]:
# Plot the training and validation loss

#plt.rcParams["figure.figsize"] = (20,3)
#plt.plot(np.squeeze(y_with_age.values))
#plt.plot(np.squeeze(decision_tree_regressor.predict(X_with_age)))
plt.bar(X_axis - 0.2, np.squeeze(y_with_age_check.values), 0.4)
plt.bar(X_axis + 0.2, np.squeeze(decision_tree_regressor.predict(X_with_age_check)), 0.4)
plt.title('ID vs. Age')
plt.ylabel('Age')
plt.xlabel('ID')
plt.legend(['Y', 'Prediction'], loc='upper right')
plt.show()

In [None]:
# Random Forest Regressor

random_forest_regressor = RandomForestRegressor(n_estimators=100)
random_forest_regressor = random_forest_regressor.fit(X_with_age_train, np.squeeze(y_with_age_train.values))
Y_pred_random_forest_regressor = random_forest_regressor.predict(df_without_age)
acc_random_forest_regressor = round(random_forest_regressor.score(X_with_age_check, np.squeeze(y_with_age_check.values)) * 100, 2)
acc_random_forest_regressor

In [None]:
# Plot the training and validation loss

#plt.rcParams["figure.figsize"] = (20,3)
#plt.plot(np.squeeze(y_with_age.values))
#plt.plot(np.squeeze(random_forest_regressor.predict(X_with_age)))
plt.bar(X_axis - 0.2, np.squeeze(y_with_age_check.values), 0.4)
plt.bar(X_axis + 0.2, np.squeeze(random_forest_regressor.predict(X_with_age_check)), 0.4)
plt.title('ID vs. Age')
plt.ylabel('Age')
plt.xlabel('ID')
plt.legend(['Y', 'Prediction'], loc='upper right')
plt.show()

In [None]:
#help(RandomForestRegressor.score)

In [None]:
# Stochastic Gradient Descent

sgd_regressor = SGDRegressor()
sgd_regressor = sgd_regressor.fit(X_with_age_train, np.squeeze(y_with_age_train.values))
Y_pred_sgd_regressor = sgd_regressor.predict(df_without_age)
acc_sgd_regressor = round(sgd_regressor.score(X_with_age_check, np.squeeze(y_with_age_check.values)) * 100, 2)
acc_sgd_regressor

In [None]:
# Plot the training and validation loss

#plt.rcParams["figure.figsize"] = (20,3)
#plt.plot(np.squeeze(y_with_age.values))
#plt.plot(np.squeeze(sgd_regressor.predict(X_with_age)))
plt.bar(X_axis - 0.2, np.squeeze(y_with_age_check.values), 0.4)
plt.bar(X_axis + 0.2, np.squeeze(sgd_regressor.predict(X_with_age_check)), 0.4)
plt.title('ID vs. Age')
plt.ylabel('Age')
plt.xlabel('ID')
plt.legend(['Y', 'Prediction'], loc='upper right')
plt.show()

In [None]:
# Linear SVR

linear_svr = LinearSVR(max_iter=100000)
linear_svr.fit(X_with_age_train, np.squeeze(y_with_age_train.values))
Y_pred_linear_svr = linear_svr.predict(df_without_age)
acc_linear_svr = round(linear_svr.score(X_with_age_check, np.squeeze(y_with_age_check.values)) * 100, 2)
acc_linear_svr

In [None]:
# Plot the training and validation loss

#plt.rcParams["figure.figsize"] = (20,3)
#plt.plot(np.squeeze(y_with_age.values))
#plt.plot(np.squeeze(linear_svr.predict(X_with_age)))
plt.bar(X_axis - 0.2, np.squeeze(y_with_age_check.values), 0.4)
plt.bar(X_axis + 0.2, np.squeeze(linear_svr.predict(X_with_age_check)), 0.4)
plt.title('ID vs. Age')
plt.ylabel('Age')
plt.xlabel('ID')
plt.legend(['Y', 'Prediction'], loc='upper right')
plt.show()

In [None]:
# KNR confidence score

knr = KNeighborsRegressor(n_neighbors = 3)
knr.fit(X_with_age_train, np.squeeze(y_with_age_train.values))
Y_pred_knr = knr.predict(df_without_age)
acc_knr = round(knr.score(X_with_age_check, np.squeeze(y_with_age_check.values)) * 100, 2)
acc_knr

In [None]:
# Plot the training and validation loss

#plt.plot(np.squeeze(y_with_age.values))
#plt.plot(np.squeeze(knr.predict(X_with_age)))
plt.bar(X_axis - 0.2, np.squeeze(y_with_age_check.values), 0.4)
plt.bar(X_axis + 0.2, np.squeeze(knr.predict(X_with_age_check)), 0.4)
plt.title('ID vs. Age')
plt.ylabel('Age')
plt.xlabel('ID')
plt.legend(['Y', 'Prediction'], loc='upper right')
plt.show()

<a id="TOC-09"></a>
# Summarize accuracy of 'Age' prediction by various methods

In [None]:
# Summarize age prediction model results

prediction_models = pd.DataFrame({
    'Model': ['My Age Model', 'Support Vector Machines', 'KNR',  
              'Random Forest', 'Stochastic Gradient Decent', 'Linear SVR', 
              'Decision Tree'],
    'Score': [acc_my_age_model, acc_svr_age, acc_knr,  
              acc_random_forest_regressor, 
              acc_sgd_regressor, acc_linear_svr, acc_decision_tree_regressor]})
prediction_models.sort_values(by='Score', ascending=False)

In [None]:
display(all_data)
display(df_with_age)
display(df_without_age)

<a id="TOC-10"></a>
# Predict missing 'Age'

In [None]:
# Use predicted age values from most accurate model. In this case, it will be my age model.

#predicted_ages = age_model.predict(sc_without_age[['Pclass', 'Fare']].values)
predicted_ages = pred_my_age_model

# DataFrame of data with 'Age'
all_data_with_age = all_data.dropna(subset=['Age'])

# DataFrame of data without 'Age'
all_data_without_age =all_data.loc[all_data['Age'].isna()]
all_data_without_age.pop('Age')

all_data_without_age.loc[:,'Age'] = predicted_ages[:]

# Recombine to form the complete train and test data

all_data_with_age = pd.concat((all_data_with_age, all_data_without_age), axis=0, sort=False)
all_data_with_age.sort_index(inplace=True)

# Rebuild all_data

all_data = all_data_with_age.copy()
display(all_data)

<a id="TOC-11"></a>
# Prepare data for 'Survived' prediction

In [None]:
# Check NaN status
list_columns = list(all_data.columns)
print(list_columns)
print(train_data.columns)
for column in list_columns:
    print("all_data Nan in ", column, " = ", all_data[column].isna().sum())

In [None]:
# Analyze 'Age' and convert to categorical

display(all_data.head())
convert_numerical_feature_name_to_categorical(all_data, 'Age', 0, 80, 16)
analyze_feature_name(all_data, 'Age')
display(all_data.head())

In [None]:
# Binarize 'Age'

print(all_data.shape)
display(all_data.head())
features = ['Age']
all_data = convert_feature_to_binary(all_data, features)

In [None]:
display(all_data.head())

In [None]:
print(all_data.columns)

In [None]:
# Experiment dropping some features (except 'is_test').
#'''
copy_of_all_data = all_data.copy()

features_for_Prediction_to_drop = ['Title']
features_for_Prediction_to_drop_columns = []
for feature in features_for_Prediction_to_drop:
    features_for_Prediction_to_drop_columns.extend([col for col in all_data.columns if feature in col])
print(features_for_Prediction_to_drop_columns)

all_data = all_data.drop(features_for_Prediction_to_drop_columns, axis=1)
display(all_data)
#'''

In [None]:
# Rebuild train_data and test_data

train_data = all_data[all_data['is_test'] == 0]
# Now that the train_data is separated, remove the 'is_test' column
train_data.pop('is_test')
print(train_data.shape)
display(train_data.head())

test_data = all_data[all_data['is_test'] == 1]
# Now that the test_data is separated, remove the 'is_test' column
test_data.pop('is_test')
print(test_data.shape)
display(test_data.head())


In [None]:
# Finalize train and test data

train_X = train_data.copy()
y_column_name = "Survived"
train_y = train_X.pop(y_column_name)

# Similarly setup test data
test_X = test_data.copy()
test_X.pop(y_column_name)

train_X_df = train_X
train_y_df = train_y
test_X_df = test_X

display(train_X)
display(test_X)

In [None]:
# Convert to ndarray
train_X_all = np.asarray(train_X)
train_y_all = np.asarray(train_y)
test_X = np.asarray(test_X)

<a id="TOC-12"></a>
# Select appropriate radom_state value to split the input data into 'train' and 'check' datasets

In the 'train' data we have the following:<br>
    <ul>
    <li>Total passengers = 891 (passengers and crew)</li>
    <li>Survived = 342</li>
    <li>Percent survived = 38.38%</li>
    </ul>
    
From the internet we have thefollowing (https://comparecamp.com/titanic-statistics/):<br>
    <ul>
    <li>Total passengers = 2,223 (passengers and crew)</li>
    <li>Total passengers = 1,316 (passengers only)</li>
    <li>Total crew = 885 (crew only)</li>
    <li>Survived = 723 (approximately)</li>
    <li>Percent survived = 33.04%</li>
    </ul>
    <ul>
    <li>There were a total of 325 first-class passengers on the ship.</li>
    <li>Cost of first-class (parlor suite) one-way ticket was £870 or $4,350 ($83,200 today).</li>
    <li>285 was the number of second-class passengers aboard the RMS Titanic.</li>
    <li>Second-class tickets cost £12 or $60 ($1200 today).</li>
    <li>Third-class passengers on board were 706.</li>
    <li>£3 to £8 or $40 ($298 to $793 today) was the cost of third-class tickets.</li>
    </ul>
    
For the train data we have the following:<br>
    <ul>
    <li>Total passengers = 417 (passengers and crew)</li>
        <blockquote>The overall survival rate was 33.04%. <br>The 'train' set survival rate is 38.38%.
        <br>TIf the 'train' and 'test' sample together represent the overall data, then
        the approximate survival rate in the 'test' data should be:
            <ul>
            <li>Survived overall = (891 + 417)*0.3304 = 432</li>
            <li>Survived 'test' = 432 - 342 = 90</li>
            <li>Survived 'test' percentage = 21.61%</li>
            </blockquote>
    
Based on this, if we select a 'train_y' split data with 'Survived' = 38.3%, then when the accuracy is 100%,
the 'check_y' data should also give 'Survived' = 38.3%.
    
In addition, the 'test' data should give 'Survived' = 21.61%. Although, this estimate may not be correct
because it is based on the assumpion that the <br>overall data ('train' and 'test' data) is an accurate
representation of the original Titanic data. But, if the end result does show a survival <br>rate of around 21%,
I will be quite happy.

In [None]:
# The above cell was used to estimate 'random_state = 0' which gives a train and check split with survival rate of about 38.3%

train_X, check_X, train_y, check_y = train_test_split(train_X_all, train_y_all, test_size=0.2, random_state=0)

<a id="TOC-13"></a>
# Estimate 'Survived' - using a Classification model

<a id="TOC-14"></a>
# Setup Keras Tuner and Optimize Hyperparameters
    
    Experimented with GridSearchCV. Had to run the model overnight. Same search (though not as thorough) done with
    Keras Tuner (Hyperband and BayesOptimization) took less than 1 hour. Though Keras Tuner is a bit of a black box
    it appreast to give reasonable results.

In [None]:
# define the grid search parameters


#'''
# First try

input_dim = train_X.shape[1]
#epochs = [100, 200]
optimizer = ['Adam', 'Nadam'] #
learning_rate = [0.000001, 0.0001, 0.3] #
momentum = [0.4, 0.9]
init_mode = ['glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'] #
activation = ['relu', 'tanh'] #
weight_constraint = [3, 5] #
dropout_rate = [0.2, 0.5] #
weight_decay = [1e-6, 1e-5, 1e-3, 1e-2] #
# cv = 5


#'''


In [None]:
tuner_survived = kt.Hyperband(
    build_survival_prediction_classification_model,
    objective='val_binary_accuracy',
    directory='kt_02_01',
    project_name='Titanic_keras_tuner',
    factor=3,
    max_epochs=50,
    executions_per_trial=1,
    overwrite=True)

tuner_survived.search_space_summary()

In [None]:
# Search for the best model.
callbacks_survived = get_callbacks()
tuner_survived.search(train_X, train_y, validation_split=0.2, batch_size=8, epochs=200, callbacks=[callbacks_survived])

In [None]:
# Extract the best model

best_model_survived = tuner_survived.get_best_models()[0]
best_model_survived.summary()

In [None]:
tuner_survived.results_summary()

In [None]:
# Get best hyperparameters

best_hps_survived=tuner_survived.get_best_hyperparameters(num_trials=1)[0]
best_hps_survived.get('optimizer')

In [None]:
# Build the model with the optimal hyperparameters and train it on the data for 600 epochs
tuner_survived_hypermodel = tuner_survived.hypermodel.build(best_hps_survived)

In [None]:
# Extract the best epoch when val_loss_per_epoch is minimum.

history = tuner_survived_hypermodel.fit(train_X, train_y, epochs=600, validation_split=0.2, batch_size=8, verbose=0, callbacks=[callbacks_survived])
val_bin_acc_per_epoch = history.history['val_binary_accuracy']
val_loss_per_epoch = history.history['val_loss']
best_epoch = val_loss_per_epoch.index(min(val_loss_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

In [None]:
# Retrain the model to the best epoch.

tuner_survived_hypermodel.fit(train_X, train_y, epochs=best_epoch, batch_size=8, validation_split=0.2)

In [None]:
eval_result = tuner_survived_hypermodel.evaluate(check_X, check_y)
print("[check loss (val_loss), check accuracy (val_binary_accuracy)]:", eval_result)

In [None]:
predictions_check = np.squeeze(tuner_survived_hypermodel.predict(check_X, verbose=0))
predictions_check = predictions_check > 0.5
predictions_check = predictions_check.astype('int32')
acc_my_survived_model = round(np.mean(predictions_check == check_y)*100, 2)
print(acc_my_survived_model)

In [None]:
# Plot the 'check' data

plt.rcParams["figure.figsize"] = (20,3)
#plt.plot(np.squeeze(y_with_age_check.values))
X_axis = np.arange(len(check_y))
plt.bar(X_axis - 0.2, check_y, 0.4)
#plt.plot(np.squeeze(tuner_survived_hypermodel.predict(X_with_age_check)))
plt.bar(X_axis + 0.2, np.squeeze(predictions_check), 0.4)
plt.title('ID vs. Age')
plt.ylabel('Age')
plt.xlabel('ID')
plt.legend(['Y', 'Prediction'], loc='upper right')
plt.show()

In [None]:
# Load weights from last epoch or the best epoch

def get_model_last_epoch(model):
    """
    This function should create a new instance of the CNN you created earlier,
    load on the weights from the last training epoch, and return this model.
    """
    filepath = tf.train.latest_checkpoint('checkpoints_every_epoch')
    model.load_weights(filepath)
    
    return model
    
def get_model_best_epoch(model):
    """
    This function should create a new instance of the CNN you created earlier, load 
    on the weights leading to the highest validation accuracy, and return this model.
    """
    filepath = tf.train.latest_checkpoint('checkpoints_best_only')
    model.load_weights(filepath)
    
    return model
    

<a id="TOC-15"></a>
# Plot the 'Survived' learning curves

In [None]:
# Plot the training and validation loss

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss vs. epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper right')
plt.show()

In [None]:
# Confusion matrix

predict_check_y = tuner_survived_hypermodel.predict(check_X)
predict_check_y = (predict_check_y > 0.5)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(check_y, predict_check_y)

In [None]:
# Heat map

import seaborn as sns
cols = ['check_y', 'predict_check_y']
sns.heatmap(cm, annot=True, yticklabels=cols, xticklabels=cols).set_title('Given vs Predicted')

In [None]:
# Manual check of 'train' prediction

predictions_train = tuner_survived_hypermodel.predict(train_X, verbose=0)
predictions_train = np.where(predictions_train > 0.5, 1, 0)
predictions_train = predictions_train.astype('int32')
train_y = train_y.reshape(predictions_train.shape[0],1)

predictions_manual_df = pd.DataFrame({
    'predictions_train' : list(np.squeeze(predictions_train)),
    'train_y' : list(np.squeeze(train_y.astype('int32')))
})
display(predictions_manual_df.describe())
percent_survived_y = round(np.mean(train_y)*100, 2)
percent_survived_train = round(np.mean(predictions_train)*100, 2)

In [None]:
pred_my_survived_model = tuner_survived_hypermodel.predict(test_X)
pred_my_survived_model = pred_my_survived_model > 0.5
pred_my_survived_model = pred_my_survived_model.astype('int32')

In [None]:
pred_my_survived_model_df = pd.DataFrame(zip(PassengerId_test, np.squeeze(pred_my_survived_model)))
pred_my_survived_model_df.set_axis(['PassengerId', 'Predictions_My_Survived_Model'], axis=1, inplace=True)
pred_my_survived_model_df.set_index('PassengerId', drop=False, inplace=True)
test_data_final = pd.concat([test_data_original, pred_my_survived_model_df], axis=1)
display(test_data_final)

In [None]:
display(pred_my_survived_model_df.describe())
percent_survived_test_my_model = (np.mean(pred_my_survived_model_df.Predictions_My_Survived_Model)*100)
print(f'{percent_survived_test_my_model:.1f}')

In [None]:
layer_dimensions=''
for layer in tuner_survived_hypermodel.layers:
    if layer_dimensions == '':
        layer_dimensions=layer_dimensions + str(layer.get_output_at(0).get_shape().as_list()[1])
    else:
        layer_dimensions=layer_dimensions + 'x' + str(layer.get_output_at(0).get_shape().as_list()[1])
    
#print(layer_dimensions)

In [None]:
#sys.exit('Stopping the code at this location.')

<a id="TOC-16"></a>
# Predict using various methods and compare

In [None]:
# Generate alias variable names

X_train = train_X
Y_train = np.squeeze(train_y)
X_test = test_X

In [None]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred_log = logreg.predict(X_test)
acc_log = round(logreg.score(check_X, check_y) * 100, 2)
acc_log

In [None]:
coeff_df = pd.DataFrame(train_X_df.columns)
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)


In [None]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred_svc = svc.predict(X_test)
acc_svc = round(svc.score(check_X, check_y) * 100, 2)
acc_svc

In [None]:
# KNN confidence score is better than Logistic Regression but worse than SVM

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred_knn = knn.predict(X_test)
acc_knn = round(knn.score(check_X, check_y) * 100, 2)
acc_knn

In [None]:
percent_survived_test_knn = round(np.mean(Y_pred_knn)*100,2)
print(percent_survived_test_knn)

In [None]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred_gaussian = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(check_X, check_y) * 100, 2)
acc_gaussian

In [None]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred_preceptron = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(check_X, check_y) * 100, 2)
acc_perceptron

In [None]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred_linear_svc = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(check_X, check_y) * 100, 2)
acc_linear_svc

In [None]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred_sgd = sgd.predict(X_test)
acc_sgd = round(sgd.score(check_X, check_y) * 100, 2)
acc_sgd

In [None]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree = decision_tree.fit(X_train, Y_train)
Y_pred_decision_tree = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(check_X, check_y) * 100, 2)
print(acc_decision_tree)
#tree.plot_tree(decision_tree)

In [None]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred_random_forest = random_forest.predict(X_test)
percent_survived_test_random_forest = round(np.mean(Y_pred_random_forest) * 100, 2)
print(percent_survived_test_random_forest)
#random_forest.score(X_train, Y_train)
acc_random_forest_train = round(random_forest.score(X_train, Y_train) * 100, 2)
#print(acc_random_forest_train)
acc_random_forest_check = round(random_forest.score(check_X, check_y) * 100, 2)
acc_random_forest_check

<a id="TOC-17"></a>
# Summarize accuracy of 'Survived' results by various methods

In [None]:
# Summarize results

prediction_models = pd.DataFrame({
    'Model': ['My Model', 'Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_my_survived_model, acc_svc, acc_knn, acc_log, 
              acc_random_forest_check, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
prediction_models.sort_values(by='Score', ascending=False)

<a id="TOC-18"></a>
# Prepare Submission

In [None]:
predictions_test_df = pd.DataFrame({
    "PassengerId" : PassengerId_test, 
    #"Survived" : np.squeeze(Y_pred_svc).astype('int32')
    #"Survived" : np.squeeze(Y_pred_knn).astype('int32')
    #"Survived" : np.squeeze(Y_pred_svc).astype('int32')
    "Survived" : np.squeeze(pred_my_survived_model).astype('int32')
})
harishsenapathy_submission = predictions_test_df.copy()
#display(harishsenapathy_submission)
predictions_test_df.set_index('PassengerId', drop=True, inplace=True)
#display(predictions_test_df)
test_data_final = pd.concat([test_data_original, predictions_test_df], axis=1)
display(test_data_final)
#acc_selected_model = acc_knn
acc_selected_model = acc_my_survived_model
#selected_percent_survived = percent_survived_test_knn
selected_percent_survived = percent_survived_test_my_model
harishsenapathy_submission.to_csv(f'harishsenapathy_submission_11_{layer_dimensions}_{acc_selected_model:.1f}_{percent_survived_y:.1f}_{percent_survived_train:.1f}_{selected_percent_survived:.1f}.csv', encoding='utf-8', index=False)

In [None]:
#display(test_X_df)
with pd.option_context("display.max_rows", 10000):
    display(test_X_df)
