# Introduction

Titanic



# The Package

In [1]:
import numpy as np
import pandas as pd


from scipy.stats import chi2_contingency, kruskal, f_oneway, normaltest, bartlett
import plotly.express as px
import plotly.graph_objects as go # For pie charts

!{sys.executable} -m pip install keras
!{sys.executable} -m pip install -U keras-tuner

import tensorflow as tf
from tensorflow import keras
from keras import layers
import kerastuner
from kerastuner import RandomSearch

tf.random.set_seed(1)

# Explore Dataset

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

display(train_df.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
DROP = ['Name', 'PassengerId','Cabin']    #irrelevant 
train_df = train_df.drop(columns=DROP)


In [42]:
test_df = test_df.drop(columns=DROP)

In [44]:
test_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1.0,male,0.452723,0.0,0.0,330911,0.015282,Q
1,1.0,female,0.617566,0.125,0.0,363272,0.013663,S
2,0.5,male,0.815377,0.0,0.0,240276,0.018909,Q
3,1.0,male,0.353818,0.0,0.0,315154,0.016908,S
4,1.0,female,0.287881,0.125,0.111111,3101298,0.023984,S


In [5]:
display(train_df.head())

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,female,35.0,1,0,113803,53.1,S
4,0,3,male,35.0,0,0,373450,8.05,S


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


Now that in Age columns there are some null values, which must be dropped or filled with a mean value of the entire existing column. 

In [7]:
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
train_df = train_df.dropna(subset=['Embarked'])

train_df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64

In [8]:
train_df.shape

(889, 9)

In [22]:
CAT_FEATURES = ['Sex','Ticket','Embarked']
NUM_FEATURES = ['Pclass','Age','SibSp','Parch','Fare']

LABEL = 'Survived'
FEATURES = train_df.columns.tolist()
FEATURES.remove(LABEL)

In [23]:
for feature in NUM_FEATURES:
    box_by_label = px.box(train_df, x=LABEL, y=feature, title=feature + " by survive/die")
    box_by_label.show()

Among the numerical features, SibSp does not look like it is useful to predict survival output. 

In [24]:
train_df.corr(method="spearman")

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.336917,-0.041027,0.090944,0.140126,0.320772
Pclass,-0.336917,1.0,-0.314204,-0.045279,-0.024696,-0.686166
Age,-0.041027,-0.314204,1.0,-0.143847,-0.2166,0.121971
SibSp,0.090944,-0.045279,-0.143847,1.0,0.449607,0.450749
Parch,0.140126,-0.024696,-0.2166,0.449607,1.0,0.413106
Fare,0.320772,-0.686166,0.121971,0.450749,0.413106,1.0


#Correlation

Only Pclass and Fare seem to be correlated with the survival result as the absolute values of the outputs are over 0.357 and 0.33 respectively.

The Fare is highly correlated with the Pclass, followed by SibSp and Parch. 

SibSp is also correlated with Parch

In [25]:
SIG = 0.05
MOD_SIG = 0.1

survival = train_df[train_df[LABEL] == 1]
death = train_df[train_df[LABEL] == 0]

In [26]:
# Find out which features can be assessed using ANOVA
for col in NUM_FEATURES:
    pop1 = survival[col]
    pop2 = death[col]
    stat1, p1 = normaltest(pop1)
    stat2, p2 = normaltest(pop2)
    if p1 > SIG and p2 > SIG:
        stat, p = bartlett(pop1, pop2)
        if p > SIG:
            print(col, "meets ANOVA assumptions")
        else:
            print(col, "--> Kruskal-Wallis, variance is unequal:", p)
    else:
        print(col,  "--> Kruskal-Wallis, not normally distributed:", p1, p2)


Pclass --> Kruskal-Wallis, not normally distributed: 0.0 4.465405162782553e-20
Age --> Kruskal-Wallis, not normally distributed: 0.08338285310821116 9.34541840078704e-15
SibSp --> Kruskal-Wallis, not normally distributed: 1.5799326846744402e-33 3.7991122810189133e-94
Parch --> Kruskal-Wallis, not normally distributed: 1.0694692584470984e-27 7.8105068599282235e-90
Fare --> Kruskal-Wallis, not normally distributed: 7.892442029445252e-68 1.4147916196682132e-118


In [27]:
for col in NUM_FEATURES:
    pop1 = survival[col]
    pop2 = death[col]
    stat, p = f_oneway(pop1, pop2)
    if p <= SIG:
        print(col, "and label are not independent - keep, p =", p)
    elif p <= MOD_SIG:
        print(col, "and label may have some relationship - maybe keep, p =", p)
    else:
        print(col, "and label are independent - drop, p =", p)

Pclass and label are not independent - keep, p = 7.776916288562695e-25
Age and label are not independent - keep, p = 0.037395225426059005
SibSp and label are independent - drop, p = 0.31067537341133067
Parch and label are not independent - keep, p = 0.013136766201093403
Fare and label are not independent - keep, p = 1.0797887540536882e-14


In [28]:
for col in FEATURES:
    pop1 = survival[col]
    pop2 = death[col]
    stat, p = kruskal(pop1, pop2)
    if p <= SIG:
        print(col, "and label are not independent - keep, p =", p)
    elif p <= MOD_SIG:
        print(col, "and label may have some relationship - maybe keep, p =", p)
    else:
        print(col, "and label are independent - drop, p =", p)

Pclass and label are not independent - keep, p = 1.0178648698620555e-23
Sex and label are not independent - keep, p = 1.3601519476403678e-58
Age and label are independent - drop, p = 0.2214856745252243
SibSp and label are not independent - keep, p = 0.006726926101742026
Parch and label are not independent - keep, p = 2.9711077819070433e-05
Ticket and label are not independent - keep, p = 1.9663317989359804e-06
Fare and label are not independent - keep, p = 1.191429108987401e-21
Embarked and label are not independent - keep, p = 1.319392291772178e-06


# Rescale and Dummy encode train_df

In [29]:
def rescale(x, MIN, MAX):
    return (x-MIN)/(MAX-MIN)

def rescale_df(df,num_features):
    for feature in num_features:
        min_val = df[feature].min()
        max_val = df[feature].max()
        df[feature] = df[feature].apply(lambda x: rescale(x,min_val, max_val))
    return df

In [30]:
rescaled_train_df = rescale_df(train_df, NUM_FEATURES)

preprocessed_df = pd.get_dummies(rescaled_train_df, columns= CAT_FEATURES)

display(preprocessed_df.head())

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Ticket_110152,Ticket_110413,...,Ticket_W./C. 14263,Ticket_W./C. 6607,Ticket_W./C. 6608,Ticket_W./C. 6609,Ticket_W.E.P. 5734,Ticket_W/C 14208,Ticket_WE/P 5735,Embarked_C,Embarked_Q,Embarked_S
0,0,1.0,0.271174,0.125,0.0,0.014151,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0.0,0.472229,0.125,0.0,0.139136,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,1.0,0.321438,0.0,0.0,0.015469,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0.0,0.434531,0.125,0.0,0.103644,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,1.0,0.434531,0.0,0.0,0.015713,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [32]:
LABEL = 'Survived'
FEATURES = preprocessed_df.columns.tolist()
FEATURES.remove(LABEL)

validation_size = int(len(preprocessed_df)*0.8)
X_train = preprocessed_df[:validation_size]
X_valid = preprocessed_df[validation_size:]

X_train_df = X_train[FEATURES]
X_valid_df = X_valid[FEATURES]

y_train_df = X_train[LABEL]
y_valid_df = X_valid[LABEL]

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_validate,LeaveOneOut

In [34]:
clf = LogisticRegression(penalty= "l2")
result = cross_validate(clf, X_train_df, y_train_df, cv= LeaveOneOut())

avg_accuracy = sum(result['test_score'])/len(result['test_score'])
print(avg_accuracy)

0.8030942334739803


In [75]:
random_forest = RandomForestClassifier(n_estimators=100, max_depth=50, random_state=1)
result1 = cross_validate(random_forest, X_train_df, y_train_df, cv= LeaveOneOut())

avg_accuracy1 = sum(result1['test_score'])/len(result1['test_score'])
print(avg_accuracy1)

0.8270042194092827


In [64]:
def tune_model(hp):
    model = keras.Sequential()
    for i in range(hp.Int("num_layers", min_value=2, max_value=4, step=1)):
        model.add(layers.Dense(units=hp.Int("units_" + str(i),
                                            min_value=30, max_value=690, step=30),
                               activation="relu"))
        model.add(layers.Dense(1, activation="sigmoid"))
        model.compile(optimizer="adam", loss="binary_crossentropy",
                    metrics=["accuracy", 
                             "TruePositives", "TrueNegatives",
                             "FalsePositives", "FalseNegatives"])
    return model

In [65]:
tuner_acc = RandomSearch(
    tune_model, objective="val_accuracy", 
    max_trials=10, executions_per_trial=3, project_name="titanic4"
)

tuner_acc.search_space_summary()

tuner_acc.search(X_train_df.values,
                 y_train_df.values,
                 epochs=200,
                 batch_size=446,
                 validation_data=(X_valid_df.values, y_valid_df.values),
                 callbacks=[keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)])

Trial 10 Complete [00h 00m 11s]
val_accuracy: 0.8239700198173523

Best val_accuracy So Far: 0.8651685317357382
Total elapsed time: 00h 02m 21s
INFO:tensorflow:Oracle triggered exit


In [66]:
tuner_acc.results_summary()

Results summary
Results in .\titanic4
Showing 10 best trials
Objective(name='val_accuracy', direction='max')
Trial summary
Hyperparameters:
num_layers: 4
units_0: 240
units_1: 390
units_2: 480
units_3: 30
Score: 0.8651685317357382
Trial summary
Hyperparameters:
num_layers: 3
units_0: 390
units_1: 630
units_2: 30
Score: 0.8539325594902039
Trial summary
Hyperparameters:
num_layers: 2
units_0: 60
units_1: 630
units_2: 570
units_3: 300
Score: 0.8258426785469055
Trial summary
Hyperparameters:
num_layers: 3
units_0: 180
units_1: 330
units_2: 540
units_3: 390
Score: 0.8239700198173523
Trial summary
Hyperparameters:
num_layers: 2
units_0: 390
units_1: 270
units_2: 600
units_3: 510
Score: 0.8239700198173523
Trial summary
Hyperparameters:
num_layers: 2
units_0: 360
units_1: 690
Score: 0.795880138874054
Trial summary
Hyperparameters:
num_layers: 3
units_0: 240
units_1: 150
units_2: 450
units_3: 660
Score: 0.7752808928489685
Trial summary
Hyperparameters:
num_layers: 3
units_0: 90
units_1: 540
uni

In [39]:
def best_model(layer_info):
    '''
    layer_info is a list with nodes per layer
    '''
    model = keras.Sequential()
    for i in range(len(layer_info)):
        model.add(layers.Dense(layer_info[i], activation="relu"))
    
    model.add(layers.Dense(1, activation="sigmoid"))
    model.compile(optimizer="adam", loss="binary_crossentropy",
                metrics=["accuracy", 
                         "TruePositives", "TrueNegatives",
                         "FalsePositives", "FalseNegatives"])
    return model

In [43]:
rescaled_test_df = rescale_df(test_df, NUM_FEATURES)

new_test_df = pd.get_dummies(rescaled_test_df, columns= CAT_FEATURES)

display(new_test_df.head())

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Ticket_110469,Ticket_110489,Ticket_110813,...,Ticket_STON/O2. 3101270,Ticket_STON/OQ. 369943,Ticket_W./C. 14260,Ticket_W./C. 14266,Ticket_W./C. 6607,Ticket_W./C. 6608,Ticket_W.E.P. 5734,Embarked_C,Embarked_Q,Embarked_S
0,1.0,0.452723,0.0,0.0,0.015282,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1.0,0.617566,0.125,0.0,0.013663,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0.5,0.815377,0.0,0.0,0.018909,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1.0,0.353818,0.0,0.0,0.016908,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1.0,0.287881,0.125,0.111111,0.023984,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [73]:
best_model_acc = best_model([240,390])
best_model_acc.fit(X_train_df.values, y_train_df.values, epochs=200, batch_size=446,
                   validation_data=(X_valid_df.values, y_valid_df.values),
                   callbacks=[keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200


<tensorflow.python.keras.callbacks.History at 0x23579460be0>

In [None]:
print(prediction)