In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn import preprocessing
from sklearn import tree
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image 
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
titanic = pd.read_csv("Titanic.csv")
titanic.head()

Unnamed: 0.1,Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1st,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.337494,B5,Southampton,2.0,,"St Louis, MO"
1,2,1st,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.550003,C22 C26,Southampton,11.0,,"Montreal, PQ / Chesterville, ON"
2,3,1st,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.550003,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"
3,4,1st,0,"Allison, Mr. Hudson Joshua Crei",male,30.0,1,2,113781,151.550003,C22 C26,Southampton,,135.0,"Montreal, PQ / Chesterville, ON"
4,5,1st,0,"Allison, Mrs. Hudson J C (Bessi",female,25.0,1,2,113781,151.550003,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"


In [3]:
titanic = titanic[['pclass','sex','age','sibsp','survived']]
titanic.head()

Unnamed: 0,pclass,sex,age,sibsp,survived
0,1st,female,29.0,0,1
1,1st,male,0.9167,1,1
2,1st,female,2.0,1,0
3,1st,male,30.0,1,0
4,1st,female,25.0,1,0


In [4]:
check = titanic.isnull().sum()
check

pclass        0
sex           0
age         263
sibsp         0
survived      0
dtype: int64

In [5]:
titanic['age'] = titanic['age'].fillna(titanic['age'].mean())
titanic['age'].isna().sum()

0

In [6]:
le = preprocessing.LabelEncoder()
titanic['pclass']= le.fit_transform(titanic['pclass'])
titanic['sex']=le.fit_transform(titanic['sex'])

In [7]:
titanic['pclass'].unique()

array([0, 1, 2])

In [8]:
titanic['sex'].unique()

array([0, 1])

In [9]:
X = titanic.drop('survived',axis=1)
y = titanic['survived']

# Step 1

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

In [11]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp
772,2,1,17.0,0
543,1,1,36.0,0
289,0,0,18.0,0
10,0,1,47.0,1
147,0,1,29.881135,0


# Step 2

In [12]:
clf_1 = MLPClassifier(solver='adam', hidden_layer_sizes=(100,200), activation = 'logistic',learning_rate='constant',
                    learning_rate_init = 0.001,momentum=0.9,alpha=0.0001,max_iter=1000)
clf_1 = clf_1.fit(X_train,y_train)
y_pred_test_1 = clf_1.predict(X_test)
score_1 = accuracy_score(y_test,y_pred_test_1)
score_1

0.7595419847328244

# Step 3

In [13]:
#Confusion matrix for out sample
conf_matrix_test_1 = confusion_matrix(y_test, y_pred_test_1)
TN_1 = conf_matrix_test_1[0][0]
FN_1 = conf_matrix_test_1[1][0]
TP_1 = conf_matrix_test_1[1][1]
FP_1 = conf_matrix_test_1[0][1]
sensitivity_test_1 = TP_1/(TP_1+FN_1)
sensitivity_test_percent_1 = sensitivity_test_1 * 100
specificity_test_1 = TN_1/(FP_1+TN_1)
specificity_test_percent_1 = specificity_test_1 * 100

print("out‐of‐sample percent survivors correctly predicted (on testing set) : {0:.2f}%\n".format(sensitivity_test_percent_1))
print("out-of‐sample percent fatalities correctly predicted (on testing set) : {0:.2f}%\n".format(specificity_test_percent_1))

out‐of‐sample percent survivors correctly predicted (on testing set) : 55.08%

out-of‐sample percent fatalities correctly predicted (on testing set) : 93.06%



In [14]:
clf_2 = MLPClassifier(solver='adam', hidden_layer_sizes=(500,600), activation = 'logistic',learning_rate='constant',
                    learning_rate_init = 0.001,momentum=0.9,alpha=0.0001,max_iter=1000)
clf_2 = clf_2.fit(X_train,y_train)
y_pred_test_2 = clf_2.predict(X_test)
score_2 = accuracy_score(y_test,y_pred_test_2)
score_2

0.7633587786259542

In [15]:
#Confusion matrix for out sample
conf_matrix_test_2 = confusion_matrix(y_test, y_pred_test_2)
TN_2 = conf_matrix_test_2[0][0]
FN_2 = conf_matrix_test_2[1][0]
TP_2 = conf_matrix_test_2[1][1]
FP_2 = conf_matrix_test_2[0][1]
sensitivity_test_2 = TP_2/(TP_2+FN_2)
sensitivity_test_percent_2 = sensitivity_test_2 * 100
specificity_test_2 = TN_2/(FP_2+TN_2)
specificity_test_percent_2 = specificity_test_2 * 100

print("out‐of‐sample percent survivors correctly predicted (on testing set) : {0:.2f}%\n".format(sensitivity_test_percent_2))
print("out-of‐sample percent fatalities correctly predicted (on testing set) : {0:.2f}%\n".format(specificity_test_percent_2))

out‐of‐sample percent survivors correctly predicted (on testing set) : 60.17%

out-of‐sample percent fatalities correctly predicted (on testing set) : 89.58%



# Step 4

In [16]:
df = pd.DataFrame(columns=['Parameter', 'Random Forest', 'Hidden Layer = 100,200','Hidden Layer = 500,600'])
df = df.append({'Parameter': 'Out‐of‐sample percent survivors correctly predicted ', 'Random Forest': '48.30', 'Hidden Layer = 100,200': sensitivity_test_percent_1 ,'Hidden Layer = 500,600': sensitivity_test_percent_2  }, ignore_index=True)
df = df.append({'Parameter': 'Out-of‐sample percent fatalities correctly predicted ', 'Random Forest': '94.44', 'Hidden Layer = 100,200': specificity_test_percent_1 ,'Hidden Layer = 500,600':specificity_test_percent_2 }, ignore_index=True)

df

Unnamed: 0,Parameter,Random Forest,"Hidden Layer = 100,200","Hidden Layer = 500,600"
0,Out‐of‐sample percent survivors correctly pred...,48.3,55.084746,60.169492
1,Out-of‐sample percent fatalities correctly pre...,94.44,93.055556,89.583333


### You can clearly see an improvement in the accuracy when we are using Neural Networks as compared to that of Random Forest. We can also see an improvement when we increase the Hidden Layer size in the second network structure.