In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.tree as tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc, roc_auc_score, f1_score, accuracy_score
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier


In [2]:
#Titanic survival data set
data = pd.read_csv('train.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


#### I followed all data preprocessing steps in this article.
https://medium.com/i-like-big-data-and-i-cannot-lie/how-i-scored-in-the-top-9-of-kaggles-titanic-machine-learning-challenge-243b5f45c8e9

RangeIndex tells that there are 891 entries, but Age, Cabin, and Embarked have fewer than that, suggesting that these 3 columns have null/missing values. 

To  fix missing values in these variables:
- Missing age: Data is grouped by a passenger’s sex, class, and title, fill the missing age with then median age by each group.
- Missing Cabin: Fill Cabin with "U" for unknown.
- Missing Embarked: Fill with the most frequent point of embarkment.

In [3]:
# create a new feature to extract title names from the column of "Name"
data['Title'] = data.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())

In [4]:
normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}

# map the normalized titles to the current titles 
data.Title = data.Title.map(normalized_titles)
# view value counts for the normalized titles
print(data.Title.value_counts())

Mr         517
Miss       184
Mrs        127
Master      40
Officer     18
Royalty      5
Name: Title, dtype: int64


In [5]:
# group by Sex, Pclass, and Title 
grouped = data.groupby(['Sex','Pclass', 'Title'])  
# view the median Age by the grouped features 
grouped.Age.median()

Sex     Pclass  Title  
female  1       Miss       30.0
                Mrs        40.0
                Officer    49.0
                Royalty    40.5
        2       Miss       24.0
                Mrs        31.5
        3       Miss       18.0
                Mrs        31.0
male    1       Master      4.0
                Mr         40.0
                Officer    51.0
                Royalty    40.0
        2       Master      1.0
                Mr         31.0
                Officer    46.5
        3       Master      4.0
                Mr         26.0
Name: Age, dtype: float64

In [6]:
# apply the grouped median value on the Age NaN
data.Age = grouped.Age.apply(lambda x: x.fillna(x.median()))

In [7]:
# fill Cabin NaN with U for unknown
data.Cabin = data.Cabin.fillna('U')
# find most frequent Embarked value and store in variable
most_embarked = data.Embarked.value_counts().index[0]

# fill NaN with most_embarked value
data.Embarked = data.Embarked.fillna(most_embarked)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       891 non-null object
Title          891 non-null object
dtypes: float64(2), int64(5), object(6)
memory usage: 90.6+ KB


Following the steps in this article, two new features are created: Family size per passenger.The assumption is that it is more difficult for a larger family to secure a spot on a life boat, compared to a smaller size family. 

In [8]:
# size of families (including the passenger)
data['FamilySize'] = data.Parch + data.SibSp + 1

Another feature is the Cabin feature. The first letter of the Cabin is extracted to represent the Cabin feature. A Kaggle discussion mentions that the first letter of the cabin indicates deck.(https://www.kaggle.com/c/titanic/discussion/4693) Also, pclass roughly matches the deck - that is, first class has the top decks (A-E), second class (D-F), and thrid class (E-G). The assumption is that passengers with a higher deck/pclass have a higher possibility to survive, because they are closer to lifeboats.

In [9]:
# map first letter of cabin to itself
data.Cabin = data.Cabin.map(lambda x: x[0])

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       891 non-null object
Title          891 non-null object
FamilySize     891 non-null int64
dtypes: float64(2), int64(6), object(6)
memory usage: 97.5+ KB


In [11]:
# Convert categorical features into dummy variables.
# Convert the male and female groups to integer form
data.Sex = data.Sex.map({"male": 0, "female":1})
# create dummy variables for categorical features
pclass_dummies = pd.get_dummies(data.Pclass, prefix="Pclass")
title_dummies = pd.get_dummies(data.Title, prefix="Title")
cabin_dummies = pd.get_dummies(data.Cabin, prefix="Cabin")
embarked_dummies = pd.get_dummies(data.Embarked, prefix="Embarked")
# concatenate dummy columns with main dataset
data_dummies = pd.concat([data, pclass_dummies, title_dummies, cabin_dummies, embarked_dummies], axis=1)

# drop categorical fields
data_dummies.drop(['Pclass', 'Title', 'Cabin', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)

data_dummies.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,FamilySize,Pclass_1,Pclass_2,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Embarked_C,Embarked_Q,Embarked_S
0,1,0,0,22.0,1,0,7.25,2,0,0,...,0,0,0,0,0,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,2,1,0,...,1,0,0,0,0,0,0,1,0,0
2,3,1,1,26.0,0,0,7.925,1,0,0,...,0,0,0,0,0,0,1,0,0,1
3,4,1,1,35.0,1,0,53.1,2,1,0,...,1,0,0,0,0,0,0,0,0,1
4,5,0,0,35.0,0,0,8.05,1,0,0,...,0,0,0,0,0,0,1,0,0,1


In [12]:
data_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 29 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Sex              891 non-null int64
Age              891 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Fare             891 non-null float64
FamilySize       891 non-null int64
Pclass_1         891 non-null uint8
Pclass_2         891 non-null uint8
Pclass_3         891 non-null uint8
Title_Master     891 non-null uint8
Title_Miss       891 non-null uint8
Title_Mr         891 non-null uint8
Title_Mrs        891 non-null uint8
Title_Officer    891 non-null uint8
Title_Royalty    891 non-null uint8
Cabin_A          891 non-null uint8
Cabin_B          891 non-null uint8
Cabin_C          891 non-null uint8
Cabin_D          891 non-null uint8
Cabin_E          891 non-null uint8
Cabin_F          891 non-null uint8
Cabin_G          891 non-null uint8
Cabin_T          891 non-nu

In [13]:
Xdf=data_dummies[['Sex','Age','SibSp','Parch','Fare','FamilySize','Pclass_1','Pclass_2','Pclass_3',
                'Title_Master','Title_Miss','Title_Mr','Title_Mrs','Title_Officer','Title_Royalty','Cabin_A','Cabin_B',         
                'Cabin_C','Cabin_D','Cabin_E','Cabin_F','Cabin_G','Cabin_T','Cabin_U','Embarked_C','Embarked_Q','Embarked_S']]
Ydf=data_dummies[['Survived']]

In [14]:
X=Xdf.values
Y=Ydf.values

In [15]:
# Split data into train and  test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
print ("Training set in Titanic survival classification model has {} samples.".format(X_train.shape[0]))
print ("Testing set in Titanic survival classification model has {} samples.".format(X_test.shape[0]))

Training set in Titanic survival classification model has 668 samples.
Testing set in Titanic survival classification model has 223 samples.


In [62]:
# Neural Networks
# Baseline (no pruning)
start_time = timeit.default_timer()

neuraln0 = MLPClassifier(random_state=1)
neuraln0.fit(X_train, y_train.ravel())
end_time = timeit.default_timer()
training_time = end_time - start_time

start_time = timeit.default_timer()
Y_pred0 = neuraln0.predict(X_test)
end_time = timeit.default_timer()
predict_time = end_time - start_time

acctrain_neuraln0 = round(neuraln0.score(X_train, y_train), 3)
print('Accuracy score (train set, baseline - no cross validation):', acctrain_neuraln0)
print("Model Training Time (s):   "+"{:.5f}".format(training_time))
print("Model Prediction Time (s): "+"{:.5f}\n".format(pred_time))  


Accuracy score (train set, baseline - no cross validation): 0.846
Model Training Time (s):   0.24109
Model Prediction Time (s): 0.00030



In [16]:
import mlrose
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer
#from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV

In [17]:
# Feature scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
#Class MLPClassifier implements a multi-layer perceptron (MLP) algorithm that trains using Backpropagation.
# This has been done in Assignment 1. Based on reviewer's comments, I further tried different activation function and hidden layer size.
# Also, Multi-layer Perceptron is sensitive to feature scaling, so I did it.

from sklearn.neural_network import MLPClassifier
import timeit
import time

start_time=time.time()


MLP = MLPClassifier(random_state=1)
mlp_param_grid = {'hidden_layer_sizes' : [(5,),(10,),(15,),(20,)], 
                  'learning_rate_init': [0.0001,0.001,0.01],
                  'activation': ['relu', 'logistic'],
                  'max_iter':[200,300,400,600]}

gsMLP = GridSearchCV(MLP,param_grid = mlp_param_grid, cv=5, scoring='accuracy',n_jobs=-1,verbose=1)
gsMLP.fit(X_train_scaled,y_train)
MLP_best = gsMLP.best_estimator_

#gsMLP.best_score_

print("process time total:{:.2f} seconds".format(time.time()-start_time))


Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  1.8min finished
  y = column_or_1d(y, warn=True)


process time total:108.75 seconds




In [25]:
print("best score: %s" % gsMLP.best_score_)
print("best param: %s" % gsMLP.best_params_)

best score: 0.8383233532934131
best param: {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.001, 'max_iter': 300}


In [26]:
#Re-run NN with the best parameters reported in Grid Search

nn1 = MLPClassifier(max_iter=300,activation='relu',
                    learning_rate_init=0.001,
                    #solver='lbfgs',
                    hidden_layer_sizes=[10,],random_state=1)

# Training
start_time = timeit.default_timer()
nn1.fit(X_train_scaled, y_train)
end_time = timeit.default_timer()
training_time = end_time - start_time
print("Model Training Time (s):   "+"{:.5f}".format(training_time))

# Predict labels for test set and assess accuracy
y_test_pred_mlp = nn1.predict(X_test_scaled)
y_test_accuracy_mlp = round(accuracy_score(y_test, y_test_pred_mlp),3)
print('Accuracy score (test set, pruned):', y_test_accuracy_mlp)

# Predict labels for train set and assess accuracy
y_train_pred_mlp = nn1.predict(X_train_scaled)
y_train_accuracy_mlp = round(accuracy_score(y_train, y_train_pred_mlp),3)
print('Accuracy score (train set, pruned):',y_train_accuracy_mlp)

  y = column_or_1d(y, warn=True)


Model Training Time (s):   0.53842
Accuracy score (test set, pruned): 0.825
Accuracy score (train set, pruned): 0.856




In [28]:
import warnings
warnings.filterwarnings("ignore")

In [29]:
nn1_ga = mlrose.NeuralNetwork(hidden_nodes = [10], activation = 'relu', algorithm = 'genetic_alg', 
                           bias = True,  is_classifier = True, 
                           #learning_rate=0.1, 
                           early_stopping=True, 
                           #clip_max=5, 
                           max_attempts =500, max_iters=300)

# Training
start_time = timeit.default_timer()
nn1_ga.fit(X_train_scaled, y_train)
end_time = timeit.default_timer()
training_time = end_time - start_time
print("Model Training Time - NN1-GA (s):   "+"{:.5f}".format(training_time))

# Predict labels for test set and assess accuracy
y_test_pred_nn1ga = nn1_ga.predict(X_test_scaled)
y_test_accuracy_nn1ga = round(accuracy_score(y_test, y_test_pred_nn1ga),3)
print('Accuracy score (test set, pruned):', y_test_accuracy_nn1ga)

# Predict labels for train set and assess accuracy
y_train_pred_nn1ga = nn1_ga.predict(X_train_scaled)
y_train_accuracy_nn1ga = round(accuracy_score(y_train, y_train_pred_nn1ga),3)
print('Accuracy score (train set, pruned):',y_train_accuracy_nn1ga)

Model Training Time - NN1-GA (s):   77.18175
Accuracy score (test set, pruned): 0.78
Accuracy score (train set, pruned): 0.817
