In [2]:
# importing libraires
import numpy as np
import pandas as pd

# importing metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [3]:
# loading dataset
titanic_df = pd.read_csv('datasets/titanic-processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,1,1,37.0,1,0,53.1,0,0,1
1,1,2,0,21.0,0,0,10.5,0,0,1
2,0,2,1,25.0,0,0,13.0,0,0,1
3,1,1,1,48.0,0,0,26.55,0,0,1
4,1,1,0,32.0,0,0,76.2917,1,0,0


In [4]:
# spliting data into train and test data
X = titanic_df.drop('Survived', axis=1)
Y = titanic_df['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [21]:
# defining summarize function
def summarize_classification(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred, normalize=True)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print('Test data count :', len(y_test))
    print('Accuracy Score :', accuracy)
    print('Precision Score :', precision)
    print('Recall Score :', recall)
    print()

In [22]:
# using hyperparameter tunning for DecisionTree
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth': [2, 4, 5, 6, 7, 9, 10]}

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 5}

In [23]:
# finding out best max depth value for this model
for i in range(len(parameters['max_depth'])):
    print('Parameters :', grid_search.cv_results_['params'][i])
    print('Mean Test Score :', grid_search.cv_results_['mean_test_score'][i])
    print('Rank Test Score:', grid_search.cv_results_['rank_test_score'][i])

Parameters : {'max_depth': 2}
Mean Test Score : 0.7803163444639719
Rank Test Score: 5
Parameters : {'max_depth': 4}
Mean Test Score : 0.7996485061511424
Rank Test Score: 2
Parameters : {'max_depth': 5}
Mean Test Score : 0.8066783831282952
Rank Test Score: 1
Parameters : {'max_depth': 6}
Mean Test Score : 0.7943760984182777
Rank Test Score: 3
Parameters : {'max_depth': 7}
Mean Test Score : 0.7855887521968365
Rank Test Score: 4
Parameters : {'max_depth': 9}
Mean Test Score : 0.7662565905096661
Rank Test Score: 6
Parameters : {'max_depth': 10}
Mean Test Score : 0.7644991212653779
Rank Test Score: 7


In [24]:
# buliding decision reg
decision_tree_model = DecisionTreeClassifier(max_depth=grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [25]:
# predicting y
y_pred = decision_tree_model.predict(x_test)

In [26]:
# summarizing the scores
summarize_classification(y_test, y_pred)

Test data count : 143
Accuracy Score : 0.8111888111888111
Precision Score : 0.7291666666666666
Recall Score : 0.7142857142857143



In [31]:
# buliding hyperparameters for Logistic Regression
parameters = {'penalty':['l1', 'l2'], 'C':[0.1, 0.4, 0.8, 1, 2, 5]}

grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 0.8, 'penalty': 'l1'}

In [34]:
# finding out best max depth value for this model
for i in range(len(parameters['C'])):
    print('Parameters :', grid_search.cv_results_['params'][i])
    print('Mean Test Score :', grid_search.cv_results_['mean_test_score'][i])
    print('Rank Test Score:', grid_search.cv_results_['rank_test_score'][i])

Parameters : {'C': 0.1, 'penalty': 'l1'}
Mean Test Score : 0.7697715289982425
Rank Test Score: 12
Parameters : {'C': 0.1, 'penalty': 'l2'}
Mean Test Score : 0.7750439367311072
Rank Test Score: 10
Parameters : {'C': 0.4, 'penalty': 'l1'}
Mean Test Score : 0.7803163444639719
Rank Test Score: 3
Parameters : {'C': 0.4, 'penalty': 'l2'}
Mean Test Score : 0.7803163444639719
Rank Test Score: 3
Parameters : {'C': 0.8, 'penalty': 'l1'}
Mean Test Score : 0.7838312829525483
Rank Test Score: 1
Parameters : {'C': 0.8, 'penalty': 'l2'}
Mean Test Score : 0.7750439367311072
Rank Test Score: 10


In [36]:
# building logistic model
logistic_model = LogisticRegression(solver='liblinear',\
                                    penalty=grid_search.best_params_['penalty'],\
                                    C=grid_search.best_params_['C']).\
                                    fit(x_train, y_train)

In [37]:
# predicting y
y_pred = logistic_model.predict(x_test)

In [38]:
# summarizing scores
summarize_classification(y_test, y_pred)

Test data count : 143
Accuracy Score : 0.7902097902097902
Precision Score : 0.6938775510204082
Recall Score : 0.6938775510204082

