In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic = pd.read_csv('datasets/titanic_processed.csv')
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


In [3]:
X = titanic.drop('Survived', axis= 1)
Y = titanic['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2) 

In [4]:
def summarize(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize= True)
    num_acc = accuracy_score(y_test, y_pred, normalize= False)
    
    pre = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print("test data count: ", len(y_test))
    print("accuracy_count: ", num_acc)
    print("accuracy score: ", acc)
    print("precison score: ", pre)
    print("recall score: ", recall)
    print()

### decision tree

In [5]:
from sklearn.model_selection import GridSearchCV

param = {'max_depth': [2,4,5,7,9,10]}

grid_se = GridSearchCV(DecisionTreeClassifier(),
                      param,
                      cv= 3,
                      return_train_score= True)
grid_se.fit(x_train, y_train)
grid_se.best_params_

{'max_depth': 4}

In [6]:
for i in range(6):
    print("parameters: ", grid_se.cv_results_['params'][i])
    print("mean tests core: ", grid_se.cv_results_['mean_test_score'][i])
    print("rank: ", grid_se.cv_results_['rank_test_score'][i])

parameters:  {'max_depth': 2}
mean tests core:  0.7873387171632785
rank:  2
parameters:  {'max_depth': 4}
mean tests core:  0.789083820662768
rank:  1
parameters:  {'max_depth': 5}
mean tests core:  0.7837928153717627
rank:  4
parameters:  {'max_depth': 7}
mean tests core:  0.785547201336675
rank:  3
parameters:  {'max_depth': 9}
mean tests core:  0.7767752715121136
rank:  5
parameters:  {'max_depth': 10}
mean tests core:  0.7662582381880627
rank:  6


In [7]:
decision_tree = DecisionTreeClassifier(max_depth = grid_se.best_params_['max_depth']).fit(x_train, y_train)

In [8]:
y_pred = decision_tree.predict(x_test)
summarize(y_test, y_pred)

test data count:  143
accuracy_count:  117
accuracy score:  0.8181818181818182
precison score:  0.8
recall score:  0.7142857142857143



### logistic regression

In [9]:
param = {'penalty': ['l1', 'l2'],
         'C': [0.1, 0.4, 0.8, 1, 2, 5]}
grid_se = GridSearchCV(LogisticRegression(solver= 'liblinear'),
                      param,
                      cv= 3,
                      return_train_score= True)
grid_se.fit(x_train, y_train)
grid_se.best_params_

{'C': 5, 'penalty': 'l2'}

In [10]:
for i in range(12):
    print("parameters: ", grid_se.cv_results_['params'][i])
    print("mean tests core: ", grid_se.cv_results_['mean_test_score'][i])
    print("rank: ", grid_se.cv_results_['rank_test_score'][i])

parameters:  {'C': 0.1, 'penalty': 'l1'}
mean tests core:  0.7714935486865312
rank:  10
parameters:  {'C': 0.1, 'penalty': 'l2'}
mean tests core:  0.7785482224078715
rank:  7
parameters:  {'C': 0.4, 'penalty': 'l1'}
mean tests core:  0.7715121136173767
rank:  9
parameters:  {'C': 0.4, 'penalty': 'l2'}
mean tests core:  0.7679940592221294
rank:  12
parameters:  {'C': 0.8, 'penalty': 'l1'}
mean tests core:  0.7820848417339645
rank:  4
parameters:  {'C': 0.8, 'penalty': 'l2'}
mean tests core:  0.7697577276524644
rank:  11
parameters:  {'C': 1, 'penalty': 'l1'}
mean tests core:  0.7820848417339645
rank:  4
parameters:  {'C': 1, 'penalty': 'l2'}
mean tests core:  0.7768124013738049
rank:  8
parameters:  {'C': 2, 'penalty': 'l1'}
mean tests core:  0.7803118908382066
rank:  6
parameters:  {'C': 2, 'penalty': 'l2'}
mean tests core:  0.7838392276988767
rank:  2
parameters:  {'C': 5, 'penalty': 'l1'}
mean tests core:  0.7838206627680312
rank:  3
parameters:  {'C': 5, 'penalty': 'l2'}
mean tests 

In [11]:
logistic = LogisticRegression(solver= 'liblinear',
                              penalty = grid_se.best_params_['penalty'],
                              C = grid_se.best_params_['C']).fit(x_train, y_train)

In [12]:
y_pred = logistic.predict(x_test)
summarize(y_test, y_pred)

test data count:  143
accuracy_count:  115
accuracy score:  0.8041958041958042
precison score:  0.78
recall score:  0.6964285714285714

