In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

titanic = pd.read_csv('titanic_cleaned_features.csv')
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,family_cnt,cabin_ind
0,0,3,0,22.0,7.25,1,0
1,1,1,1,38.0,71.2833,1,1
2,1,3,1,26.0,7.925,0,0
3,1,1,1,35.0,53.1,1,1
4,0,3,0,35.0,8.05,0,0


In [5]:
features = titanic.drop('Survived', axis=1)
labels = titanic['Survived']
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.4, random_state=42)
X_test, X_val, Y_test, Y_val = train_test_split(X_test, Y_test, test_size=0.5, random_state=42)

In [26]:
rf = RandomForestClassifier()
lr = LogisticRegression()
scores = cross_val_score(rf, X_train, Y_train, cv=5)
print(scores)
scores2 = cross_val_score(lr, X_train, Y_train, cv=5)
print(scores2)

[0.81308411 0.81308411 0.80373832 0.79439252 0.82075472]
[0.85981308 0.8317757  0.73831776 0.71962617 0.8490566 ]


In [30]:
#parameter tuning would further improve these results
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

parameters = {
    'n_estimators': [5, 50, 100],
    'max_depth': [2, 10, 20, None]
}

parameters2 = {
    'C': [100, 10, 1.0, 0.1, 0.01]
}
cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_train, Y_train)
print_results(cv)
cv2 = GridSearchCV(lr, parameters2, cv=5)
cv2.fit(X_train, Y_train)
print_results(cv2)

BEST PARAMS: {'max_depth': 10, 'n_estimators': 100}

0.775 (+/-0.148) for {'max_depth': 2, 'n_estimators': 5}
0.794 (+/-0.124) for {'max_depth': 2, 'n_estimators': 50}
0.802 (+/-0.109) for {'max_depth': 2, 'n_estimators': 100}
0.796 (+/-0.066) for {'max_depth': 10, 'n_estimators': 5}
0.826 (+/-0.063) for {'max_depth': 10, 'n_estimators': 50}
0.826 (+/-0.053) for {'max_depth': 10, 'n_estimators': 100}
0.813 (+/-0.057) for {'max_depth': 20, 'n_estimators': 5}
0.809 (+/-0.025) for {'max_depth': 20, 'n_estimators': 50}
0.811 (+/-0.018) for {'max_depth': 20, 'n_estimators': 100}
0.792 (+/-0.044) for {'max_depth': None, 'n_estimators': 5}
0.801 (+/-0.022) for {'max_depth': None, 'n_estimators': 50}
0.807 (+/-0.032) for {'max_depth': None, 'n_estimators': 100}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


BEST PARAMS: {'C': 1.0}

0.794 (+/-0.116) for {'C': 100}
0.794 (+/-0.116) for {'C': 10}
0.8 (+/-0.118) for {'C': 1.0}
0.777 (+/-0.134) for {'C': 0.1}
0.708 (+/-0.098) for {'C': 0.01}


In [18]:
#Better results with RF classifier than LR Classifier
#[82.4, 82, 81.7] are our candidate models for X_val and Y_val
rf1 = RandomForestClassifier(n_estimators=5, max_depth=10)
rf1.fit(X_train, Y_train)

rf2 = RandomForestClassifier(n_estimators=100, max_depth=10)
rf2.fit(X_train, Y_train)

rf3 = RandomForestClassifier(n_estimators=100, max_depth=None)
rf3.fit(X_train, Y_train)

RandomForestClassifier()

In [22]:
for mdl in [rf1, rf2, rf3]:
    y_pred = mdl.predict(X_val)
    accuracy = round(accuracy_score(Y_val, y_pred), 3)
    precision = round(precision_score(Y_val, y_pred), 3)
    recall = round(recall_score(Y_val, y_pred), 3)
    print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(mdl.max_depth,
                                                                         mdl.n_estimators,
                                                                         accuracy,
                                                                         precision,
                                                                         recall))

MAX DEPTH: 10 / # OF EST: 5 -- A: 0.832 / P: 0.838 / R: 0.75
MAX DEPTH: 10 / # OF EST: 100 -- A: 0.838 / P: 0.862 / R: 0.737
MAX DEPTH: None / # OF EST: 100 -- A: 0.804 / P: 0.806 / R: 0.711


In [24]:
#select model2 and pass test data 
y_pred = rf2.predict(X_test)
accuracy = round(accuracy_score(Y_test, y_pred), 3)
precision = round(precision_score(Y_test, y_pred), 3)
recall = round(recall_score(Y_test, y_pred), 3)
print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(rf2.max_depth,
                                                                     rf2.n_estimators,
                                                                     accuracy,
                                                                     precision,
                                                                     recall))

MAX DEPTH: 10 / # OF EST: 100 -- A: 0.798 / P: 0.754 / R: 0.662
