In [2]:
# importing python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# importing model, metrics and preprocessing libs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# importing classification model
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [3]:
# loading data of titanic
titanic_df = pd.read_csv('datasets/titanic-processed.csv')
titanic_df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
228,0,2,1,47.0,0,0,15.0,0,0,1
230,0,3,1,32.0,0,0,7.75,0,1,0
496,1,1,0,44.0,0,0,27.7208,1,0,0
583,0,3,0,41.0,0,2,20.2125,0,0,1
429,0,1,1,50.0,1,0,106.425,1,0,0


In [4]:
# getting features
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [5]:
# Storing result in result dict
result_dict = {}

In [13]:
# defining helper function for evaluating models score
def score_of_classifiers(y_train, y_pred):
    accuracy = accuracy_score(y_train, y_pred)
    precision = precision_score(y_train, y_pred)
    recall = recall_score(y_train, y_pred)
    
    return {
            'accuracy' : accuracy,
            'precision' : precision,
            'recall' : recall
           }

In [14]:
# defining helper for build model
def build_models(classification_fn,
                 name_of_y_col,
                 names_of_x_cols,
                 dataset,
                 test_frac=0.2
                ):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
    model = classification_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    train_summary = score_of_classifiers(y_train, y_pred_train)
    test_summary = score_of_classifiers(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test':y_test, 'y_pred':y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {
            'training' : train_summary,
            'test' : test_summary,
            'confusion_matrix' : model_crosstab
           }

In [15]:
# defining function for comparing results
def compare_results():
    for key in result_dict:
        print('Classification :', key)
        
        print()
        print('Training Data:')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test Data:')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
        
        print()

In [16]:
# defining logistic regression function
def logistic_reg(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [17]:
# training model
result_dict['survived_logistic'] = build_models(logistic_reg, 'Survived', FEATURES, titanic_df)
compare_results()

Classification : survived_logistic

Training Data:
accuracy 0.7838312829525483
precision 0.7722772277227723
recall 0.6695278969957081

Test Data:
accuracy 0.8181818181818182
precision 0.7843137254901961
recall 0.7272727272727273



In [18]:
# defining linear discriminant modal
def linear_discriminant_reg(x_train, y_train, solver='svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [19]:
# calling bulid model function
result_dict['survived_linear_discriminant'] = build_models(linear_discriminant_reg, 'Survived', FEATURES, titanic_df)
compare_results()

Classification : survived_logistic

Training Data:
accuracy 0.7838312829525483
precision 0.7722772277227723
recall 0.6695278969957081

Test Data:
accuracy 0.8181818181818182
precision 0.7843137254901961
recall 0.7272727272727273

Classification : survived_linear_discriminant

Training Data:
accuracy 0.7978910369068541
precision 0.7725118483412322
recall 0.7086956521739131

Test Data:
accuracy 0.7902097902097902
precision 0.7413793103448276
recall 0.7413793103448276



In [20]:
# removing dummy trap as we used one hot encoding
# calling bulid model function
result_dict['survived_linear_discriminant'] = build_models(linear_discriminant_reg, 'Survived', FEATURES[0:-1], titanic_df)
compare_results()

Classification : survived_logistic

Training Data:
accuracy 0.7838312829525483
precision 0.7722772277227723
recall 0.6695278969957081

Test Data:
accuracy 0.8181818181818182
precision 0.7843137254901961
recall 0.7272727272727273

Classification : survived_linear_discriminant

Training Data:
accuracy 0.804920913884007
precision 0.7952380952380952
recall 0.7106382978723405

Test Data:
accuracy 0.7622377622377622
precision 0.6557377049180327
recall 0.7547169811320755



In [21]:
# defining quadratic discriminant reg
def quadratic_discriminant_reg(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [24]:
# removing dummy trap as we used one hot encoding
# calling bulid model function
result_dict['survived_quadratic_discriminant'] = build_models(quadratic_discriminant_reg, 'Survived', FEATURES[0:-1], titanic_df)
compare_results()

Classification : survived_logistic

Training Data:
accuracy 0.7838312829525483
precision 0.7722772277227723
recall 0.6695278969957081

Test Data:
accuracy 0.8181818181818182
precision 0.7843137254901961
recall 0.7272727272727273

Classification : survived_linear_discriminant

Training Data:
accuracy 0.789103690685413
precision 0.7706422018348624
recall 0.7058823529411765

Test Data:
accuracy 0.8181818181818182
precision 0.74
recall 0.74

Classification : survived_quadratic_discriminant

Training Data:
accuracy 0.8031634446397188
precision 0.7857142857142857
recall 0.7112068965517241

Test Data:
accuracy 0.7692307692307693
precision 0.7346938775510204
recall 0.6428571428571429



In [28]:
# defining SGD model
def sgd_reg(x_train, y_train, max_iter=1000000, tol=1e-3):
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
    
    return model

In [29]:
# training model
result_dict['survived_sgd_reg'] = build_models(sgd_reg, 'Survived', FEATURES, titanic_df)
compare_results()

Classification : survived_logistic

Training Data:
accuracy 0.7838312829525483
precision 0.7722772277227723
recall 0.6695278969957081

Test Data:
accuracy 0.8181818181818182
precision 0.7843137254901961
recall 0.7272727272727273

Classification : survived_linear_discriminant

Training Data:
accuracy 0.789103690685413
precision 0.7706422018348624
recall 0.7058823529411765

Test Data:
accuracy 0.8181818181818182
precision 0.74
recall 0.74

Classification : survived_quadratic_discriminant

Training Data:
accuracy 0.8031634446397188
precision 0.7857142857142857
recall 0.7112068965517241

Test Data:
accuracy 0.7692307692307693
precision 0.7346938775510204
recall 0.6428571428571429

Classification : survived_sgd_reg

Training Data:
accuracy 0.7135325131810193
precision 0.8505747126436781
recall 0.33035714285714285

Test Data:
accuracy 0.6433566433566433
precision 0.8421052631578947
recall 0.25



In [35]:
# defining linear SVM model
def linear_svm_reg(x_train, y_train, C=1.0, max_iter=100000, tol=1e-3):
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
    
    return model

In [36]:
# calling bulid model function
result_dict['survived_svm_reg'] = build_models(linear_svm_reg, 'Survived', FEATURES, titanic_df)
compare_results()

Classification : survived_logistic

Training Data:
accuracy 0.7838312829525483
precision 0.7722772277227723
recall 0.6695278969957081

Test Data:
accuracy 0.8181818181818182
precision 0.7843137254901961
recall 0.7272727272727273

Classification : survived_linear_discriminant

Training Data:
accuracy 0.789103690685413
precision 0.7706422018348624
recall 0.7058823529411765

Test Data:
accuracy 0.8181818181818182
precision 0.74
recall 0.74

Classification : survived_quadratic_discriminant

Training Data:
accuracy 0.8031634446397188
precision 0.7857142857142857
recall 0.7112068965517241

Test Data:
accuracy 0.7692307692307693
precision 0.7346938775510204
recall 0.6428571428571429

Classification : survived_sgd_reg

Training Data:
accuracy 0.7135325131810193
precision 0.8505747126436781
recall 0.33035714285714285

Test Data:
accuracy 0.6433566433566433
precision 0.8421052631578947
recall 0.25

Classification : survived_svm_reg

Training Data:
accuracy 0.8066783831282952
precision 0.79292929

In [37]:
# defining radius neighbors reg
def radius_neighbors_reg(x_train, y_train, radius=40.0):
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)
    
    return model

In [38]:
# calling bulid model function
result_dict['survived_radius_neighbors_reg'] = build_models(linear_svm_reg, 'Survived', FEATURES, titanic_df)
compare_results()

Classification : survived_logistic

Training Data:
accuracy 0.7838312829525483
precision 0.7722772277227723
recall 0.6695278969957081

Test Data:
accuracy 0.8181818181818182
precision 0.7843137254901961
recall 0.7272727272727273

Classification : survived_linear_discriminant

Training Data:
accuracy 0.789103690685413
precision 0.7706422018348624
recall 0.7058823529411765

Test Data:
accuracy 0.8181818181818182
precision 0.74
recall 0.74

Classification : survived_quadratic_discriminant

Training Data:
accuracy 0.8031634446397188
precision 0.7857142857142857
recall 0.7112068965517241

Test Data:
accuracy 0.7692307692307693
precision 0.7346938775510204
recall 0.6428571428571429

Classification : survived_sgd_reg

Training Data:
accuracy 0.7135325131810193
precision 0.8505747126436781
recall 0.33035714285714285

Test Data:
accuracy 0.6433566433566433
precision 0.8421052631578947
recall 0.25

Classification : survived_svm_reg

Training Data:
accuracy 0.8066783831282952
precision 0.79292929

In [40]:
# building decision tree
def decision_tree_reg(x_train, y_train, max_depth=None, max_features=None):
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    
    return model

In [42]:
# calling bulid model function
result_dict['survived_decision_tree_reg'] = build_models(decision_tree_reg, 'Survived', FEATURES, titanic_df)
compare_results()

Classification : survived_logistic

Training Data:
accuracy 0.7838312829525483
precision 0.7722772277227723
recall 0.6695278969957081

Test Data:
accuracy 0.8181818181818182
precision 0.7843137254901961
recall 0.7272727272727273

Classification : survived_linear_discriminant

Training Data:
accuracy 0.789103690685413
precision 0.7706422018348624
recall 0.7058823529411765

Test Data:
accuracy 0.8181818181818182
precision 0.74
recall 0.74

Classification : survived_quadratic_discriminant

Training Data:
accuracy 0.8031634446397188
precision 0.7857142857142857
recall 0.7112068965517241

Test Data:
accuracy 0.7692307692307693
precision 0.7346938775510204
recall 0.6428571428571429

Classification : survived_sgd_reg

Training Data:
accuracy 0.7135325131810193
precision 0.8505747126436781
recall 0.33035714285714285

Test Data:
accuracy 0.6433566433566433
precision 0.8421052631578947
recall 0.25

Classification : survived_svm_reg

Training Data:
accuracy 0.8066783831282952
precision 0.79292929

In [43]:
# defining naive bayes model
def naive_bayes_reg(x_train, y_train, priors=None):
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    
    return model

In [44]:
# calling bulid model function
result_dict['survived_naive_bayes_reg'] = build_models(naive_bayes_reg, 'Survived', FEATURES, titanic_df)
compare_results()

Classification : survived_logistic

Training Data:
accuracy 0.7838312829525483
precision 0.7722772277227723
recall 0.6695278969957081

Test Data:
accuracy 0.8181818181818182
precision 0.7843137254901961
recall 0.7272727272727273

Classification : survived_linear_discriminant

Training Data:
accuracy 0.789103690685413
precision 0.7706422018348624
recall 0.7058823529411765

Test Data:
accuracy 0.8181818181818182
precision 0.74
recall 0.74

Classification : survived_quadratic_discriminant

Training Data:
accuracy 0.8031634446397188
precision 0.7857142857142857
recall 0.7112068965517241

Test Data:
accuracy 0.7692307692307693
precision 0.7346938775510204
recall 0.6428571428571429

Classification : survived_sgd_reg

Training Data:
accuracy 0.7135325131810193
precision 0.8505747126436781
recall 0.33035714285714285

Test Data:
accuracy 0.6433566433566433
precision 0.8421052631578947
recall 0.25

Classification : survived_svm_reg

Training Data:
accuracy 0.8066783831282952
precision 0.79292929