In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic = pd.read_csv('datasets/titanic_processed.csv')
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


In [3]:
features = list(titanic.columns[1:])
features

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [4]:
result_dict = {}

In [5]:
def summarize_result(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize= True)
    num_acc = accuracy_score(y_test, y_pred, normalize= False)
    
    pre = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return{
        'accuracy': acc,
        'precision': pre,
        'recall': recall,
        'accuracy_count': num_acc
    }

In [10]:
def build_model(classifier,
                y_cols,
                x_cols,
                dataset,
                test_frac= 0.2):
    
    X = dataset[x_cols]
    Y = dataset[y_cols]

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= test_frac)
    model = classifier(x_train, y_train)

    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)

    train_summary = summarize_result(y_train, y_pred_train)
    test_summary = summarize_result(y_test, y_pred)
    pred_results = pd.DataFrame({'y_test': y_test,
                                 'y_pred': y_pred })

    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

    return {
        'training': train_summary,
        'test': test_summary,
        'confusion matrix': model_crosstab
    }

In [7]:
def compare_result():
    
    for key in result_dict:
        print('classification: ', key)
        
        print()
        print('training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()

### Logistic Regression

In [8]:
def logistic(x_train, y_train):
    
    model = LogisticRegression(solver= 'liblinear')
    model.fit(x_train, y_train)
    
    return model

In [12]:
result_dict['survived - logistic'] = build_model(logistic,
                                                 'Survived',
                                                 features,
                                                 titanic)
compare_result()

classification:  survived - logistic

training data
accuracy 0.804920913884007
precision 0.7922705314009661
recall 0.7068965517241379
accuracy_count 458

test data
accuracy 0.7552447552447552
precision 0.7058823529411765
recall 0.6428571428571429
accuracy_count 108



### LDA and QDA

In [13]:
def linear_discriminant(x_train, y_train, solver= 'svd'):
    
    model = LinearDiscriminantAnalysis(solver= solver)
    model.fit(x_train, y_train)
    
    return model

In [15]:
result_dict['survived - LDA'] = build_model(linear_discriminant,
                                                 'Survived',
                                                 features,
                                                 titanic)
compare_result()

classification:  survived - logistic

training data
accuracy 0.804920913884007
precision 0.7922705314009661
recall 0.7068965517241379
accuracy_count 458

test data
accuracy 0.7552447552447552
precision 0.7058823529411765
recall 0.6428571428571429
accuracy_count 108

classification:  survived - LDA

training data
accuracy 0.7978910369068541
precision 0.7546296296296297
recall 0.7244444444444444
accuracy_count 454

test data
accuracy 0.7832167832167832
precision 0.8333333333333334
recall 0.6349206349206349
accuracy_count 112



In [16]:
def quadratic_discriminant(x_train, y_train):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [17]:
result_dict['survived - QDA'] = build_model(quadratic_discriminant,
                                                 'Survived',
                                                 features,
                                                 titanic)
compare_result()

classification:  survived - logistic

training data
accuracy 0.804920913884007
precision 0.7922705314009661
recall 0.7068965517241379
accuracy_count 458

test data
accuracy 0.7552447552447552
precision 0.7058823529411765
recall 0.6428571428571429
accuracy_count 108

classification:  survived - LDA

training data
accuracy 0.7978910369068541
precision 0.7546296296296297
recall 0.7244444444444444
accuracy_count 454

test data
accuracy 0.7832167832167832
precision 0.8333333333333334
recall 0.6349206349206349
accuracy_count 112

classification:  survived - QDA

training data
accuracy 0.6098418277680141
precision 0.7777777777777778
recall 0.030837004405286344
accuracy_count 347

test data
accuracy 0.6013986013986014
precision 0.8333333333333334
recall 0.08196721311475409
accuracy_count 86





**one-hot encoding can result in collinearity of features, instead use dummy encoding where we drop one of the one-hot encoded columns**

In [19]:
#dummy trap

result_dict['survived - QDA'] = build_model(quadratic_discriminant,
                                                 'Survived',
                                                 features[0:-1],
                                                 titanic)
compare_result()

classification:  survived - logistic

training data
accuracy 0.804920913884007
precision 0.7922705314009661
recall 0.7068965517241379
accuracy_count 458

test data
accuracy 0.7552447552447552
precision 0.7058823529411765
recall 0.6428571428571429
accuracy_count 108

classification:  survived - LDA

training data
accuracy 0.7978910369068541
precision 0.7546296296296297
recall 0.7244444444444444
accuracy_count 454

test data
accuracy 0.7832167832167832
precision 0.8333333333333334
recall 0.6349206349206349
accuracy_count 112

classification:  survived - QDA

training data
accuracy 0.7943760984182777
precision 0.7636363636363637
recall 0.721030042918455
accuracy_count 452

test data
accuracy 0.8181818181818182
precision 0.7735849056603774
recall 0.7454545454545455
accuracy_count 117



### SGD

In [28]:
def sgd(x_train, y_train, max_iter= 11000, tol= 1e-3):
    
    model = SGDClassifier(max_iter= max_iter, tol= tol)
    model.fit(x_train, y_train)
    
    return model

In [29]:
result_dict['survived - SGD'] = build_model(sgd,
                                            'Survived',
                                            features,
                                            titanic)
compare_result()

classification:  survived - logistic

training data
accuracy 0.804920913884007
precision 0.7922705314009661
recall 0.7068965517241379
accuracy_count 458

test data
accuracy 0.7552447552447552
precision 0.7058823529411765
recall 0.6428571428571429
accuracy_count 108

classification:  survived - LDA

training data
accuracy 0.7978910369068541
precision 0.7546296296296297
recall 0.7244444444444444
accuracy_count 454

test data
accuracy 0.7832167832167832
precision 0.8333333333333334
recall 0.6349206349206349
accuracy_count 112

classification:  survived - QDA

training data
accuracy 0.7943760984182777
precision 0.7636363636363637
recall 0.721030042918455
accuracy_count 452

test data
accuracy 0.8181818181818182
precision 0.7735849056603774
recall 0.7454545454545455
accuracy_count 117

classification:  survived - SGD

training data
accuracy 0.7750439367311072
precision 0.734375
recall 0.6467889908256881
accuracy_count 441

test data
accuracy 0.7832167832167832
precision 0.8545454545454545
r

### SVM

In [31]:
def linear_svc(x_train, y_train, C=1.0, max_iter= 2000, tol= 1e-3):
    
    model = LinearSVC(C= C, max_iter= max_iter, tol= tol, dual= False)
    model.fit(x_train, y_train)
    
    return model

In [32]:
result_dict['survived - linear SVC'] = build_model(linear_svc,
                                                   'Survived',
                                                    features,
                                                    titanic)
compare_result()

classification:  survived - logistic

training data
accuracy 0.804920913884007
precision 0.7922705314009661
recall 0.7068965517241379
accuracy_count 458

test data
accuracy 0.7552447552447552
precision 0.7058823529411765
recall 0.6428571428571429
accuracy_count 108

classification:  survived - LDA

training data
accuracy 0.7978910369068541
precision 0.7546296296296297
recall 0.7244444444444444
accuracy_count 454

test data
accuracy 0.7832167832167832
precision 0.8333333333333334
recall 0.6349206349206349
accuracy_count 112

classification:  survived - QDA

training data
accuracy 0.7943760984182777
precision 0.7636363636363637
recall 0.721030042918455
accuracy_count 452

test data
accuracy 0.8181818181818182
precision 0.7735849056603774
recall 0.7454545454545455
accuracy_count 117

classification:  survived - SGD

training data
accuracy 0.7750439367311072
precision 0.734375
recall 0.6467889908256881
accuracy_count 441

test data
accuracy 0.7832167832167832
precision 0.8545454545454545
r

### radius neighbor

In [41]:
def radius_neighbor(x_train, y_train, radius=30.0):
    
    model = RadiusNeighborsClassifier(radius = radius)
    model.fit(x_train, y_train)
    
    return model

In [42]:
result_dict['survived - radius neighbor'] = build_model(radius_neighbor,
                                                        'Survived',
                                                         features,
                                                         titanic)
compare_result()

classification:  survived - logistic

training data
accuracy 0.804920913884007
precision 0.7922705314009661
recall 0.7068965517241379
accuracy_count 458

test data
accuracy 0.7552447552447552
precision 0.7058823529411765
recall 0.6428571428571429
accuracy_count 108

classification:  survived - LDA

training data
accuracy 0.7978910369068541
precision 0.7546296296296297
recall 0.7244444444444444
accuracy_count 454

test data
accuracy 0.7832167832167832
precision 0.8333333333333334
recall 0.6349206349206349
accuracy_count 112

classification:  survived - QDA

training data
accuracy 0.7943760984182777
precision 0.7636363636363637
recall 0.721030042918455
accuracy_count 452

test data
accuracy 0.8181818181818182
precision 0.7735849056603774
recall 0.7454545454545455
accuracy_count 117

classification:  survived - SGD

training data
accuracy 0.7750439367311072
precision 0.734375
recall 0.6467889908256881
accuracy_count 441

test data
accuracy 0.7832167832167832
precision 0.8545454545454545
r

### decision trees

In [43]:
def decision_tree(x_train, y_train, max_depth= None, max_features= None):
    
    model = DecisionTreeClassifier(max_depth= max_depth,
                                   max_features= max_features)
    model.fit(x_train, y_train)
    
    return model

In [44]:
result_dict['survived - decision tree'] = build_model(decision_tree,
                                                      'Survived',
                                                      features,
                                                      titanic)
compare_result()

classification:  survived - logistic

training data
accuracy 0.804920913884007
precision 0.7922705314009661
recall 0.7068965517241379
accuracy_count 458

test data
accuracy 0.7552447552447552
precision 0.7058823529411765
recall 0.6428571428571429
accuracy_count 108

classification:  survived - LDA

training data
accuracy 0.7978910369068541
precision 0.7546296296296297
recall 0.7244444444444444
accuracy_count 454

test data
accuracy 0.7832167832167832
precision 0.8333333333333334
recall 0.6349206349206349
accuracy_count 112

classification:  survived - QDA

training data
accuracy 0.7943760984182777
precision 0.7636363636363637
recall 0.721030042918455
accuracy_count 452

test data
accuracy 0.8181818181818182
precision 0.7735849056603774
recall 0.7454545454545455
accuracy_count 117

classification:  survived - SGD

training data
accuracy 0.7750439367311072
precision 0.734375
recall 0.6467889908256881
accuracy_count 441

test data
accuracy 0.7832167832167832
precision 0.8545454545454545
r

###  naive bayes

In [45]:
def naive_bayes(x_train, y_train, priors= None):
    
    model = GaussianNB(priors = priors)
    model.fit(x_train, y_train)
    
    return model

In [46]:
result_dict['survived - naive bayes'] = build_model(naive_bayes,
                                                    'Survived',
                                                    features,
                                                    titanic)
compare_result()

classification:  survived - logistic

training data
accuracy 0.804920913884007
precision 0.7922705314009661
recall 0.7068965517241379
accuracy_count 458

test data
accuracy 0.7552447552447552
precision 0.7058823529411765
recall 0.6428571428571429
accuracy_count 108

classification:  survived - LDA

training data
accuracy 0.7978910369068541
precision 0.7546296296296297
recall 0.7244444444444444
accuracy_count 454

test data
accuracy 0.7832167832167832
precision 0.8333333333333334
recall 0.6349206349206349
accuracy_count 112

classification:  survived - QDA

training data
accuracy 0.7943760984182777
precision 0.7636363636363637
recall 0.721030042918455
accuracy_count 452

test data
accuracy 0.8181818181818182
precision 0.7735849056603774
recall 0.7454545454545455
accuracy_count 117

classification:  survived - SGD

training data
accuracy 0.7750439367311072
precision 0.734375
recall 0.6467889908256881
accuracy_count 441

test data
accuracy 0.7832167832167832
precision 0.8545454545454545
r