In [221]:
from numpy import mean
from numpy import std

import pandas as pd
import time
import datetime as dt
import pickle

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import xgboost as xgb

from sklearn.model_selection import GridSearchCV, cross_val_predict

In [3]:
df = pd.read_csv("train_bin.csv")

In [4]:
X = df.drop("Survived", axis=1)
y = df.Survived

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [38]:
def fit_model(model, X_train, y_train, cv, X_test=X_test, y_test=y_test):
    model = model.fit(X_train, y_train)
    acc = round(model.score(X_test, y_test) * 100, 2)
    
    test_pred = cross_val_predict(model,
                                  X_test,
                                  y_test,
                                  cv = cv)
    
    acc_cv = round(accuracy_score(y_test, test_pred) * 100, 2)
    
    return test_pred, acc, acc_cv

# Logistic Regression

In [20]:
start = time.time()

train_pred_logistic, acc_logistic, acc_cv_logistic = fit_model(LogisticRegression(random_state = 23),
                                                               X_train,
                                                               y_train,
                                                               5)

logistic_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_logistic}")
print(f"5-fold CV Accuracy: {acc_cv_logistic}")
print(f"Running time: {logistic_time}")

Accuracy: 79.41
5-fold CV Accuracy: 74.71
Running time: 0:00:00.295659


In [190]:
parameters = {'C':[0.01, 0.1, 1, 10], 
              'penalty':["l1", "l2", "none"], 
              "solver":["liblinear", "lbfgs"]}

In [None]:
lr_clf = GridSearchCV(LogisticRegression(random_state=23, max_iter=1000),
                      parameters,
                      refit=True,
                      verbose=3)

lr_clf.fit(X_train, y_train)

In [224]:
lr_clf.best_params_

{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}

In [225]:
lr_clf.best_score_

0.7912250048534265

In [226]:
lr_predictions = lr_clf.predict(X_test)
print(classification_report(y_test, lr_predictions))

              precision    recall  f1-score   support

           0       0.75      0.80      0.78        76
           1       0.83      0.79      0.81        94

    accuracy                           0.79       170
   macro avg       0.79      0.79      0.79       170
weighted avg       0.80      0.79      0.79       170



In [196]:
filename = "lr_model.pkl"
pickle.dump(lr_clf, open(filename, "wb"))

# Naive Bayes

In [21]:
start = time.time()

train_pred_nb, acc_nb, acc_cv_nb = fit_model(GaussianNB(),
                                             X_train,
                                             y_train,
                                             5)

nb_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_nb}")
print(f"5-fold CV Accuracy: {acc_cv_nb}")
print(f"Running time: {nb_time}")

Accuracy: 55.29
5-fold CV Accuracy: 68.82
Running time: 0:00:00.079290


# Linear SVC

In [22]:
start = time.time()

train_pred_linsvc, acc_linsvc, acc_cv_linsvc = fit_model(LinearSVC(),
                                                         X_train,
                                                         y_train,
                                                         5)

linsvc_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_linsvc}")
print(f"5-fold CV Accuracy: {acc_cv_linsvc}")
print(f"Running time: {linsvc_time}")

Accuracy: 81.18
5-fold CV Accuracy: 78.24
Running time: 0:00:00.092836


# K Neighbours Classifier

In [152]:
start = time.time()

train_pred_kn, acc_kn, acc_cv_kn = fit_model(KNeighborsClassifier(),
                                                               X_train.values,
                                                               y_train,
                                                               5,
                                                               X_test.values)

kn_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_kn}")
print(f"5-fold CV Accuracy: {acc_cv_kn}")
print(f"Running time: {kn_time}")

Accuracy: 80.59
5-fold CV Accuracy: 72.94
Running time: 0:00:00.250943


In [227]:
parameters = {'n_neighbors': [3, 4, 5, 6, 7],  
              'weights': ["uniform", "distance"], 
              'leaf_size':[10, 20, 30, 40, 50],
              'p': [1, 2, 3]}
                

In [None]:
knn_clf = GridSearchCV(KNeighborsClassifier(),
                       parameters,
                       refit=True,
                       verbose=3)

knn_clf.fit(X_train.values, y_train)

In [231]:
knn_clf.best_params_

{'leaf_size': 10, 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [229]:
knn_clf.best_score_

0.7970879440885265

In [230]:
knn_predictions = knn_clf.predict(X_test.values)
print(classification_report(y_test, knn_predictions))

              precision    recall  f1-score   support

           0       0.75      0.78      0.76        76
           1       0.81      0.79      0.80        94

    accuracy                           0.78       170
   macro avg       0.78      0.78      0.78       170
weighted avg       0.78      0.78      0.78       170



In [197]:
filename = "knn_model.pkl"
pickle.dump(knn_clf, open(filename, "wb"))

# SVC

In [102]:
start = time.time()

train_pred_svc, acc_svc, acc_cv_svc = fit_model(SVC(),
                                                X_train,
                                                y_train,
                                                5)

svc_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_svc}")
print(f"5-fold CV Accuracy: {acc_cv_svc}")
print(f"Running time: {svc_time}")

Accuracy: 80.0
5-fold CV Accuracy: 75.88
Running time: 0:00:00.228903


In [109]:
parameters = {'C': [0.1, 1, 10, 100],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'gamma':['scale', 'auto'],
              'kernel': ['linear', 'rbf', 'poly'],
              'degree': [1, 2, 3, 4, 5]}  

In [None]:
svc_clf = GridSearchCV(SVC(),
                   parameters,
                   refit=True,
                   verbose=3)

svc_clf.fit(X_train, y_train)

In [111]:
svc_clf.best_params_

{'C': 10, 'degree': 1, 'gamma': 'scale', 'kernel': 'rbf'}

In [181]:
svc_predictions = svc_clf.predict(X_test)
print(classification_report(y_test, svc_predictions))

              precision    recall  f1-score   support

           0       0.76      0.79      0.77        76
           1       0.82      0.80      0.81        94

    accuracy                           0.79       170
   macro avg       0.79      0.79      0.79       170
weighted avg       0.80      0.79      0.79       170



In [198]:
filename = "svc_model.pkl"
pickle.dump(svc_clf, open(filename, "wb"))

# Random Forest

In [29]:
start = time.time()

train_pred_rf, acc_rf, acc_cv_rf = fit_model(RandomForestClassifier(),
                                                X_train,
                                                y_train,
                                                5)

rf_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_rf}")
print(f"5-fold CV Accuracy: {acc_cv_rf}")
print(f"Running time: {rf_time}")

Accuracy: 78.82
5-fold CV Accuracy: 75.88
Running time: 0:00:01.084869


In [166]:
parameters = {'n_estimators': [5, 10, 50, 100, 250, 1000],  
              'criterion': ["gini", "entropy"],
              'min_samples_split': [2, 3, 4, 5, 6]}

In [None]:
rf_clf = GridSearchCV(RandomForestClassifier(),
                      parameters,
                      refit=True,
                      verbose=3)
 
rf_clf.fit(X_train, y_train)

In [168]:
rf_clf.best_params_

{'criterion': 'entropy', 'min_samples_split': 4, 'n_estimators': 100}

In [169]:
rf_clf.best_score_

0.8069695204814599

In [182]:
rf_predictions = svc_clf.predict(X_test)
print(classification_report(y_test, rf_predictions))

              precision    recall  f1-score   support

           0       0.76      0.79      0.77        76
           1       0.82      0.80      0.81        94

    accuracy                           0.79       170
   macro avg       0.79      0.79      0.79       170
weighted avg       0.80      0.79      0.79       170



In [199]:
filename = "rf_model.pkl"
pickle.dump(rf_clf, open(filename, "wb"))

# XGBoost

In [13]:
dtrain = xgb.DMatrix(X_train.values, y_train.values)

In [54]:
start = time.time()

train_pred_xg, acc_xg, acc_cv_xg = fit_model(xgb.XGBClassifier(objective="binary:hinge", 
                                                            use_label_encoder=False),
                                          X_train.values,
                                          y_train,
                                          5,
                                          X_test.values)

xg_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_xg}")
print(f"5-fold CV Accuracy: {acc_cv_xg}")
print(f"Running time: {xg_time}")

Accuracy: 77.65
5-fold CV Accuracy: 77.06
Running time: 0:00:00.777178


In [172]:
parameters = {'n_estimators':[3000, 4000, 5000], 
              'max_depth':[3, 4, 5], 
              "learning_rate":[0.005, 0.01, 0.02], 
              "gamma":[0.01, 0.1, 0.2]}

In [None]:
xg_clf = GridSearchCV(xgb.XGBClassifier(objective="binary:logistic", 
                                        use_label_encoder=False,
                                        eval_metric="logloss"),
                                        parameters,
                                        refit=True,
                                        verbose=3)

xg_clf.fit(X_train.values, y_train.values)

In [183]:
xg_clf.best_params_

{'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 3000}

In [184]:
xg_clf.best_score_

0.8148320714424383

In [185]:
grid_predictions = xg_clf.predict(X_test.values)
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.75      0.76      0.76        76
           1       0.81      0.80      0.80        94

    accuracy                           0.78       170
   macro avg       0.78      0.78      0.78       170
weighted avg       0.78      0.78      0.78       170



In [200]:
filename = "xgboost_model.pkl"
pickle.dump(xg_clf, open(filename, "wb"))

In [None]:
{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}

In [240]:
def get_voting():
    # define the base models
    models = list()
    models.append(("svc_clf", SVC(C=10, 
                                  degree=1, 
                                  gamma="scale", 
                                  kernel="rbf",
                                  probability=True)))
    models.append(("knn_clf", KNeighborsClassifier(leaf_size=10, 
                                                   n_neighbors=5, 
                                                   p=1, 
                                                   weights="distance")))
    models.append(("xg_clf", xgb.XGBClassifier(objective="binary:logistic", 
                                               use_label_encoder=False,
                                               eval_metric="logloss",
                                               gamma=0.1,
                                               learning_rate=0.01,
                                               max_depth=5,
                                               n_estimators=3000)))
    models.append(("lr_clf", LogisticRegression(C=1, 
                                                penalty="l2",
                                                solver="liblinear",
                                                max_iter=1000, 
                                                random_state=23)))
    models.append(("rf_clf", RandomForestClassifier(criterion="entropy", 
                                                    min_samples_split=4, 
                                                    n_estimators=100,
                                                    random_state =23)))
    
    # define the voting ensemble
    ensemble = VotingClassifier(estimators=models, voting="soft")
    return ensemble

def get_stacking():
    # define the base models
    models = list()
    models.append(("svc_clf", SVC(C=10, 
                                  degree=1, 
                                  gamma="scale", 
                                  kernel="rbf",
                                  probability=True)))
    models.append(("knn_clf", KNeighborsClassifier(leaf_size=10, 
                                                   n_neighbors=5, 
                                                   p=1, 
                                                   weights="distance")))
    models.append(("xg_clf", xgb.XGBClassifier(objective="binary:logistic", 
                                               use_label_encoder=False,
                                               eval_metric="logloss",
                                               gamma=0.1,
                                               learning_rate=0.01,
                                               max_depth=5,
                                               n_estimators=3000)))
    models.append(("lr_clf", LogisticRegression(C=1, 
                                                penalty="l2",
                                                solver="liblinear",
                                                max_iter=1000, 
                                                random_state=23)))
    models.append(("rf_clf", RandomForestClassifier(criterion="entropy", 
                                                    min_samples_split=4, 
                                                    n_estimators=100,
                                                    random_state =23)))
    
    # define the stacking ensemble
    ensemble = StackingClassifier(estimators=models, final_estimator=LogisticRegression())
    return ensemble


def get_models():
    models = dict()
    models["svc_clf"] = SVC(C=10, 
                            degree=1, 
                            gamma="scale", 
                            kernel="rbf",
                            probability=True)
    models["knn_clf"] = KNeighborsClassifier(leaf_size=10, 
                                             n_neighbors=5, 
                                             p=1, 
                                             weights="distance")
    models["xg_clf"] = xgb.XGBClassifier(objective="binary:logistic", 
                                         use_label_encoder=False,
                                         eval_metric="logloss",
                                         gamma=0.1,
                                         learning_rate=0.01,
                                         max_depth=5,
                                         n_estimators=3000)
    models["lr_clf"] = LogisticRegression(C=1, 
                                          penalty="l2",
                                          solver="liblinear",
                                          max_iter=1000, 
                                          random_state=23)
    models["rf_clf"] = RandomForestClassifier(criterion="entropy", 
                                              min_samples_split=4, 
                                              n_estimators=100,
                                              random_state =23)
    
    models["soft_voting"] = get_voting()
    models["stacking"] = get_stacking()
    
    return models

def evaluate_model(model, X, y):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X.values, y, scoring="accuracy", cv=cv, n_jobs=-1, error_score="raise")
	return scores

In [241]:
models = get_models()

results, names = list(), list()
for name, model in models.items():
	scores = evaluate_model(model, X_train, y_train)
	results.append(scores)
	names.append(name)
	print(f"{name}: {mean(scores):.3f}, ({std(scores):.3f})")

svc_clf: 0.821, (0.059)
knn_clf: 0.796, (0.071)
xg_clf: 0.816, (0.059)
lr_clf: 0.799, (0.052)
rf_clf: 0.809, (0.072)
soft_voting: 0.823, (0.064)
stacking: 0.819, (0.062)


## I can see a slight edge here when using the soft_voting Ensemble Classifier, so I think I will use that for the submission. 