In [69]:
import pandas as pd
import time
import datetime as dt

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb

from sklearn.model_selection import GridSearchCV, cross_val_predict

In [3]:
df = pd.read_csv("train_bin.csv")

In [4]:
X = df.drop("Survived", axis=1)
y = df.Survived

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [38]:
def fit_model(model, X_train, y_train, cv, X_test=X_test, y_test=y_test):
    model = model.fit(X_train, y_train)
    acc = round(model.score(X_test, y_test) * 100, 2)
    
    test_pred = cross_val_predict(model,
                                  X_test,
                                  y_test,
                                  cv = cv)
    
    acc_cv = round(accuracy_score(y_test, test_pred) * 100, 2)
    
    return test_pred, acc, acc_cv

# Logistic Regression

In [20]:
start = time.time()

train_pred_logistic, acc_logistic, acc_cv_logistic = fit_model(LogisticRegression(),
                                                               X_train,
                                                               y_train,
                                                               5)

logistic_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_logistic}")
print(f"5-fold CV Accuracy: {acc_cv_logistic}")
print(f"Running time: {logistic_time}")

Accuracy: 79.41
5-fold CV Accuracy: 74.71
Running time: 0:00:00.295659


# Naive Bayes

In [21]:
start = time.time()

train_pred_nb, acc_nb, acc_cv_nb = fit_model(GaussianNB(),
                                             X_train,
                                             y_train,
                                             5)

nb_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_nb}")
print(f"5-fold CV Accuracy: {acc_cv_nb}")
print(f"Running time: {nb_time}")

Accuracy: 55.29
5-fold CV Accuracy: 68.82
Running time: 0:00:00.079290


# Linear SVC

In [22]:
start = time.time()

train_pred_linsvc, acc_linsvc, acc_cv_linsvc = fit_model(LinearSVC(),
                                                         X_train,
                                                         y_train,
                                                         5)

linsvc_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_linsvc}")
print(f"5-fold CV Accuracy: {acc_cv_linsvc}")
print(f"Running time: {linsvc_time}")

Accuracy: 81.18
5-fold CV Accuracy: 78.24
Running time: 0:00:00.092836


# K Neighbours Classifier

In [39]:
start = time.time()

train_pred_kn, acc_kn, acc_cv_kn = fit_model(KNeighborsClassifier(),
                                                               X_train.values,
                                                               y_train,
                                                               5,
                                                               X_test.values)

kn_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_kn}")
print(f"5-fold CV Accuracy: {acc_cv_kn}")
print(f"Running time: {kn_time}")

Accuracy: 80.59
5-fold CV Accuracy: 72.94
Running time: 0:00:00.050925


# SVC

In [28]:
start = time.time()

train_pred_svc, acc_svc, acc_cv_svc = fit_model(SVC(),
                                                X_train,
                                                y_train,
                                                5)

svc_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_svc}")
print(f"5-fold CV Accuracy: {acc_cv_svc}")
print(f"Running time: {svc_time}")

Accuracy: 80.0
5-fold CV Accuracy: 75.88
Running time: 0:00:00.088298


# Random Forest

In [29]:
start = time.time()

train_pred_rf, acc_rf, acc_cv_rf = fit_model(RandomForestClassifier(),
                                                X_train,
                                                y_train,
                                                5)

rf_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_rf}")
print(f"5-fold CV Accuracy: {acc_cv_rf}")
print(f"Running time: {rf_time}")

Accuracy: 78.82
5-fold CV Accuracy: 75.88
Running time: 0:00:01.084869


# XGBoost

In [13]:
dtrain = xgb.DMatrix(X_train.values, y_train.values)

In [54]:
start = time.time()

train_pred_xg, acc_xg, acc_cv_xg = fit_model(xgb.XGBClassifier(objective="binary:hinge", 
                                                            use_label_encoder=False),
                                          X_train.values,
                                          y_train,
                                          5,
                                          X_test.values)

xg_time = dt.timedelta(seconds=(time.time() - start))
print(f"Accuracy: {acc_xg}")
print(f"5-fold CV Accuracy: {acc_cv_xg}")
print(f"Running time: {xg_time}")

Accuracy: 77.65
5-fold CV Accuracy: 77.06
Running time: 0:00:00.777178


In [95]:
parameters = {'n_estimators':[5000], 'max_depth':[4], "learning_rate":[0.01], "gamma":[0.1]}

In [96]:
clf = GridSearchCV(xgb.XGBClassifier(objective="binary:hinge", 
                                     use_label_encoder=False),
                   parameters,
                   refit=True,
                   verbose=3)

clf.fit(X_train.values, y_train.values)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END gamma=0.1, learning_rate=0.01, max_depth=4, n_estimators=5000;, score=0.804 total time=   6.1s
[CV 2/5] END gamma=0.1, learning_rate=0.01, max_depth=4, n_estimators=5000;, score=0.814 total time=   4.2s
[CV 3/5] END gamma=0.1, learning_rate=0.01, max_depth=4, n_estimators=5000;, score=0.853 total time=   5.0s
[CV 4/5] END gamma=0.1, learning_rate=0.01, max_depth=4, n_estimators=5000;, score=0.812 total time=   9.9s
[CV 5/5] END gamma=0.1, learning_rate=0.01, max_depth=4, n_estimators=5000;, score=0.812 total time=   8.7s


GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None,
                                     objective='binary:hinge', predictor=None,
                                     random_state=None, reg_alpha=None,
                          

In [97]:
clf.best_params_

{'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 5000}

In [98]:
grid_predictions = clf.predict(X_test.values)
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.75      0.78      0.76        76
           1       0.81      0.79      0.80        94

    accuracy                           0.78       170
   macro avg       0.78      0.78      0.78       170
weighted avg       0.78      0.78      0.78       170



In [89]:
clf

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_est...rs=100, n_jobs=None,
                                     num_parallel_tree=None,
                                     objective='binary:hinge', predictor=None,
                                     random_state=None, reg_alpha=None,
                            