# Importing Required Packages

In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import pickle

import warnings
warnings.filterwarnings("ignore")

sns.set()

# Reading Data

In [2]:
train_features_backup = pd.read_csv("../Data/train_features_df.csv")
test_features_backup = pd.read_csv("../Data/test_features_df.csv")

train_features_df = train_features_backup.copy()
train_features_df = train_features_df.drop(["Age_label_enc", "Fare_label_enc"], axis = 1)
train_features_df["Indices"] = train_features_df.index

test_features_df = test_features_backup.copy()
test_features_df = test_features_df.drop(["Age_label_enc", "Fare_mean_enc"], axis = 1)

In [3]:
train_df = pd.read_csv("../Data/train.csv")
test_df = pd.read_csv("../Data/test.csv")

train_preprocessed_df = pd.read_csv("../Data/preprocessed_train_df.csv")
test_preprocessed_df = pd.read_csv("../Data/preprocessed_test_df.csv")

# Train Test Split

In [67]:
X, y = train_features_df.values, train_df["Survived"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Building and testing Ensemble on Train Set

## All Models

In [68]:
lr_model = pickle.load(open("logistic_regression_model.sav", 'rb'))
svm_model = pickle.load(open("final_best_svm.sav", 'rb'))
dt_model = DecisionTreeClassifier(random_state = 42).fit(X_train[:, :-1], y_train)
rf_model = pickle.load(open("best_ext_rf.sav", 'rb'))
xgb_model = XGBClassifier(
                    objective = "binary:logistic", 
                    random_state = 42
                ).fit(X_train[:, :-1], y_train, early_stopping_rounds=5, eval_set=[(X_test[:, :-1], y_test)])

[0]	validation_0-error:0.18994
Will train until validation_0-error hasn't improved in 5 rounds.
[1]	validation_0-error:0.18436
[2]	validation_0-error:0.18994
[3]	validation_0-error:0.17877
[4]	validation_0-error:0.17877
[5]	validation_0-error:0.17877
[6]	validation_0-error:0.17877
[7]	validation_0-error:0.17318
[8]	validation_0-error:0.17877
[9]	validation_0-error:0.16760
[10]	validation_0-error:0.16760
[11]	validation_0-error:0.17318
[12]	validation_0-error:0.16760
[13]	validation_0-error:0.16201
[14]	validation_0-error:0.16760
[15]	validation_0-error:0.16201
[16]	validation_0-error:0.16201
[17]	validation_0-error:0.16760
[18]	validation_0-error:0.16201
Stopping. Best iteration:
[13]	validation_0-error:0.16201



# First Level Model Predictions

In [73]:
train_first_preds = pd.DataFrame()
train_first_preds["lr"] = lr_model.predict(X_train[:, :-1])
train_first_preds["svm"] = svm_model.predict(X_train[:, :-1])
# train_first_preds["dt"] = dt_model.predict(X_train[:, :-1])
train_first_preds["rf"] = rf_model.predict(X_train[:, :-1])
# train_first_preds["xgb"] = xgb_model.predict(X_train[:, :-1])

test_first_preds = pd.DataFrame()
test_first_preds["lr"] = lr_model.predict(X_test[:, :-1])
test_first_preds["svm"] = svm_model.predict(X_test[:, :-1])
# test_first_preds["dt"] = dt_model.predict(X_test[:, :-1])
test_first_preds["rf"] = rf_model.predict(X_test[:, :-1])
# test_first_preds["xgb"] = xgb_model.predict(X_test[:, :-1])

# Second Level Predictions

In [80]:
base_ens_model = DecisionTreeClassifier(random_state = 42).fit(train_first_preds.values, y_train)

In [81]:
np.where(base_ens_model.predict(test_first_preds.values) == y_test, 1, 0).mean()

0.8268156424581006

## Optimizing Second Level Predictions

In [82]:
base_dt = DecisionTreeClassifier(random_state = 42)

param_grid = [
    {
        "max_features" : ["log2", "sqrt", "auto"],
        "criterion" : ["entropy", "gini"],
        "max_depth" : [i for i in range(2, 21)],
        "min_samples_split" : [i for i in range(1, 21)],
        "min_samples_leaf" : [i for i in range(1, 21)]
    }
]

dt_grid = GridSearchCV(base_dt, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

dt_grid_fit = dt_grid.fit(train_first_preds, y_train)

Fitting 3 folds for each of 45600 candidates, totalling 136800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 4492 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 12492 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 22396 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-1)]: Done 34588 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 49564 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 71764 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 95764 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 122964 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 136800 out of 136800 | elapsed:  4.6min finished


In [83]:
dt_grid_fit.best_params_

{'criterion': 'entropy',
 'max_depth': 3,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [85]:
dt_best = dt_grid_fit.best_estimator_

In [87]:
np.where(dt_best.predict(test_first_preds) == y_test, 1, 0).mean()

0.8268156424581006

# Submission for Decision Tree Ensemble

In [100]:
X, y = train_features_df.values, train_df["Survived"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

train_first_preds = pd.DataFrame()
train_first_preds["lr"] = lr_model.predict(X[:, :-1])
train_first_preds["svm"] = svm_model.predict(X[:, :-1])
# train_first_preds["dt"] = dt_model.predict(X_train[:, :-1])
train_first_preds["rf"] = rf_model.predict(X[:, :-1])
# train_first_preds["xgb"] = xgb_model.predict(X_train[:, :-1])

test_first_preds = pd.DataFrame()
test_first_preds["lr"] = lr_model.predict(test_features_df.values)
test_first_preds["svm"] = svm_model.predict(test_features_df.values)
# test_first_preds["dt"] = dt_model.predict(X_test[:, :-1])
test_first_preds["rf"] = rf_model.predict(test_features_df.values)
# test_first_preds["xgb"] = xgb_model.predict(X_test[:, :-1])

In [101]:
base_dt = DecisionTreeClassifier(random_state = 42)

param_grid = [
    {
        "max_features" : ["log2", "sqrt", "auto"],
        "criterion" : ["entropy", "gini"],
        "max_depth" : [i for i in range(2, 21)],
        "min_samples_split" : [i for i in range(1, 21)],
        "min_samples_leaf" : [i for i in range(1, 21)]
    }
]

dt_grid = GridSearchCV(base_dt, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

dt_grid_fit = dt_grid.fit(train_first_preds, y)

Fitting 3 folds for each of 45600 candidates, totalling 136800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  71 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 4492 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 12492 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 23692 tasks      | elapsed:   41.6s
[Parallel(n_jobs=-1)]: Done 38092 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 55692 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 76492 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 100492 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 127692 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 136800 out of 136800 | elapsed:  3.8min finished


In [103]:
dt_ens_sub8 = pd.read_csv("../Data/gender_submission.csv")
dt_ens_sub8["Survived"] = dt_grid_fit.best_estimator_.predict(test_first_preds.values)
dt_ens_sub8.to_csv("sub8_dt_ensemble.csv", index = False)

# Exporting Ensemble

In [105]:
filename = "best_dt_ensemble.sav"
pickle.dump(dt_grid_fit.best_estimator_, open(filename, 'wb'))

In [106]:
filename = "best_dt_ensemble.sav"
loaded_model = pickle.load(open(filename, 'rb'))

In [107]:
loaded_model

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=3, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')