# Importing Required Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import pickle

import warnings
warnings.filterwarnings("ignore")

sns.set()

# Reading Data

In [2]:
train_features_backup = pd.read_csv("../Data/train_features_df.csv")
test_features_backup = pd.read_csv("../Data/test_features_df.csv")

train_features_df = train_features_backup.copy()
train_features_df = train_features_df.drop(["Age_label_enc", "Fare_label_enc"], axis = 1)
train_features_df["Indices"] = train_features_df.index

test_features_df = test_features_backup.copy()
test_features_df = test_features_df.drop(["Age_label_enc", "Fare_mean_enc"], axis = 1)

In [3]:
train_df = pd.read_csv("../Data/train.csv")
test_df = pd.read_csv("../Data/test.csv")

train_preprocessed_df = pd.read_csv("../Data/preprocessed_train_df.csv")
test_preprocessed_df = pd.read_csv("../Data/preprocessed_test_df.csv")

# Train Test Split

In [4]:
X, y = train_features_df.values, train_df["Survived"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Building and testing Ensemble on Train Set

## All Models

In [6]:
lr_model = pickle.load(open("logistic_regression_model.sav", 'rb'))
knn_model = pickle.load(open("grid_search_knn_model.sav", "rb"))
svm_model = pickle.load(open("final_best_svm.sav", 'rb'))
dt_model = DecisionTreeClassifier(random_state = 42).fit(X_train[:, :-1], y_train)
rf_model = pickle.load(open("best_ext_rf.sav", 'rb'))
xgb_model = XGBClassifier(
                    objective = "binary:logistic", 
                    random_state = 42
                ).fit(X_train[:, :-1], y_train, early_stopping_rounds=5, eval_set=[(X_test[:, :-1], y_test)])

[0]	validation_0-error:0.18994
Will train until validation_0-error hasn't improved in 5 rounds.
[1]	validation_0-error:0.18436
[2]	validation_0-error:0.18994
[3]	validation_0-error:0.17877
[4]	validation_0-error:0.17877
[5]	validation_0-error:0.17877
[6]	validation_0-error:0.17877
[7]	validation_0-error:0.17318
[8]	validation_0-error:0.17877
[9]	validation_0-error:0.16760
[10]	validation_0-error:0.16760
[11]	validation_0-error:0.17318
[12]	validation_0-error:0.16760
[13]	validation_0-error:0.16201
[14]	validation_0-error:0.16760
[15]	validation_0-error:0.16201
[16]	validation_0-error:0.16201
[17]	validation_0-error:0.16760
[18]	validation_0-error:0.16201
Stopping. Best iteration:
[13]	validation_0-error:0.16201



## Adding First Level Predictions

In [31]:
train_first_preds = pd.DataFrame()
# train_first_preds["lr"] = lr_model.predict(X_train[:, :-1])
train_first_preds["knn"] = knn_model.predict(X_train[:, :-1])
# train_first_preds["svm"] = svm_model.predict(X_train[:, :-1])
train_first_preds["dt"] = dt_model.predict(X_train[:, :-1])
# train_first_preds["rf"] = rf_model.predict(X_train[:, :-1])
train_first_preds["xgb"] = xgb_model.predict(X_train[:, :-1])

test_first_preds = pd.DataFrame()
# test_first_preds["lr"] = lr_model.predict(X_test[:, :-1])
test_first_preds["knn"] = knn_model.predict(X_test[:, :-1])
# test_first_preds["svm"] = svm_model.predict(X_test[:, :-1])
test_first_preds["dt"] = dt_model.predict(X_test[:, :-1])
# test_first_preds["rf"] = rf_model.predict(X_test[:, :-1])
test_first_preds["xgb"] = xgb_model.predict(X_test[:, :-1])

## Base Ensemble Models

In [43]:
lr_ens = LogisticRegression(random_state = 42).fit(train_first_preds.values, y_train)

np.where(lr_ens.predict(test_first_preds.values) == y_test, 1, 0).mean()

0.8770949720670391

In [44]:
svm_ens = SVC(random_state = 42).fit(train_first_preds.values, y_train)

np.where(svm_ens.predict(test_first_preds.values) == y_test, 1, 0).mean()

0.888268156424581

In [46]:
rf_ens = RandomForestClassifier(n_estimators = 100, 
                                random_state = 42).fit(train_first_preds.values, y_train)

np.where(rf_ens.predict(test_first_preds.values) == y_test, 1, 0).mean()

0.888268156424581

## Base Submissions

In [48]:
X, y = train_features_df.values, train_df["Survived"].values

train_first_preds = pd.DataFrame()
# train_first_preds["lr"] = lr_model.predict(X[:, :-1])
train_first_preds["knn"] = knn_model.predict(X[:, :-1])
# train_first_preds["svm"] = svm_model.predict(X[:, :-1])
train_first_preds["dt"] = dt_model.predict(X[:, :-1])
# train_first_preds["rf"] = rf_model.predict(X[:, :-1])
train_first_preds["xgb"] = xgb_model.predict(X[:, :-1])

test_first_preds = pd.DataFrame()
# test_first_preds["lr"] = lr_model.predict(test_features_df.values)
test_first_preds["knn"] = knn_model.predict(test_features_df.values)
# test_first_preds["svm"] = svm_model.predict(test_features_df.values)
test_first_preds["dt"] = dt_model.predict(test_features_df.values)
# test_first_preds["rf"] = rf_model.predict(test_features_df.values)
test_first_preds["xgb"] = xgb_model.predict(test_features_df.values)

In [49]:
full_lr_ens = LogisticRegression(random_state = 42).fit(train_first_preds, y)
sub8_base_lr_ens = pd.read_csv("../Data/gender_submission.csv")
sub8_base_lr_ens["Survived"] = full_lr_ens.predict(test_first_preds)
sub8_base_lr_ens.to_csv("sub8_base_lr_ens.csv", index = False)

In [51]:
full_svm_ens = SVC(random_state = 42).fit(train_first_preds, y)
sub8_base_svm_ens = pd.read_csv("../Data/gender_submission.csv")
sub8_base_svm_ens["Survived"] = full_svm_ens.predict(test_first_preds)
sub8_base_svm_ens.to_csv("sub8_base_svm_ens.csv", index = False)