In [None]:
import numpy as np
import pandas as pd
from IPython.display import display 

### Training and test set
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import Imputer, StandardScaler

from imblearn.over_sampling import SMOTE

from xgboost.sklearn import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression
from sklearn.base import clone
from sklearn.model_selection import KFold, train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from mlxtend.classifier import StackingClassifier
from mlens.visualization import corrmat
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('./all/health-diagnostics-train.csv')
test = pd.read_csv('./all/health-diagnostics-test.csv')

df.replace('#NULL!',np.nan, inplace=True)
df.iloc[:, 0:(len(df.columns)-1)] = df.select_dtypes(include='object').apply(pd.to_numeric)
df.dropna(inplace=True)
test.replace('#NULL!',np.nan, inplace=True)
test = test.select_dtypes(include='object').apply(pd.to_numeric)
imp = Imputer(strategy='most_frequent')
imp.fit(test)
F_test = imp.transform(test)

In [None]:
X = df.drop('target',axis=1)
X.reset_index(drop=True, inplace=True)
y=df['target']
y.reset_index(drop=True, inplace=True)
F_test = pd.DataFrame(F_test, columns=X.columns)

In [None]:
lrc = LogisticRegression(C=10,class_weight='balanced', random_state=seed)
sgd = SGDClassifier(loss='log', class_weight={0:0.01,1:0.99}, random_state=seed)
rfc = RandomForestClassifier(
    n_estimators=25,
    max_depth=5,
    min_impurity_decrease=0.02,
    min_samples_leaf=0.003,
    min_samples_split=0.01,
    class_weight={0:0.01,1:0.99},
    max_features='auto',
    random_state=seed
)
nb = GaussianNB()
gbc = GradientBoostingClassifier(
    n_estimators=1000,
    loss="exponential",
    max_features=4,
    max_depth=3,
    subsample=0.5,
    learning_rate=0.005, 
    random_state=seed
)
sclf = StackingClassifier(classifiers=[lrc, sgd, rfc, nb], 
                          meta_classifier=gbc)

In [None]:
lrc = LogisticRegression(C=10,class_weight='balanced', random_state=seed)
sgd = SGDClassifier(loss='log', class_weight={0:0.01,1:0.99}, random_state=seed)
rfc = RandomForestClassifier(
    n_estimators=25,
    max_depth=5,
    min_impurity_decrease=0.02,
    min_samples_leaf=0.003,
    min_samples_split=0.01,
    class_weight={0:0.01,1:0.99},
    max_features='auto',
    random_state=seed
)
nb = GaussianNB()
meta_lrc = LogisticRegression(random_state=seed)
sclf = StackingClassifier(classifiers=[lrc,sgd, rfc, nb], 
                          meta_classifier=meta_lrc)