In [1]:
import numpy as np
import pandas as pd
from IPython.display import display 

### Training and test set
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import Imputer, StandardScaler

from imblearn.over_sampling import SMOTE

from xgboost.sklearn import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression
from sklearn.base import clone
from sklearn.model_selection import KFold, train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from mlxtend.classifier import StackingClassifier
from mlens.visualization import corrmat
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

  from numpy.core.umath_tests import inner1d
[MLENS] backend: threading


In [2]:
df = pd.read_csv('./all/health-diagnostics-train.csv')
test = pd.read_csv('./all/health-diagnostics-test.csv')

df.replace('#NULL!',np.nan, inplace=True)
df.iloc[:, 0:(len(df.columns)-1)] = df.select_dtypes(include='object').apply(pd.to_numeric)
df.dropna(inplace=True)
test.replace('#NULL!',np.nan, inplace=True)
test = test.select_dtypes(include='object').apply(pd.to_numeric)
imp = Imputer(strategy='most_frequent')
imp.fit(test)
F_test = imp.transform(test)

In [3]:
X = df.drop('target',axis=1)
F_test = pd.DataFrame(F_test, columns=X.columns)
NoFam_test = F_test.drop('fam-history',axis=1)
NoFam_X = X.drop('fam-history',axis=1)
X.reset_index(drop=True, inplace=True)
NoFam_X.reset_index(drop=True, inplace=True)
y=df['target']
y.reset_index(drop=True, inplace=True)

In [4]:
seed = 111
np.random.seed(seed)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

In [6]:
train_idx = X_train.index
test_idx = X_test.index

In [7]:
Fam_Xtrain, Fam_Xtest = NoFam_X.iloc[train_idx], NoFam_X.iloc[test_idx]

In [8]:
def get_models():
    lrc = LogisticRegression(C=10, class_weight='balanced', random_state=seed)
    
    sgd = SGDClassifier(loss='log', class_weight={0:0.01,1:0.99}, random_state=seed)
    
    rfc = RandomForestClassifier(
        n_estimators=25,
        max_depth=5,
        min_impurity_decrease=0.02,
        min_samples_leaf=0.003,
        min_samples_split=0.01,
        class_weight={0:0.01,1:0.99},
        max_features='auto',
        random_state=seed
    )
    
    noFam_rfc = RandomForestClassifier(n_estimators=110, 
                           max_depth=2, 
                           min_samples_leaf=0.001, 
                           min_samples_split=0.03, 
                           class_weight={0: 0.01, 1: 0.99}, 
                           random_state=seed)
    
    nb = GaussianNB()
    
    models = {
        'lrc':lrc,
        'sgd':sgd,
        'rfc':rfc,
        'nb':nb,
        'noFam':noFam_rfc
    }
    return models

In [9]:
def train_base_learners(base_learners, inp, fam_inp, out, verbose=True):
    """Train all base learners in the library."""
    if verbose: print("Fitting models.")
    for i, (name, m) in enumerate(base_learners.items()):
        if verbose: 
            print("%s..." % name, end=" ", flush=False)
        if name == 'noFam':
            m.fit(fam_inp, out)
        else:
            m.fit(inp, out)
        if verbose: print("done")

In [10]:
def predict_base_learners(pred_base_learners, inp, fam_inp, verbose=True):
    """Generate a prediction matrix."""
    P = np.zeros((inp.shape[0], len(pred_base_learners)))

    if verbose: print("Generating base learner predictions.")
    for i, (name, m) in enumerate(pred_base_learners.items()):
        if verbose: 
            print("%s..." % name, end=" ", flush=False)
        if name == 'noFam':
            p = m.predict_proba(fam_inp)
        else:
            p = m.predict_proba(inp)
        P[:, i] = p[:, 1]
        if verbose: print("done")
    return P

In [11]:
def ensemble_predict(base_learners, meta_learner, inp, fam_inp, verbose=True):
    """Generate predictions from the ensemble."""
    P_pred = predict_base_learners(base_learners, inp, fam_inp, verbose=verbose)
    return P_pred, meta_learner.predict(P_pred)

In [16]:
def stacking(base_learners, meta_learner, X_stack, Fam_Xstack, y_stack, generator):
    print("Fitting final base learners...", end="")
    train_base_learners(base_learners, X_stack, Fam_Xstack, y_stack, verbose=False)
    print("done")
    kf = KFold(n_splits=10, random_state=seed)
    cv_preds, cv_y = [], []
    for i, (train_idx, test_idx) in enumerate(kf.split(X_stack)):
        fold_xtrain, fold_ytrain = X_stack.iloc[train_idx, :], y_stack.iloc[train_idx]
        fam_xtrain, fam_ytrain = Fam_Xstack.iloc[train_idx, :], y_stack.iloc[train_idx]
        fold_xtest, fold_ytest = X_stack.iloc[test_idx, :], y_stack.iloc[test_idx]
        fam_xtest, fam_ytest = Fam_Xstack.iloc[test_idx, :], y_stack.iloc[test_idx]
        
        # Inner loop: step 4 and 5
        fold_base_learners = {name: clone(model) for name, model in base_learners.items()}
        train_base_learners(fold_base_learners, fold_xtrain, fam_xtrain, fold_ytrain, verbose=False)
        fold_P_base = predict_base_learners(fold_base_learners, fold_xtest, fam_xtest, verbose=False)

        cv_preds.append(fold_P_base)
        cv_y.append(fold_ytest)
        print("Fold %i done" % (i + 1))

    np.vstack(cv_preds)
    X_meta = np.vstack(cv_preds)
    y_meta = np.hstack(cv_y)
    meta_learner.fit(X_meta, y_meta)
    return base_learners, meta_learner

In [17]:
meta_rfc = RandomForestClassifier(
    n_estimators=25, 
    max_depth=4, 
    max_features=3, 
    min_impurity_decrease=0.02, 
    min_samples_leaf=0.003,
    min_samples_split=0.01,
    class_weight={0:0.01, 1:0.99})

In [18]:
weak_learners, meta_learner = stacking(get_models(), meta_rfc, X_train, Fam_Xtrain, y_train, KFold(n_splits=10, random_state=seed))

Fitting final base learners...



done




Fold 1 done




Fold 2 done




Fold 3 done




Fold 4 done




Fold 5 done




Fold 6 done




Fold 7 done




Fold 8 done




Fold 9 done




Fold 10 done


In [19]:
P, y_pred = ensemble_predict(weak_learners, meta_learner, X_test, Fam_Xtest)

Generating base learner predictions.
lrc... done
sgd... done
rfc... done
nb... done
noFam... done


In [36]:
model_out = pd.DataFrame(P, columns=['lrc','sgd','rfc','nb','noFam'])
actuals = pd.Series(y_test).reset_index(drop=True)
preds = pd.Series(y_pred).reset_index(drop=True)
test = pd.concat([model_out,preds,actuals],axis=1)
test.columns = ['lrc','sgd','rfc','nb','noFam','pred','actual']

In [90]:
np.sum(np.where(model_out['rfc']>0.6,1,0))

0

In [82]:
fp = test[(test['pred']==1)&(test['actual']==0)]

In [83]:
fn = test[(test['pred']==0)&(test['actual']==1)]

In [84]:
fn

Unnamed: 0,lrc,sgd,rfc,nb,noFam,pred,actual,noFam_pred
2591,0.620718,0.273747,0.116641,0.119808,0.188115,0,1,1
2868,0.29124,0.124522,0.116641,2e-05,0.148271,0,1,0
6064,0.438295,0.158655,0.116641,3.3e-05,0.135719,0,1,0
6689,0.243004,0.123404,0.116641,4.2e-05,0.134157,0,1,0


In [85]:
tp = test[(test['pred']==1)&(test['actual']==1)]

In [86]:
tp

Unnamed: 0,lrc,sgd,rfc,nb,noFam,pred,actual,noFam_pred
2306,0.99998,0.992868,0.599202,1.0,0.199182,1,1,1
3093,0.980694,0.740276,0.599202,1.0,0.1337,1,1,0
3603,0.995736,0.92039,0.599202,1.0,0.223515,1,1,1
3876,0.990908,0.859528,0.599202,1.0,0.164697,1,1,1
4484,0.992774,0.844202,0.599202,1.0,0.170946,1,1,1
4771,0.992209,0.853002,0.599202,1.0,0.171424,1,1,1
5846,0.984684,0.767661,0.599202,1.0,0.142253,1,1,0
6033,0.993529,0.835964,0.599202,1.0,0.17302,1,1,1
6673,0.998167,0.889185,0.596727,1.0,0.39579,1,1,1


In [73]:
boundaries = np.linspace(0.01, 0.5, 25)

In [74]:
best_boundary = 0 
best_score = 0
for boundary in boundaries:
    score = roc_auc_score(y_test, np.where(test['noFam']>boundary,1,0))
    if score > best_score:
        best_boundary = boundary
        best_score = score

In [75]:
best_score

0.673476931178353

In [76]:
best_boundary

0.15291666666666667

In [78]:
test['noFam_pred'] = np.where(test['noFam']>0.15,1,0)