In [1]:
import numpy as np
import pandas as pd

### Training and test set
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.base import clone
from sklearn.model_selection import KFold
from mlens.visualization import corrmat
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from IPython.display import SVG
from graphviz import Source
from IPython.display import display                               
from ipywidgets import interactive

  from numpy.core.umath_tests import inner1d
[MLENS] backend: threading


In [2]:
def plot_roc_curve(ytest, P_base_learners, P_ensemble, labels, ens_label):
    """Plot the roc curve for base learners and ensemble."""
    plt.figure(figsize=(10, 8))
    plt.plot([0, 1], [0, 1], 'k--')
    
    cm = [plt.cm.rainbow(i)
      for i in np.linspace(0, 1.0, P_base_learners.shape[1] + 1)]
    
    for i in range(P_base_learners.shape[1]):
        p = P_base_learners[:, i]
        fpr, tpr, _ = roc_curve(ytest, p)
        plt.plot(fpr, tpr, label=labels[i], c=cm[i + 1])

    fpr, tpr, _ = roc_curve(ytest, P_ensemble)
    plt.plot(fpr, tpr, label=ens_label, c=cm[0])
        
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(frameon=False)
    plt.show()

In [3]:
seed = 222
np.random.seed(seed)

In [4]:
df = pd.read_csv('./data/election.csv')

In [5]:
"""Split Data into train and test sets."""
y = 1 * (df.cand_pty_affiliation == "REP")
X = df.drop(["cand_pty_affiliation"], axis=1)

X = pd.get_dummies(X, sparse=True)
X.drop(X.columns[X.std() == 0], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.95, random_state=seed)
labels = X.columns

# Combining Decision Trees

In [6]:
def display_roc_auc(fpr, tpr, roc_auc):
    # method I: plt
    import matplotlib.pyplot as plt
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

def plot_tree(crit, split, depth, min_split, min_leaf=0.2, min_decrease=0.001):
    estimator = DecisionTreeClassifier(random_state = 0
        , criterion=crit
        , splitter = split
        , max_depth = depth
        , min_samples_split=min_split
        , min_samples_leaf=min_leaf
        , min_impurity_decrease = min_decrease)
    estimator.fit(X_train, y_train)
    graph = Source(tree.export_graphviz(estimator
          , out_file=None
          , feature_names=labels
          , class_names=['Dem','Rep']
          , filled = True))
    # calculate the fpr and tpr for all thresholds of the classification
    probs = estimator.predict_proba(X_test)[:,1]
    y_pred = estimator.predict(X_test)
    fpr, tpr, threshold = roc_curve(y_test, probs)
    roc_auc = auc(fpr, tpr)
    display(roc_auc_score(y_test, y_pred))
    display_roc_auc(fpr, tpr, roc_auc)
    display(SVG(graph.pipe(format='svg')))
    return estimator

inter=interactive(plot_tree
   , crit = ["gini", "entropy"]
   , split = ["best"]
   , depth=np.linspace(1, 35, 35, endpoint=True)
   , min_split=np.linspace(.01, .5, 100)
   , min_leaf=np.linspace(.01, .5, 100)
   , min_decrease=np.linspace(.001, .01, 100)
)
display(inter)

interactive(children=(Dropdown(description='crit', options=('gini', 'entropy'), value='gini'), Dropdown(descri…

In [None]:
dt1 = DecisionTreeClassifier(max_depth=10, min_samples_leaf=0.01, min_samples_split=0.01, min_impurity_decrease=0.004, random_state=0)
dt1.fit(X_train, y_train)
dt1_proba = dt1.predict_proba(X_test)
y_pred = dt1.predict(X_test)
roc_auc_score(y_test, y_pred)

In [None]:
"""Split Data into train and test sets."""
y = 1 * (df.cand_pty_affiliation == "REP")
X = df.drop(["cand_pty_affiliation","transaction_amt"], axis=1)

X = pd.get_dummies(X, sparse=True)
X.drop(X.columns[X.std() == 0], axis=1, inplace=True)
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(X, y, test_size=0.2, random_state=seed)
labels = X.columns

In [None]:
def display_roc_auc(fpr, tpr, roc_auc):
    # method I: plt
    import matplotlib.pyplot as plt
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

def plot_tree(crit, split, depth, min_split, min_leaf=0.2, min_decrease=0.001):
    estimator = DecisionTreeClassifier(random_state = 0
        , criterion=crit
        , splitter = split
        , max_depth = depth
        , min_samples_split=min_split
        , min_samples_leaf=min_leaf
        , min_impurity_decrease = min_decrease)
    estimator.fit(X_train_red, y_train_red)
    graph = Source(tree.export_graphviz(estimator
          , out_file=None
          , feature_names=labels
          , class_names=['Dem','Rep']
          , filled = True))
    # calculate the fpr and tpr for all thresholds of the classification
    probs = estimator.predict_proba(X_test_red)[:,1]
    y_pred = estimator.predict(X_test_red)
    display(roc_auc_score(y_test_red, y_pred))
    fpr, tpr, threshold = roc_curve(y_test_red, probs)
    roc_auc = auc(fpr, tpr)
    display_roc_auc(fpr, tpr, roc_auc)
    display(SVG(graph.pipe(format='svg')))
    return estimator

inter=interactive(plot_tree
   , crit = ["gini", "entropy"]
   , split = ["best"]
   , depth=np.linspace(1, 35, 35, endpoint=True)
   , min_split=np.linspace(.01, .5, 100)
   , min_leaf=np.linspace(.01, .5, 100)
   , min_decrease=np.linspace(.001, .01, 100)
)
display(inter)

In [None]:
dt2 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=0.01, min_samples_split=0.01, min_impurity_decrease=0.003, random_state=0)
dt2.fit(X_train_red, y_train_red)
dt2_proba = dt2.predict_proba(X_test_red)
y_pred = dt2.predict(X_test_red)
roc_auc_score(y_test_red, y_pred)

In [None]:
p1 = dt1_proba[:,1]
p2 = dt2_proba[:,1]

display(sns.heatmap(pd.DataFrame({"full_data": p1,
              "red_data": p2}).corr(), annot=True))

In [None]:
p = np.mean([p1,p2], axis=0)
roc_auc_score(y_test, p)

# Comparing Ensemble w/ DT vs Random Forest

In [None]:
rf = RandomForestClassifier(
    n_estimators=10,
    max_features=3,
    random_state=SEED
)

rf.fit(X_train, y_train)
p = rf.predict_proba(X_test)[:, 1]
print("Average of decision tree ROC-AUC score: %.3f" % roc_auc_score(y_test, p))

# Using array of model types as base learners

In [None]:
def get_models():
    """Generate a library of base learners."""
    nb = GaussianNB()
    svc = SVC(C=30, kernel='rbf', probability=True, verbose=3)
    knn = KNeighborsClassifier(n_neighbors=3)
    lr = LogisticRegression(C=30, random_state=seed)
    nn = MLPClassifier((80, 10), early_stopping=False, random_state=seed)
    gb = GradientBoostingClassifier(n_estimators=100, random_state=seed)
    rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=seed)

    models = {'svm': svc,
              'knn': knn,
              'naive bayes': nb,
              'mlp-nn': nn,
              'random forest': rf,
              'gbm': gb,
              'logistic': lr,
              }

    return models

In [None]:
def train_predict(model_list):
    """Fit models in list on training set and return preds"""
    P = np.zeros((y_test.shape[0], len(model_list)))
    P = pd.DataFrame(P)

    print("Fitting models.")
    cols = list()
    for i, (name, m) in enumerate(models.items()):
        print("%s..." % name, end=" ", flush=False)
        m.fit(X_train, y_train)
        P.iloc[:, i] = m.predict_proba(X_test)[:, 1]
        cols.append(name)
        print("done")

    P.columns = cols
    print("Done.\n")
    return P

In [None]:
def score_models(P, y):
    """Score model in prediction DF"""
    print("Scoring models.")
    for m in P.columns:
        score = roc_auc_score(y, P.loc[:, m])
        print("%-26s: %.3f" % (m, score))
    print("Done.\n")

In [None]:
models = get_models()
P = train_predict(models)
score_models(P, y_test)

In [None]:
corrmat(P.corr(), inflate=False)
plt.show()

In [None]:
corrmat(P.apply(lambda pred: 1*(pred >= 0.5) - y_test.values).corr(), inflate=False)
plt.show()

In [None]:
print("Ensemble ROC-AUC score: %.3f" % roc_auc_score(y_test, P.mean(axis=1)))

In [None]:
p = P.apply(lambda x: 1*(x >= 0.5).value_counts(normalize=True))
p.index = ["DEM", "REP"]
p.loc["REP", :].sort_values().plot(kind="bar")
plt.axhline(0.25, color="k", linewidth=0.5)
plt.text(0., 0.23, "True share republicans")
plt.show()

In [None]:
include = [c for c in P.columns if c not in ["mlp-nn"]]
print("Truncated ensemble ROC-AUC score: %.3f" % roc_auc_score(y_test, P.loc[:, include].mean(axis=1)))

# Adding a meta learner and randomizing training data using K fold CV

In [None]:
base_learners = get_models()

In [None]:
meta_learner = GradientBoostingClassifier(
    n_estimators=1000,
    loss="exponential",
    max_features=4,
    max_depth=3,
    subsample=0.5,
    learning_rate=0.005, 
    random_state=seed
)

In [None]:
xtrain_base, xpred_base, ytrain_base, ypred_base = train_test_split(X_train, y_train, test_size=0.5, random_state=seed)

In [None]:
def train_base_learners(base_learners, inp, out, verbose=True):
    """Train all base learners in the library."""
    if verbose: print("Fitting models.")
    for i, (name, m) in enumerate(base_learners.items()):
        if verbose: print("%s..." % name, end=" ", flush=False)
        m.fit(inp, out)
        if verbose: print("done")

In [None]:
train_base_learners(base_learners, xtrain_base, ytrain_base)

In [None]:
def predict_base_learners(pred_base_learners, inp, verbose=True):
    """Generate a prediction matrix."""
    P = np.zeros((inp.shape[0], len(pred_base_learners)))

    if verbose: print("Generating base learner predictions.")
    for i, (name, m) in enumerate(pred_base_learners.items()):
        if verbose: print("%s..." % name, end=" ", flush=False)
        p = m.predict_proba(inp)
        # With two classes, need only predictions for one class
        P[:, i] = p[:, 1]
        if verbose: print("done")

    return P

In [None]:
P_base = predict_base_learners(base_learners, xpred_base)

In [None]:
meta_learner.fit(P_base, ypred_base)

In [None]:
def ensemble_predict(base_learners, meta_learner, inp, verbose=True):
    """Generate predictions from the ensemble."""
    P_pred = predict_base_learners(base_learners, inp, verbose=verbose)
    return P_pred, meta_learner.predict_proba(P_pred)[:, 1]

In [None]:
P_pred, p = ensemble_predict(base_learners, meta_learner, X_test)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(y_test, p))

In [None]:
def stacking(base_learners, meta_learner, X, y, generator):
    """Simple training routine for stacking."""

    # Train final base learners for test time
    print("Fitting final base learners...", end="")
    train_base_learners(base_learners, X, y, verbose=False)
    print("done")

    # Generate predictions for training meta learners
    # Outer loop:
    print("Generating cross-validated predictions...")
    cv_preds, cv_y = [], []
    for i, (train_idx, test_idx) in enumerate(generator.split(X)):

        fold_xtrain, fold_ytrain = X[train_idx, :], y[train_idx]
        fold_xtest, fold_ytest = X[test_idx, :], y[test_idx]

        # Inner loop: step 4 and 5
        fold_base_learners = {name: clone(model)
                              for name, model in base_learners.items()}
        train_base_learners(
            fold_base_learners, fold_xtrain, fold_ytrain, verbose=False)

        fold_P_base = predict_base_learners(
            fold_base_learners, fold_xtest, verbose=False)

        cv_preds.append(fold_P_base)
        cv_y.append(fold_ytest)
        print("Fold %i done" % (i + 1))

    print("CV-predictions done")
    
    # Be careful to get rows in the right order
    cv_preds = np.vstack(cv_preds)
    cv_y = np.hstack(cv_y)

    # Train meta learner
    print("Fitting meta learner...", end="")
    meta_learner.fit(cv_preds, cv_y)
    print("done")

    return base_learners, meta_learner

In [None]:
# Train with stacking
cv_base_learners, cv_meta_learner = stacking(
    get_models(), clone(meta_learner), X_train.values, y_train.values, KFold(2))

P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, X_test, verbose=False)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(y_test, p))

In [None]:
from mlens.ensemble import SuperLearner

# Instantiate the ensemble with 10 folds
sl = SuperLearner(
    folds=10,
    random_state=seed,
    verbose=2,
    backend="multiprocessing"
)

# Add the base learners and the meta learner
sl.add(list(base_learners.values()), proba=True) 
sl.add_meta(meta_learner, proba=True)

# Train the ensemble
sl.fit(X_train, y_train)

# Predict the test set
p_sl = sl.predict_proba(X_test)

print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(y_test, p_sl[:, 1]))

In [None]:
plot_roc_curve(y_test, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"], "Super Learner")