In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

In [2]:
import numpy as np
np.random.seed(42)
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Load Toy Dataset

In [3]:
from sklearn.datasets import load_breast_cancer
tmp = load_breast_cancer()

X = tmp.data
y = tmp.target

In [4]:
del tmp

# Data Splitting

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.34, random_state=42)

In [6]:
len(X_train), len(X_test)

(375, 194)

In [7]:
del X, y

# Modeling

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class PredictorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return self.model.predict(X).reshape(-1, 1)

In [9]:
# pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
# preprocessing
from sklearn.preprocessing import StandardScaler
# submodels
from sklearn.gaussian_process import GaussianProcessClassifier
import sklearn.gaussian_process.kernels as gpk
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier)
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
# hard voting
from binsel import BinSel


pipe = Pipeline(steps=[
    ('scl', StandardScaler()),
    ('level1', FeatureUnion(transformer_list=[
        ('gpc1_dot',  PredictorTransformer(GaussianProcessClassifier(gpk.DotProduct()))),
        ('gpc2_mater', PredictorTransformer(GaussianProcessClassifier(gpk.Matern()))),
        ('gpc3_pair',  PredictorTransformer(GaussianProcessClassifier(gpk.PairwiseKernel()))),
        ('gpc4_rbf',   PredictorTransformer(GaussianProcessClassifier(gpk.RBF()))),
        ('gpc5_quad',  PredictorTransformer(GaussianProcessClassifier(gpk.RationalQuadratic()))),
        ('knn_3', PredictorTransformer(KNeighborsClassifier(n_neighbors=3))),
        ('knn_5', PredictorTransformer(KNeighborsClassifier(n_neighbors=5))),
        ('knn_7', PredictorTransformer(KNeighborsClassifier(n_neighbors=7))),
        ('svm1_lin', PredictorTransformer(SVC(kernel='linear', C=0.02))),
        ('svm2_rbf', PredictorTransformer(SVC(kernel='rbf', C=100, gamma=0.001))),
        ('dt_3', PredictorTransformer(DecisionTreeClassifier(max_depth=3))),
        ('dt_4', PredictorTransformer(DecisionTreeClassifier(max_depth=4))),
        ('dt_5', PredictorTransformer(DecisionTreeClassifier(max_depth=5))),
        ('ada', PredictorTransformer(AdaBoostClassifier(n_estimators=16, random_state=42))),
        ('gbm', PredictorTransformer(GradientBoostingClassifier(
            n_estimators=16, max_depth=3, random_state=42, 
            min_samples_leaf=2, max_features=0.4, subsample=0.3))),
        ('rf', PredictorTransformer(RandomForestClassifier(
            n_estimators=16, max_depth=3, min_samples_leaf=2, 
            max_features=0.4, random_state=42, oob_score=True))), 
        ('gnb', PredictorTransformer(GaussianNB())),
        ('lda', PredictorTransformer(LinearDiscriminantAnalysis())),
        ('qda', PredictorTransformer(QuadraticDiscriminantAnalysis())),
        ('mlp1_logit', PredictorTransformer(MLPClassifier(
            hidden_layer_sizes=64, solver='lbfgs', activation='logistic'))),
        ('mlp2_tanh',  PredictorTransformer(MLPClassifier(
            hidden_layer_sizes=64, solver='lbfgs', activation='tanh'))),
        ('mlp3_relu',  PredictorTransformer(MLPClassifier(
            hidden_layer_sizes=64, solver='lbfgs', activation='relu')))
     ])),
     #('twostep', TwoStepFit(Pipeline)),
     ('level2', BinSel(
         preselect=0.8,  # pick the 80% best features
         n_select=3, max_rho=0.4,  # then try to find 3 low correlated features
         subsample=0.6, oob_score=True, random_state=42))
])

# Optimization

In [10]:
%%time
pipe.fit(X_train, y_train)

CPU times: user 11.2 s, sys: 648 ms, total: 11.8 s
Wall time: 6.46 s


Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('level1', FeatureUnion(n_jobs=None,
       transformer_list=[('gpc1_dot', PredictorTransformer(model=GaussianProcessClassifier(copy_X_train=True, kernel=DotProduct(sigma_0=1),
             max_iter_predict=100, multi_class='o... preselect=0.8,
    random_state=42, replace=False, subsample=0.6, unique=True,
    verbose=False))])

# Check Results

## Best Model

In [11]:
pipe.steps[2][1].idx, pipe.steps[2][1].neg

((4, 10, 21), (0, 0, 0))

In [12]:
# Top-10 Models
pd.DataFrame(pipe.steps[2][1].res, columns=['indicies', 'negation', 'oob score']).head(10)

Unnamed: 0,indicies,negation,oob score
0,"(4, 10, 21)","(0, 0, 0)",1.0
1,"(8, 10, 18)","(0, 0, 0)",1.0
2,"(5, 8, 21)","(0, 1, 0)",1.0
3,"(5, 14, 21)","(0, 0, 0)",1.0
4,"(6, 11, 18)","(0, 0, 0)",0.98654
5,"(6, 15, 21)","(0, 0, 0)",0.985801
6,"(5, 6, 10)","(0, 0, 0)",0.985693
7,"(6, 17, 18)","(0, 0, 0)",0.985324
8,"(10, 17, 18)","(0, 0, 0)",0.98435
9,"(6, 10, 18)","(0, 0, 0)",0.973643


## Scores

In [13]:
print("  Training Score: {:5.4f}".format(pipe.score(X_train, y_train)))
print("Validation Score: {:5.4f}".format(pipe.score(X_test, y_test)))

  Training Score: 0.9947
Validation Score: 0.9845


## Predictions

In [14]:
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]  # proba of 1
pd.DataFrame(np.c_[y_pred, y_proba], columns=['binary', 'proba'])

Unnamed: 0,binary,proba
0,1.0,1.000000
1,0.0,0.000000
2,0.0,0.000000
3,1.0,1.000000
4,1.0,1.000000
5,0.0,0.000000
6,0.0,0.000000
7,0.0,0.000000
8,0.0,0.333333
9,1.0,1.000000
