In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
import warnings
warnings.filterwarnings("ignore")

In [13]:
seed = 2017
np.random.seed(seed)

data = load_iris()
idx = np.random.permutation(150)
X = data.data[idx]
y = data.target[idx]

#### Building an ensemble
Instantiating a fully specified ensemble is straightforward and requires three steps: first create the instance, second add the intermediate layers, and finally the meta estimator.

In [14]:
from mlens.ensemble import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC

In [15]:
# --- Build ---
# Passing a scoring function will create cv scores during fitting
# the scorer should be a simple function accepting to vectors and returning a scalar
ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2)

# Build the first layer
ensemble.add([RandomForestClassifier(random_state=seed), SVC()])

# Attach the final meta estimator 
ensemble.add_meta(LogisticRegression())

# --- Use ---

# Fit ensemble 
ensemble.fit(X[:75], y[:75])

# Predict 
preds = ensemble.predict(X[75:])


Fitting 2 layers
Processing layer-1             done | 00:00:00
Processing layer-2             done | 00:00:00
Fit complete                        | 00:00:00

Predicting 2 layers
Processing layer-1             done | 00:00:00
Processing layer-2             done | 00:00:00
Predict complete                    | 00:00:00


In [18]:
print("Fit data:\n%r" % ensemble.data)


Fit data:
                                   score-m  score-s  ft-m  ft-s  pt-m  pt-s
layer-1  randomforestclassifier       0.84     0.06  0.04  0.00  0.00  0.00
layer-1  svc                          0.89     0.05  0.00  0.00  0.00  0.00



In [19]:
data = pd.DataFrame

In [21]:
data.count

<function pandas.core.frame.DataFrame.count(self, axis=0, level=None, numeric_only=False)>

In [22]:
data.shape

<property at 0x11ea4df98>

In [23]:
print("Prediction score: %.3f" % accuracy_score(preds, y[75:]))


Prediction score: 0.960


#### Multi-layer ensembles
With each call to the `add` method, another layer is added to the ensemble. Note that all ensembles are sequential in the order layers are added. For instance, in the above example, we could add a second layer as follows.

In [28]:
ensemble = SuperLearner(scorer=accuracy_score, random_state=seed)

# Build the first layer
ensemble.add([RandomForestClassifier(random_state=seed), LogisticRegression()])

# Build the second layer
ensemble.add([LogisticRegression(), SVC()])

# Attach the final meta estimator
ensemble.add_meta(SVC())

SuperLearner(array_check=None, backend=None, folds=2,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=9787, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=2, raise_on_ex...816620>)],
   n_jobs=-1, name='group-14', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=2017, sample_size=20,
       scorer=<function accuracy_score at 0x1a21816620>, shuffle=False,
       verbose=False)

In [29]:
ensemble.fit(X[:75], y[:75])
preds = ensemble.predict(X[75:])
print("Fit data:\n%r" % ensemble.data)

Fit data:
                                   score-m  score-s  ft-m  ft-s  pt-m  pt-s
layer-1  logisticregression           0.75     0.14  0.02  0.01  0.00  0.00
layer-1  randomforestclassifier       0.84     0.06  0.04  0.01  0.00  0.00
layer-2  logisticregression           0.67     0.12  0.00  0.00  0.00  0.00
layer-2  svc                          0.89     0.00  0.00  0.00  0.00  0.00



#### The scoring function

In [37]:
from mlens.metrics import make_scorer

In [38]:
accuracy_scorer = make_scorer(accuracy_score, greater_is_better=True)

#### A simple evaluation
Before throwing preprocessing into the mix, let’s see how to evaluate a set of estimator. First, we need a list of estimator and a dictionary of parameter distributions that maps to each estimator. The estimators should be put in a list, either as is or as a named tuple ((name, est)). If you don’t name the estimator, the Evaluator will automatically name the model as the class name in lower case. This name must be the key in the parameter dictionary.

In [39]:
from mlens.model_selection import Evaluator 
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import randint

In [40]:
# Name the estimators
ests = [('gnb', GaussianNB()), ('knn', KNeighborsClassifier())]

# Mapping parameters to these / Since gnb doesn't have any parameters we can just skip it 
pars = {'n_neighbors': randint(2, 20)}
params = {'knn': pars}

We can now run an evaluation over these estimators and parameter distributions by calling the fit method.



In [41]:
evaluator = Evaluator(accuracy_scorer, cv=10, random_state=seed, verbose=1)
evaluator.fit(X, y, ests, params, n_iter=10)

Launching job
Job           done | 00:00:01


<mlens.model_selection.model_selection.Evaluator at 0x10f9665f8>

The full history of the evaluation can be found in `cv_results`. To compare models with their best parameters, we can pass the `results` attribute to a pandas.DataFrame or print it as a table. We use `m` to denote mean values and `s` to denote standard deviation across folds for brevity. Note that the timed prediction is for the training set, for comparability with training time.

In [42]:
print("Score comparison with best params founds:\n\n%r" % evaluator.results)


Score comparison with best params founds:

       test_score-m  test_score-s  train_score-m  train_score-s  fit_time-m  fit_time-s  pred_time-m  pred_time-s               params
gnb           0.960         0.033          0.957          0.006       0.009       0.006        0.004        0.001                     
knn           0.967         0.033          0.980          0.005       0.002       0.001        0.051        0.005  {'n_neighbors': 15}



#### Preprocessing
Next, suppose we want to compare the models across a set of preprocessing pipelines. To do this, we first need to specify a dictionary of preprocessing pipelines to run through. Each entry in the dictionary should be a list of transformers to apply sequentially.

In [43]:
from mlens.preprocessing import Subset
from sklearn.preprocessing import StandardScaler

In [44]:
# Map preprocessing cases through a dictionary
preprocess_cases = {'none': [],
                    'sc': [StandardScaler()],
                    'sub': [Subset([0, 1])]
                    }

The fit methods determines automatically whether there is any preprocessing or any estimator jobs to run, so all we need to do is specify the arguments we want to be processed. If a previous preprocessing job was fitted, those pipelines are stored and will be used for subsequent estimator fits.

This can be helpful if the preprocessing is time-consuming, for instance if the preprocessing pipeline is an ensemble itself. All ensembles implement a `transform` method that, in contrast to the predict method, regenerates the predictions made during the fit``call. More precisely, the ``transform method uses the estimators fitted with cross-validation to construct predictions, whereas the `predict` method uses the final estimators fitted on all data. This allows us use ensembles as preprocessing steps that mimicks how that ensemble would produce predictions for a subsequent meta learner or layer. Since fitting large ensembles is highly time-consuming, fixing the lower layers as preprocessing input is highly valuable for tuning the higher layers and / or the final meta learner. 

In [45]:
evaluator.fit(X, y, preprocessing=preprocess_cases)


Launching job
Job           done | 00:00:00


<mlens.model_selection.model_selection.Evaluator at 0x10f9665f8>

#### Model Selection across preprocessing pipelines
To evaluate the same set of estimators across all pipelines with the same parameter distributions, there is no need to take any heed of the preprocessing pipeline, just carry on as in the simple case:

In [47]:
evaluator.fit(X, y, ests, params, n_iter=10)
print("\nComparison across preprocessing pipelines:\n\n%r" % evaluator.results)


Launching job
Job           done | 00:00:04

Comparison across preprocessing pipelines:

             test_score-m  test_score-s  train_score-m  train_score-s  fit_time-m  fit_time-s  pred_time-m  pred_time-s               params
none  gnb           0.960         0.033          0.957          0.006       0.005       0.002        0.005        0.002                     
none  knn           0.967         0.033          0.980          0.005       0.002       0.003        0.037        0.028  {'n_neighbors': 15}
sc    gnb           0.960         0.033          0.957          0.006       0.006       0.002        0.005        0.002                     
sc    knn           0.960         0.044          0.965          0.003       0.002       0.000        0.056        0.005   {'n_neighbors': 8}
sub   gnb           0.780         0.133          0.791          0.020       0.005       0.002        0.003        0.001                     
sub   knn           0.800         0.126          0.837          0