In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp sklearn

# Scikit-Learn

Helper classes for Scikit Learn estimators.

In [None]:
#hide
from nbdev.showdoc import *
from nbdev.export import notebook2script

In [None]:
#export
import sklearn

from sklearn.metrics import *
from functools import lru_cache


class ScikitLearner:
    """Helper class to use fine tune parameter based on validation dataset performace.
    Feel free to implement your own version."""
    def __init__(self, learner, X_train, X_val, y_train, y_val, predict_proba=True):
        """Create the class instance.
        
        **Parameters**
        
        - learner: the scikit-learn estimator
        - X_train, X_val, y_train, y_val: train validation datasets
        - predict_proba: whether the estimator can predict probability
        """
        self.learner = learner
        self.predict_proba = predict_proba
        self._proba = None
        self.X_train, self.X_val, self.y_train, self.y_val = X_train, X_val, y_train, y_val

    def __getattr__(self, key):
        return getattr(self.learner, key)

    def __dir__(self):
        return set(super().__dir__() + list(self.__dict__.keys()) +
                   dir(self.learner))

    @lru_cache(maxsize=None)
    def valid_loss_with_params(self, loss_func=None, callbacks=tuple(), **params):
        """Calculate loss of the estimator on validation set, and also the values of callbacks.

        **Parameters**
        
        - loss_func: loss function to use. By default, if estimator support to predict probability, it will be
        log_loss, otherwise it will be accuracy score.
        - callbacks: callbacks to also evaluate. Default is empty tuple.
        - params: parameters to use when train the estimator.

        **Returns**
        
        Return values will be a two element tuple.
        - the first is values of the loss function
        - the other is values of all callbacks
        """
        return self.loss_with_params(self.X_val,
                                     self.y_val,
                                     loss_func=loss_func,
                                     callbacks=callbacks,
                                     **params)

    def loss_with_params(self, X, y, loss_func=None, callbacks=tuple(), **params):
        estimator = sklearn.clone(self.learner)
        estimator.set_params(**params)
        estimator.fit(self.X_train, self.y_train)
        if self.predict_proba:
            if self._proba is None:
                try:                
                    y_pred = estimator.predict_proba(self.X_val)
                    self._proba = True
                except:
                    self._proba = False
                    y_pred = estimator.predict(self.X_val)
            elif self._proba:
                y_pred = estimator.predict_proba(self.X_val)
            else:
                y_pred = estimator.predict(self.X_val)
        else:
            y_pred = estimator.predict(self.X_val)
        if loss_func is None:
            if self._proba:
                loss_func = log_loss
            else:
                loss_func = lambda x,y: -accuracy_score(x,y)
        return [loss_func(self.y_val, y_pred)], [callback(self.y_val, y_pred) for callback in callbacks]


In [None]:
show_doc(ScikitLearner.__init__)
show_doc(ScikitLearner.valid_loss_with_params)

## Example of using `ScikitLearner`
`Digits` is a sub-class of `ScikitLearner`. Check `sample_cases` for more information.

In [None]:
from eptune.sample_cases import Digits
from eptune.algorithms import eaSimpleWithExtraLog, eaMuPlusLambdaWithExtraLog, eaMuCommaLambdaWithExtraLog
from eptune.parameter import *

from deap import base
from deap import creator
from deap import tools
from deap import algorithms
from functools import partial
from sklearn.metrics import *
from sklearn.svm import SVC

toolbox = base.Toolbox()
params = [
    LogFloatParameter([0.1, 1000], 'C'),
    CategoricalParameter(['poly', 'rbf', 'sigmoid'], "kernel"),
    LogFloatParameter([1e-6, 1e6], 'gamma')
]

svc_digits = Digits(SVC())


def initParams(cls):
    return cls({i.name: next(i) for i in cls.params})


def evaluate(params):
    return svc_digits.valid_loss_with_params(callbacks=(accuracy_score, ), **params)


creator.create("Loss", base.Fitness, weights=(-1.0, ))
creator.create("Parameters", dict, params=params, fitness=creator.Loss)
toolbox.register("individual", initParams, creator.Parameters)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register('evaluate', evaluate)
from eptune.utils import ConcurrentMap
pmap = ConcurrentMap(10)
toolbox.register('map', pmap.map)

toolbox.register("select", tools.selTournament, tournsize=3)

from eptune.crossover import cxDictUniform
toolbox.register("mate", cxDictUniform, indpb=0.5)

from eptune.mutation import mutDictRand
toolbox.register("mutate", partial(mutDictRand, params=params, indpb=0.6))

import numpy
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", numpy.mean, axis=0)
stats.register("std", numpy.std, axis=0)
stats.register("min", numpy.min, axis=0)
stats.register("max", numpy.max, axis=0)
hof = tools.HallOfFame(2)


def run():
    return eaSimpleWithExtraLog(toolbox.population(10),
                                toolbox,
                                cxpb=0.6,
                                mutpb=0.6,
                                ngen=16,
                                halloffame=hof,
                                elitism=True,
                                stats=stats)

In [None]:
%time population, logbook = run()

In [None]:
fig = logbook.plot(['min','avg'])

Print Hall of Fame with extra info:

In [None]:
[(i,i.extra) for i in hof.items]

Use SVC with probability support, which is much slower than the default SVC.

In [None]:
def accuracy(y_true, y_pred):
    return accuracy_score(y_true, y_pred.argmax(-1))

svc_digits_proba = Digits(SVC(probability=True))

def evaluate_proba(params):
    return svc_digits_proba.valid_loss_with_params(callbacks=(accuracy,), **params)

from eptune.crossover import cxDictBlenderIfCan
toolbox.register("mate", cxDictBlenderIfCan, alpha=0.5, indpb=0.8, fix_invalid=True)
toolbox.register('evaluate', evaluate_proba)

# Because we are using the same HallofFame, we need to clear it before use.
hof.clear()
pmap.close()
pmap = ConcurrentMap(10)
toolbox.register('map', pmap.map)
def run_proba():
    return eaSimpleWithExtraLog(toolbox.population(10),
                                toolbox,
                                cxpb=0.6,
                                mutpb=0.3,
                                ngen=16,
                                halloffame=hof,
                                elitism=True,
                                stats=stats)

In [None]:
%time population, logbook = run_proba()

In [None]:
fig = logbook.plot(['min','avg'])

In [None]:
[(i,i.extra) for i in hof.items]

## Using other algorithms to optimize

### Using `eaMuPlusLambdaWithExtraLog`

In [None]:
hof.clear()
pmap.close()
pmap = ConcurrentMap(10)
from eptune.crossover import cxDictBlenderIfCan
toolbox.register("mate", cxDictBlenderIfCan, alpha=0.5, indpb=0.8, fix_invalid=True)
toolbox.register('evaluate', evaluate)
toolbox.register('map', pmap.map)
def run_mu_plus_lambda():
    return eaMuPlusLambdaWithExtraLog(toolbox.population(16),
                                toolbox,
                                mu=8,
                                lambda_=10,
                                cxpb=0.3,
                                mutpb=0.6,
                                ngen=16,
                                halloffame=hof,
                                stats=stats)

In [None]:
%time population, logbook = run_mu_plus_lambda()

In [None]:
fig = logbook.plot(['min','avg'])

### Using `eaMuCommaLambdaWithExtraLog`

In [None]:
hof.clear()
pmap.close()
pmap = ConcurrentMap(10)
toolbox.register('evaluate', evaluate)
toolbox.register('map', pmap.map)
def run_mu_comma_lambda():
    return eaMuCommaLambdaWithExtraLog(toolbox.population(10),
                                toolbox,
                                mu=10,
                                lambda_=20,
                                cxpb=0.3,
                                mutpb=0.6,
                                ngen=16,
                                halloffame=hof,
                                stats=stats)

In [None]:
%time population, logbook = run_mu_plus_lambda()

In [None]:
fig = logbook.plot(['min','avg'])

In [None]:
#export
import sklearn

from sklearn.metrics import *
from functools import lru_cache, partial
from sklearn.model_selection import cross_val_predict


class ScikitLearnerCV:
    "Make use of sklearn cross_val_predict interface to optimize paramters."

    def __init__(self, learner, X, y):
        """Create the class instance.
        
        **Parameters**
        
        - learner: the scikit-learn estimator
        """
        self.learner = learner
        self.X = X
        self.y = y

    def __getattr__(self, key):
        return getattr(self.learner, key)

    def __dir__(self):
        return set(super().__dir__() + list(self.__dict__.keys()) +
                   dir(self.learner))

    @lru_cache(maxsize=None)
    def cv_loss_with_params(self,
                            loss_func=None,
                            callbacks=tuple(),
                            groups=None,
                            cv=None,
                            n_jobs=None,
                            verbose=0,
                            pre_dispatch='2*n_jobs',
                            method='predict',
                            fit_params=None,
                            **params):
        """Calculate loss of the estimator on validation set, and also the values of callbacks.

        **Parameters**
        
        - loss_func: loss function to use. By default, if estimator support to predict probability, it will be
        log_loss, otherwise it will be accuracy score.
        - callbacks: callbacks to also evaluate. Default is empty tuple.
        - params: parameters to use when train the estimator.

        **Returns**
        
        Return values will be a two element tuple.
        - the first is values of the loss function
        - the other is values of all callbacks
        """
        return self.loss_with_params(self.X,
                                     self.y,
                                     loss_func=loss_func,
                                     callbacks=callbacks,
                                     groups=groups,
                                     cv=cv,
                                     n_jobs=n_jobs,
                                     verbose=verbose,
                                     pre_dispatch=pre_dispatch,
                                     method=method,
                                     fit_params=fit_params,
                                     **params)

    def loss_with_params(self,
                         X,
                         y,
                         loss_func=None,
                         callbacks=tuple(),
                         groups=None,
                         cv=None,
                         n_jobs=None,
                         verbose=0,
                         pre_dispatch='2*n_jobs',
                         method='predict',
                         fit_params=None,
                         **params):
        estimator = sklearn.clone(self.learner)
        estimator.set_params(**params)
        y_pred = cross_val_predict(estimator,
                                   X,
                                   y,
                                   groups=groups,
                                   cv=cv,
                                   n_jobs=n_jobs,
                                   verbose=verbose,
                                   pre_dispatch=pre_dispatch,
                                   fit_params=fit_params,
                                   method=method)
        if loss_func is None:
            if method == 'predict_proba':
                loss_func = log_loss
            else:
                loss_func = lambda x, y: -accuracy_score(x, y)
        return [loss_func(y, y_pred)
                ], [callback(y, y_pred) for callback in callbacks]

In [None]:
show_doc(ScikitLearnerCV.__init__)
show_doc(ScikitLearnerCV.cv_loss_with_params)

## Example of using `ScikitLearnerCV`
`DigitsCV` is a sub-class of `ScikitLearnerCV`. Check `sample_cases` for more information.# Optimize parameters based on CV result

In [None]:
from eptune.sample_cases import DigitsCV
from sklearn.model_selection import StratifiedKFold

cv_svc_digits = DigitsCV(SVC())
def cv_evaluate(params):
    return cv_svc_digits.cv_loss_with_params(callbacks=(accuracy_score, ),
                                             cv=StratifiedKFold(n_splits=3),
                                             **params)


hof.clear()
pmap.close()
pmap = ConcurrentMap(10)
toolbox.register('evaluate', cv_evaluate)
toolbox.register('map', pmap.map)
from eptune.crossover import cxDictBlenderIfCan
toolbox.register("mate", cxDictBlenderIfCan, alpha=1.2, indpb=0.5, fix_invalid=True)

from eptune.mutation import mutDictRand
toolbox.register("mutate", partial(mutDictRand, params=params, indpb=0.6))



def runcv():
    return eaSimpleWithExtraLog(toolbox.population(10),
                                toolbox,
                                cxpb=0.5,
                                mutpb=0.4,
                                ngen=16,
                                halloffame=hof,
                                elitism=True,
                                stats=stats)

In [None]:
%time population, logbook = runcv()

In [None]:
fig = logbook.plot(['min', 'avg'])

## Using other algorithms to optimize

### Using `eaMuPlusLambdaWithExtraLog`

In [None]:
hof.clear()
pmap.close()
pmap = ConcurrentMap(10)
from eptune.crossover import cxDictBlenderIfCan
toolbox.register("mate", cxDictBlenderIfCan, alpha=1.2, indpb=0.9, fix_invalid=True)

from eptune.mutation import mutDictRand
toolbox.register("mutate", partial(mutDictRand, params=params, indpb=0.9))

toolbox.register('evaluate', cv_evaluate)
toolbox.register('map', pmap.map)
def runcv_mu_plus_lambda():
    return eaMuPlusLambdaWithExtraLog(toolbox.population(10),
                                toolbox,
                                mu=10,
                                lambda_=10,
                                cxpb=0.4,
                                mutpb=0.4,
                                ngen=16,
                                halloffame=hof,
                                stats=stats)

In [None]:
%time population, logbook = runcv_mu_plus_lambda()

In [None]:
fig = logbook.plot(['min', 'avg'])

### Using `eaMuCommaLambdaWithExtraLog`

In [None]:
hof.clear()
pmap.close()
pmap = ConcurrentMap(10)
toolbox.register('evaluate', cv_evaluate)
toolbox.register('map', pmap.map)
def runcv_mu_comma_lambda():
    return eaMuCommaLambdaWithExtraLog(toolbox.population(10),
                                toolbox,
                                mu=10,
                                lambda_=10,
                                cxpb=0.3,
                                mutpb=0.5,
                                ngen=16,
                                halloffame=hof,
                                stats=stats)

In [None]:
%time population, logbook = runcv_mu_comma_lambda()

In [None]:
fig = logbook.plot(['min', 'avg'])

You can check the content in HallofFame at anytime.

In [None]:
[(i,i.extra) for i in hof.items]

In [None]:
#hide
notebook2script()