In [None]:
# <api>
import pandas as pd
import numpy as np
from time import time
from operator import itemgetter
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from enum import Enum
import logging

try:
    from exceptions import Exception
except:
    pass

logger = logging.getLogger(__name__)

In [None]:
# <api>
class BinaryClassifier(Enum):
    GBM = 'GBM'
    XGB = 'XGBOOST'
    LGB = 'LightGBM'
    RF = 'RF'
    LR = 'LR'

    @property
    def model(self):
        if self is BinaryClassifier.GBM:
            import work.marvin.binary_classifier_models.bestGbdtModelProducer
            return work.marvin.binary_classifier_models.bestGbdtModelProducer
        elif self is BinaryClassifier.XGB:
            import work.marvin.binary_classifier_models.bestXgboostModelProducer
            return work.marvin.binary_classifier_models.bestXgboostModelProducer
        elif self is BinaryClassifier.LGB:
            import work.marvin.binary_classifier_models.bestLightgbmModelProducer
            return work.marvin.binary_classifier_models.bestLightgbmModelProducer
        elif self is BinaryClassifier.RF:
            import work.marvin.binary_classifier_models.bestRfModelProducer
            return work.marvin.binary_classifier_models.bestRfModelProducer
        elif self is BinaryClassifier.LR:
            import work.marvin.binary_classifier_models.bestLrModelProducer
            return work.marvin.binary_classifier_models.bestLrModelProducer

    def produceBestModel(self, traindf, datamapper, target, fig_path=None,
                         callbacks=[], verbose=0):
        """
        auto classifier model generation, 3 steps:
        1. estimate optimal model parameters space for gridsearch,
           depends on sample size and feature size
        2. run gridsearch to find best parameter set
        3. train the best model using the best parameter set
        """
        # estimate optimal parameters grid space
        configspace = self.parameterGrid(datamapper)
        try:
            return self.model.produceBestModel(traindf,
                                               datamapper,
                                               target,
                                               configspace,
                                               fig_path=fig_path,
                                               verbose=verbose)
        except Exception as e:
            logger.error("search {} with {} error: {}".format(self, datamapper, e))
            raise

    def optimizeBestModel(self, traindf, datamapper, target,
                          configspace_manual=None,
                          configspace=None,
                          score='roc_auc',
                          test_metric=roc_auc_score,
                          fig_path=None,
                          search_alg='GP', n_calls=100,
                          seed=27, callbacks=[], verbose=0):
        """
        auto classifier model generation, 3 steps:
        1. estimate optimal model parameters space for hyperparam search,
           depends on sample size and feature size
        2. run hyperparam search to find best parameter set
        3. train the best model using the best parameter set
        """
        if configspace_manual is None:
            configspace_manual = self.configspace_manual()
        if configspace is None:
            configspace = self.configspace(datamapper, configspace_manual)
        optimize_method = HyperOpt(search_alg).search
        try:
            return self.model.optimizeBestModel(traindf,
                                                datamapper,
                                                target,
                                                configspace,
                                                optimize_method,
                                                score=score,
                                                test_metric=test_metric,
                                                fig_path=fig_path,
                                                n_calls=n_calls,
                                                verbose=verbose,
                                                seed=seed)
        except Exception as e:
            logger.error("optimize {} with {} error: {}".format(self, search_alg, e))
            raise

    def parameterGrid(self, datamapper):
        if self is BinaryClassifier.LR:
            return [{'penalty': ['l1', 'l2']}]
        else:
            param_grid = self.model.parameterGridInitialization(datamapper.shape)
            return [param_grid] if self is BinaryClassifier.RF else param_grid

    def configspace(self, datamapper, configspace_manual=None):
        if self is BinaryClassifier.LR:
            return {'penalty': ['l1', 'l2']}
        else:
            return self.model.configSpaceInitialization(datamapper.shape, configspace_manual)

    def configspace_manual(self):
        return self.model.configSpaceManualInitialization()

In [None]:
# <api>
class HyperOpt(Enum):
    GP = "GP"
    RF = "RF"
    GBRT = "GBRT"

    @property
    def method(self):
        if self is HyperOpt.GP:
            from skopt import gp_minimize
            return gp_minimize
        elif self is HyperOpt.RF:
            from skopt import forest_minimize
            return forest_minimize
        elif self is HyperOpt.GBRT:
            from skopt import gbrt_minimize
            return gbrt_minimize

    def search(self, X_train, y_train,
               model_class, param_grid,
               score='roc_auc',
               n_calls=100,
               verbose=0):
        """
        General method for applying `skopt_method` to the data.

        Parameters
        ----------
        X_train : np.array
            The design matrix, dimension `(n_samples, n_features)`.

        y_train : list or np.array
            The target, of dimension `n_samples`.

        model_class : classifier
            A classifier model in the mode of `sklearn`, with at least
            `fit` and `predict` methods operating on things like
            `X` and `y`.

        param_grid : dict
            Map from parameter names to pairs of values specifying the
            upper and lower ends of the space from which to sample.
            The values can also be directly specified as `skopt`
            objects like `Categorical`.

        score : function or string
            An appropriate score function or string recognizable by
            sklearn.model_selection.cross_val_score. In sklearn, scores
            should be positive and we are minimizing so we always want
            smaller to mean better.

        n_calls : int
            Number of evaluations to do.

        Returns
        -------
        list of dict
            Each has keys 'score' and 'params', where 'params' stores the
            values from `param_grid` for that run. The primary organizing
            value is 'score'.
        Example
        -------
        >>> skopt_grid = {
                'max_depth': (4, 12),
                'learning_rate': (0.01, 0.5),
                'n_estimators': (20, 200),
                'objective' : Categorical(('multi:softprob',)),
                'gamma': (0, 0.5),
                'min_child_weight': (1, 5),
                'subsample': (0.1, 1),
                'colsample_bytree': (0.1, 1)}
        >>> res = HyperOpt('RF').search(X, y, XGBClassifier, skopt_grid, LOG_SCORE, n_calls=10)

        To be followed by (see below):

        >>> best_params, best_loss = best_results(res)
        """
        logger.debug("********************  HyperOpt start ********************")
        param_keys, param_vecs = zip(*param_grid.items())
        param_keys = list(param_keys)
        param_vecs = list(param_vecs)

        def skopt_scorer(param_vec):
            params = dict(zip(param_keys, param_vec))
            logger.info(params)
            err = cross_validated_scorer(
                X_train, y_train, model_class, params, score, verbose=verbose)
            return err

        try:
            outcome = self.method(skopt_scorer, list(param_vecs), n_calls=n_calls)
            results = []
            for err, param_vec in zip(outcome.func_vals, outcome.x_iters):
                params = dict(zip(param_keys, param_vec))
                results.append({'loss': err, 'params': params})
                if verbose > 0:
                    logger.debug({'loss': err, 'params': params})

            logger.debug("********************  HyperOpt end ********************")
        except Exception as e:
            logger.error(e)
            raise

        return results

In [None]:
# <api>
def prepareDataforTraining(transformed, datamapper=None, train_size=0.75):
    """
    shortcut for train_test_split, currently a simple wrapper for sklearn function
    Parameters
    ----------
    transformed : DataFrame or np.array
        The design matrix, dimension `(n_samples, n_features)`.

    datamapper : sklearn-pandas DataFrameMapper, currently not used

    train_size : ratio of train / totalset
    """
    traindf, testdf = train_test_split(transformed, train_size=train_size)
    return traindf, testdf

In [None]:
def pipelineTrainEvaluation(pipeline, train, target):
    """
    using predefined pipeline to fit, evaluate, score

    Parameters
    ----------
    pipeline : PMMLPipeLine, a valid mapper to PMML
    train    : DataFrame, training data object
    target   ：string, name of target column
    """
    pipeline.fit(train[train.columns.difference([target])], train[target])

    prediction = pipeline.predict_proba(train[train.columns.difference([target])])
    predprob = pd.DataFrame(prediction[:, 1], columns=['predprob'])
    predprob['ytrue'] = train[target].values

    auc = roc_auc_score(y_true=predprob['ytrue'], y_score=predprob['predprob'])
    cv_score = pipeline.cross_val_score(train[train.columns.difference([target])],
                                        train[target], scoring='roc_auc')

    return pipeline, predprob, auc, cv_score

In [None]:
# <api>
def run_experiments(
        experimental_run,
        trainX,
        trainY,
        model_class,
        score='roc_auc',
        test_metric=roc_auc_score,
        n_folds=1,
        verbose=0,
        random_state=None,
        dataset_name=None):
    """
    Basic experimental framework.

    Parameters
    ----------
    experimental_run : list of tuples
        These tuples should have exactly three members: the first one
        of `grid_search`, `randomized_search`, `hyperopt_search`,
        `skopt_gp_minimize`, `skopt_forest_minimize`, or
        `skopt_forest_gbrt`, the second an appropriate `param_grid`
        dict for that function, and the third a dict specifying
        keyword arguments to the search function.

    dataset : (np.array, iterable)
        A dataset (X, y) where `X` has dimension
        `(n_samples, n_features)` and `y` has
         dimension `n_samples`.

    model_class : classifier
        A classifier model in the mode of `sklearn`, with at least
        `fit` and `predict` methods operating on things like
        `X` and `y`.

    score : function or string
        An appropriate score function or string recognizable by
        `sklearn.model_selection.cross_val_score`. In `sklearn`, scores
        are positive and we are maximizing so we always want higher to mean
        better.

    test_metric : function
        An `sklearn.metrics` function.

    random_state : int

    dataset_name : str or None
        Informal name to give the dataset. Purely for
        book-keeping.

    Returns
    -------
    list of dict
       Each dict is a results dictionary of the sort returned
       by `assess`.
    """
    X = trainX
    y = trainY
    if n_folds <= 1:
        skf = ((X, y, X, y),)
    else:
        skf = get_cross_validation_split(X, y,
                                         n_folds=n_folds,
                                         random_state=random_state)

    all_results = []
    # This loop can easily be parallelized, but doing so can
    # be tricky on some systems, since `cross_val_score`
    # calls `joblib` even if `n_jobs=1`, resulting in
    # nested parallel jobs even if there is no actual
    # parallelization elsewhere in the experimental run
    for search_func, param_grid, kwargs in experimental_run:
        all_results.append(
            assess(
                skf,
                search_func=search_func,
                model_class=model_class,
                param_grid=param_grid,
                score=score,
                test_metric=test_metric,
                dataset_name=dataset_name,
                search_func_args=kwargs,
                verbose=verbose))
        logger.info("******************** assess end *******************")
    return all_results

In [None]:
# <api>
def assess(
        cv_split,
        search_func,
        model_class,
        param_grid,
        n_folds=1,
        score='roc_auc',
        test_metric=roc_auc_score,
        dataset_name=None,
        search_func_args={},
        callbacks=[],
        verbose=0):
    """
    The core of the experimental framework. This runs cross-validation
    and, for the inner loop, does cross-validation to find the optimal
    hyperparameters according to `search_func`. These optimal
    parameters are then used for an assessment in the outer
    cross-validation run.

    Parameters
    ----------
    cv_split: generator of 4-tuple of (X_train, y_train, X_test, y_test)

    search_func : function
        The search function to use. Can be `grid_search`,
        `randomized_search`, `hyperopt_search`, `skopt_gp_minimize`,
        `skopt_forest_minimize`, or `skopt_forest_gbrt`, all
        defined in this module. This choice has to be compatible with
        `param_grid`, in the sense that `grid_search` and
        `randomized_search` require a dict from strings to lists of
        values, `hyperopt_search` requires a dict from strings to
        hyperopt sampling functions, and the `skopt` functions
        require dicts from strings to (upper, lower) pairs of
        special `skopt` functions.

    model_class : classifier
        A classifier model in the mode of `sklearn`, with at least
        `fit` and `predict` methods operating on things like
        `X` and `y`.

    param_grid : dict
        Map from parameter names to appropriate specifications of
        appropriate values for that parameter. This is not the
        expanded grid, but rather the simple map that can be expanded
        by `expand_grid` below (though not all methods call for that).
        This has to be compatible with  `search_func`, and all the
        values must be suitable arguments to `model_class` instances.

    score : function or string
        An appropriate score function or string recognizable by
        `sklearn.model_selection.cross_val_score`. In `sklearn`, scores
        are positive and we are maximizing so we always want higher to mean
        better.

    test_metric : function
        An `sklearn.metrics` function.

    xval_indices : list
        List of train and test indices into `X` and `y`. This defines
        the cross-validation. This is done outside of this method to
        allow for identical splits across different experiments.

    dataset_name : str or None
        Name for the dataset being analyzed. For book-keeping and
        display only.

    search_func_args : dict
        Keyword arguments to feed to `search_func`.

    callbacks: list
        List of all callback func following the sklearn idioms

    Returns
    -------
    dict
        Accumulated information about the experiment:

        {'Test accuracy': list of float,
         'Cross-validation time':list of float,
         'Parameters sampled': list of int,
         'Iteration details': list of list,
         'Method': search_func.__name__,
         'Model': model_class.__name__,
         'Dataset': dataset_name,
         'Best parameters': list of dict,
         'Mean test accuracy': float,
         'Mean cross-validation time': float,
         'Mean parameters sampled': float}
    """
    logger.info(search_func)
    logger.info(model_class)

    data = {'Test accuracy': [],
            'Cross-validation time': [],
            'Parameters sampled': [],
            'Iteration details': [],
            'Method': search_func.__name__,
            'Model': model_class.__name__,
            'Dataset': dataset_name,
            'Best parameters': []}

    for X_train, y_train, X_test, y_test in cv_split:
        start = time()
        results = search_func(
            X_train,
            y_train,
            model_class,
            param_grid,
            score,
            verbose=verbose,
            **search_func_args)

        best_params = sorted(results, key=itemgetter('loss'), reverse=True)
        best_params = best_params[0]['params']
        data['Best parameters'].append(best_params)
        bestmod = model_class(**best_params)
        bestmod.fit(X_train, y_train)
        predictions = bestmod.predict(X_train)
        data['Cross-validation time'].append(time() - start)
        data['Parameters sampled'].append(len(results))
        data['Iteration details'].append(results)
        data['Test accuracy'].append(test_metric(y_test, predictions))

    data['Mean test accuracy'] = np.mean(data['Test accuracy'])
    data['Mean cross-validation time'] = np.mean(data['Cross-validation time'])
    data['Mean parameters sampled'] = np.mean(data['Parameters sampled'])
    return data

In [None]:
# <api>
def searchBestParamsSkopt(train, labels_train, skopt_grid, search_alg, train_alg,
                          score='roc_auc',
                          test_metric=roc_auc_score,
                          n_folds=1,
                          verbose=0, n_calls=100):
    experiment_setting = [(search_alg, skopt_grid, {'n_calls': n_calls})]

    experiment_result = run_experiments(experiment_setting,
                                        train,
                                        labels_train,
                                        train_alg,
                                        score=score,
                                        test_metric=test_metric,
                                        n_folds=n_folds,
                                        verbose=verbose)

    test_accuracy = experiment_result[0]['Test accuracy']
    max_index = test_accuracy.index(max(test_accuracy))
    best_params = experiment_result[0]['Best parameters'][max_index]
    trace = experiment_result[0]['Iteration details'][max_index]
    return best_params, trace

In [None]:
# <api>
def get_cross_validation_split(X, y, n_folds=5, random_state=None):
    """
    Use `StratifiedKFold` to create an `n_folds` cross-validator for
    the dataset defined by `X` and y`. Only `y` is used, but both are
    given for an intuitive interface; `X` could just as easily be used.
    """
    skf = StratifiedKFold(n_splits=n_folds, random_state=random_state)
    for train_index, test_index in skf.split(X, y):
        if hasattr(X, 'iloc'):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        else:
            X_train, X_test = X[train_index], X[test_index]
        if hasattr(y, 'iloc'):
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        else:
            y_train, y_test = y[train_index], y[test_index]
    yield X_train, y_train, X_test, y_test

In [None]:
# <api>
def cross_validated_scorer(
        X_train, y_train, model_class, params, score, verbose=0, kfolds=5):
    """
    The scoring function used through this module, by all search
    functions.

    Parameters
    ----------
    X_train : np.array
        The design matrix, dimension `(n_samples, n_features)`.

    y_train : list or np.array
        The target, of dimension `n_samples`.

    model_class : classifier
        A classifier model in the mode of `sklearn`, with at least
        `fit` and `predict` methods operating on things like
        `X` and `y`.

    params : dict
        Map from parameter names to single appropriate values
        for that parameter. This will be used to build a model
        from `model_class`.

    score : function or string
        An appropriate score function or string recognizable by
        `sklearn.model_selection.cross_val_score`. In `sklearn`, scores
        are positive and we are maximizing so we always want higher to mean
        better.

    kfolds : int
        Number of cross-validation runs to do.

    Returns
    -------
    float
       Average loss over the `kfolds` runs.
    """
    if verbose > 0:
        logger.info("*********************** params **********************")
        logger.info(params)

    mod = model_class(**params)
    cv_score = cross_val_score(
        mod,
        X_train,
        y=y_train,
        scoring=score,
        cv=kfolds,
        verbose=verbose,
        n_jobs=1).mean()

    if verbose > 0:
        logger.info("********************** cv_score *********************")
        logger.info(cv_score)

    return cv_score