In [None]:
# <api>
import math
import numpy as np
from sklearn.ensemble import RandomForestClassifier

from sklearn.grid_search import GridSearchCV

import sklearn.metrics as metrics
from skopt.space import Categorical
import work.marvin.binary_classifier_models.modelfit as modelfit

import logging
logger = logging.getLogger(__name__)

In [None]:
# <api>
def bestModelProducer(data, target, datamapper, fig_path=None):
    """
    # auto rf model generation, 3 steps:
    1. estimate optimal model parameters space for gridsearch,
       depends on sample size and feature size
    2. run gridsearch to find best parameter set
    3. train the best rf model using the best parameter set
    """
    traindf, testdf = modelfit.prepareDataforTraining(data)
    datamapper.fit_transform(traindf[traindf.columns.difference([target])])

    # estimate optimal parameters grid space
    configspace = parameterGridInitialization(datamapper.shape)
    bestModel = produceBestRFmodel(traindf, datamapper, target, configspace, fig_path)
    return bestModel, traindf, testdf

## Parameters Initialization

In [None]:
# <api>
def max_feature_space(feature_size):
    fs_sqrt = math.sqrt(feature_size)
    if fs_sqrt > 10:
        max_feature = range(int(fs_sqrt-3), int(fs_sqrt*1.50), 2)
    else:
        max_feature = range(int(fs_sqrt), int(fs_sqrt*1.50), 2)
    return list(max_feature)


# <api>
def n_estimators_space(train_size):
    if train_size > 2000:
        n_estimators_spc = range(50, 301, 20)
    else:
        n_estimators_spc = range(20, 100, 10)
    return list(n_estimators_spc)


# <api>
def min_samples_leaf_space(train_size):
    min_samples_leaf_spc = [50, 100, 200, 500]
    return min_samples_leaf_spc

In [None]:
# <api>
def parameterGridInitialization(shape):
    feature_size = shape[1] - 1
    train_size = shape[0]
    n_estimators_spc = n_estimators_space(train_size)
    min_samples_leaf_spc = min_samples_leaf_space(train_size)
    max_feature_spc = max_feature_space(feature_size)
    param_grid = {'n_estimators': n_estimators_spc,
                  'max_features': max_feature_spc,
                  'min_samples_leaf': min_samples_leaf_spc}
    return param_grid

In [None]:
# <api>
def configSpaceInitialization(shape):
    feature_size = shape[1] - 1
    train_size = shape[0]

    if train_size >= 1000:
        skopt_grid = {'max_features': (2, feature_size),
                      'min_samples_leaf': (50, 500),
                      'min_samples_split': (50, 500),
                      'n_estimators': (50, 800)}
    else:
        skopt_grid = {'max_features': (2, feature_size),
                      'min_samples_leaf': (20, train_size),
                      'min_samples_split': (20, train_size),
                      'n_estimators': (20, 200)}
    skopt_grid['n_jobs'] = Categorical((-1,))
    return skopt_grid

## Best RF Model Producer

In [None]:
# <api>
def rfGridSearch(train, labels_train, configspace, seed=27):
    logger.error(configspace)
    gsearch = GridSearchCV(estimator=RandomForestClassifier(oob_score=True, random_state=seed),
                           param_grid=configspace, scoring='roc_auc',
                           n_jobs=-1, iid=False, cv=5)
    gsearch.fit(train, labels_train)
    best_parameters = gsearch.best_estimator_.get_params()
    best_n_estimators = best_parameters['n_estimators']
    best_max_features = best_parameters['max_features']
    best_min_samples_leaf = best_parameters['min_samples_leaf']
    return best_n_estimators, best_max_features, best_min_samples_leaf

In [None]:
# <api>
def produceBestRFmodel(traindf, datamapper, target,
                       configspace, fig_path=None, seed=27):
    # datamapper transform
    train = datamapper.fit_transform(traindf[traindf.columns.difference([target])])
    labels_train = np.array(traindf[target])

    # running grid search to get the best parameter set
    best_n_estimators, best_max_features, best_min_samples_leaf = rfGridSearch(train,
                                                                               labels_train,
                                                                               configspace,
                                                                               seed=seed)

    rf_best = RandomForestClassifier(n_estimators=best_n_estimators,
                                     min_samples_leaf=best_min_samples_leaf,
                                     max_features=best_max_features,
                                     n_jobs=-1,
                                     oob_score=True,
                                     random_state=seed)

    return rf_best

In [None]:
# <api>
def produceBestModel(traindf, datamapper, target, configspace, fig_path=None, seed=27, verbose=0):
    return produceBestRFmodel(traindf, datamapper, target, configspace, fig_path, seed)

In [None]:
# <api>
def optimizeBestModel(traindf, datamapper, target,
                      configspace, search_alg,
                      fig_path=None, n_calls=100, seed=27, verbose=0):
    # datamapper transform
    train = datamapper.fit_transform(traindf[traindf.columns.difference([target])])
    train = np.array(train)
    labels_train = np.array(traindf[target])

    # running grid search to get the best parameter set
    best_params, trace = modelfit.searchBestParamsSkopt(train, labels_train,
                                                        configspace, search_alg,
                                                        RandomForestClassifier, n_calls)
    # train a randomforest using the best parameter set
    rf_best = RandomForestClassifier(n_estimators=best_params['n_estimators'],
                                     min_samples_leaf=best_params['min_samples_leaf'],
                                     max_features=best_params['max_features'],
                                     oob_score=True,
                                     random_state=seed)

    return rf_best