In [None]:
# <api>
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
import work.marvin.binary_classifier_models.modelfit as modelfit

import logging

logger = logging.getLogger(__name__)

In [None]:
# <api>
def bestModelProducer(data, target, datamapper, fig_path=None):
    """
    # auto LR model generation, 3 steps:
    1. estimate optimal model parameters space for gridsearch,
       depends on sample size and feature size
    2. run gridsearch to find best parameter set
    3. train the best LR model using the best parameter set
    """
    traindf, testdf = modelfit.prepareDataforTraining(data)
    configspace = {'penalty': ['l1', 'l2'], 'n_jobs': -1}

    bestModel = produceBestLRmodel(traindf, datamapper, target,
                                   configspace, fig_path)
    return bestModel, traindf, testdf

In [None]:
# <api>
def produceBestLRmodel(traindf, datamapper, target,
                       configspace,
                       fig_path=None,
                       seed=27, verbose=0):
    # datamapper transform
    train = datamapper.fit_transform(traindf[traindf.columns.difference([target])])
    train = np.array(train)
    labels_train = np.array(traindf[target])

    # running grid search to get the best parameter set
    gsearch = GridSearchCV(estimator=LogisticRegression(random_state=seed),
                           param_grid=configspace,
                           scoring='roc_auc',
                           n_jobs=1, iid=False, cv=5,
                           verbose=verbose)
    gsearch.fit(train, labels_train)
    trace = [{"params": grid[0], "loss": grid[1]} for grid in gsearch.grid_scores_]
    best_parameters = gsearch.best_estimator_.get_params()
    best_penalty = best_parameters['penalty']
    best_lr = LogisticRegression(penalty=best_penalty, n_jobs=-1, random_state=seed)
    return best_lr, trace

In [None]:
# <api>
def produceBestModel(traindf, datamapper, target,
                     configspace=None, fig_path=None,
                     seed=27, verbose=0):
    if not configspace:
        configspace = {'penalty': ['l1', 'l2'], 'n_jobs': -1}
    return produceBestLRmodel(traindf, datamapper, target, configspace, fig_path, seed)

In [None]:
# <api>
def optimizeBestModel(traindf, datamapper, target, configspace=None,
                      search_alg=None, fig_path=None,
                      n_calls=None, seed=27, verbose=0):
    if not configspace:
        configspace = {'penalty': ['l1', 'l2'], 'n_jobs': -1}
    return produceBestLRmodel(traindf, datamapper, target, configspace,
                              fig_path=fig_path, seed=seed)