In [18]:
from skconfig.condition import EqualsCondition
from skconfig.validator import BaseValidator
from skconfig.forbidden import ForbiddenEquals, ForbiddenIn, ForbiddenAnd
from skconfig.parameter import StringParam, BoolParam, FloatIntervalParam, NoneParam, UnionParam, IntParam, IntIntervalParam
from skconfig.distribution import UniformBoolDistribution, UniformFloatDistribution, CategoricalDistribution, UnionDistribution, ConstantDistribution, UniformIntDistribution
from skconfig.sampler import Sampler
from sklearn.linear_model import LogisticRegression

## Validation

skconfig creates a DSL for defining the search space for a sklearn model. For example, we can defined a LogRegressionValidator as follows:

In [19]:
class LogRegressionValidator(BaseValidator):
    estimator = LogisticRegression
    
    penalty = StringParam("l2", "l1")
    dual = BoolParam()
    tol = FloatIntervalParam(lower=0, include_lower=False)
    C = FloatIntervalParam(lower=0)
    fit_intercept = BoolParam()
    intercept_scaling = FloatIntervalParam(lower=0, include_lower=False)
    class_weight = NoneParam()
    random_state = UnionParam(IntParam(), NoneParam())
    solver = StringParam("newton-cg", "lbfgs", "liblinear", "sag", "saga", "warn")
    max_iter = IntIntervalParam(lower=1)
    multi_class = StringParam("ovr", "multinomial", "auto", "warn")
    verbose = IntParam()
    warm_start = BoolParam()
    n_jobs = UnionParam(NoneParam(), IntIntervalParam(lower=-1))
    
    forbiddens = [
        ForbiddenAnd([ForbiddenEquals("penalty", "l1"), ForbiddenIn("solver", ["newton-cg", "sag", "lbfgs"])]),
        ForbiddenAnd([ForbiddenEquals("solver", "liblinear"), ForbiddenEquals("multi_class", "multinomial")]),
    ]

With this validator object, we can validate a set of parameters:

In [23]:
validator = LogRegressionValidator()

validator.validate_params(multi_class="ovr") # Does not raise

In [15]:
validator.validate_params(penalty="hello world")

InvalidParamChoices: penalty must be one of ('l2', 'l1')

In [4]:
validator.validate_params(solver="liblinear", multi_class="multinomial")

ForbiddenValue: solver and multi_class with value liblinear and multinomial is forbidden

In [5]:
validator.validate_params(penalty="l1", solver="sag")

ForbiddenValue: penalty and solver with value l1 and ['newton-cg', 'sag', 'lbfgs'] is forbidden

In [6]:
params_dict = {"penalty": "l1", "solver": "sag"}
validator.validate_params(**params_dict)

ForbiddenValue: penalty and solver with value l1 and ['newton-cg', 'sag', 'lbfgs'] is forbidden

Or validate a estimator:

In [7]:
est = LogisticRegression()
validator.validate_estimator(est)

## Sampling

To sample the parameter space, a skconfig has a DSL for defining the distribution to be sampled from:

In [8]:
validator = LogRegressionValidator()
sampler = Sampler(validator, 
             dual=UniformBoolDistribution(),
             C=UniformFloatDistribution(0.0, 1.0),
             solver=CategoricalDistribution(["newton-cg", "lbfgs", "liblinear", "sag", "saga"]),
             random_state=UnionDistribution(ConstantDistribution(None), UniformIntDistribution(0, 10)),
             penalty=CategoricalDistribution(["l2", "l1"]),
             multi_class=CategoricalDistribution(["ovr", "multinomial"])
)

In [12]:
params_sample = sampler.sample(5)
params_sample

[{'C': 0.49609443571092127,
  'dual': False,
  'multi_class': 'ovr',
  'penalty': 'l2',
  'solver': 'saga',
  'random_state': None},
 {'C': 0.7169968253334416,
  'dual': True,
  'multi_class': 'multinomial',
  'penalty': 'l2',
  'solver': 'saga',
  'random_state': 0},
 {'C': 0.7899051166909798,
  'dual': False,
  'multi_class': 'multinomial',
  'penalty': 'l2',
  'solver': 'lbfgs',
  'random_state': None},
 {'C': 0.4268914739541635,
  'dual': False,
  'multi_class': 'ovr',
  'penalty': 'l1',
  'solver': 'saga',
  'random_state': 2},
 {'C': 0.3945090918540334,
  'dual': True,
  'multi_class': 'multinomial',
  'penalty': 'l2',
  'solver': 'lbfgs',
  'random_state': 4}]

Create an estimator from the first param from params_sample

In [10]:
est = LogisticRegression(**params_sample[0])
est.get_params()

{'C': 0.7215182701146491,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'multinomial',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'newton-cg',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

## Serialization

The sampler can be serialized into a json:

In [24]:
import json
from IPython.display import JSON

serialized = sampler.to_dict()
json_serialized = json.dumps(serialized, indent=2)
JSON(serialized)

<IPython.core.display.JSON object>

In [25]:
sampler_dict = json.loads(json_serialized)
sampler_new = Sampler(validator).from_dict(sampler_dict)
sampler_new

dual: UniformBoolDistribution(default=True)
C: UniformFloatDistribution(lower=0.0, upper=1.0, default=0.0, log=False)
solver: CategoricalDistribution(choices=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], default=newton-cg)
random_state: UnionDistribution(dists=[{'type': 'ConstantDistribution', 'value': None}, {'lower': 0, 'upper': 10, 'default': 0, 'log': False, 'type': 'UniformIntDistribution'}])
penalty: CategoricalDistribution(choices=['l2', 'l1'], default=l2)
multi_class: CategoricalDistribution(choices=['ovr', 'multinomial'], default=ovr)