diff --git a/stability_selection/bootstrap.py b/stability_selection/bootstrap.py new file mode 100644 index 0000000..8f77eb3 --- /dev/null +++ b/stability_selection/bootstrap.py @@ -0,0 +1,76 @@ +""" +=============================== +Bootstrap helper functions +=============================== + +This module contains helper functions for stability_selection.py that do bootstrap sampling +""" + +import numpy as np + +from sklearn.utils.random import sample_without_replacement + + +__all__ = ['bootstrap_without_replacement', 'complementary_pairs_bootstrap'] + + +def bootstrap_without_replacement(n_samples, n_subsamples, random_state=None): + """ + Bootstrap without replacement. It is a wrapper around + sklearn.utils.random.sample_without_replacement. + + Parameters + ---------- + n_samples : int + Number of total samples + n_subsamples : int + Number of subsamples in the bootstrap sample + random_state : int, RandomState instance or None, optional, default=None + Pseudo random number generator state used for random uniform sampling + from lists of possible values instead of scipy.stats distributions. + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Returns + ------- + out : array of size [n_subsamples,] + The sampled subsets of integer. The subset of selected integer might + not be randomized, see the method argument. + """ + return sample_without_replacement(n_samples, n_subsamples, random_state=random_state) + + +def complementary_pairs_bootstrap(n_samples, n_subsamples, random_state=None): + """ + Complementary pairs bootstrap. Two subsamples A and B are generated, such + that |A| = n_subsamples, the union of A and B equals {0, ..., n_samples - 1}, + and the intersection of A and B is the empty set. + + Parameters + ---------- + n_samples : int + Number of total samples + n_subsamples : int + Number of subsamples in the bootstrap sample + random_state : int, RandomState instance or None, optional, default=None + Pseudo random number generator state used for random uniform sampling + from lists of possible values instead of scipy.stats distributions. + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Returns + ------- + A : array of size [n_subsamples,] + The sampled subsets of integer. The subset of selected integer might + not be randomized, see the method argument. + B : array of size [n_samples - n_subsamples,] + The complement of A. + """ + subsample = bootstrap_without_replacement(n_samples, n_subsamples, random_state) + complementary_subsample = np.setdiff1d(np.arange(n_samples), subsample) + + return subsample, complementary_subsample diff --git a/stability_selection/stability_selection.py b/stability_selection/stability_selection.py index 408f627..90a6053 100644 --- a/stability_selection/stability_selection.py +++ b/stability_selection/stability_selection.py @@ -21,13 +21,20 @@ from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.utils import check_X_y, check_array, safe_mask, check_random_state -from sklearn.utils.random import sample_without_replacement from sklearn.utils.validation import check_is_fitted from warnings import warn +from .bootstrap import bootstrap_without_replacement, complementary_pairs_bootstrap + __all__ = ['StabilitySelection', 'plot_stability_path'] +DEFAULT_LAMBDA_GRID = np.logspace(-5, -2, 25) +BOOTSTRAP_FUNC_MAPPING = { + 'subsample': bootstrap_without_replacement, + 'complementary_pairs': complementary_pairs_bootstrap +} + def _return_estimator_from_pipeline(pipeline): """Returns the final estimator in a Pipeline, or the estimator if it is not""" @@ -37,7 +44,17 @@ def _return_estimator_from_pipeline(pipeline): return pipeline -def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value, threshold=None, random_state=None): +def _bootstrap_generator(n_bootstrap_iterations, bootstrap_func, n_samples, n_subsamples, random_state=None): + for _ in range(n_bootstrap_iterations): + subsample = bootstrap_func(n_samples, n_subsamples, random_state) + if isinstance(subsample, tuple): + for item in subsample: + yield item + else: + yield subsample + + +def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value, threshold=None): """ Fits base_estimator on a bootstrap sample of the original data, and returns a mas of the variables that are \ selected by the fitted model. @@ -59,17 +76,24 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value, thres lambda_value : float Value of the penalization parameter + threshold : string, float, optional default None + The threshold value to use for feature selection. Features whose + importance is greater or equal are kept while the others are + discarded. If "median" (resp. "mean"), then the ``threshold`` value is + the median (resp. the mean) of the feature importances. A scaling + factor (e.g., "1.25*mean") may also be used. If None and if the + estimator has a parameter penalty set to l1, either explicitly + or implicitly (e.g, Lasso), the threshold used is 1e-5. + Otherwise, "mean" is used by default. + Returns ------- selected_variables : array-like, shape = [n_features] Boolean mask of selected variables. """ - n_samples = X.shape[0] - bootstrap = sample_without_replacement(n_samples, n_samples // 2, random_state=random_state) - X_train, y_train = X[safe_mask(X, bootstrap), :], y[bootstrap] base_estimator.set_params(**{lambda_name: lambda_value}) - base_estimator.fit(X_train, y_train) + base_estimator.fit(X, y) # TODO: Reconsider if we really want to use SelectFromModel here or not selector_model = _return_estimator_from_pipeline(base_estimator) @@ -116,7 +140,7 @@ def plot_stability_path(stability_selection, threshold_highlight=None, **kwargs) class StabilitySelection(BaseEstimator, TransformerMixin): - """Stability selection fits the estimator `base_estimator` on bootstrap samples of the original data set, for + """Stability selection [1] fits the estimator `base_estimator` on bootstrap samples of the original data set, for different values of the regularization parameter for `base_estimator`. Variables that reliably get selected by the model in these bootstrap samples are considered to be stable variables. @@ -126,7 +150,7 @@ class StabilitySelection(BaseEstimator, TransformerMixin): The base estimator used for stability selection. lambda_name : str. - The name of the penalizatin parameter for the estimator `base_estimator`. + The name of the penalization parameter for the estimator `base_estimator`. lambda_grid : array-like. Grid of values of the penalization parameter to iterate over. @@ -134,10 +158,23 @@ class StabilitySelection(BaseEstimator, TransformerMixin): n_bootstrap_iterations : integer. Number of bootstrap samples to create. - threshold: float. + sample_fraction : float, optional + The fraction of samples to be used in each bootstrap sample. + Should be between 0 and 1. If 1, all samples are used. + + threshold : float. Threshold defining the minimum cutoff value for the stability scores. - bootstrap_threshold: string, float, optional default None + bootstrap_func : str or callable fun (default=bootstrap_without_replacement) + The function used to subsample the data. This parameter can be: + - A string, which must be one of + - 'subsample': For subsampling without replacement. + - 'complementary_pairs': For complementary pairs subsampling [2]. + - A function that takes n_samples, and a random state + as inputs and returns a list of sample indices in the range + (0, n_samples-1). By default, indices are uniformly subsampled. + + bootstrap_threshold : string, float, optional default None The threshold value to use for feature selection. Features whose importance is greater or equal are kept while the others are discarded. If "median" (resp. "mean"), then the ``threshold`` value is @@ -179,15 +216,27 @@ class StabilitySelection(BaseEstimator, TransformerMixin): ---------- stability_scores_ : array, shape = [n_features, n_alphas] Array of stability scores for each feature for each value of the penalization parameter. + + References + ---------- + + .. [1] Meinshausen, N. and Buhlmann, P., 2010. Stability selection. Journal of the + Royal Statistical Society: Series B (Statistical Methodology), 72(4), pp.417-473. + .. [2] Shah, R.D. and Samworth, R.J., 2013. Variable selection with error control: + another look at stability selection. Journal of the Royal Statistical Society: + Series B (Statistical Methodology), 75(1), pp.55-80. """ def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name='C', lambda_grid=None, - n_bootstrap_iterations=100, threshold=0.6, - bootstrap_threshold=None, verbose=0, n_jobs=1, pre_dispatch='2*n_jobs', random_state=None): + n_bootstrap_iterations=100, sample_fraction=0.5, threshold=0.6, + bootstrap_func=bootstrap_without_replacement, bootstrap_threshold=None, verbose=0, + n_jobs=1, pre_dispatch='2*n_jobs', random_state=None): self.base_estimator = base_estimator self.lambda_name = lambda_name self.lambda_grid = lambda_grid self.n_bootstrap_iterations = n_bootstrap_iterations + self.sample_fraction = sample_fraction self.threshold = threshold + self.bootstrap_func = bootstrap_func self.bootstrap_threshold = bootstrap_threshold self.verbose = verbose self.n_jobs = n_jobs @@ -196,7 +245,11 @@ def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name= def _validate_input(self): if not isinstance(self.n_bootstrap_iterations, int) or self.n_bootstrap_iterations <= 0: - raise ValueError('n_bootstrap_iterations should be a positive integer, got %s' % self.n_bootstrap_iterations) + raise ValueError('n_bootstrap_iterations should be a positive integer, got %s' % + self.n_bootstrap_iterations) + + if not isinstance(self.sample_fraction, float) or not (0.0 < self.sample_fraction <= 1.0): + raise ValueError('sample_fraction should be a float in (0, 1], got %s' % self.threshold) if not isinstance(self.threshold, float) or not (0.0 < self.threshold <= 1.0): raise ValueError('threshold should be a float in (0, 1], got %s' % self.threshold) @@ -206,7 +259,16 @@ def _validate_input(self): 'with that name' % (self.lambda_name, self.base_estimator.__class__.__name__)) if self.lambda_grid is None: - self.lambda_grid = np.logspace(-5, -2, 25) + self.lambda_grid = DEFAULT_LAMBDA_GRID + + if isinstance(self.bootstrap_func, str): + if self.bootstrap_func not in BOOTSTRAP_FUNC_MAPPING.keys(): + raise ValueError('bootstrap_func is set to %s, but must be one of %s or a callable' % + (self.bootstrap_func, BOOTSTRAP_FUNC_MAPPING.keys())) + + self.bootstrap_func = BOOTSTRAP_FUNC_MAPPING[self.bootstrap_func] + elif not callable(self.bootstrap_func): + raise ValueError('bootstrap_func must be one of %s or a callable' % BOOTSTRAP_FUNC_MAPPING.keys()) def fit(self, X, y): """Fit the stability selection model on the given data. @@ -224,6 +286,7 @@ def fit(self, X, y): X, y = check_X_y(X, y) n_samples, n_variables = X.shape + n_subsamples = np.floor(self.sample_fraction * n_samples).astype(int) n_lambdas = self.lambda_grid.shape[0] base_estimator = clone(self.base_estimator) @@ -235,12 +298,16 @@ def fit(self, X, y): print("Fitting estimator for lambda = %.5f (%d / %d) on %d bootstrap samples" % (lambda_value, idx + 1, n_lambdas, self.n_bootstrap_iterations)) + bootstrap_samples = _bootstrap_generator(self.n_bootstrap_iterations, self.bootstrap_func, n_samples, + n_subsamples, random_state=random_state) + selected_variables = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch - )(delayed(_fit_bootstrap_sample)(clone(base_estimator), X, y, self.lambda_name, lambda_value, - threshold=self.bootstrap_threshold, random_state=random_state) - for _ in range(self.n_bootstrap_iterations)) + )(delayed(_fit_bootstrap_sample)(clone(base_estimator), X=X[safe_mask(X, subsample), :], + y=y[subsample], lambda_name=self.lambda_name, lambda_value=lambda_value, + threshold=self.bootstrap_threshold) + for subsample in bootstrap_samples) stability_scores[:, idx] = np.vstack(selected_variables).mean(axis=0) diff --git a/stability_selection/tests/test_common.py b/stability_selection/tests/test_common.py index 32349d3..dff0e68 100644 --- a/stability_selection/tests/test_common.py +++ b/stability_selection/tests/test_common.py @@ -51,4 +51,14 @@ def test_check_wrong_lambda_name(): def test_automatic_lambda_grid(): selector = StabilitySelection() selector._validate_input() - assert_array_equal(np.logspace(-5, -2, 25), selector.lambda_grid) \ No newline at end of file + assert_array_equal(np.logspace(-5, -2, 25), selector.lambda_grid) + + +@raises(ValueError) +def test_bootstrap_func(): + StabilitySelection(bootstrap_func='nonexistent')._validate_input() + + +@raises(ValueError) +def test_callable_bootstrap_func(): + StabilitySelection(bootstrap_func=0.5)._validate_input() diff --git a/stability_selection/tests/test_stability_selection.py b/stability_selection/tests/test_stability_selection.py index 12a5648..4207475 100644 --- a/stability_selection/tests/test_stability_selection.py +++ b/stability_selection/tests/test_stability_selection.py @@ -68,4 +68,25 @@ def test_stability_selection_regression(): chosen_betas = selector.get_support(indices=True) - assert_almost_equal(important_betas, chosen_betas) \ No newline at end of file + assert_almost_equal(important_betas, chosen_betas) + + +def test_different_bootstrap(): + n, p, k = 500, 1000, 5 + + X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) + + base_estimator = Pipeline([ + ('scaler', StandardScaler()), + ('model', Lasso()) + ]) + + lambdas_grid = np.logspace(-1, 1, num=10) + + selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid, + bootstrap_func='complementary_pairs') + selector.fit(X, y) + + chosen_betas = selector.get_support(indices=True) + + assert_almost_equal(important_betas, chosen_betas)