lasso.py

"""
This module contains a class `lasso`_ that implements
post selection for the lasso
as described in `post selection LASSO`_.
.. _covTest: http://arxiv.org/abs/1301.7161
.. _Kac Rice: http://arxiv.org/abs/1308.3020
.. _Spacings: http://arxiv.org/abs/1401.3889
.. _post selection LASSO: http://arxiv.org/abs/1311.6238
.. _sample carving: http://arxiv.org/abs/1410.2597
"""
from __future__ import division

import warnings, functools
from copy import copy

import numpy as np
import pandas as pd
from scipy.stats import norm as ndist, t as tdist
from scipy.linalg import block_diag

from regreg.api import (
    glm,
    weighted_l1norm,
    simple_problem,
    smooth_sum,
    squared_error,
    identity_quadratic,
    quadratic_loss,
)

from .sqrt_lasso import solve_sqrt_lasso, estimate_sigma
from .debiased_lasso import debiasing_matrix, pseudoinverse_debiasing_matrix

from ..constraints.affine import (
    constraints,
    selection_interval,
    interval_constraints,
    sample_from_constraints,
    gibbs_test,
    stack,
)

from ..distributions.discrete_family import discrete_family
from ..truncated.gaussian import truncated_gaussian_old as TG
from ..glm import pairs_bootstrap_glm


class lasso(object):
    r"""
    A class for the LASSO for post-selection inference.
    The problem solved is
    .. math::
        \text{minimize}_{\beta} \frac{1}{2n} \|y-X\beta\|^2_2 +
            \lambda \|\beta\|_1
    where $\lambda$ is `lam`.
    """

    def __init__(
        self,
        loglike,
        feature_weights,
        covariance_estimator=None,
        ignore_inactive_constraints=False,
    ):
        r"""
        Create a new post-selection dor the LASSO problem
        Parameters
        ----------
        loglike : `regreg.smooth.glm.glm`
            A (negative) log-likelihood as implemented in `regreg`.
        feature_weights : np.ndarray
            Feature weights for L-1 penalty. If a float,
            it is brodcast to all features.
        covariance_estimator : callable (optional)
            If None, use the parameteric
            covariance estimate of the selected model.
        Notes
        -----
        If not None, `covariance_estimator` should
        take arguments (beta, active, inactive)
        and return an estimate of the covariance of
        $(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
        the unpenalized estimator and the inactive
        coordinates of the gradient of the likelihood at
        the unpenalized estimator.
        """

        self.loglike = loglike
        if np.asarray(feature_weights).shape == ():
            feature_weights = np.ones(loglike.shape) * feature_weights
        self.feature_weights = np.asarray(feature_weights)

        self.covariance_estimator = covariance_estimator
        self.ignore_inactive_constraints = ignore_inactive_constraints

    def fit(self, lasso_solution=None, solve_args={"tol": 1.0e-12, "min_its": 50}):
        """
        Fit the lasso using `regreg`.
        This sets the attributes `soln`, `onestep` and
        forms the constraints necessary for post-selection inference
        by calling `form_constraints()`.
        Parameters
        ----------
        lasso_solution : optional
             If not None, this is taken to be the solution
             of the optimization problem. No checks
             are done, though the implied affine
             constraints will generally not be satisfied.
        solve_args : keyword args
             Passed to `regreg.problems.simple_problem.solve`.
        Returns
        -------
        soln : np.float
             Solution to lasso.

        Notes
        -----
        If `self` already has an attribute `lasso_solution`
        this will be taken to be the solution and
        no optimization problem will be solved. Supplying
        the optional argument `lasso_solution` will
        overwrite `self`'s `lasso_solution`.
        """

        self._penalty = weighted_l1norm(self.feature_weights, lagrange=1.0)
        if lasso_solution is None and not hasattr(self, "lasso_solution"):
            problem = simple_problem(self.loglike, self._penalty)
            self.lasso_solution = problem.solve(**solve_args)
        elif lasso_solution is not None:
            self.lasso_solution = lasso_solution

        lasso_solution = (
            self.lasso_solution
        )  # shorthand after setting it correctly above

        if not np.all(lasso_solution == 0):
            self.active = np.nonzero(lasso_solution != 0)[0]
            self.inactive = lasso_solution == 0
            self.active_signs = np.sign(lasso_solution[self.active])
            self._active_soln = lasso_solution[self.active]
            H = self.loglike.hessian(self.lasso_solution)
            H_AA = H[self.active][:, self.active]
            H_AAinv = np.linalg.inv(H_AA)
            Q = self.loglike.quadratic
            G_Q = Q.objective(self.lasso_solution, "grad")
            G = self.loglike.gradient(self.lasso_solution) + G_Q
            G_A = G[self.active]
            G_I = self._G_I = G[self.inactive]
            dbeta_A = H_AAinv.dot(G_A)
            self.onestep_estimator = self._active_soln - dbeta_A
            self.active_penalized = self.feature_weights[self.active] != 0

            if self.active_penalized.sum():
                self._constraints = constraints(
                    -np.diag(self.active_signs)[self.active_penalized],
                    (self.active_signs * dbeta_A)[self.active_penalized],
                    covariance=H_AAinv,
                )
            else:
                self._constraints = constraints(
                    np.identity(self.active.shape[0]),
                    1.0e12 * np.ones(self.active.shape[0]) * H_AAinv.max(),
                    # XXX np.inf seems to fail tests
                    covariance=H_AAinv,
                )

            if self.inactive.sum():
                # inactive constraints

                H_IA = H[self.inactive][:, self.active]
                H_II = H[self.inactive][:, self.inactive]
                inactive_cov = H_II - H_IA.dot(H_AAinv).dot(H_IA.T)
                irrepresentable = H_IA.dot(H_AAinv)
                inactive_mean = irrepresentable.dot(-G_A)
                self._inactive_constraints = constraints(
                    np.vstack(
                        [
                            np.identity(self.inactive.sum()),
                            -np.identity(self.inactive.sum()),
                        ]
                    ),
                    np.hstack(
                        [
                            self.feature_weights[self.inactive],
                            self.feature_weights[self.inactive],
                        ]
                    ),
                    covariance=inactive_cov,
                    mean=inactive_mean,
                )
                if not self._inactive_constraints(G_I):
                    warnings.warn(
                        "inactive constraint of KKT conditions not satisfied -- perhaps need to solve with more accuracy"
                    )

                if self.covariance_estimator is not None:
                    # make full constraints

                    # A: active
                    # I: inactive
                    # F: full, (A,I) stacked

                    _cov_FA = self.covariance_estimator(
                        self.onestep_estimator, self.active, self.inactive
                    )

                    _cov_IA = _cov_FA[len(self.active) :]
                    _cov_AA = _cov_FA[: len(self.active)]

                    _beta_bar = self.onestep_estimator

                    if not self.ignore_inactive_constraints:
                        # X_{-E}^T(y - X_E \bar{\beta}_E)

                        _inactive_score = -G_I - inactive_mean

                        _indep_linear_part = _cov_IA.dot(np.linalg.inv(_cov_AA))

                        # we "fix" _nuisance, effectively conditioning on it

                        _nuisance = _inactive_score - _indep_linear_part.dot(_beta_bar)
                        _upper_lim = (
                            self.feature_weights[self.inactive]
                            - _nuisance
                            - inactive_mean
                        )
                        _lower_lim = (
                            _nuisance
                            + self.feature_weights[self.inactive]
                            + inactive_mean
                        )

                        _upper_linear = _indep_linear_part
                        _lower_linear = -_indep_linear_part

                        C = self._constraints
                        _full_linear = np.vstack(
                            [C.linear_part, _upper_linear, _lower_linear]
                        )

                        _full_offset = np.hstack([C.offset, _upper_lim, _lower_lim])

                        self._constraints = constraints(
                            _full_linear, _full_offset, covariance=_cov_AA
                        )
                    else:
                        self._constraints.covariance[:] = _cov_AA

                    if not self._constraints(_beta_bar):
                        warnings.warn(
                            "constraints of KKT conditions on one-step estimator "
                            + " not satisfied -- perhaps need to solve with more"
                            + "accuracy"
                        )

            else:
                self._inactive_constraints = None
        else:
            self.active = []
            self.active_signs = []
            self.inactive = np.arange(lasso_solution.shape[0])
            self._constraints = None
            self._inactive_constraints = None
        return self.lasso_solution

    def summary(
        self,
        alternative="twosided",
        level=0.95,
        compute_intervals=False,
        dispersion=None,
        truth=None,
    ):
        """
        Summary table for inference adjusted for selection.

        Parameters
        ----------

        alternative : str
            One of ["twosided","onesided"]

        level : float
            Form level*100% selective confidence intervals.

        compute_intervals : bool
            Should we compute confidence intervals?

        dispersion : float
            Scalar to multiply `self.constraints.covaraince`

        truth : np.array
            True values of each beta for selected variables. If not None, a column 'pval' are p-values
            computed under these corresponding null hypotheses.

        Returns
        -------
        pval_summary : np.recarray
            Array with one entry per active variable.
            Columns are 'variable', 'pval', 'lasso', 'onestep', 'lower_trunc', 'upper_trunc', 'sd'.
        """

        if alternative not in ["twosided", "onesided"]:
            raise ValueError("alternative must be one of ['twosided', 'onesided']")

        if truth is None:
            truth = np.zeros_like(self.active_signs)

        if dispersion is None:
            dispersion = 1.0

        result = []
        C = self.constraints
        if C is not None:
            _cov = C.covariance.copy()
            C.covariance = _cov * dispersion
            one_step = self.onestep_estimator
            for i in range(one_step.shape[0]):
                eta = np.zeros_like(one_step)
                eta[i] = self.active_signs[i]
                _alt = {"onesided": "greater", "twosided": "twosided"}[alternative]
                if C.linear_part.shape[0] > 0:  # there were some constraints
                    _pval = C.pivot(
                        eta, one_step, null_value=truth[i], alternative=_alt
                    )
                else:
                    obs = (eta * one_step).sum()
                    sd = np.sqrt((eta * C.covariance.dot(eta)))
                    Z = obs / sd
                    _pval = 2 * ndist.sf(np.fabs(Z))

                alpha = 1 - level
                if compute_intervals:
                    if C.linear_part.shape[0] > 0:  # there were some constraints
                        try:
                            _interval = C.interval(eta, one_step, alpha=alpha)
                        except OverflowError:
                            _interval = (-np.inf, np.inf)
                        _interval = sorted(
                            [
                                _interval[0] * self.active_signs[i],
                                _interval[1] * self.active_signs[i],
                            ]
                        )
                    else:
                        _interval = (
                            obs - ndist.ppf(1 - alpha / 2) * sd,
                            obs + ndist.ppf(1 - alpha / 2) * sd,
                        )
                else:
                    _interval = [np.nan, np.nan]
                _bounds = np.array(C.bounds(eta, one_step))
                sd = _bounds[-1]
                lower_trunc, est, upper_trunc = sorted(
                    _bounds[:3] * self.active_signs[i]
                )

                result.append(
                    (
                        self.active[i],
                        _pval,
                        self.lasso_solution[self.active[i]],
                        one_step[i],
                        _interval[0],
                        _interval[1],
                        lower_trunc,
                        upper_trunc,
                        sd,
                    )
                )
            C.covariance = _cov

        df = pd.DataFrame(
            index=self.active,
            data=dict(
                [
                    (n, d)
                    for n, d in zip(
                        [
                            "variable",
                            "pvalue",
                            "lasso",
                            "onestep",
                            "lower_confidence",
                            "upper_confidence",
                            "lower_trunc",
                            "upper_trunc",
                            "sd",
                        ],
                        np.array(result).T,
                    )
                ]
            ),
        )
        if len(df) > 0:
            df["variable"] = df["variable"].astype(int)
        return df

    @property
    def soln(self):
        """
        Solution to the lasso problem, set by `fit` method.
        """
        if not hasattr(self, "lasso_solution"):
            self.fit()
        return self.lasso_solution

    @property
    def constraints(self):
        """
        Affine constraints for this LASSO problem.
        These are the constraints determined only
        by the active block.
        """
        return self._constraints

    @classmethod
    def gaussian(
        klass,
        X,
        Y,
        feature_weights,
        sigma=1.0,
        covariance_estimator=None,
        quadratic=None,
    ):
        r"""
        Squared-error LASSO with feature weights.
        Objective function is
        $$
        \beta \mapsto \frac{1}{2} \|Y-X\beta\|^2_2 + \sum_{i=1}^p \lambda_i |\beta_i|
        $$
        where $\lambda$ is `feature_weights`.
        Parameters
        ----------
        X : ndarray
            Shape (n,p) -- the design matrix.
        Y : ndarray
            Shape (n,) -- the response.
        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.
        sigma : float (optional)
            Noise variance. Set to 1 if `covariance_estimator` is not None.
            This scales the loglikelihood by `sigma**(-2)`.
        covariance_estimator : callable (optional)
            If None, use the parameteric
            covariance estimate of the selected model.
        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.
        Returns
        -------
        L : `selection.algorithms.lasso.lasso`

        Notes
        -----
        If not None, `covariance_estimator` should
        take arguments (beta, active, inactive)
        and return an estimate of some of the
        rows and columns of the covariance of
        $(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
        the unpenalized estimator and the inactive
        coordinates of the gradient of the likelihood at
        the unpenalized estimator.
        """
        if covariance_estimator is not None:
            sigma = 1.0
        loglike = glm.gaussian(X, Y, coef=1.0 / sigma**2, quadratic=quadratic)
        return klass(
            loglike,
            np.asarray(feature_weights) / sigma**2,
            covariance_estimator=covariance_estimator,
        )

    @classmethod
    def logistic(
        klass,
        X,
        successes,
        feature_weights,
        trials=None,
        covariance_estimator=None,
        quadratic=None,
    ):
        r"""
        Logistic LASSO with feature weights.
        Objective function is
        $$
        \beta \mapsto \ell(X\beta) + \sum_{i=1}^p \lambda_i |\beta_i|
        $$
        where $\ell$ is the negative of the logistic
        log-likelihood (half the logistic deviance)
        and $\lambda$ is `feature_weights`.
        Parameters
        ----------
        X : ndarray
            Shape (n,p) -- the design matrix.
        successes : ndarray
            Shape (n,) -- response vector. An integer number of successes.
            For data that is proportions, multiply the proportions
            by the number of trials first.
        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.
        trials : ndarray (optional)
            Number of trials per response, defaults to
            ones the same shape as Y.
        covariance_estimator : optional
            If None, use the parameteric
            covariance estimate of the selected model.
        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.
        Returns
        -------
        L : `selection.algorithms.lasso.lasso`

        Notes
        -----
        If not None, `covariance_estimator` should
        take arguments (beta, active, inactive)
        and return an estimate of the covariance of
        $(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
        the unpenalized estimator and the inactive
        coordinates of the gradient of the likelihood at
        the unpenalized estimator.
        """
        loglike = glm.logistic(X, successes, trials=trials, quadratic=quadratic)
        return klass(
            loglike, feature_weights, covariance_estimator=covariance_estimator
        )

    @classmethod
    def cox(
        klass,
        X,
        times,
        status,
        feature_weights,
        covariance_estimator=None,
        quadratic=None,
    ):
        r"""
        Cox proportional hazards LASSO with feature weights.
        Objective function is
        $$
        \beta \mapsto \ell^{\text{Cox}}(\beta) + \sum_{i=1}^p \lambda_i |\beta_i|
        $$
        where $\ell^{\text{Cox}}$ is the
        negative of the log of the Cox partial
        likelihood and $\lambda$ is `feature_weights`.
        Uses Efron's tie breaking method.
        Parameters
        ----------
        X : ndarray
            Shape (n,p) -- the design matrix.
        times : ndarray
            Shape (n,) -- the survival times.
        status : ndarray
            Shape (n,) -- the censoring status.
        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.
        covariance_estimator : optional
            If None, use the parameteric
            covariance estimate of the selected model.
        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.
        Returns
        -------
        L : `selection.algorithms.lasso.lasso`

        Notes
        -----
        If not None, `covariance_estimator` should
        take arguments (beta, active, inactive)
        and return an estimate of the covariance of
        $(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
        the unpenalized estimator and the inactive
        coordinates of the gradient of the likelihood at
        the unpenalized estimator.
        """
        loglike = glm.cox(X, times, status, quadratic=quadratic)
        return klass(
            loglike, feature_weights, covariance_estimator=covariance_estimator
        )

    @classmethod
    def poisson(
        klass, X, counts, feature_weights, covariance_estimator=None, quadratic=None
    ):
        r"""
        Poisson log-linear LASSO with feature weights.
        Objective function is
        $$
        \beta \mapsto \ell^{\text{Poisson}}(\beta) + \sum_{i=1}^p \lambda_i |\beta_i|
        $$
        where $\ell^{\text{Poisson}}$ is the negative
        of the log of the Poisson likelihood (half the deviance)
        and $\lambda$ is `feature_weights`.
        Parameters
        ----------
        X : ndarray
            Shape (n,p) -- the design matrix.
        counts : ndarray
            Shape (n,) -- the response.
        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.
        covariance_estimator : optional
            If None, use the parameteric
            covariance estimate of the selected model.
        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.
        Returns
        -------
        L : `selection.algorithms.lasso.lasso`

        Notes
        -----
        If not None, `covariance_estimator` should
        take arguments (beta, active, inactive)
        and return an estimate of the covariance of
        $(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
        the unpenalized estimator and the inactive
        coordinates of the gradient of the likelihood at
        the unpenalized estimator.
        """
        loglike = glm.poisson(X, counts, quadratic=quadratic)
        return klass(
            loglike, feature_weights, covariance_estimator=covariance_estimator
        )

    @classmethod
    def sqrt_lasso(
        klass,
        X,
        Y,
        feature_weights,
        quadratic=None,
        covariance="parametric",
        sigma_estimate="truncated",
        solve_args={"min_its": 200},
    ):
        r"""
        Use sqrt-LASSO to choose variables.
        Objective function is
        $$
        \beta \mapsto \|Y-X\beta\|_2 + \sum_{i=1}^p \lambda_i |\beta_i|
        $$
        where $\lambda$ is `feature_weights`. After solving the problem
        treat as if `gaussian` with implied variance and choice of
        multiplier. See arxiv.org/abs/1504.08031 for details.
        Parameters
        ----------
        X : ndarray
            Shape (n,p) -- the design matrix.
        Y : ndarray
            Shape (n,) -- the response.
        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.
        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.
        covariance : str
            One of 'parametric' or 'sandwich'. Method
            used to estimate covariance for inference
            in second stage.
        sigma_estimate : str
            One of 'truncated' or 'OLS'. Method
            used to estimate $\sigma$ when using
            parametric covariance.
        solve_args : dict
            Arguments passed to solver.
        Returns
        -------
        L : `selection.algorithms.lasso.lasso`

        Notes
        -----
        Unlike other variants of LASSO, this
        solves the problem on construction as the active
        set is needed to find equivalent gaussian LASSO.
        Assumes parametric model is correct for inference,
        i.e. does not accept a covariance estimator.
        """

        n, p = X.shape

        if np.asarray(feature_weights).shape == ():
            feature_weights = np.ones(p) * feature_weights
        feature_weights = np.asarray(feature_weights)

        # TODO: refits sqrt lasso more than once -- make an override for avoiding refitting?

        soln = solve_sqrt_lasso(
            X, Y, weights=feature_weights, quadratic=quadratic, solve_args=solve_args
        )[0]

        # find active set, and estimate of sigma

        active = soln != 0
        nactive = active.sum()

        if nactive:
            subgrad = np.sign(soln[active]) * feature_weights[active]
            X_E = X[:, active]
            X_Ei = np.linalg.pinv(X_E)
            sigma_E = np.linalg.norm(Y - X_E.dot(X_Ei.dot(Y))) / np.sqrt(n - nactive)
            multiplier = np.sqrt(
                (n - nactive) / (1 - np.linalg.norm(X_Ei.T.dot(subgrad)) ** 2)
            )

            # check truncation interval for sigma_E

            # the KKT conditions imply an inequality like
            # \hat{\sigma}_E \cdot LHS \leq RHS

            penalized = feature_weights[active] != 0

            if penalized.sum():
                D_E = np.sign(soln[active][penalized])  # diagonal matrix of signs
                LHS = D_E * np.linalg.solve(X_E.T.dot(X_E), subgrad)[penalized]
                RHS = D_E * X_Ei.dot(Y)[penalized]

                ratio = RHS / LHS

                group1 = LHS > 0
                upper_bound = np.inf
                if group1.sum():
                    upper_bound = min(
                        upper_bound, np.min(ratio[group1])
                    )  # necessarily these will have RHS > 0

                group2 = (LHS <= 0) * (
                    RHS <= 0
                )  # we can ignore the other possibility since this gives a lower bound of 0
                lower_bound = 0
                if group2.sum():
                    lower_bound = max(lower_bound, np.max(ratio[group2]))

                upper_bound /= multiplier
                lower_bound /= multiplier

            else:
                lower_bound = 0
                upper_bound = np.inf

            _sigma_estimator_args = (sigma_E, n - nactive, lower_bound, upper_bound)

            if sigma_estimate == "truncated":
                _sigma_hat = estimate_sigma(*_sigma_estimator_args)
            elif sigma_estimate == "OLS":
                _sigma_hat = sigma_E
            else:
                raise ValueError('sigma_estimate must be one of ["truncated", "OLS"]')
        else:
            _sigma_hat = np.linalg.norm(Y) / np.sqrt(n)
            multiplier = np.sqrt(n)
            sigma_E = _sigma_hat

        # XXX how should quadratic be changed?
        # multiply everything by sigma_E?

        if quadratic is not None:
            qc = quadratic.collapsed()
            qc.coef *= np.sqrt(n - nactive) / sigma_E
            qc.linear_term *= np.sqrt(n - nactive) / sigma_E
            quadratic = qc

        loglike = glm.gaussian(X, Y, quadratic=quadratic)

        if covariance == "parametric":
            cov_est = glm_parametric_estimator(loglike, dispersion=_sigma_hat)
        elif covariance == "sandwich":
            cov_est = glm_sandwich_estimator(loglike, B=2000)
        else:
            raise ValueError('covariance must be one of ["parametric", "sandwich"]')

        L = klass(
            loglike,
            feature_weights * multiplier * sigma_E,
            covariance_estimator=cov_est,
            ignore_inactive_constraints=True,
        )

        # these arguments are reused for data carving

        if nactive:
            L._sigma_hat = _sigma_hat
            L._sigma_estimator_args = _sigma_estimator_args
            L._weight_multiplier = multiplier * sigma_E
            L._multiplier = multiplier
            L.lasso_solution = soln

        return L


def nominal_intervals(lasso_obj, level=0.95):
    """
    Intervals for OLS parameters of active variables
    that have not been adjusted for selection.
    """
    unadjusted_intervals = []
    alpha = 1 - level
    if lasso_obj.active is not []:
        SigmaE = lasso_obj.constraints.covariance
        for i in range(lasso_obj.active.shape[0]):
            eta = np.ones_like(lasso_obj.onestep_estimator)
            eta[i] = 1.0
            center = lasso_obj.onestep_estimator[i]
            width = ndist.ppf(1 - alpha / 2.0) * np.sqrt(SigmaE[i, i])
            _interval = [center - width, center + width]
            unadjusted_intervals.append((lasso_obj.active[i], eta, center, _interval))
    return unadjusted_intervals


def glm_sandwich_estimator(loss, B=1000):
    """
    Bootstrap estimator of covariance of

    .. math::

        (\bar{\beta}_E, X_{-E}^T(y-X_E\bar{\beta}_E)
    the OLS estimator of population regression
    coefficients and inactive correlation with the
    OLS residuals.
    Returns
    -------
    estimator : callable
        Takes arguments (beta, active, inactive)
    """

    def _estimator(loss, B, beta, active, inactive):
        X, Y = loss.data
        n, p = X.shape  # shorthand

        beta_full = np.zeros(p)
        beta_full[active] = beta

        # make sure active / inactive are bool

        active_bool = np.zeros(p, np.bool)
        active_bool[active] = 1

        inactive_bool = np.zeros(p, np.bool)
        inactive_bool[inactive] = 1

        bootstrapper = pairs_bootstrap_glm(
            loss, active_bool, beta_full=beta_full, inactive=inactive_bool
        )[0]

        nactive = active_bool.sum()
        first_moment = np.zeros(p)
        second_moment = np.zeros((p, nactive))

        for b in range(B):
            indices = np.random.choice(n, n, replace=True)
            Z_star = bootstrapper(indices)
            first_moment += Z_star
            second_moment += np.multiply.outer(Z_star, Z_star[:nactive])

        first_moment /= B
        second_moment /= B

        cov = second_moment - np.multiply.outer(first_moment, first_moment[:nactive])

        return cov

    return functools.partial(_estimator, loss, B)


def glm_parametric_estimator(loglike, dispersion=None):
    """
    Parametric estimator of covariance of

    .. math::

        (\bar{\beta}_E, X_{-E}^T(y-\nabla \ell(X_E\bar{\beta}_E))
    the OLS estimator of population regression
    coefficients and inactive correlation with the
    OLS residuals.
    If `sigma` is None, it computes usual unbiased estimate
    of variance in Gaussian model and plugs it in,
    assuming parametric form is correct.
    Returns
    -------
    estimator : callable
        Takes arguments (beta, active, inactive)
    """

    def _estimator(loglike, dispersion, beta, active, inactive):
        X, Y = loglike.data
        n, p = X.shape
        nactive = len(active)

        linear_predictor = X[:, active].dot(beta)
        W = loglike.saturated_loss.hessian(linear_predictor)
        Sigma_A = X[:, active].T.dot(W[:, None] * X[:, active])
        Sigma_Ainv = np.linalg.inv(Sigma_A)

        _unscaled = np.zeros((p, len(active)))
        _unscaled[:nactive] = Sigma_Ainv

        # the lower block is left at 0 because
        # under the parametric model, these pieces
        # are independent

        # if no dispersion, use Pearson's X^2

        if dispersion is None:
            eta = X[:, active].dot(beta)
            dispersion = (
                (loglike.saturated_loss.smooth_objective(eta, "grad")) ** 2 / W
            ).sum() / (n - nactive)

        _unscaled *= dispersion**2

        return _unscaled

    return functools.partial(_estimator, loglike, dispersion)


def standard_lasso(X, y, sigma=1, lam_frac=1.0, **solve_args):
    """
    Fit a LASSO with a default choice of Lagrange parameter
    equal to `lam_frac` times $\sigma \cdot E(|X^T\epsilon|)$
    with $\epsilon$ IID N(0,1).
    Parameters
    ----------
    y : np.float
        Response vector
    X : np.float
        Design matrix
    sigma : np.float
        Noise variance
    lam_frac : float
        Multiplier for choice of $\lambda$
    solve_args : keyword args
        Passed to `regreg.problems.simple_problem.solve`.
    Returns
    -------
    lasso_selection : `lasso`
         Instance of `lasso` after fitting.
    """
    n, p = X.shape
    lam = (
        lam_frac
        * np.mean(np.fabs(X.T.dot(np.random.standard_normal((n, 50000)))).max(0))
        * sigma
    )
    lasso_selector = lasso.gaussian(X, y, lam, sigma=sigma)
    lasso_selector.fit(**solve_args)

    return lasso_selector


class data_carving(lasso):
    """
    Notes
    -----
    Even if a covariance estimator is supplied,
    we assume that we can drop inactive constraints,
    i.e. the same (asymptotic) independence that
    holds for parametric model is assumed to hold here
    as well.
    """

    def __init__(
        self,
        loglike_select,
        loglike_inference,
        loglike_full,
        feature_weights,
        covariance_estimator=None,
    ):
        lasso.__init__(
            self,
            loglike_select,
            feature_weights,
            covariance_estimator=covariance_estimator,
        )
        self.loglike_inference = loglike_inference
        self.loglike_full = loglike_full

    @classmethod
    def gaussian(
        klass, X, Y, feature_weights, split_frac=0.9, sigma=1.0, stage_one=None
    ):
        n, p = X.shape
        if stage_one is None:
            splitn = int(n * split_frac)
            indices = np.arange(n)
            np.random.shuffle(indices)
            stage_one = indices[:splitn]
            stage_two = indices[splitn:]
        else:
            stage_two = [i for i in np.arange(n) if i not in stage_one]
        Y1, X1 = Y[stage_one], X[stage_one]
        Y2, X2 = Y[stage_two], X[stage_two]

        loglike = glm.gaussian(X, Y, coef=1.0 / sigma**2)
        loglike1 = glm.gaussian(X1, Y1, coef=1.0 / sigma**2)
        loglike2 = glm.gaussian(X2, Y2, coef=1.0 / sigma**2)

        return klass(loglike1, loglike2, loglike, feature_weights / sigma**2)

    @classmethod
    def logistic(
        klass,
        X,
        successes,
        feature_weights,
        trials=None,
        split_frac=0.9,
        sigma=1.0,
        stage_one=None,
    ):
        n, p = X.shape
        if stage_one is None:
            splitn = int(n * split_frac)
            indices = np.arange(n)
            np.random.shuffle(indices)
            stage_one = indices[:splitn]
            stage_two = indices[splitn:]
        else:
            stage_two = [i for i in np.arange(n) if i not in stage_one]

        if trials is None:
            trials = np.ones_like(successes)

        successes1, X1, trials1 = successes[stage_one], X[stage_one], trials[stage_one]
        successes2, X2, trials2 = successes[stage_two], X[stage_two], trials[stage_two]

        loglike = glm.logistic(X, successes, trials=trials)
        loglike1 = glm.logistic(X1, successes1, trials=trials1)
        loglike2 = glm.logistic(X2, successes2, trials=trials2)

        return klass(loglike1, loglike2, loglike, feature_weights)

    @classmethod
    def poisson(
        klass, X, counts, feature_weights, split_frac=0.9, sigma=1.0, stage_one=None
    ):
        n, p = X.shape
        if stage_one is None:
            splitn = int(n * split_frac)
            indices = np.arange(n)
            np.random.shuffle(indices)
            stage_one = indices[:splitn]
            stage_two = indices[splitn:]
        else:
            stage_two = [i for i in np.arange(n) if i not in stage_one]

        counts1, X1 = counts[stage_one], X[stage_one]
        counts2, X2 = counts[stage_two], X[stage_two]

        loglike = glm.poisson(X, counts)
        loglike1 = glm.poisson(X1, counts1)
        loglike2 = glm.poisson(X2, counts2)

        return klass(loglike1, loglike2, loglike, feature_weights)

    @classmethod
    def cox(
        klass,
        X,
        times,
        status,
        feature_weights,
        split_frac=0.9,
        sigma=1.0,
        stage_one=None,
    ):
        n, p = X.shape
        if stage_one is None:
            splitn = int(n * split_frac)
            indices = np.arange(n)
            np.random.shuffle(indices)
            stage_one = indices[:splitn]
            stage_two = indices[splitn:]
        else:
            stage_two = [i for i in np.arange(n) if i not in stage_one]

        times1, X1, status1 = times[stage_one], X[stage_one], status[stage_one]
        times2, X2, status2 = times[stage_two], X[stage_two], status[stage_two]

        loglike = glm.cox(X, times, status)
        loglike1 = glm.cox(X1, times1, status1)
        loglike2 = glm.cox(X2, times2, status2)

        return klass(loglike1, loglike2, loglike, feature_weights)

    @classmethod
    def sqrt_lasso(
        klass,
        X,
        Y,
        feature_weights,
        split_frac=0.9,
        stage_one=None,
        solve_args={"min_its": 200},
    ):
        n, p = X.shape

        if stage_one is None:
            splitn = int(n * split_frac)
            indices = np.arange(n)
            np.random.shuffle(indices)
            stage_one = indices[:splitn]
            stage_two = indices[splitn:]
        else:
            stage_two = [i for i in np.arange(n) if i not in stage_one]

        Y1, X1 = Y[stage_one], X[stage_one]
        Y2, X2 = Y[stage_two], X[stage_two]

        # TODO: refits sqrt lasso more than once

        L = lasso.sqrt_lasso(X1, Y1, feature_weights, solve_args=solve_args)
        soln = L.fit(solve_args=solve_args)
        _sigma_E1, _df1, _lower, _upper = L._sigma_estimator_args
        _df2 = max(len(stage_two) - len(L.active), 0)
        if _df2:
            X_E2 = X2[:, L.active]
            _sigma_E2 = np.linalg.norm(Y2 - X_E2.dot(np.linalg.pinv(X_E2).dot(Y2))) / (
                len(stage_two) - len(L.active)
            )
            _sigma_hat = estimate_sigma(
                np.sqrt(
                    (_sigma_E1**2 * _df1 + _sigma_E2**2 * _df2) / (_df1 + _df2)
                ),
                _df1,
                _lower,
                _upper,
                untruncated_df=_df2,
            )
        else:
            _sigma_hat = L._sigma_hat

        cov_est = glm_parametric_estimator(L.loglike, dispersion=_sigma_hat)

        loglike = glm.gaussian(X, Y)
        loglike1 = glm.gaussian(X1, Y1)
        loglike2 = glm.gaussian(X2, Y2)

        L = klass(
            loglike1,
            loglike2,
            loglike,
            feature_weights * L._weight_multiplier,
            covariance_estimator=cov_est,
        )
        L.lasso_solution = soln
        return L

    def fit(self, solve_args={"tol": 1.0e-12, "min_its": 50}):
        lasso.fit(self, solve_args=solve_args)

        n1 = self.loglike.get_data()[0].shape[0]
        n = self.loglike_full.get_data()[0].shape[0]

        _feature_weights = self.feature_weights.copy()
        _feature_weights[self.active] = 0.0
        _feature_weights[self.inactive] = np.inf

        _unpenalized_problem = simple_problem(
            self.loglike_full, weighted_l1norm(_feature_weights, lagrange=1.0)
        )
        _unpenalized = _unpenalized_problem.solve(**solve_args)
        _unpenalized_active = _unpenalized[self.active]

        s = len(self.active)

        if self.covariance_estimator is None:
            H = self.loglike_full.hessian(_unpenalized)
            H_AA = H[self.active][:, self.active]
            _cov_block = np.linalg.inv(H_AA)
            self._carve_invcov = H_AA
        else:
            C = self.covariance_estimator(
                _unpenalized_active, self.active, self.inactive
            )
            _cov_block = C[: len(self.active)][:, : len(self.active)]
            self._carve_invcov = np.linalg.pinv(_cov_block)

        _subsample_block = (n * 1.0 / n1) * _cov_block
        _carve_cov = np.zeros((2 * s, 2 * s))
        _carve_cov[:s][:, :s] = _cov_block
        _carve_cov[s:][:, :s] = _subsample_block
        _carve_cov[:s][:, s:] = _subsample_block
        _carve_cov[s:][:, s:] = _subsample_block

        _carve_linear_part = self._constraints.linear_part.dot(np.identity(2 * s)[s:])
        _carve_offset = self._constraints.offset
        self._carve_constraints = constraints(
            _carve_linear_part, _carve_offset, covariance=_carve_cov
        )

        self._carve_feasible = np.hstack([_unpenalized_active, self.onestep_estimator])
        self._unpenalized_active = _unpenalized_active

    def hypothesis_test(
        self, variable, burnin=2000, ndraw=8000, compute_intervals=False
    ):
        if variable not in self.active:
            raise ValueError("expecting an active variable")

        # shorthand
        j = list(self.active).index(variable)
        twice_s = self._carve_constraints.linear_part.shape[1]
        s = sparsity = int(twice_s / 2)

        keep = np.ones(s, np.bool)
        keep[j] = 0
        conditioning = self._carve_invcov.dot(np.identity(twice_s)[:s])[keep]

        contrast = np.zeros(2 * s)
        contrast[j] = 1.0

        # condition to remove dependence on nuisance parameters
        if len(self.active) > 1:
            conditional_law = self._carve_constraints.conditional(
                conditioning, conditioning.dot(self._carve_feasible)
            )
        else:
            conditional_law = self._carve_constraints

        observed = (contrast * self._carve_feasible).sum()

        if self._carve_constraints.linear_part.shape[0] > 0:
            _, _, _, family = gibbs_test(
                conditional_law,
                self._carve_feasible,
                contrast,
                sigma_known=True,
                white=False,
                ndraw=ndraw,
                burnin=burnin,
                how_often=10,
                UMPU=False,
            )

            pval = family.cdf(0, observed)
            pval = 2 * min(pval, 1 - pval)

        else:  # only unpenalized coefficients were nonzero, no constraints
            sd = np.sqrt(
                (contrast * self._carve_constraints.covariance.dot(contrast)).sum()
            )
            Z = observed / sd
            pval = 2 * ndist.sf(np.fabs(Z))

        return pval


class data_splitting(data_carving):
    def fit(self, solve_args={"tol": 1.0e-12, "min_its": 500}, use_full_cov=True):
        lasso.fit(self, solve_args=solve_args)

        _feature_weights = self.feature_weights.copy()
        _feature_weights[self.active] = 0.0
        _feature_weights[self.inactive] = np.inf

        _unpenalized_problem = simple_problem(
            self.loglike_inference, weighted_l1norm(_feature_weights, lagrange=1.0)
        )
        _unpenalized = _unpenalized_problem.solve(**solve_args)

        self._unpenalized_active = _unpenalized[self.active]

        if use_full_cov:
            H = self.loglike_full.hessian(_unpenalized)
            n_inference = self.loglike_inference.data[0].shape[0]
            n_full = self.loglike_full.data[0].shape[0]
            H *= 1.0 * n_inference / n_full
        else:
            H = self.loglike_inference.hessian(_unpenalized)

        H_AA = H[self.active][:, self.active]
        self._cov_inference = np.linalg.inv(H_AA)

    def hypothesis_test(self, variable, df=np.inf):
        """
        Wald test for an active variable.
        """
        if variable not in self.active:
            raise ValueError("expecting an active variable")

        # shorthand
        j = list(self.active).index(variable)

        Z = self._unpenalized_active[j] / np.sqrt(self._cov_inference[j, j])

        if df == np.inf:
            return 2 * ndist.sf(np.abs(Z))
        else:
            return 2 * tdist.sf(np.abs(Z), df)


def _data_carving_deprec(
    X,
    y,
    lam_frac=2.0,
    sigma=1.0,
    stage_one=None,
    split_frac=0.9,
    coverage=0.95,
    ndraw=8000,
    burnin=2000,
    splitting=False,
    compute_intervals=True,
    UMPU=False,
):
    """
    Fit a LASSO with a default choice of Lagrange parameter
    equal to `lam_frac` times $\sigma \cdot E(|X^T\epsilon|)$
    with $\epsilon$ IID N(0,1) on a proportion (`split_frac`) of
    the data.
    Parameters
    ----------
    y : np.float
        Response vector
    X : np.float
        Design matrix
    sigma : np.float
        Noise variance
    lam_frac : float (optional)
        Multiplier for choice of $\lambda$. Defaults to 2.
    coverage : float
        Coverage for selective intervals. Defaults to 0.95.
    stage_one : [np.array(np.int), None] (optional)
        Index of data points to be used in  first stage.
        If None, a randomly chosen set of entries is used based on
        `split_frac`.
    split_frac : float (optional)
        What proportion of the data to use in the first stage?
        Defaults to 0.9.
    ndraw : int (optional)
        How many draws to keep from Gibbs hit-and-run sampler.
        Defaults to 8000.
    burnin : int (optional)
        Defaults to 2000.
    splitting : bool (optional)
        If True, also return splitting pvalues and intervals.
    compute_intervals : bool (optional)
        Compute selective intervals?
    UMPU : bool (optional)
        Perform the UMPU test?

    Returns
    -------
    results : [(variable, pvalue, interval)
        Indices of active variables,
        selected (twosided) pvalue and selective interval.
        If splitting, then each entry also includes
        a (split_pvalue, split_interval) using stage_two
        for inference.
    stage_one : `lasso`
        Results of fitting LASSO to stage one data.
    """

    n, p = X.shape
    first_stage, stage_one, stage_two = split_model(
        X, y, sigma=sigma, lam_frac=lam_frac, split_frac=split_frac, stage_one=stage_one
    )
    splitn = stage_one.shape[0]

    L = first_stage  # shorthand
    s = sparsity = L.active.shape[0]

    if splitn < n:
        # JT: this is all about computing constraints for active
        # variables -- we already have this!

        # quantities related to models fit on
        # stage_one and full dataset

        y1, X1 = y[stage_one], X[stage_one]
        X_E = X[:, L.active]
        X_Ei = np.linalg.pinv(X_E)
        X_E1 = X1[:, L.active]
        X_Ei1 = np.linalg.pinv(X_E1)

        inv_info_E = X_Ei.dot(X_Ei.T)
        inv_info_E1 = X_Ei1.dot(X_Ei1.T)

        beta_E = X_Ei.dot(y)
        beta_E1 = X_Ei1.dot(y[stage_one])

        if n - splitn > s:
            linear_part = np.zeros((s, 2 * s))
            linear_part[:, s:] = -np.diag(L.active_signs)
            b = L.constraints.offset
            con = constraints(linear_part, b)

            # specify covariance of 2s Gaussian vector

            cov = np.zeros((2 * s, 2 * s))
            cov[:s, :s] = inv_info_E
            cov[s:, :s] = inv_info_E
            cov[:s, s:] = inv_info_E
            cov[s:, s:] = inv_info_E1

            con.covariance[:] = cov * sigma**2

            # for the conditional law
            # we will change the linear function for each coefficient

            selector = np.zeros((s, 2 * s))
            selector[:, :s] = np.identity(s)
            conditional_linear = X_E.T.dot(X_E).dot(selector)

            # a valid initial condition

            initial = np.hstack([beta_E, beta_E1])
            OLS_func = selector

        else:
            linear_part = np.zeros((s, s + n - splitn))
            linear_part[:, :s] = -np.diag(L.active_signs)
            b = L.constraints.offset
            con = constraints(linear_part, b)

            # specify covariance of Gaussian vector

            cov = np.zeros((s + n - splitn, s + n - splitn))
            cov[:s, :s] = inv_info_E1
            cov[s:, :s] = 0
            cov[:s, s:] = 0
            cov[s:, s:] = np.identity(n - splitn)

            con.covariance[:] = cov * sigma**2

            conditional_linear = np.zeros((s, s + n - splitn))
            conditional_linear[:, :s] = np.linalg.pinv(inv_info_E1)
            conditional_linear[:, s:] = X[stage_two, :][:, L.active].T

            selector1 = np.zeros((s, s + n - splitn))
            selector1[:, :s] = np.identity(s)
            selector2 = np.zeros((n - splitn, s + n - splitn))
            selector2[:, s:] = np.identity(n - splitn)

            # write the OLS estimates of full model in terms of X_E1^{dagger}y_1, y2

            OLS_func = inv_info_E.dot(conditional_linear)

            # a valid initial condition

            initial = np.hstack([beta_E1, y[stage_two]])

        pvalues = []
        intervals = []

        if splitting:
            y2, X2 = y[stage_two], X[stage_two]
            X_E2 = X2[:, L.active]
            X_Ei2 = np.linalg.pinv(X_E2)
            beta_E2 = X_Ei2.dot(y2)
            inv_info_E2 = X_Ei2.dot(X_Ei2.T)

            splitting_pvalues = []
            splitting_intervals = []

            if n - splitn < s:
                warnings.warn("not enough data for second stage of sample splitting")

            split_cutoff = np.fabs(ndist.ppf((1.0 - coverage) / 2))

        # compute p-values intervals

        cov_inv = np.linalg.pinv(con.covariance)

        for j in range(X_E.shape[1]):
            keep = np.ones(s, np.bool)
            keep[j] = 0

            eta = OLS_func[j]

            con_cp = copy(con)
            conditional_law = con_cp.conditional(
                conditional_linear[keep], X_E.T.dot(y)[keep]
            )

            # tilt so that samples are closer to observed values
            # the multiplier should be the pseudoMLE so that
            # the observed value is likely

            observed = (initial * eta).sum()

            if compute_intervals:
                _, _, _, family = gibbs_test(
                    conditional_law,
                    initial,
                    eta,
                    sigma_known=True,
                    white=False,
                    ndraw=ndraw,
                    burnin=burnin,
                    how_often=10,
                    UMPU=UMPU,
                    tilt=conditional_law.covariance.dot(eta),
                )

                lower_lim, upper_lim = family.equal_tailed_interval(
                    observed, 1 - coverage
                )

                # in the model we've chosen, the parameter beta is associated
                # to the natural parameter as below
                # exercise: justify this!

                lower_lim_final = (
                    eta.dot(conditional_law.covariance.dot(eta)) * lower_lim
                )
                upper_lim_final = (
                    eta.dot(conditional_law.covariance.dot(eta)) * upper_lim
                )

                intervals.append((lower_lim_final, upper_lim_final))
            else:  # we do not really need to tilt just for p-values
                _, _, _, family = gibbs_test(
                    conditional_law,
                    initial,
                    eta,
                    sigma_known=True,
                    white=False,
                    ndraw=ndraw,
                    burnin=burnin,
                    how_often=10,
                    UMPU=UMPU,
                )
                intervals.append((np.nan, np.nan))

            pval = family.cdf(0, observed)
            pval = 2 * min(pval, 1 - pval)

            pvalues.append(pval)

            if splitting:
                if s < n - splitn:  # enough data to generically
                    # test hypotheses. proceed as usual

                    split_pval = ndist.cdf(
                        beta_E2[j] / (np.sqrt(inv_info_E2[j, j]) * sigma)
                    )
                    split_pval = 2 * min(split_pval, 1.0 - split_pval)
                    splitting_pvalues.append(split_pval)

                    splitting_interval = (
                        beta_E2[j] - split_cutoff * np.sqrt(inv_info_E2[j, j]) * sigma,
                        beta_E2[j] + split_cutoff * np.sqrt(inv_info_E2[j, j]) * sigma,
                    )
                    splitting_intervals.append(splitting_interval)
                else:
                    splitting_pvalues.append(np.random.sample())
                    splitting_intervals.append((np.nan, np.nan))

        if not splitting:
            return zip(L.active, pvalues, intervals), L
        else:
            return (
                zip(
                    L.active, pvalues, intervals, splitting_pvalues, splitting_intervals
                ),
                L,
            )
    else:
        pvalues = [p for _, p in L.summary("twosided")["pvalue"]]
        intervals = np.array([L.intervals["lower"], L.intervals["upper"]]).T
        if splitting:
            splitting_pvalues = np.random.sample(len(pvalues))
            splitting_intervals = [(np.nan, np.nan) for _ in range(len(pvalues))]

            return (
                zip(
                    L.active, pvalues, intervals, splitting_pvalues, splitting_intervals
                ),
                L,
            )
        else:
            return zip(L.active, pvalues, intervals), L


def split_model(X, y, sigma=1, lam_frac=1.0, split_frac=0.9, stage_one=None):
    """
    Fit a LASSO with a default choice of Lagrange parameter
    equal to `lam_frac` times $\sigma \cdot E(|X^T\epsilon|)$
    with $\epsilon$ IID N(0,1) on a proportion (`split_frac`) of
    the data.
    Parameters
    ----------
    y : np.float
        Response vector
    X : np.float
        Design matrix
    sigma : np.float
        Noise variance
    lam_frac : float (optional)
        Multiplier for choice of $\lambda$. Defaults to 2.
    split_frac : float (optional)
        What proportion of the data to use in the first stage?
        Defaults to 0.9.
    stage_one : [np.array(np.int), None] (optional)
        Index of data points to be used in  first stage.
        If None, a randomly chosen set of entries is used based on
        `split_frac`.
    Returns
    -------
    first_stage : `lasso`
        Lasso object from stage one.
    stage_one : np.array(int)
        Indices used for stage one.
    stage_two : np.array(int)
        Indices used for stage two.
    """

    n, p = X.shape
    if stage_one is None:
        splitn = int(n * split_frac)
        indices = np.arange(n)
        np.random.shuffle(indices)
        stage_one = indices[:splitn]
        stage_two = indices[splitn:]
    else:
        stage_two = [i for i in np.arange(n) if i not in stage_one]
    y1, X1 = y[stage_one], X[stage_one]

    first_stage = standard_lasso(X1, y1, sigma=sigma, lam_frac=lam_frac)
    return first_stage, stage_one, stage_two


def additive_noise(
    X,
    y,
    sigma,
    lam_frac=1.0,
    perturb_frac=0.2,
    y_star=None,
    coverage=0.95,
    ndraw=8000,
    compute_intervals=True,
    burnin=2000,
):
    """

    Additive noise LASSO.
    Parameters
    ----------
    y : np.float
        Response vector
    X : np.float
        Design matrix
    sigma : np.float
        Noise variance
    lam_frac : float (optional)
        Multiplier for choice of $\lambda$. Defaults to 2.
    perturb_frac : float (optional)
        How much noise to add? Noise added has variance
        proportional to existing variance.
    coverage : float
        Coverage for selective intervals. Defaults to 0.95.
    ndraw : int (optional)
        How many draws to keep from Gibbs hit-and-run sampler.
        Defaults to 8000.
    burnin : int (optional)
        Defaults to 2000.
    compute_intervals : bool (optional)
        Compute selective intervals?

    Returns
    -------
    results : [(variable, pvalue, interval)
        Indices of active variables,
        selected (twosided) pvalue and selective interval.
        If splitting, then each entry also includes
        a (split_pvalue, split_interval) using stage_two
        for inference.
    randomized_lasso : `lasso`
        Results of fitting LASSO to randomized data.
    """

    n, p = X.shape

    # Add some noise to y and fit the LASSO at a fixed lambda

    gamma = np.sqrt(perturb_frac) * sigma
    sigma_star = np.sqrt(sigma**2 + gamma**2)
    lam = (
        lam_frac
        * np.mean(np.fabs(X.T.dot(np.random.standard_normal((n, 5000)))).max(0))
        * sigma_star
    )
    y_star = y + np.random.standard_normal(n) * gamma

    randomized_lasso = L = standard_lasso(
        X, y_star, sigma=sigma_star, lam_frac=lam_frac
    )
    L.fit()

    # Form the constraint matrix on (y,y^*)
    X_E = X[:, L.active]
    X_Ei = np.linalg.pinv(X_E)
    Cov_E = X_Ei.dot(X_Ei.T)
    W_E = Cov_E.dot(L.active_signs)

    pvalues = []
    intervals = []

    beta_E = X_Ei.dot(y)

    # compute each pvalue
    for j in range(X_E.shape[1]):
        s_obs = L.active.shape[0]
        keep = np.ones(s_obs, np.bool)
        keep[j] = 0

        # form the 2s Gaussian vector we will condition on

        X_minus_j = X_E[:, keep]
        P_minus_j = X_minus_j.dot(np.linalg.pinv(X_minus_j))
        R_minus_j = np.identity(n) - P_minus_j

        theta_E = L.active_signs * (X_Ei.dot(P_minus_j.dot(y)) - lam * W_E)
        scale = np.sqrt(Cov_E[j, j])
        kappa = 1.0 / scale**2
        alpha_E = kappa * L.active_signs * Cov_E[j]
        A = np.hstack([-alpha_E.reshape((s_obs, 1)), np.identity(s_obs)])
        con = constraints(A, theta_E)
        cov = np.zeros((s_obs + 1, s_obs + 1))
        cov[0, 0] = scale**2 * sigma**2
        cov[1:, 1:] = Cov_E * gamma**2 * np.outer(L.active_signs, L.active_signs)
        con.covariance[:] = cov
        initial = np.zeros(s_obs + 1)
        initial[0] = beta_E[j]
        initial[1:] = -X_Ei.dot(y_star - y) * L.active_signs
        eta = np.zeros(s_obs + 1)
        eta[0] = 1.0

        observed = (initial * eta).sum()

        if compute_intervals:
            _, _, _, family = gibbs_test(
                con,
                initial,
                eta,
                UMPU=False,
                sigma_known=True,
                ndraw=ndraw,
                burnin=burnin,
                how_often=5,
                tilt=con.covariance.dot(eta),
            )

            lower_lim, upper_lim = family.equal_tailed_interval(observed, 1 - coverage)

            # in the model we've chosen, the parameter beta is associated
            # to the natural parameter as below
            # exercise: justify this!

            lower_lim_final = eta.dot(con.covariance.dot(eta)) * lower_lim
            upper_lim_final = eta.dot(con.covariance.dot(eta)) * upper_lim

            intervals.append((lower_lim_final, upper_lim_final))

        else:
            _, _, _, family = gibbs_test(
                con,
                initial,
                eta,
                UMPU=False,
                sigma_known=True,
                ndraw=ndraw,
                burnin=burnin,
                how_often=5,
                tilt=con.covariance.dot(eta),
            )

            intervals.append((np.nan, np.nan))

        pval = family.cdf(0, observed)
        pval = 2 * min(pval, 1 - pval)
        pvalues.append(pval)

    return zip(L.active, pvalues, intervals), randomized_lasso


## Liu, Markovic and Tibshirani method based on full model
## conditioning only on the event j \in E for each active j

# Liu, Markovic, Tibs selection


def _solve_restricted_problem(
    Qbeta_bar, Xinfo, lagrange, initial=None, wide=True, min_its=30, tol=1.0e-12
):
    p = Qbeta_bar.shape[0]

    if wide:
        X, W = Xinfo
        loss = squared_error(X * np.sqrt(W)[:, None], np.zeros(X.shape[0]))
    else:
        Q = Xinfo
        loss = quadratic_loss(Q.shape[0], Q=Q)

    loss.quadratic = identity_quadratic(0, 0, -Qbeta_bar, 0)

    lagrange = np.asarray(lagrange)
    if lagrange.shape in [(), (1,)]:
        lagrange = np.ones(p) * lagrange
    pen = weighted_l1norm(lagrange, lagrange=1.0)
    problem = simple_problem(loss, pen)
    if initial is not None:
        problem.coefs[:] = initial
    soln = problem.solve(tol=tol, min_its=min_its)
    return soln


def _truncation_interval(Qbeta_bar, Xinfo, Qi_jj, j, beta_barj, lagrange, wide=True):
    if lagrange[j] != 0:
        lagrange_cp = lagrange.copy()
    else:
        return -np.inf, np.inf
    lagrange_cp[j] = np.inf

    # TODO: use initial solution for speed
    restricted_soln = _solve_restricted_problem(
        Qbeta_bar, Xinfo, lagrange_cp, wide=wide
    )

    p = Qbeta_bar.shape[0]
    Ij = np.zeros(p)
    Ij[j] = 1.0
    nuisance = Qbeta_bar - Ij / Qi_jj * beta_barj

    if wide:
        X, W = Xinfo
        Qj = X.T.dot(X[:, j] * W)
    else:
        Q = Xinfo
        Qj = Q[j]
    center = nuisance[j] - Qj.dot(restricted_soln)
    upper = (lagrange[j] - center) * Qi_jj
    lower = (-lagrange[j] - center) * Qi_jj

    if not (beta_barj < lower or beta_barj > upper):
        warnings.warn("implied KKT constraint not satisfied")

    return lower, upper


class ROSI(lasso):
    r"""
    A class for the LASSO for post-selection inference.
    The problem solved is
    .. math::
        \text{minimize}_{\beta} \frac{1}{2n} \|y-X\beta\|^2_2 +
            \lambda \|\beta\|_1
    where $\lambda$ is `lam`.
    Notes
    -----
    In solving the debiasing problem to approximate the inverse
    of (X^TWX) in a GLM, this class makes the implicit assumption
    that the scaling of X is such that diag(X^TWX) is O(n)
    with n=X.shape[0]. That is, X's are similar to IID samples
    from a population that does not depend on n.
    """

    def __init__(self, loglike, feature_weights, approximate_inverse="BN"):
        r"""
        Create a new post-selection for the LASSO problem
        Parameters
        ----------
        loglike : `regreg.smooth.glm.glm`
            A (negative) log-likelihood as implemented in `regreg`.
        feature_weights : np.ndarray
            Feature weights for L-1 penalty. If a float,
            it is brodcast to all features.
        approximate_inverse : str (optional)
             One of "JM" (Javanmard, Montanari) or "BN" (Boot, Niedderling) or None.
             A form of approximate inverse when p is close to (or larger) than n.

        """

        self.loglike = loglike
        if np.asarray(feature_weights).shape == ():
            feature_weights = np.ones(loglike.shape) * feature_weights
        self.feature_weights = np.asarray(feature_weights)
        self.approximate_inverse = approximate_inverse

    def fit(
        self,
        lasso_solution=None,
        solve_args={"tol": 1.0e-12, "min_its": 50},
        debiasing_args={},
    ):
        """
        Fit the lasso using `regreg`.
        This sets the attributes `soln`, `onestep` and
        forms the constraints necessary for post-selection inference
        by calling `form_constraints()`.
        Parameters
        ----------
        lasso_solution : optional
             If not None, this is taken to be the solution
             of the optimization problem. No checks
             are done, though the implied affine
             constraints will generally not be satisfied.
        solve_args : keyword args
             Passed to `regreg.problems.simple_problem.solve`.
        debiasing_args : dict
             Arguments passed to `.debiased_lasso.debiasing_matrix`
             or `.debiased_lasso.pseudoinverse_debiasing_matrix` depending
             on `self.approximate_inverse`.
        Returns
        -------
        soln : np.float
             Solution to lasso.

        Notes
        -----
        If `self` already has an attribute `lasso_solution`
        this will be taken to be the solution and
        no optimization problem will be solved. Supplying
        the optional argument `lasso_solution` will
        overwrite `self`'s `lasso_solution`.
        """

        self._penalty = weighted_l1norm(self.feature_weights, lagrange=1.0)
        if lasso_solution is None and not hasattr(self, "lasso_solution"):
            problem = simple_problem(self.loglike, self._penalty)
            self.lasso_solution = problem.solve(**solve_args)
        elif lasso_solution is not None:
            self.lasso_solution = lasso_solution

        lasso_solution = (
            self.lasso_solution
        )  # shorthand after setting it correctly above

        if not np.all(lasso_solution == 0):
            self.active = np.nonzero(lasso_solution != 0)[0]
            self.inactive = lasso_solution == 0
            self.active_signs = np.sign(lasso_solution[self.active])
            self._active_soln = lasso_solution[self.active]

            X, y = self.loglike.data  # presuming GLM here
            n, p = X.shape

            W = self.loglike.saturated_loss.hessian(X.dot(lasso_solution))

            # Needed for finding truncation intervals

            self._Qbeta_bar = X.T.dot(
                W * X.dot(lasso_solution)
            ) - self.loglike.smooth_objective(lasso_solution, "grad")
            self._W = W

            if n > p and self.approximate_inverse is None:
                Q = self.loglike.hessian(lasso_solution)
                E = self.active
                Qi = np.linalg.inv(Q)
                self._QiE = Qi[E][:, E]
                _beta_bar = Qi.dot(self._Qbeta_bar)
                self._beta_barE = _beta_bar[E]
                one_step = self._beta_barE

                # Pearson's X^2 to estimate sigma

                y_hat = self.loglike.saturated_loss.mean_function(X.dot(_beta_bar))
                self._pearson_sigma = np.sqrt(
                    ((y - y_hat) ** 2 / self._W).sum() / (n - p)
                )

            elif self.approximate_inverse is None:
                raise ValueError(
                    "n is less than or equal to p, an approximate inverse is needed."
                )
            else:
                X, y = self.loglike.data

                # target is one-step estimator

                G = self.loglike.smooth_objective(lasso_solution, "grad")

                if self.approximate_inverse == "JM":
                    M = self._M = (
                        debiasing_matrix(
                            X * np.sqrt(W)[:, None], self.active, **debiasing_args
                        )
                        / n
                    )
                    # the n is to make sure we get rows of the inverse
                    # of (X^TWX) instead of (X^TWX/n).

                elif self.approximate_inverse == "BN":
                    M = self._M = pseudoinverse_debiasing_matrix(
                        X * np.sqrt(W)[:, None], self.active, **debiasing_args
                    )
                else:
                    raise ValueError(
                        'approximate inverse should be one of ["JM", "BN"]'
                    )

                Qinv_hat = np.atleast_2d(M)
                observed_target = lasso_solution[self.active] - Qinv_hat.dot(G)
                M1 = Qinv_hat.dot(X.T)
                self._QiE = (M1 * self._W[None, :]).dot(M1.T)
                Xfeat = X[:, self.active]
                Qrelax = Xfeat.T.dot(self._W[:, None] * Xfeat)
                relaxed_soln = lasso_solution[self.active] - np.linalg.inv(Qrelax).dot(
                    G[self.active]
                )
                self._beta_barE = observed_target

                # relaxed Pearson's X^2 to estimate sigma
                y_hat = self.loglike.saturated_loss.mean_function(
                    Xfeat.dot(relaxed_soln)
                )
                self._pearson_sigma = np.sqrt(
                    ((y - y_hat) ** 2 / self._W).sum() / (n - len(self.active))
                )

        else:
            self.active = []
            self.inactive = np.arange(lasso_solution.shape[0])
        return self.lasso_solution

    def summary(self, level=0.95, compute_intervals=False, dispersion=None, truth=None):
        """
        Summary table for inference adjusted for selection.
        Parameters
        ----------
        level : float
            Form level*100% selective confidence intervals.

        compute_intervals : bool
            Should we compute confidence intervals?

        dispersion : float
            Estimate of dispersion. Defaults to a Pearson's X^2 estimate in the relaxed model.

        truth : np.array
            True values of each beta for selected variables. If not None, a column 'pvalue' are p-values
            computed under these corresponding null hypotheses.

        Returns
        -------
        pval_summary : np.recarray
            Array with one entry per active variable.
            Columns are 'variable', 'pvalue', 'lasso', 'onestep', 'lower_trunc', 'upper_trunc', 'sd'.
        """

        if len(self.active) > 0:
            X, y = self.loglike.data
            active_set, QiE, beta_barE, Qbeta_bar = (
                self.active,
                self._QiE,
                self._beta_barE,
                self._Qbeta_bar,
            )
            W, sigma = self._W, self._pearson_sigma
            if dispersion is None:
                sqrt_dispersion = sigma
            else:
                sqrt_dispersion = np.sqrt(dispersion)

            result = []

            # note that QiE is the dispersion free covariance
            # matrix!
            # dispersion comes into truncated Gaussian below

            if truth is None:
                truth = np.zeros(len(active_set))

            for j in range(len(active_set)):
                idx = self.active[j]
                lower, upper = _truncation_interval(
                    Qbeta_bar,
                    (X, W),
                    QiE[j, j],
                    idx,
                    beta_barE[j],
                    self.feature_weights,
                    wide=True,
                )

                sd = sqrt_dispersion * np.sqrt(QiE[j, j])
                tg = TG([(-np.inf, lower), (upper, np.inf)], scale=sd, mu=truth[j])
                pvalue = tg.cdf(beta_barE[j])
                pvalue = float(2 * min(pvalue, 1 - pvalue))

                if compute_intervals:
                    l, u = tg.equal_tailed_interval(beta_barE[j], alpha=1 - level)
                else:
                    l, u = np.nan, np.nan

                result.append(
                    (
                        idx,
                        pvalue,
                        self.lasso_solution[idx],
                        beta_barE[j],
                        sd,
                        l,
                        u,
                        lower,
                        upper,
                    )
                )

            df = pd.DataFrame(
                index=self.active,
                data=dict(
                    [
                        (n, d)
                        for n, d in zip(
                            [
                                "variable",
                                "pvalue",
                                "lasso",
                                "onestep",
                                "sd",
                                "lower_confidence",
                                "upper_confidence",
                                "lower_truncation",
                                "upper_truncation",
                            ],
                            np.array(result).T,
                        )
                    ]
                ),
            )
            df["variable"] = df["variable"].astype(int)
            return df

    @property
    def soln(self):
        """
        Solution to the lasso problem, set by `fit` method.
        """
        if not hasattr(self, "lasso_solution"):
            self.fit()
        return self.lasso_solution

    @classmethod
    def gaussian(
        klass,
        X,
        Y,
        feature_weights,
        sigma=1.0,
        covariance_estimator=None,
        quadratic=None,
        approximate_inverse=None,
    ):
        r"""
        Squared-error LASSO with feature weights.
        Objective function is
        $$
        \beta \mapsto \frac{1}{2} \|Y-X\beta\|^2_2 + \sum_{i=1}^p \lambda_i |\beta_i|
        $$
        where $\lambda$ is `feature_weights`.
        Parameters
        ----------
        X : ndarray
            Shape (n,p) -- the design matrix.
        Y : ndarray
            Shape (n,) -- the response.
        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.
        sigma : float (optional)
            Noise variance. Set to 1 if `covariance_estimator` is not None.
            This scales the loglikelihood by `sigma**(-2)`.
        covariance_estimator : callable (optional)
            If None, use the parameteric
            covariance estimate of the selected model.
        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.
        Returns
        -------
        L : `selection.algorithms.lasso.lasso`

        Notes
        -----
        If not None, `covariance_estimator` should
        take arguments (beta, active, inactive)
        and return an estimate of some of the
        rows and columns of the covariance of
        $(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
        the unpenalized estimator and the inactive
        coordinates of the gradient of the likelihood at
        the unpenalized estimator.
        """
        if covariance_estimator is not None:
            sigma = 1.0
        loglike = glm.gaussian(X, Y, coef=1.0 / sigma**2, quadratic=quadratic)
        return klass(
            loglike,
            np.asarray(feature_weights) / sigma**2,
            approximate_inverse=approximate_inverse,
        )

    @classmethod
    def logistic(
        klass,
        X,
        successes,
        feature_weights,
        trials=None,
        covariance_estimator=None,
        quadratic=None,
        approximate_inverse=None,
    ):
        r"""
        Logistic LASSO with feature weights.
        Objective function is
        $$
        \beta \mapsto \ell(X\beta) + \sum_{i=1}^p \lambda_i |\beta_i|
        $$
        where $\ell$ is the negative of the logistic
        log-likelihood (half the logistic deviance)
        and $\lambda$ is `feature_weights`.
        Parameters
        ----------
        X : ndarray
            Shape (n,p) -- the design matrix.
        successes : ndarray
            Shape (n,) -- response vector. An integer number of successes.
            For data that is proportions, multiply the proportions
            by the number of trials first.
        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.
        trials : ndarray (optional)
            Number of trials per response, defaults to
            ones the same shape as Y.
        covariance_estimator : optional
            If None, use the parameteric
            covariance estimate of the selected model.
        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.
        Returns
        -------
        L : `selection.algorithms.lasso.lasso`

        Notes
        -----
        If not None, `covariance_estimator` should
        take arguments (beta, active, inactive)
        and return an estimate of the covariance of
        $(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
        the unpenalized estimator and the inactive
        coordinates of the gradient of the likelihood at
        the unpenalized estimator.
        """
        loglike = glm.logistic(X, successes, trials=trials, quadratic=quadratic)
        return klass(loglike, feature_weights, approximate_inverse=approximate_inverse)

    @classmethod
    def poisson(
        klass,
        X,
        counts,
        feature_weights,
        covariance_estimator=None,
        quadratic=None,
        approximate_inverse=None,
    ):
        r"""
        Poisson log-linear LASSO with feature weights.
        Objective function is
        $$
        \beta \mapsto \ell^{\text{Poisson}}(\beta) + \sum_{i=1}^p \lambda_i |\beta_i|
        $$
        where $\ell^{\text{Poisson}}$ is the negative
        of the log of the Poisson likelihood (half the deviance)
        and $\lambda$ is `feature_weights`.
        Parameters
        ----------
        X : ndarray
            Shape (n,p) -- the design matrix.
        counts : ndarray
            Shape (n,) -- the response.
        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.
        covariance_estimator : optional
            If None, use the parameteric
            covariance estimate of the selected model.
        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.
        Returns
        -------
        L : `selection.algorithms.lasso.lasso`

        Notes
        -----
        If not None, `covariance_estimator` should
        take arguments (beta, active, inactive)
        and return an estimate of the covariance of
        $(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
        the unpenalized estimator and the inactive
        coordinates of the gradient of the likelihood at
        the unpenalized estimator.
        """
        loglike = glm.poisson(X, counts, quadratic=quadratic)
        return klass(loglike, feature_weights, approximate_inverse=approximate_inverse)


class ROSI_modelQ(lasso):
    r"""
    A class for the LASSO for post-selection inference
    in which
    The problem solved is
    .. math::
        \text{minimize}_{\beta} -(X\beta)^Ty + \frac{1}{2} \beta^TQ\beta +
            \sum_i \lambda_i |\beta_i|
    where $\lambda$ is `feature_weights`.
    Notes
    -----
    In solving the debiasing problem to approximate the inverse
    of (X^TWX) in a GLM, this class makes the implicit assumption
    that the scaling of X is such that diag(X^TWX) is O(n)
    with n=X.shape[0]. That is, X's are similar to IID samples
    from a population that does not depend on n.
    """

    def __init__(
        self,
        Q,  # population or semi-supervised version of X.T.dot(X)
        X,
        y,
        feature_weights,
    ):
        r"""
        Create a new post-selection for the LASSO problem
        Parameters
        ----------
        Q : np.ndarray((p,p))
        X : np.ndarray((n, p))
        y : np.ndarray(n)
        feature_weights : np.ndarray
            Feature weights for L-1 penalty. If a float,
            it is brodcast to all features.
        """

        self.Q = Q
        self.X, self.y = X, y
        self._loss = quadratic_loss(Q.shape[0], Q=Q)
        self._linear_term = identity_quadratic(0, 0, -X.T.dot(y), 0)
        if np.asarray(feature_weights).shape == ():
            feature_weights = np.ones(Q.shape[0]) * feature_weights
        self.feature_weights = np.asarray(feature_weights)

    def fit(self, solve_args={"tol": 1.0e-12, "min_its": 50}, debiasing_args={}):
        """
        Fit the lasso using `regreg`.
        This sets the attributes `soln`, `onestep` and
        forms the constraints necessary for post-selection inference
        by calling `form_constraints()`.
        Parameters
        ----------
        lasso_solution : optional
             If not None, this is taken to be the solution
             of the optimization problem. No checks
             are done, though the implied affine
             constraints will generally not be satisfied.
        solve_args : keyword args
             Passed to `regreg.problems.simple_problem.solve`.
        Returns
        -------
        soln : np.float
             Solution to lasso.

        Notes
        -----
        If `self` already has an attribute `lasso_solution`
        this will be taken to be the solution and
        no optimization problem will be solved. Supplying
        the optional argument `lasso_solution` will
        overwrite `self`'s `lasso_solution`.
        """

        self._penalty = weighted_l1norm(self.feature_weights, lagrange=1.0)
        problem = simple_problem(self._loss, self._penalty)
        self.lasso_solution = problem.solve(self._linear_term, **solve_args)

        lasso_solution = (
            self.lasso_solution
        )  # shorthand after setting it correctly above

        if not np.all(lasso_solution == 0):
            self.active = np.nonzero(lasso_solution != 0)[0]
            self.inactive = lasso_solution == 0
            self.active_signs = np.sign(lasso_solution[self.active])
            self._active_soln = lasso_solution[self.active]

            # Needed for finding truncation intervals

            G = self._loss.smooth_objective(
                lasso_solution, "grad"
            ) + self._linear_term.objective(lasso_solution, "grad")

            self._Qbeta_bar = self.Q.dot(lasso_solution) - G

            Q = self.Q
            E = self.active
            QiE = np.linalg.inv(Q)[E]  # maybe we want to use a debised estimate
            self._QiE = QiE[:, E]
            self._beta_barE = QiE.dot(self._Qbeta_bar)

            # Pearson's X^2 to estimate sigma from relaxed estimator
            y, X = self.y, self.X
            n, p = X.shape
            relaxed_beta_barE = np.linalg.inv(Q[E][:, E]).dot(X[:, E].T.dot(y))
            self._pearson_sigma = np.sqrt(
                ((y - X[:, E].dot(relaxed_beta_barE)) ** 2).sum()
                / (n - len(self.active))
            )

        else:
            self.active = []
            self.inactive = np.arange(lasso_solution.shape[0])
        return self.lasso_solution

    def summary(self, level=0.95, compute_intervals=False, dispersion=None):
        """
        Summary table for inference adjusted for selection.
        Parameters
        ----------
        level : float
            Form level*100% selective confidence intervals.
        compute_intervals : bool
            Should we compute confidence intervals?
        dispersion : float
            Estimate of dispersion. Defaults to a Pearson's X^2 estimate in the relaxed model.
        Returns
        -------
        pval_summary : np.recarray
            Array with one entry per active variable.
            Columns are 'variable', 'pvalue', 'lasso', 'onestep', 'lower_trunc', 'upper_trunc', 'sd'.
        """

        if len(self.active) > 0:
            X, y = self.X, self.y
            sigma = self._pearson_sigma
            if dispersion is None:
                sqrt_dispersion = sigma
            else:
                sqrt_dispersion = np.sqrt(dispersion)
            active_set, QiE, beta_barE, Qbeta_bar = (
                self.active,
                self._QiE,
                self._beta_barE,
                self._Qbeta_bar,
            )

            result = []

            for j in range(len(active_set)):
                idx = self.active[j]
                lower, upper = _truncation_interval(
                    Qbeta_bar,
                    self.Q,
                    QiE[j, j],
                    idx,
                    beta_barE[j],
                    self.feature_weights,
                    wide=False,
                )

                sd = sqrt_dispersion * np.sqrt(QiE[j, j])
                tg = TG([(-np.inf, lower), (upper, np.inf)], scale=sd)
                pvalue = tg.cdf(beta_barE[j])
                pvalue = float(2 * min(pvalue, 1 - pvalue))

                if compute_intervals:
                    l, u = tg.equal_tailed_interval(beta_barE[j], alpha=1 - level)
                else:
                    l, u = np.nan, np.nan

                result.append(
                    (
                        idx,
                        pvalue,
                        self.lasso_solution[idx],
                        beta_barE[j],
                        sd,
                        l,
                        u,
                        lower,
                        upper,
                    )
                )

            df = pd.DataFrame(
                index=self.active,
                data=dict(
                    [
                        (n, d)
                        for n, d in zip(
                            [
                                "variable",
                                "pvalue",
                                "lasso",
                                "onestep",
                                "sd",
                                "lower_confidence",
                                "upper_confidence",
                                "lower_truncation",
                                "upper_truncation",
                            ],
                            np.array(result).T,
                        )
                    ]
                ),
            )
            df["variable"] = df["variable"].astype(int)
            return df