In [19]:
import numpy as np

from selectinf.randomized.lasso import lasso, split_lasso
from selectinf.tests.instance import gaussian_instance

from selectinf.base import (full_targets, selected_targets)

The following chunk of code generates a simulation instance, under the generating model
$$Y = X\beta + \epsilon,$$
where $\beta$ is sparse, and $\epsilon \sim N(\mathbf{0}, \sigma^2 I_n)$, and the design matrix $X$ is drawn from either an equi-correlated or auto-correlated Gaussian distribution with correlation parameter $\rho$, with the following parameters:
1. `n`: sample size
2. `p`: number of variables
3. `s`: size of the support of $\beta$, i.e., $|\{j: \beta_j \neq 0\}|$ 
4. `sigma`: sd of the noise term
5. $\rho \in [0,1)$: the equicorrelation/autocorrelation parameter
6. `signal`: signal strength
7. `random signs`: a boolean flag determining whether the nonzero entries of beta having a random sign or being uniformly positive
8. `scale`: whether to standardize and scale the covariates by root n
9. `center`: whether to center the data
10. `equicorrelated`: if `True`, generate equicorrelated $X$, else generate autocorrelated $X$

### Special considerations of the `scale` parameter
When implementing the generating function, the `scale` parameter controls scaling of the relative scales of covariates and $\beta$, while keeping the scale of $Y$ unchanged under both options. In particular:
If `scale == False`, 
1. $X \sim N(\mathbf{0}, \Sigma)$, 
2. $\beta = \begin{bmatrix}\mathbf{1}_s & \mathbf{0}_{p-s}\end{bmatrix}$ 
3. and we scale $\beta$ as $\tilde\beta = \beta / \sqrt{n}$, 
4. and generate $Y = X \tilde\beta + \epsilon$ with the sd of $\epsilon$ equal to the $n$-free constant `signal`, 
5. and the function returns $X,\ Y,\ \tilde\beta$.

Otherwise, if `scale == True`, 
1. $X \sim N(\mathbf{0}, \Sigma)$, 
2. $\beta = \begin{bmatrix}\mathbf{1}_s & \mathbf{0}_{p-s}\end{bmatrix}$ 
3. and we scale $X$ as $\tilde X = X / (\sqrt{n} * \text{std}(X))$, 
4. and generate $Y = \tilde X \beta + \epsilon$ with the sd of $\epsilon$ equal to the $n$-free constant `signal`, 
5. and the function returns $\tilde X,\ Y,\ \beta$.

Notice finally that the true scale of signal is of order $O\left(1/\sqrt{n}\right)$, this is due to the asymptotic regime in which the theory applies.

### $L1$ penalty under different values of `scale`
The LASSO solver we implemented solves a problem of form
$$\underset{\beta}{\min} \frac{1}{2}||Y - X\beta||_2^2 + \lambda_n ||\beta||_1.$$

In practice, when `scale` is `True`, we solve for
$$\underset{\beta}{\min} \frac{1}{2}||Y - \tilde X\beta||_2^2 + \lambda_n ||\beta||_1 = \frac{1}{2}||Y - \frac{X}{\sqrt{n}}\beta||_2^2 + \lambda_n ||\beta||_1,$$
setting
$$\lambda_n = C\hat\sigma\sqrt{2\log p}$$
due to its model selection quality guarantees.

However, when `scale` is `False`, due to the different scaling of $X$, we solve alternatively for
$$\underset{\beta}{\min} \frac{1}{2}||Y - X\beta||_2^2 + \lambda_n ||\beta||_1.$$ 
To remedy the change of scale of the target of inference, we scale up 
$\lambda_n$ by $\sqrt{n}$ to match up the order, that is,
$$\lambda_n = C\hat\sigma\sqrt{2n\log p}.$$


In [261]:
n = 500
p = 50
signal_fac = 1
sigma = 2
rho = 0.5
proportion = 0.5
randomizer_scale = 1.
full_dispersion = True
level = 0.90
scale_X = False

inst = gaussian_instance
const = lasso.gaussian
signal = np.sqrt(signal_fac * 2 * np.log(p))

X, Y, beta = inst(n=n, p=p, s=7, sigma=sigma,
                  rho=rho, signal=signal, random_signs=False,
                  scale=scale_X, center=True,
                  equicorrelated=True)[:3]

In the previous code chunk were we set the simulation parameters, the variable `scale_X` denotes whether to scale $X$ by $1/\sqrt{n}$. Practically, it is more common to use the covariates in the original scale, and we therefore set `scale_X=False`. Note that in calling the function `lasso.gaussian()`, which is refactored as `const` in the running example, an argument called `scaled` is required, where the user should pass in the truth value of whether $X$ is scaled by $1/\sqrt{n}$.

In [262]:
weight_frac = 2
sigma_ = np.std(Y)
if scale_X:
    weight = weight_frac * sigma_ * np.sqrt(2 * np.log(p)) 
else:
    weight = weight_frac * sigma_ * np.sqrt(2 * np.log(p)) * np.sqrt(n) 
conv = const(X, Y, scaled=scale_X, feature_weights=weight * np.ones(X.shape[1]),
             randomizer_scale=randomizer_scale * sigma,
             ridge_term=0.)

signs = conv.fit()
nonzero = signs != 0

In [263]:
np.std(X @ beta)

1.3929402190390485

In [264]:
if nonzero.sum() > 0:

    if full_dispersion:
        dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p)
    else:
        dispersion = np.linalg.norm(Y - X[:,nonzero].dot(np.linalg.pinv(X[:,nonzero]).dot(Y))) ** 2 / (n - nonzero.sum())

    conv.setup_inference(dispersion=dispersion)

    target_spec = selected_targets(conv.loglike,
                                   conv.observed_soln,
                                   dispersion=dispersion)

    result = conv.inference(target_spec, 
                       method='selective_MLE',
                       level=level)

    pval = result['pvalue']
    intervals = np.asarray(result[['lower_confidence', 'upper_confidence']])

    beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))

    coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1])

In [265]:
print(np.mean(coverage))        # Coverage rate
print(np.mean(intervals[:, 1] - intervals[:, 0])) # Mean CI length
print(np.mean(beta[beta!=0]))   # Signal strength

1.0
0.4798426877254546
0.250184667969183


In [266]:
print(nonzero.sum())    # Selected support size

10


# Examples for Data Carving

The following codes provides an example of using data carving. Note that here, there is no need to set the `scaled` argument in calling `split_lasso.gaussian()`. However, the scaling issue in 

In [267]:
const = split_lasso.gaussian

In [268]:
weight_frac = 2
sigma_ = np.std(Y)
if scale_X:
    weight = weight_frac * sigma_ * np.sqrt(2 * np.log(p)) 
else:
    weight = weight_frac * sigma_ * np.sqrt(2 * np.log(p)) * np.sqrt(n) 
conv = const(X, Y, feature_weights=weight * np.ones(X.shape[1]),
             proportion=0.5)

signs = conv.fit()
nonzero = signs != 0

In [269]:
np.std(X @ beta)

1.3929402190390485

In [270]:
if nonzero.sum() > 0:

    if full_dispersion:
        dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p)
    else:
        dispersion = np.linalg.norm(Y - X[:,nonzero].dot(np.linalg.pinv(X[:,nonzero]).dot(Y))) ** 2 / (n - nonzero.sum())

    conv.setup_inference(dispersion=dispersion)

    target_spec = selected_targets(conv.loglike,
                                   conv.observed_soln,
                                   dispersion=dispersion)

    result = conv.inference(target_spec, 
                       method='selective_MLE',
                       level=level)

    pval = result['pvalue']
    intervals = np.asarray(result[['lower_confidence', 'upper_confidence']])

    beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))

    coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1])

In [271]:
print(np.mean(coverage))        # Coverage rate
print(np.mean(intervals[:, 1] - intervals[:, 0])) # Mean CI length
print(np.mean(beta[beta!=0]))   # Signal strength

1.0
0.5051216480186823
0.250184667969183


In [272]:
nonzero.sum()  # Selected support size

8