In [1]:
import numpy as np
import pandas as pd

import regreg.api as rr

from selectinf.group_lasso_query import (group_lasso)

from selectinf.base import selected_targets

from selectinf.base import restricted_estimator

  warn('unable to import isotonic regression from sklearn, using a pure python implementation')


In [2]:
X = np.asarray(pd.read_csv("X.csv", header=None, index_col=0))
Y = np.asarray(pd.read_csv("Y.csv", header=None, index_col=0))
groups = np.arange(50).repeat(4)
n,p = X.shape
Y = np.reshape(Y, (n,))

In [3]:
X

array([[ 0.        ,  0.10867511,  0.        , ..., -0.03976352,
        -0.03814476,  0.01531214],
       [ 0.11057861,  0.        ,  0.        , ..., -0.0465319 ,
        -0.04394881, -0.04586215],
       [ 0.        ,  0.        ,  0.        , ..., -0.03304711,
        -0.02231436, -0.03610369],
       ...,
       [ 0.        ,  0.        ,  0.11222645, ..., -0.01858433,
        -0.02500537, -0.00875401],
       [ 0.        ,  0.        ,  0.11222645, ..., -0.08024148,
         0.04924986,  0.01084571],
       [ 0.        ,  0.        ,  0.        , ...,  0.06904143,
         0.04903993, -0.03705397]])

In [4]:
Y

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,

In [5]:
Y.shape

(500,)

In [6]:
def estimate_hess():
    loglike = rr.glm.logistic(X, successes=Y, trials=np.ones(n))
    # For LASSO, this is the OLS solution on X_{E,U}
    beta_full = restricted_estimator(loglike, np.array([True] * p))
    def pi_hess(x):
        return np.exp(x) / (1 + np.exp(x)) ** 2

    # Calculation the asymptotic covariance of the MLE
    W = np.diag(pi_hess(X @ beta_full))

    return X.T @ W @ X

hess = estimate_hess()
weight_frac = 1.

sigma_ = np.std(Y)
#weights = dict([(i, 0.5) for i in np.unique(groups)])
weights = dict([(i, weight_frac * sigma_ * np.sqrt(2 * np.log(p))) for i in np.unique(groups)])

# To generalize to other link functions,
# change group_lasso.logistic() to, e.g., group_lasso.poisson(),
# and pass in arguments as defined in group_lasso_query.py;
conv = group_lasso.logistic(X=X,
                            successes=Y,
                            trials=np.ones(n),
                            groups=groups,
                            weights=weights,
                            useJacobian=True,
                            ridge_term=0.,
                            cov_rand=hess)

signs, _ = conv.fit()
nonzero = (signs != 0)

conv.setup_inference(dispersion=1)

target_spec = selected_targets(conv.loglike,
                               conv.observed_soln,
                               dispersion=1)

result,_ = conv.inference(target_spec,
                        method='selective_MLE',
                        level=0.9)

pval = result['pvalue']
intervals = np.asarray(result[['lower_confidence',
                               'upper_confidence']])

In [7]:
print("Selected Group Indices:", conv._ordered_groups)

Selected Group Indices: [5, 9, 10, 12, 14, 33, 40, 42, 43, 44]


In [53]:
print(pval)

0     0.256576
1     0.758525
2     0.919110
3     0.212195
4     0.884632
5     0.265846
6     0.820663
7     0.642729
8     0.526361
9     0.736804
10    0.531505
11    0.996099
12    0.252418
13    0.537264
14    0.029055
15    0.970745
16    0.060772
17    0.262038
18    0.503243
19    0.787372
20    0.592110
21    0.096246
22    0.598967
23    0.174697
24    0.322498
25    0.273063
26    0.890187
27    0.937532
Name: pvalue, dtype: float64


In [54]:
print(intervals)

[[ -1.99330506  10.85597146]
 [ -5.29115882   7.72357324]
 [ -7.09949922   6.27381694]
 [ -1.55229609  11.30091734]
 [ -7.16177285   6.00066002]
 [-11.38359431   2.19691534]
 [ -7.48536597   5.67203   ]
 [ -8.27855612   4.63627586]
 [ -9.23139809   4.09736979]
 [ -5.37353833   8.13336316]
 [ -4.07991806   9.08970783]
 [ -6.75722216   6.71717542]
 [-11.52481278   2.06733775]
 [ -9.6309945    4.37688381]
 [  2.20537705  15.69332249]
 [ -7.19692278   6.88299027]
 [-13.02008845  -0.85184068]
 [-11.292697     2.13598871]
 [ -9.47577663   3.99404391]
 [ -7.69575838   5.52741393]
 [ -4.09898625   8.05934224]
 [-11.93280775  -0.06665559]
 [ -4.07509544   7.90547705]
 [ -1.02677043  10.71839129]
 [ -2.42685676   9.75233936]
 [ -1.95766768   9.77690773]
 [ -5.81456324   6.880146  ]
 [ -5.54355433   6.09824787]]


In [55]:
print(result['MLE'])

0     4.431333
1     1.216207
2    -0.412841
3     4.874311
4    -0.580556
5    -4.593339
6    -0.906668
7    -1.821140
8    -2.567014
9     1.379912
10    2.504895
11   -0.020023
12   -4.728738
13   -2.627055
14    8.949350
15   -0.156966
16   -6.935965
17   -4.578354
18   -2.740866
19   -1.084172
20    1.980178
21   -5.999732
22    1.915191
23    4.845810
24    3.662741
25    3.909620
26    0.532791
27    0.277347
Name: MLE, dtype: float64
