In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from selectinf.Simulation.spline_instance import generate_gaussian_instance_from_bspline_interaction, generate_gaussian_instance_nonlinear_interaction
from selectinf.reluctant_interaction import (SPAM, split_SPAM)
from selectinf.Simulation.spline_instance import gaussian_polynomial_interaction_instance

import regreg.api as rr
from selectinf.base import selected_targets_interaction
from selectinf.base import restricted_estimator
import scipy.stats

from selectinf.Simulation.test_group_lasso_simulation import (calculate_F1_score,
                                                              naive_inference,
                                                              randomization_inference,
                                                              data_splitting)



  warn('unable to import isotonic regression from sklearn, using a pure python implementation')


In [2]:
(design, data_interaction, Y, Y_mean, data_combined,
 groups, active, active_inter_adj, active_inter_list) \
    = generate_gaussian_instance_nonlinear_interaction(n=1000, p_nl=30, p_l=0, s_l=0,
                                                       nknots=6, degree=2, SNR=1,
                                                       rho=0.5,
                                                       center=False, scale=False, 
                                                       random_signs=True,
                                                       intercept=True, structure='stronghierarchy',
                                                       s_interaction=10, 
                                                       interaction_signal=5)

[[1.         0.48005291 0.45181027]
 [0.48005291 1.         0.48116166]
 [0.45181027 0.48116166 1.        ]]
Equally spaced quantile knots used.
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
noise_sd: 16.385860530083903
main effects: 293.96278038560604
interaction: 821.1261368954865


In [6]:
p = 100
intercept_flag = True
SNR = 1
(design, data_interaction, Y, Y_mean, data_combined,
 groups, active, active_inter_adj, active_inter_list) \
    = generate_gaussian_instance_nonlinear_interaction(n=1000, p_nl=p, p_l=0, s_l=0,
                           nknots=6, degree=2, SNR=SNR, rho=0.6,
                           center=False, scale=False, random_signs=True,
                           intercept=intercept_flag, structure='stronghierarchy', 
                           s_interaction=10, interaction_signal=5)

[[1.         0.60141479 0.56263191]
 [0.60141479 1.         0.57671979]
 [0.56263191 0.57671979 1.        ]]
Equally spaced quantile knots used.
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
noise_sd: 20.797297970150808
main effects: 276.47556485155184
interaction: 1113.7742483997456


In [7]:
design.shape

(1000, 201)

In [8]:
np.linalg.matrix_rank(design)

201

In [9]:
np.linalg.matrix_rank(design[:,0:10])

10

In [10]:
def proj_to_basis(X):
    proj_M = np.linalg.inv(X.T @ X) @ X.T
    hat_M = X @ proj_M
    proj = hat_M @ np.ones((X.shape[0],))
    coef = proj_M @ np.ones((X.shape[0],))
    
    return proj, coef

In [11]:
p, c = proj_to_basis(design[:,1:4])
print(p)

[0.15827419 0.4772165  0.4945896  0.07923025 0.18541663 1.02752359
 0.63473344 0.41066698 1.74454111 0.48656909 0.43430655 0.18335882
 1.01847416 1.0334956  1.79109848 0.49834385 0.08151213 0.19558551
 0.50846711 1.32046483 0.481986   0.87831223 0.22932303 0.35254421
 0.02215425 0.22831137 0.99689575 1.03870476 0.24415561 0.14778998
 0.15880295 0.24396143 0.68336178 0.33480167 0.10098058 0.03523753
 0.03674465 0.64807629 1.27425461 0.1700734  0.80457155 0.11589692
 1.1642271  0.38510666 0.44161719 1.76085273 0.33997174 0.2125153
 1.04512647 0.23503576 0.36953032 0.30291529 0.23135328 0.93839767
 0.2829771  0.5412118  0.37442865 0.17550145 0.39600097 0.10159111
 2.11580282 0.09026305 0.59236174 0.3731047  0.22108286 0.35329946
 1.10430656 0.27984858 0.58950127 0.18213005 0.08923462 0.40809724
 0.00589792 0.13966158 0.76326217 0.44012719 0.21833052 0.35973275
 1.20653191 0.14522739 0.48424671 0.56630172 0.35555765 0.08097112
 0.43423677 0.55521584 0.33976286 0.63809814 1.15926576 0.46454

In [12]:
print(c)

[-12.10190686  -0.40670425 -13.65061244]


In [13]:
for i in range(30):
    r = np.linalg.matrix_rank(design[:,i*3+1:i*3+4])
    if r < 3:
        print(i)
    

In [14]:
np.linalg.eigvals(design.T @ design)[-2]

0.2384243856283088

In [15]:
np.linalg.matrix_rank(design)

201

In [16]:
weight_frac = 1.
intercept = False
const = SPAM.gaussian

n, p = design.shape

##estimate noise level in data

sigma_ = np.std(Y)
if n > p:
    dispersion\
        = np.linalg.norm(Y - design.dot(np.linalg.pinv(design).dot(Y))) ** 2 / (n - p)
else:
    dispersion = sigma_ ** 2

sigma_ = np.sqrt(dispersion)

##solve group LASSO with group penalty weights = weights
weights = dict([(i, weight_frac * sigma_ * np.sqrt(2 * np.log(p))) for i in np.unique(groups)])
# Don't penalize intercept
if intercept:
    weights[0] = 0

conv = const(X=design,
             Y=Y,
             groups=groups,
             weights=weights,
             useJacobian=True,
             ridge_term=0., 
             cov_rand = design.T @ design)

signs, _ = conv.fit()
nonzero = signs != 0

selected_groups = conv.selection_variable['active_groups']
G_E = len(selected_groups)

conv.setup_interaction(interaction=data_interaction[(0,1)])
conv.setup_inference(dispersion=dispersion)

target_spec = selected_targets_interaction(conv.loglike,
                                           conv.observed_soln,
                                           interaction=data_interaction[(0,1)],
                                           dispersion=dispersion)

In [17]:
result,_ = conv.inference(target_spec,
                        method='selective_MLE',
                        level=0.9)

pval = result['pvalue']
intervals = np.asarray(result[['lower_confidence',
                               'upper_confidence']])

MLE Shape: (40,)


In [18]:
pval

0     3.500163e-03
1     9.904441e-01
2     6.702030e-01
3     1.028578e-01
4     9.192937e-02
5     3.208643e-01
6     2.440965e-02
7     2.999859e-01
8     1.095388e-08
9     8.265957e-01
10    1.817802e-03
11    9.555185e-02
12    3.116585e-01
13    8.381326e-01
14    1.299861e-02
15    2.247442e-01
16    1.421718e-01
17    1.898152e-01
18    1.932902e-01
19    9.374557e-01
20    2.238885e-01
21    4.958051e-01
22    1.059378e-02
23    8.341228e-01
24    3.825011e-01
25    6.383229e-01
26    8.182255e-01
27    6.913752e-01
28    8.511839e-01
29    1.933324e-01
30    2.012815e-01
31    3.920650e-01
32    3.174045e-03
33    4.669721e-01
34    9.660496e-01
35    3.408558e-02
36    7.468104e-04
37    4.980299e-01
38    1.056802e-06
39    5.896294e-01
Name: pvalue, dtype: float64

In [45]:
nonzero.sum()

21

In [46]:
conv.observed_soln != 0

array([False, False, False, False, False,  True,  True,  True, False,
        True,  True,  True,  True,  True, False, False, False, False,
       False,  True,  True,  True, False,  True, False, False,  True,
       False, False, False,  True, False, False,  True,  True, False,
       False, False,  True, False, False,  True,  True,  True, False,
       False, False, False, False,  True])

In [47]:
intervals

array([[-0.5857954 ,  0.75519369],
       [ 0.05264242,  1.34080315],
       [-1.23124057,  0.11850239],
       [-0.06833274,  1.23993246],
       [-0.67156807,  0.6706625 ],
       [-1.16716501,  0.13947456],
       [-0.70368614,  0.66895727],
       [-0.79787722,  0.57114273],
       [-0.52723975,  0.83064529],
       [-1.05933866,  0.26064749],
       [-0.79122661,  0.61077865],
       [ 1.11979535,  2.45036632],
       [ 0.27490398,  1.62183793],
       [ 0.09659609,  1.4362966 ],
       [-0.50358978,  0.80382377],
       [-1.08186619,  0.22643197],
       [ 0.31384315,  1.6975646 ],
       [-0.49575287,  0.80705528],
       [ 0.14568379,  1.48169713],
       [-0.57762035,  0.71286807],
       [-0.79774907,  0.55214373],
       [-1.01325486,  0.30027736]])

In [49]:
(a,b) = intervals[-1,:]

In [51]:
b

0.300277356640935

In [19]:
idx = [1,2,3]
d = np.zeros((5,5))

In [20]:
d[:,idx] = 1

In [21]:
d

array([[0., 1., 1., 1., 0.],
       [0., 1., 1., 1., 0.],
       [0., 1., 1., 1., 0.],
       [0., 1., 1., 1., 0.],
       [0., 1., 1., 1., 0.]])

In [30]:
d[:,np.setdiff1d(range(4), idx).tolist()] = 1

In [31]:
d

array([[1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 0.]])