In [27]:
import numpy as np
import pandas as pd
import nose.tools as nt
import seaborn as sns
import matplotlib.pyplot as plt
import time

import regreg.api as rr

from selectinf.randomized.group_lasso_query import (group_lasso,split_group_lasso)
from selectinf.randomized.group_lasso_query_quasi import (group_lasso_quasi, split_group_lasso_quasi)

# from selectinf.base import (selected_targets,selected_targets_quasi)
from selectinf.randomized.tests.instance import (quasi_poisson_group_instance, poisson_group_instance)

from selectinf.base import restricted_estimator
from selectinf.randomized.tests.test_quasipoisson_group_lasso import calculate_F1_score, naive_inference, \
    randomization_inference, randomization_inference_poisson, data_splitting
import scipy.stats

In [28]:
def test_comparison_quasipoisson_group_lasso_vary_s(n=500,
                                          p=200,
                                          signal_fac=0.1,
                                          s=5,
                                          sigma=2,
                                          rho=0.3,
                                          randomizer_scale=1.,
                                          full_dispersion=True,
                                          level=0.90,
                                          iter=1):
    """
    Compare to R randomized lasso
    """

    # Operating characteristics
    oper_char = {}
    oper_char["sparsity size"] = []
    oper_char["coverage rate"] = []
    oper_char["avg length"] = []
    oper_char["method"] = []
    oper_char["F1 score"] = []
    #oper_char["runtime"] = []

    confint_df = pd.DataFrame()

    for s in [5, 8, 10]:  # [0.01, 0.03, 0.06, 0.1]:
        for i in range(iter):
            # np.random.seed(i)

            # inst = quasi_poisson_group_instance
            inst_p = poisson_group_instance
            const = group_lasso_quasi.quasipoisson
            const_split = split_group_lasso_quasi.quasipoisson

            signal = np.sqrt(signal_fac * 2 * np.log(p))
            signal_str = str(np.round(signal, decimals=2))

            while True:  # run until we get some selection
                groups = np.arange(50).repeat(4)
                """
                X, Y, beta = inst(n=n,
                                  p=p,
                                  signal=signal,
                                  sgroup=s,
                                  groups=groups,
                                  ndiscrete=0,
                                  nlevels=0,
                                  sdiscrete=0,  # s-3, # How many discrete rvs are not null
                                  equicorrelated=False,
                                  rho=rho,
                                  phi=1.5,
                                  random_signs=True,
                                  center=False,
                                  scale=True)[:3]
                                  """
                # print(X)

                X, Y, beta = inst_p(n=n,
                                  p=p,
                                  signal=signal,
                                  sgroup=s,
                                  groups=groups,
                                  ndiscrete=0,
                                  nlevels=0,
                                  sdiscrete=0,  # s-3, # How many discrete rvs are not null
                                  equicorrelated=False,
                                  rho=rho,
                                  random_signs=True,
                                  center=False,
                                  scale=True)[:3]

                n, p = X.shape

                noselection = False  # flag for a certain method having an empty selected set

                """
                if not noselection:
                    # carving
                    coverage_s, length_s, beta_target_s, nonzero_s, \
                    selection_idx_s, K, conf_low_s, conf_up_s = \
                        split_inference(X=X, Y=Y, n=n, p=p,
                                        beta=beta, groups=groups, const=const_split,
                                        proportion=0.5)

                    noselection = (coverage_s is None)
                    if noselection:
                        print('No selection for carving')
                """
                if not noselection:
                    # MLE inference
                    coverage, length, beta_target, nonzero, conf_low, conf_up = \
                        randomization_inference(X=X, Y=Y, n=n, p=p, #proportion=0.5,
                                                beta=beta, groups=groups)

                    noselection = (coverage is None)
                    print("MLE inference noselection:", noselection)


                """# Poisson inference, to be deleted
                if not noselection:
                    # MLE inference (Poisson)
                    coverage_p, length_p, beta_target_p, nonzero_p, conf_low_p, conf_up_p = \
                        randomization_inference_poisson(X=X, Y=Y, n=n, p=p, #proportion=0.5,
                                                        beta=beta, groups=groups)
                    noselection = (coverage_p is None)"""
                """
                # Poisson data splitting, to be deleted
                if not noselection:
                    # data splitting
                    coverage_dsp, lengths_dsp, conf_low_dsp, conf_up_dsp = \
                        data_splitting_poisson(X=X, Y=Y, n=n, p=p, beta=beta, nonzero=nonzero_s,
                                               subset_select=selection_idx_s, level=0.9)
                    noselection = (coverage_dsp is None)
                """
                if not noselection:
                    # data splitting
                    coverage_ds, lengths_ds, conf_low_ds, conf_up_ds, nonzero_ds, beta_target_ds = \
                        data_splitting(X=X, Y=Y, n=n, p=p, groups=groups, beta=beta,
                                       proportion=0.5, level=0.9)
                    noselection = (coverage_ds is None)
                    print("data splitting noselection:", noselection)

                if not noselection:
                    # naive inference
                    coverage_naive, lengths_naive, nonzero_naive, conf_low_naive, conf_up_naive, \
                        beta_target_naive = \
                        naive_inference(X=X, Y=Y, groups=groups,
                                        beta=beta, const=const,
                                        n=n, level=level, nonzero_true=(beta != 0))
                    noselection = (coverage_naive is None)
                    print("naive inference noselection:", noselection)

                if not noselection:
                    # F1 scores
                    # F1_s = calculate_F1_score(beta, selection=nonzero_s)
                    F1 = calculate_F1_score(beta, selection=nonzero)
                    F1_ds = calculate_F1_score(beta, selection=nonzero_ds)
                    F1_naive = calculate_F1_score(beta, selection=nonzero_naive)
                    # F1_p = calculate_F1_score(beta, selection=nonzero_p)
                    # F1_dsp = calculate_F1_score(beta, selection=nonzero_s)

                    # MLE coverage
                    oper_char["sparsity size"].append(s)
                    oper_char["coverage rate"].append(np.mean(coverage))
                    oper_char["avg length"].append(np.mean(length))
                    oper_char["F1 score"].append(F1)
                    oper_char["method"].append('MLE')
                    df_MLE = pd.concat([pd.DataFrame(np.ones(nonzero.sum()) * i),
                                        pd.DataFrame(beta_target),
                                        pd.DataFrame(conf_low),
                                        pd.DataFrame(conf_up),
                                        pd.DataFrame(beta[nonzero] != 0),
                                        pd.DataFrame(np.ones(nonzero.sum()) * s),
                                        pd.DataFrame(np.ones(nonzero.sum()) * F1),
                                        pd.DataFrame(["MLE"] * nonzero.sum())
                                        ], axis=1)
                    confint_df = pd.concat([confint_df, df_MLE], axis=0)

                    """# Carving coverage
                    oper_char["sparsity size"].append(s)
                    oper_char["coverage rate"].append(np.mean(coverage_s))
                    oper_char["avg length"].append(np.mean(length_s))
                    oper_char["F1 score"].append(F1_s)
                    oper_char["method"].append('Carving')
                    #oper_char["runtime"].append(0)
                    df_s = pd.concat([pd.DataFrame(np.ones(nonzero_s.sum()) * i),
                                      pd.DataFrame(beta_target_s),
                                      pd.DataFrame(conf_low_s),
                                      pd.DataFrame(conf_up_s),
                                      pd.DataFrame(beta[nonzero_s] != 0),
                                      pd.DataFrame(np.ones(nonzero_s.sum()) * s),
                                      pd.DataFrame(np.ones(nonzero_s.sum()) * F1_s),
                                      pd.DataFrame(["Carving"] * nonzero_s.sum())
                                      ], axis=1)
                    confint_df = pd.concat([confint_df, df_s], axis=0)"""

                    # MLE (Poisson) coverage
                    """oper_char["sparsity size"].append(s)
                    oper_char["coverage rate"].append(np.mean(coverage_p))
                    oper_char["avg length"].append(np.mean(length_p))
                    oper_char["F1 score"].append(F1_p)
                    oper_char["method"].append('MLE (Poisson)')
                    df_p = pd.concat([pd.DataFrame(np.ones(nonzero_p.sum()) * i),
                                      pd.DataFrame(beta_target_p),
                                      pd.DataFrame(conf_low_p),
                                      pd.DataFrame(conf_up_p),
                                      pd.DataFrame(beta[nonzero_p] != 0),
                                      pd.DataFrame(np.ones(nonzero_p.sum()) * s),
                                      pd.DataFrame(np.ones(nonzero_p.sum()) * F1_p),
                                      pd.DataFrame(["MLE (Poisson)"] * nonzero_p.sum())
                                      ], axis=1)
                    confint_df = pd.concat([confint_df, df_p], axis=0)"""

                    # Data splitting coverage
                    oper_char["sparsity size"].append(s)
                    oper_char["coverage rate"].append(np.mean(coverage_ds))
                    oper_char["avg length"].append(np.mean(lengths_ds))
                    oper_char["F1 score"].append(F1_ds)
                    oper_char["method"].append('Data splitting')
                    df_ds = pd.concat([pd.DataFrame(np.ones(nonzero_ds.sum()) * i),
                                       pd.DataFrame(beta_target_ds),
                                       pd.DataFrame(conf_low_ds),
                                       pd.DataFrame(conf_up_ds),
                                       pd.DataFrame(beta[nonzero_ds] != 0),
                                       pd.DataFrame(np.ones(nonzero_ds.sum()) * s),
                                       pd.DataFrame(np.ones(nonzero_ds.sum()) * F1_ds),
                                       pd.DataFrame(["Data splitting"] * nonzero_ds.sum())
                                       ], axis=1)
                    confint_df = pd.concat([confint_df, df_ds], axis=0)

                    """# Data splitting (poisson) coverage
                    oper_char["sparsity size"].append(s)
                    oper_char["coverage rate"].append(np.mean(coverage_dsp))
                    oper_char["avg length"].append(np.mean(lengths_dsp))
                    oper_char["F1 score"].append(F1_dsp)
                    oper_char["method"].append('Data splitting (Poisson)')
                    df_dsp = pd.concat([pd.DataFrame(np.ones(nonzero_s.sum()) * i),
                                       pd.DataFrame(beta_target_s),
                                       pd.DataFrame(conf_low_dsp),
                                       pd.DataFrame(conf_up_dsp),
                                       pd.DataFrame(beta[nonzero_s] != 0),
                                       pd.DataFrame(np.ones(nonzero_s.sum()) * s),
                                       pd.DataFrame(np.ones(nonzero_s.sum()) * F1_dsp),
                                       pd.DataFrame(["Data splitting (Poisson)"] * nonzero_s.sum())
                                       ], axis=1)
                    confint_df = pd.concat([confint_df, df_ds], axis=0)"""

                    # Naive coverage
                    oper_char["sparsity size"].append(s)
                    oper_char["coverage rate"].append(np.mean(coverage_naive))
                    oper_char["avg length"].append(np.mean(lengths_naive))
                    oper_char["F1 score"].append(F1_naive)
                    oper_char["method"].append('Naive')
                    df_naive = pd.concat([pd.DataFrame(np.ones(nonzero_naive.sum()) * i),
                                          pd.DataFrame(beta_target_naive),
                                          pd.DataFrame(conf_low_naive),
                                          pd.DataFrame(conf_up_naive),
                                          pd.DataFrame(beta[nonzero_naive] != 0),
                                          pd.DataFrame(np.ones(nonzero_naive.sum()) * s),
                                          pd.DataFrame(np.ones(nonzero_naive.sum()) * F1_naive),
                                          pd.DataFrame(["Naive"] * nonzero_naive.sum())
                                          ], axis=1)
                    confint_df = pd.concat([confint_df, df_naive], axis=0)

                    break  # Go to next iteration if we have some selection

    oper_char_df = pd.DataFrame.from_dict(oper_char)
    oper_char_df.to_csv('selectinf/randomized/tests/quasipois_vary_sparsity.csv', index=False)
    colnames = ['Index'] + ['target'] + ['LCB'] + ['UCB'] + ['TP'] + ['sparsity size'] + ['F1'] + ['Method']
    confint_df.columns = colnames
    confint_df.to_csv('selectinf/randomized/tests/quasipois_CI_vary_sparsity.csv', index=False)

    #sns.histplot(oper_char_df["sparsity size"])
    #plt.show()

    print("Mean coverage rate/length:")
    print(oper_char_df.groupby(['sparsity size', 'method']).mean())

    sns.boxplot(y=oper_char_df["coverage rate"],
                x=oper_char_df["sparsity size"],
                hue=oper_char_df["method"],
                orient="v")
    plt.show()

    len_plot = sns.boxplot(y=oper_char_df["avg length"],
                           x=oper_char_df["sparsity size"],
                           hue=oper_char_df["method"],
                           showmeans=True,
                           orient="v")
    len_plot.set_ylim(0, 8)
    plt.show()

    F1_plot = sns.boxplot(y=oper_char_df["F1 score"],
                          x=oper_char_df["sparsity size"],
                          hue=oper_char_df["method"],
                          showmeans=True,
                          orient="v")
    F1_plot.set_ylim(0, 1)
    plt.show()

In [29]:
test_comparison_quasipoisson_group_lasso_vary_s()

(MLE) K estimated with full model
H norm:  7.035075205121727
K norm:  6.5355821643036105
H-K norm:  2.9271783086208045
Sigma_E norm:  8.31952993962497
H^{-1} norm:  8.69927153632105
H^{-1}-Sigma_E norm:  3.9137934569214274
MLE inference noselection: False
(Data Splitting) Selection done without carving


  loss_terms = - coef * ((counts - 1) * np.log(counts))


Data splitting |E|: 0
data splitting noselection: True
(MLE) K estimated with full model
H norm:  5.452799941628569
K norm:  4.687063088179814
H-K norm:  1.6466850985620463
Sigma_E norm:  5.764316868404765
H^{-1} norm:  6.4290015723893195
H^{-1}-Sigma_E norm:  1.8547431487862562
MLE inference noselection: False
(Data Splitting) Selection done without carving


  loss_terms = - coef * ((counts - 1) * np.log(counts))


Data splitting |E|: 4
data splitting noselection: False
(Naive) True E used
Naive selection [False False False False False False False False  True  True  True  True
 False False False False False False False False False False False False
 False False False False  True  True  True  True False False False False
 False False False False False False False False False False False False
  True  True  True  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False  True  True  True  True
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False F

  loss_terms = - coef * ((counts - 1) * np.log(counts))


H norm:  6.618034858764372
K norm:  7.8056072725772285
H-K norm:  3.1115822169936997
Sigma_E norm:  9.15635134418219
H^{-1} norm:  8.010286787541501
H^{-1}-Sigma_E norm:  3.55835441192286
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/yilingh/Desktop/PhD/SI_Codes/selective-inference/env3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/6t/y2p8qwk54f7fnkwds613w0040000gs/T/ipykernel_52800/801653391.py", line 1, in <module>
    test_comparison_quasipoisson_group_lasso_vary_s()
  File "/var/folders/6t/y2p8qwk54f7fnkwds613w0040000gs/T/ipykernel_52800/4291647449.py", line 92, in test_comparison_quasipoisson_group_lasso_vary_s
    randomization_inference(X=X, Y=Y, n=n, p=p, #proportion=0.5,
  File "/Users/yilingh/Desktop/PhD/SI_Codes/selective-inference/selectinf/randomized/tests/test_quasipoisson_group_lasso.py", line 323, in randomization_inference
  File "/Users/yilingh/Desktop/PhD/SI_Codes/selective-inference/selectinf/randomized/tests/test_quasipoisson_group_lasso.py", line 297, in solve_target_restricted
    # For LASSO, this is the OLS solution on X_{E,U}
