In [None]:
import pandas as pd
import numpy as np
import os
from time import time
from sklearn.model_selection import train_test_split
from pydts.examples_utils.plots import plot_events_occurrence
from pydts.cross_validation import TwoStagesCV, PenaltyGridSearchCV
from pydts.fitters import TwoStagesFitter, DataExpansionFitter
from pydts.evaluation import *
from pydts.data_generation import EventTimesSampler
from pydts.screening import SISTwoStagesFitter
from matplotlib import pyplot as plt
import warnings
import pickle
from copy import deepcopy
from sklearn.model_selection import KFold
pd.set_option("display.max_rows", 500)
warnings.filterwarnings('ignore')
%matplotlib inline
slicer = pd.IndexSlice
from time import time
import psutil

WORKERS = psutil.cpu_count(logical=False)

# Constants Definition

In [None]:
OUTPUT_DIR = '/data/home/tomer.me/git/DiscreteTimeSurvivalPenalization/output/tmp/'
DATA_DIR = '/data/home/tomer.me/git/DiscreteTimeSurvivalPenalization/output/tmp/'

In [None]:
n_cov = 100
beta1 = np.zeros(n_cov)
beta1[:5] = np.array([-0.7, -0.6, 0.8, 0.7, -0.8])
beta2 = np.zeros(n_cov)
beta2[:5] = np.array([0.7, 0.8, -0.8, -0.6, -0.7])

real_coef_dict = {
    "alpha": {
        1: lambda t: -3.2 + 0.3 * np.log(t),
        2: lambda t: -3.3 + 0.4 * np.log(t)
    },
    "beta": {
        1: beta1,
        2: beta2
    }
}

n_patients = 1000
d_times = 8
j_events = 2

covariates = [f'Z{i + 1}' for i in range(n_cov)]
clip_value = 3.

In [None]:
successful_runs = 2
runs = 3 # Allows 3 sampling fails
rhos = [0, 0.5, 0.9] 
seed_start = 0
quantile = 1

# SIS

In [None]:
psis_selected_models = pd.DataFrame()
psis_thresholds = pd.DataFrame()
psis_selected_models_j1 = pd.DataFrame()
psis_selected_models_j2 = pd.DataFrame()


for idrho, rho in enumerate(rhos): 
    # Building covariance matrix
    
    print(f'Building covariance matrix for rho: {rho}')
    start_cov = time()
    means_vector = np.zeros(n_cov)
    covariance_matrix = np.identity(n_cov)
    if rho > 0:
        for h in range(n_cov):
            for f in range(n_cov):
                if f > h:
                    break
                if h != f:
                    c = (rho)**np.abs(h-f)
                    covariance_matrix[h, f] = c
                    covariance_matrix[f, h] = c
    end_cov = time()
    print(f'Finished building covariates matrix for rho: {rho}, time: {int(end_cov-start_cov)}')
 
    total = 0  # Counts successes per rho 
    for _seed in range(runs):
        try:
            seed = 1000*idrho + _seed + seed_start
            print(f'Starting run number: {seed}')
            
            
            dataset_file = os.path.join(DATA_DIR, f"patients_df_{seed}_{str(rho).replace('.', '')}.csv")
            
            if os.path.isfile(dataset_file):
                print(f"Loading dataset file: {dataset_file}")
                patients_df = pd.read_csv(dataset_file, index_col=0)
            else:
                # Dataset sampling
                start = time()
                print(f"Sampling dataset file: {dataset_file}")

                ets = EventTimesSampler(d_times=d_times, j_event_types=j_events)

                patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.multivariate_normal(means_vector, covariance_matrix,
                                                                                                size=n_patients),
                                                             columns=covariates))
                patients_df.clip(lower= -1 * clip_value, upper=clip_value, inplace=True)
                patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
                patients_df = ets.sample_independent_lof_censoring(patients_df, prob_lof_at_t=0.01 * np.ones(d_times),
                                                                   seed=seed)
                patients_df = ets.update_event_or_lof(patients_df)
                patients_df.index.name = 'pid'
                patients_df = patients_df.reset_index()

                end = time()
                print(f'Total sampling time: {int(end-start)}')

                patients_df.to_csv(dataset_file)

            if 'T' in patients_df.columns:
                patients_df.drop(['C', 'T'], axis=1, inplace=True)
                
            # SIS
            start_psis = time()
            fitter = SISTwoStagesFitter()
            fitter.fit(df=patients_df, quantile=quantile, seed=seed, fit_final_model=False)

            fitter.null_model_df.to_csv(os.path.join(OUTPUT_DIR, 
                                    f"null_model_df_{seed}_{str(rho).replace('.', '')}_{seed_start}.csv"))
            fitter.marginal_estimates_df.to_csv(os.path.join(OUTPUT_DIR, 
                                    f"marginal_estimates_df_{seed}_{str(rho).replace('.', '')}_{seed_start}.csv"))

            
            psis_thresholds = pd.concat([psis_thresholds, 
                                         pd.Series(fitter.threshold, 
                                                   name=f"run_{seed}_{str(rho).replace('.', '')}")], axis=1)
            
            selected_model = pd.Series(np.zeros(len(covariates)), index=covariates, 
                                       name=f"run_{seed}_{str(rho).replace('.', '')}")
            selected_model.loc[fitter.chosen_covariates] = 1
            
            selected_model_j1 = pd.Series(np.zeros(len(covariates)), index=covariates, 
                                          name=f"run_{seed}_{str(rho).replace('.', '')}")
            selected_model_j1.loc[fitter.chosen_covariates_j[1]] = 1
            
            selected_model_j2 = pd.Series(np.zeros(len(covariates)), index=covariates, 
                                          name=f"run_{seed}_{str(rho).replace('.', '')}")
            selected_model_j2.loc[fitter.chosen_covariates_j[1]] = 1
        
            psis_selected_models = pd.concat([psis_selected_models, selected_model], axis=1) 
            psis_selected_models_j1 = pd.concat([psis_selected_models_j1, selected_model_j1], axis=1) 
            psis_selected_models_j2 = pd.concat([psis_selected_models_j2, selected_model_j2], axis=1) 

            psis_selected_models.to_csv(os.path.join(OUTPUT_DIR, f'psis_selected_models_{seed_start}.csv'))
            psis_selected_models_j1.to_csv(os.path.join(OUTPUT_DIR, f'psis_selected_models_j1_{seed_start}.csv'))
            psis_selected_models_j2.to_csv(os.path.join(OUTPUT_DIR, f'psis_selected_models_j2_{seed_start}.csv'))

            psis_thresholds.to_csv(os.path.join(OUTPUT_DIR, f'psis_thresholds_{seed_start}.csv'))

            end_psis = time()
            print(f'Total PSIS time: {int(end_psis-start_psis)}')
            total += 1
            if total >= successful_runs: 
                break 
        except Exception as e:
            print(f"Error in run {seed}: {e}")
            print(dataset_file)

# Read and Evaluate SIS

In [None]:
psis_selected_models = pd.read_csv(os.path.join(OUTPUT_DIR, f'psis_selected_models_{seed_start}.csv'), index_col=0)
psis_selected_models_j1 = pd.read_csv(os.path.join(OUTPUT_DIR, f'psis_selected_models_j1_{seed_start}.csv'), index_col=0)
psis_selected_models_j2 = pd.read_csv(os.path.join(OUTPUT_DIR, f'psis_selected_models_j2_{seed_start}.csv'), index_col=0)
psis_thresholds = pd.read_csv(os.path.join(OUTPUT_DIR, f'psis_thresholds_{seed_start}.csv'), index_col=0)

In [None]:
full_alpha_df = pd.DataFrame()
full_beta_df = pd.DataFrame()
full_cv_metric_df = pd.DataFrame()
full_rho_df = pd.DataFrame()
full_thresh_df = pd.DataFrame()

for idrho, rho in enumerate(rhos):
    rho_names = []
    total = 0
    for _seed in range(runs):
        try:
            seed = 1000*idrho + _seed + seed_start

            run_name = f"run_{seed}_{str(rho).replace('.', '')}"
            print(f'Starting run: {run_name}')

            # Get PSIS selected covariates
            run_results = psis_selected_models[run_name]
            selected_covariates = run_results[run_results > 0].index.tolist()
            
            run_results_j1 = psis_selected_models_j1[run_name]
            selected_covariates_j1 = run_results_j1[run_results_j1 > 0].index.tolist()
            run_results_j2 = psis_selected_models_j2[run_name]
            selected_covariates_j2 = run_results_j2[run_results_j2 > 0].index.tolist()
            
            twostep_covariates = {1: selected_covariates_j1, 2: selected_covariates_j2}

            # Load Dataset
            patients_df = pd.read_csv(os.path.join(DATA_DIR,
                                                   f"patients_df_{seed}_{str(rho).replace('.', '')}.csv"), index_col=0)

            # Fit selected model and save results
            fitter = TwoStagesFitter()
            fitter.fit(patients_df[['pid', 'X', 'J'] + selected_covariates], covariates=twostep_covariates)
            fitter.alpha_df[['X', 'J', 'n_jt', 'alpha_jt']].to_csv(
                os.path.join(OUTPUT_DIR, f"alpha_df_{seed}_{str(rho).replace('.', '')}_{seed_start}.csv"))
            fitter.get_beta_SE().to_csv(
                os.path.join(OUTPUT_DIR, f"beta_df_{seed}_{str(rho).replace('.', '')}_{seed_start}.csv"))

            full_beta_df = pd.concat([full_beta_df,
                                      pd.concat([pd.concat([fitter.get_beta_SE()], 
                                                          keys=[seed], axis=1)], 
                                                keys=[rho], axis=1)], axis=1)
            full_alpha_df = pd.concat([full_alpha_df,
                                       pd.concat([pd.concat([fitter.alpha_df[['X', 'J', 'n_jt', 'alpha_jt']]], 
                                                           keys=[seed], axis=1)], 
                                                 keys=[rho], axis=1)], axis=1)

            # Evaluate model using 3-fold cross validation
            evaluated = False
            tries = 0
            while not evaluated:
                try:
                    fitter_cv = TwoStagesCV()
                    fitter_cv.cross_validate(full_df=patients_df[['pid', 'X', 'J'] + selected_covariates], 
                                             covariates=twostep_covariates, n_splits=3, seed=(seed+tries))
                    tmp = pd.concat([pd.DataFrame(fitter_cv.integrated_auc).T, 
                                     pd.DataFrame(fitter_cv.integrated_bs).T], axis=1)
                    tmp.columns = ['IAUC 1', 'IAUC 2', 'IBS 1', 'IBS 2']
                    cv_metric_df = pd.concat([
                        pd.Series(fitter_cv.global_auc, name='GAUC'),
                        pd.Series(fitter_cv.global_bs, name='GBS'), 
                        tmp
                    ], axis=1)
                    cv_metric_df.to_csv(
                        os.path.join(OUTPUT_DIR, f"cv_metric_df_{seed}_{str(rho).replace('.', '')}_{seed_start}.csv"))

                    full_cv_metric_df = pd.concat([full_cv_metric_df,
                                                   pd.concat([pd.concat([cv_metric_df], 
                                                           keys=[seed], axis=1)], 
                                                 keys=[rho], axis=1)], axis=1)

                    evaluated = True

                    if tries > 3:
                        print(f"Run name {run_name} took more than {tries} tries to be evaluated")
                except:
                    tries += 1
                    
            rho_names.append(run_name)
            total += 1
            if total >= successful_runs: 
                break 
            
        except Exception as e:
            print(f"Error in run {seed}: {e}")
        
    #rho_df = psis_selected_models[rho_names]
    rho_df_j1 = psis_selected_models_j1[rho_names]
    rho_df_j2 = psis_selected_models_j2[rho_names]

    thresh_df = pd.concat([pd.concat([psis_thresholds[rho_names].mean(axis=1), 
                                      psis_thresholds[rho_names].std(axis=1)], axis=1, keys=['mean', 'STD'])], 
                          axis=1, 
                          keys=[rho])
    
    full_thresh_df = pd.concat([full_thresh_df, thresh_df], axis=1)
    
#     rho_results = pd.concat([
#             pd.Series(rho_df.sum(axis=0), name='model_size'),
#             pd.Series(rho_df.iloc[:5].sum(axis=0), name='TP'),
#             pd.Series((1-rho_df.iloc[:5]).sum(axis=0), name='FN'),
#             pd.Series(rho_df.iloc[5:].sum(axis=0), name='FP'),
#             pd.Series((1-rho_df.iloc[5:]).sum(axis=0), name='TN'),
#         ], axis=1)
    
    rho_results = pd.concat([
            pd.Series(rho_df_j1.sum(axis=0), name='model_size_j1'),
            pd.Series(rho_df_j2.sum(axis=0), name='model_size_j2'),
            pd.Series(rho_df_j1.iloc[:5].sum(axis=0), name='TP_j1'),
            pd.Series(rho_df_j2.iloc[:5].sum(axis=0), name='TP_j2'),
            pd.Series((1-rho_df_j1.iloc[:5]).sum(axis=0), name='FN_j1'),
            pd.Series((1-rho_df_j2.iloc[:5]).sum(axis=0), name='FN_j2'),
            pd.Series(rho_df_j1.iloc[5:].sum(axis=0), name='FP_j1'),
            pd.Series(rho_df_j2.iloc[5:].sum(axis=0), name='FP_j2'),
            pd.Series((1-rho_df_j1.iloc[5:]).sum(axis=0), name='TN_j1'),
            pd.Series((1-rho_df_j2.iloc[5:]).sum(axis=0), name='TN_j2'),
        ], axis=1)
    
    rho_results.to_csv(os.path.join(OUTPUT_DIR, f"rho_results_{str(rho).replace('.', '')}_{seed_start}.csv"))
    
    full_rho_df = pd.concat([full_rho_df,
                             pd.concat([rho_results], keys=[rho], axis=1).reset_index(drop=True)], axis=1)

full_beta_df.to_csv(os.path.join(OUTPUT_DIR, f"psis_full_beta_df_{seed_start}.csv"))
full_alpha_df.to_csv(os.path.join(OUTPUT_DIR, f"psis_full_alpha_df_{seed_start}.csv"))
full_cv_metric_df.to_csv(os.path.join(OUTPUT_DIR, f"psis_full_cv_metric_df_{seed_start}.csv"))
full_rho_df.to_csv(os.path.join(OUTPUT_DIR, f"psis_full_rho_df_{seed_start}.csv"))
full_thresh_df.to_csv(os.path.join(OUTPUT_DIR, f"psis_full_thresh_df_{seed_start}.csv"))

# Adding LASSO regularization: PSIS-L

In [None]:
step = 3
penalizers = np.arange(-8., -1.9, step=step) 
#penalizers = np.concatenate(([-50], penalizers))
n_splits = 3

In [None]:
psis_selected_models = pd.read_csv(os.path.join(OUTPUT_DIR, f'psis_selected_models_{seed_start}.csv'), index_col=0)
psis_selected_models_j1 = pd.read_csv(os.path.join(OUTPUT_DIR, f'psis_selected_models_j1_{seed_start}.csv'), index_col=0)
psis_selected_models_j2 = pd.read_csv(os.path.join(OUTPUT_DIR, f'psis_selected_models_j2_{seed_start}.csv'), index_col=0)
psis_thresholds = pd.read_csv(os.path.join(OUTPUT_DIR, f'psis_thresholds_{seed_start}.csv'), index_col=0)

In [None]:
lasso_full_beta_df = pd.DataFrame()
lasso_full_alpha_df = pd.DataFrame()
lasso_full_cv_metric_df = pd.DataFrame()
lasso_full_rho_df = pd.DataFrame()
lasso_full_eta_df = pd.DataFrame()
lasso_cv_metric_df = pd.DataFrame()
psisL_etas = pd.DataFrame()
psisL_selected_models_j1 = pd.DataFrame(index=psis_selected_models.index, 
                                        columns=psis_selected_models.columns).fillna(0)
psisL_selected_models_j2 = pd.DataFrame(index=psis_selected_models.index, 
                                        columns=psis_selected_models.columns).fillna(0)

for idrho, rho in enumerate(rhos):
    rho_names = []
    total = 0
    for _seed in range(runs):
        try:
            # Load PSIS results
            seed = 1000*idrho + _seed + seed_start

            run_name = f"run_{seed}_{str(rho).replace('.', '')}"

            # psis selected covariates
            run_results = psis_selected_models[run_name]
            selected_covariates = run_results[run_results > 0].index.tolist()
            
            run_results_j1 = psis_selected_models_j1[run_name]
            selected_covariates_j1 = run_results_j1[run_results_j1 > 0].index.tolist()
            run_results_j2 = psis_selected_models_j2[run_name]
            selected_covariates_j2 = run_results_j2[run_results_j2 > 0].index.tolist()
            
            twostep_covariates = {1: selected_covariates_j1, 2: selected_covariates_j2}

            thresholds = psis_thresholds[run_name]

            print(f'Starting run number: {seed}')
            print(f'PSIS selected_covariates: {selected_covariates}')

            patients_df = pd.read_csv(os.path.join(DATA_DIR, 
                                                   f"patients_df_{seed}_{str(rho).replace('.', '')}.csv"))


            # Regularization parameters {eta_1, eta_2} tuning
            penalty_cv_search = PenaltyGridSearchCV()
            twostages_fit_kwargs = {'nb_workers': WORKERS, 'covariates': twostep_covariates}
            gauc_cv_results = penalty_cv_search.cross_validate(full_df=patients_df[['pid', 'X', 'J'] + selected_covariates], 
                                                               l1_ratio=1, 
                                                               penalizers=np.exp(penalizers),  
                                                               n_splits=n_splits, 
                                                               seed=seed)
            chosen_eta = np.log(gauc_cv_results['Mean'].idxmax())
            psisL_etas = pd.concat([psisL_etas,
                                    pd.Series(chosen_eta, index=['eta_1', 'eta_2'], name=run_name)], axis=1)


            chosen_gauc = []
            chosen_iauc1 = []
            chosen_iauc2 = []
            chosen_gbs = []
            chosen_ibs1 = []
            chosen_ibs2 = []

            for i_fold in range(n_splits):
                mixed_two_step = penalty_cv_search.folds_grids[i_fold].get_mixed_two_stages_fitter(np.exp(chosen_eta))
                test_df = patients_df[patients_df['pid'].isin(penalty_cv_search.test_pids[i_fold])]
                pred_df = mixed_two_step.predict_prob_events(test_df)
                chosen_gauc.append(global_auc(pred_df))
                chosen_gbs.append(global_brier_score(pred_df))
                iauc = events_integrated_auc(pred_df)
                ibs = events_integrated_brier_score(pred_df)
                chosen_iauc1.append(iauc[1])
                chosen_iauc2.append(iauc[2])
                chosen_ibs1.append(ibs[1])
                chosen_ibs2.append(ibs[2])

            cv_metric_df = pd.DataFrame([chosen_gauc, chosen_iauc1, chosen_iauc2, 
                                         chosen_gbs, chosen_ibs1, chosen_ibs2],
                                  index=['GAUC', 'IAUC 1', 'IAUC 2', 'GBS', 'IBS 1', 'IBS 2']).T

            lasso_full_cv_metric_df = pd.concat([lasso_full_cv_metric_df,
                                                   pd.concat([pd.concat([cv_metric_df], 
                                                           keys=[seed], axis=1)], 
                                                 keys=[rho], axis=1)], axis=1)

            # Lasso covariates selection
            L1_regularized_fitter = TwoStagesFitter()
            fit_beta_kwargs = {
                'model_kwargs': {
                    1: {'penalizer': np.exp(chosen_eta[0]), 'l1_ratio': 1},
                    2: {'penalizer': np.exp(chosen_eta[1]), 'l1_ratio': 1}
            }}
            L1_regularized_fitter.fit(df = patients_df[['pid', 'X', 'J'] + selected_covariates],
                                      fit_beta_kwargs = fit_beta_kwargs, covariates=twostep_covariates)

            lasso_beta = L1_regularized_fitter.get_beta_SE() 

            lasso_full_beta_df = pd.concat([lasso_full_beta_df,
                                      pd.concat([pd.concat([L1_regularized_fitter.get_beta_SE()], 
                                                          keys=[seed], axis=1)], 
                                                keys=[rho], axis=1)], axis=1)
            lasso_full_alpha_df = pd.concat([lasso_full_alpha_df,
                                       pd.concat([pd.concat([L1_regularized_fitter.alpha_df[['X', 'J', 'n_jt', 'alpha_jt']]], 
                                                           keys=[seed], axis=1)], 
                                                 keys=[rho], axis=1)], axis=1)

            lasso_thresh_1 = 0.001 
            lasso_thresh_2 = 0.001 

            j1_covs = lasso_beta.loc[lasso_beta.loc[:, 'j1_params'].abs() >= lasso_thresh_1, 'j1_params'].index.tolist()
            j2_covs = lasso_beta.loc[lasso_beta.loc[:, 'j2_params'].abs() >= lasso_thresh_2, 'j2_params'].index.tolist()

            psisL_selected_models_j1.loc[j1_covs, run_name] = 1
            psisL_selected_models_j2.loc[j2_covs, run_name] = 1


            print(f'PSIS-L selected_covariates j=1: {j1_covs}')
            print(f'PSIS-L selected_covariates j=2: {j2_covs}')
            
            rho_names.append(run_name)
            total += 1
            if total >= successful_runs:
                break

        except Exception as e:
            print(f"Error in run {seed}: {e}")            

    rho_df_j1 = psisL_selected_models_j1[rho_names]
    rho_df_j2 = psisL_selected_models_j2[rho_names]

    eta_df = pd.concat([pd.concat([psisL_etas[rho_names].mean(axis=1), 
                                   psisL_etas[rho_names].std(axis=1)], axis=1, keys=['mean', 'STD'])], 
                          axis=1, 
                          keys=[rho])
    
    lasso_full_eta_df = pd.concat([lasso_full_eta_df, eta_df], axis=1)
    
    rho_results = pd.concat([
            pd.Series(rho_df_j1.sum(axis=0), name='model_size_j1'),
            pd.Series(rho_df_j2.sum(axis=0), name='model_size_j2'),
            pd.Series(rho_df_j1.iloc[:5].sum(axis=0), name='TP_j1'),
            pd.Series(rho_df_j2.iloc[:5].sum(axis=0), name='TP_j2'),
            pd.Series((1-rho_df_j1.iloc[:5]).sum(axis=0), name='FN_j1'),
            pd.Series((1-rho_df_j2.iloc[:5]).sum(axis=0), name='FN_j2'),
            pd.Series(rho_df_j1.iloc[5:].sum(axis=0), name='FP_j1'),
            pd.Series(rho_df_j2.iloc[5:].sum(axis=0), name='FP_j2'),
            pd.Series((1-rho_df_j1.iloc[5:]).sum(axis=0), name='TN_j1'),
            pd.Series((1-rho_df_j2.iloc[5:]).sum(axis=0), name='TN_j2'),
        ], axis=1)
    rho_results.to_csv(os.path.join(OUTPUT_DIR, f"lasso_rho_results_{str(rho).replace('.', '')}_{seed_start}.csv"))
    
    lasso_full_rho_df = pd.concat([lasso_full_rho_df,
                             pd.concat([rho_results], keys=[rho], axis=1).reset_index(drop=True)], axis=1)

lasso_full_beta_df.to_csv(os.path.join(OUTPUT_DIR, f"lasso_psis_full_beta_df_{seed_start}.csv"))
lasso_full_alpha_df.to_csv(os.path.join(OUTPUT_DIR, f"lasso_psis_full_alpha_df_{seed_start}.csv"))
lasso_full_cv_metric_df.to_csv(os.path.join(OUTPUT_DIR, f"lasso_psis_full_cv_metric_df_{seed_start}.csv"))
lasso_full_rho_df.to_csv(os.path.join(OUTPUT_DIR, f"lasso_psis_full_rho_df_{seed_start}.csv"))
lasso_full_eta_df.to_csv(os.path.join(OUTPUT_DIR, f"lasso_psis_full_thresh_df_{seed_start}.csv"))
psisL_etas.to_csv(os.path.join(OUTPUT_DIR, f"lasso_selection_psisL_etas_{seed_start}.csv"))
psisL_selected_models_j1.to_csv(os.path.join(OUTPUT_DIR, f'psisL_selected_models_j1_{seed_start}.csv'))
psisL_selected_models_j2.to_csv(os.path.join(OUTPUT_DIR, f'psisL_selected_models_j2_{seed_start}.csv'))

# Results tables

In [None]:
order = ['model_size_j1', 'FP_j1', 'FN_j1', 'model_size_j2', 'FP_j2', 'FN_j2'] # 'TP_j1', 'TN_j1', 'TP_j2', 'TN_j2',
psis_mean = full_rho_df.mean(axis=0).unstack(level=0).T
psis_mean = psis_mean[order]
psis_std = full_rho_df.std(axis=0).unstack(level=0).T
psis_std = psis_std[order]
psis = pd.concat([psis_mean, psis_std], keys=['Mean', 'SE'], axis=1)
psis.columns = psis.columns.swaplevel(1, 0)
psis = psis.sort_index(axis=1)[order]
psis.index.name = r"$\rho$"

In [None]:
order = ['model_size_j1', 'FP_j1', 'FN_j1', 'model_size_j2', 'FP_j2', 'FN_j2'] # 'TP_j1', 'TN_j1', 'TP_j2', 'TN_j2',
psisL_mean = lasso_full_rho_df.mean(axis=0).unstack(level=0).T
psisL_mean = psisL_mean[order]
psisL_std = lasso_full_rho_df.std(axis=0).unstack(level=0).T
psisL_std = psisL_std[order]
psisL = pd.concat([psisL_mean, psisL_std], keys=['Mean', 'SE'], axis=1)
psisL.columns = psisL.columns.swaplevel(1, 0)
psisL = psisL.sort_index(axis=1)[order]
psisL.index.name = r"$\rho$"

In [None]:
model_size_table = pd.concat([psis, psisL], keys=["SIS", "SIS-L"], axis=0)
model_size_table = model_size_table.round(2).rename(columns={
                                                             'model_size_j1': r"|$\mathcal{M}_1$|",
                                                             'model_size_j2': r"|$\mathcal{M}_2$|",
                                                             'FP_j1': r"$\mbox{FP}_1$",
                                                             'FN_j1': r"$\mbox{FN}_1$",
                                                             'FP_j2': r"$\mbox{FP}_2$",
                                                             'FN_j2': r"$\mbox{FN}_2$"})
model_size_table

In [None]:
print(model_size_table.to_latex(escape=False))

In [None]:
order = ['GAUC', 'GBS', 'IAUC 1', 'IAUC 2', 'IBS 1', 'IBS 2']
psis_mean = full_cv_metric_df.groupby(level=[0, 2], axis=1).mean().mean(axis=0).unstack()
psis_mean = psis_mean[order]
psis_std = full_cv_metric_df.groupby(level=[0, 2], axis=1).mean().std(axis=0).unstack()
psis_std = psis_std[order]
psis = pd.concat([psis_mean, psis_std], keys=['Mean', 'SE'], axis=1)
psis.columns = psis.columns.swaplevel(1, 0)
psis = psis.sort_index(axis=1)[order]
#psis.rename(columns={'model_size': r"|$\mathcal{M}$|"}, inplace=True)
psis.index.name = r"$\rho$"

In [None]:
psisL_mean = lasso_full_cv_metric_df.groupby(level=[0, 2], axis=1).mean().mean(axis=0).unstack()
psisL_mean = psisL_mean[order]
psisL_std = lasso_full_cv_metric_df.groupby(level=[0, 2], axis=1).mean().std(axis=0).unstack()
psisL_std = psisL_std[order]
psisL = pd.concat([psisL_mean, psisL_std], keys=['Mean', 'SE'], axis=1)
psisL.columns = psisL.columns.swaplevel(1, 0)
psisL = psisL.sort_index(axis=1)[order]
#psisL.rename(columns={'model_size': r"|$\mathcal{M}$|"}, inplace=True)
psisL.index.name = r"$\rho$"

In [None]:
metrics_table = pd.concat([psis, psisL], keys=["SIS", "SIS-L"], axis=1)
metrics_table = metrics_table.round(3).stack(0)
metrics_table.rename(columns={'GAUC': 'AUC', 'GBS': 'BS', 'IAUC 1': r'$\mbox{AUC}_1$', 'IAUC 2': r'$\mbox{AUC}_2$',
                             'IBS 1': r'$\mbox{BS}_1$', 'IBS 2': r'$\mbox{BS}_2$'}, 
                     inplace=True)
metrics_table

In [None]:
print(metrics_table.to_latex(escape=False))

In [None]:
non_zero = ['Z1', 'Z2', 'Z3', 'Z4', 'Z5']

In [None]:
final_full_thresh_df = full_thresh_df.round(3).rename(columns={'mean': 'Mean', 'STD': 'SE'})
final_full_thresh_df

In [None]:
print(final_full_thresh_df.to_latex())

In [None]:
final_lasso_full_eta_df = lasso_full_eta_df.round(3).rename(columns={'mean': 'Mean', 'STD': 'SE'},
                                                      index={'eta_1': r'$\eta_1$', 'eta_2': r'$\eta_2$'})
final_lasso_full_eta_df

In [None]:
print(final_lasso_full_eta_df.to_latex(escape=False))