In [None]:
import pandas as pd
import numpy as np
import os
from time import time
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import warnings
pd.set_option("display.max_rows", 500)
slicer = pd.IndexSlice


OUTPUT_DIR = ''

file_number = 2
runs = 50

n_cov = 35
beta1 = np.zeros(n_cov)
beta1[:5] = [1.2, 1.5, -1, -0.3, -1.2]
beta2 = np.zeros(n_cov)
beta2[:5] = [-1.2, -1, 1.4, 1, 1]

real_coef_dict = {
    "alpha": {
        1: lambda t: -4.4 + 0.3 * t,
        2: lambda t: -4.3 + 0.3 * t
    },
    "beta": {
        1: beta1,
        2: beta2
    }
}

n_patients = 500
d_times = 10
j_events = 2

step = 0.25
penalizers = np.arange(-6, -3.4, step=step)
n_splits = 3

means_vector = np.zeros(n_cov)
covariance_matrix = 0.4 * np.identity(n_cov)
clip_value = 1.5

In [None]:
gauc_files = [f for f in os.listdir(OUTPUT_DIR) if "global_auc_" in f]
grid_search_files = [f for f in os.listdir(OUTPUT_DIR) if "FP-FN_grid_search_" in f]
tp_fp_files = [f for f in os.listdir(OUTPUT_DIR) if "tp_fp_" in f]
chosen_eta_files = [f for f in os.listdir(OUTPUT_DIR) if "chosen_eta_" in f]
non_zero_count_files = [f for f in os.listdir(OUTPUT_DIR) if "nonzero_count_" in f]

In [None]:
skipped = 0
included = 0

summary_df = pd.DataFrame()
chosen_nonzero_df = pd.DataFrame()

for f in gauc_files:
    gauc_filename = os.path.join(OUTPUT_DIR, f)
    gauc_df = pd.read_csv(gauc_filename, index_col=0)
    
    run_number = f.split('_')[-1].split('.')[0]
    
    if gauc_df['gauc'].isna().any():
        skipped += 1
        continue
    else:
        included += 1
    
    chosen_eta_df = pd.read_csv(os.path.join(OUTPUT_DIR, f'chosen_eta_{run_number}.csv'), index_col=0)
    chosen_eta = [chosen_eta_df.iloc[0,0], chosen_eta_df.iloc[1,0]]
    tp_fp_df = pd.read_csv(os.path.join(OUTPUT_DIR, f'tp_fp_{run_number}.csv'), index_col=0)
    nonzero_df = pd.read_csv(os.path.join(OUTPUT_DIR, f'nonzero_count_{run_number}.csv'), index_col=0)

    tmp_summary = pd.concat([pd.concat([tp_fp_df.set_index(['penalizer', 'risk', 'type']).loc[slicer[chosen_eta[0], 1, :]], 
           tp_fp_df.set_index(['penalizer', 'risk', 'type']).loc[slicer[chosen_eta[1], 2, :]]], keys=[1,2], axis=0)],
          keys=[run_number], axis=1)
    summary_df = pd.concat([summary_df, tmp_summary], axis=1)
    
    
    tmp_nonzero = nonzero_df.reset_index()
    tmp_nonzero.columns = ['risk', 'fold'] + [np.round(float(c), 2) for c in tmp_nonzero.columns[2:]]
    tmp_nonzero = tmp_nonzero.set_index(['risk', 'fold']) 

    nonzero_df = pd.concat([tmp_nonzero.loc[slicer[1, :], np.round(float(chosen_eta[0]), 2)],
                        tmp_nonzero.loc[slicer[2, :], np.round(float(chosen_eta[1]), 2)]])
    nonzero_df.name = run_number
    
    chosen_nonzero_df = pd.concat([chosen_nonzero_df, nonzero_df], axis=1)
    
    if included == 100:
        break

summary_df

In [None]:
tmptp = summary_df.loc[slicer[1, 'TP']]
tmptp = tmptp[tmptp == 5]
tmpfp = summary_df.loc[slicer[1, 'FP']]
tmpfp = tmpfp[tmpfp == 0]
tmpfp

In [None]:
summary_df.loc[slicer[1, :]].sum()

In [None]:
res_df = pd.concat([summary_df.mean(axis=1), summary_df.std(axis=1)], axis=1, keys=['Mean', 'SE'])
res_df.index.names = ['Risk', 'Type']
res_df

In [None]:
pd.concat([summary_df.mean(axis=1), summary_df.std(axis=1)], axis=1, keys=['Mean', 'SE'])