In [None]:
import pandas as pd
import numpy as np
import os
from time import time
from sklearn.model_selection import train_test_split
from pydts.examples_utils.generate_simulations_data import generate_quick_start_df
from pydts.examples_utils.plots import plot_example_pred_output
from pydts.examples_utils.plots import add_panel_text
from pydts.cross_validation import TwoStagesCV
from pydts.fitters import TwoStagesFitter, DataExpansionFitter

from pydts.data_generation import EventTimesSampler
from matplotlib import pyplot as plt
import warnings
import pickle
from copy import deepcopy
from sklearn.model_selection import KFold
pd.set_option("display.max_rows", 500)
warnings.filterwarnings('ignore')
%matplotlib inline
slicer = pd.IndexSlice

In [None]:
OUTPUT_DIR = '/app/output'

In [None]:
file_number = 2
runs = 100


n_cov = 100
beta1 = np.zeros(n_cov)
beta1[:5] = [1.2, 1.5, -1, -0.3, -1.2]
beta2 = np.zeros(n_cov)
beta2[:5] = [-1.2, 1, 1, -1, 1.4]


real_coef_dict = {
    "alpha": {
        1: lambda t: -3.4 - 0.1 * np.log(t),
        2: lambda t: -3.4 - 0.2 * np.log(t)
    },
    "beta": {
        1: beta1,
        2: beta2
    }
}

n_patients = 10000
d_times = 15
j_events = 2



for run in range(runs):
    try:
        ets = EventTimesSampler(d_times=d_times, j_event_types=j_events)

        seed = 100*file_number + run
        print(run, seed)
        np.random.seed(seed)
        means_vector = np.zeros(n_cov)
        covariance_matrix = 0.4*np.identity(n_cov)
        clip_value = 1.5

        covariates = [f'Z{i + 1}' for i in range(n_cov)]

        patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.multivariate_normal(means_vector, covariance_matrix,
                                                                                        size=n_patients),
                                                     columns=covariates))
        patients_df.clip(lower= -1 * clip_value, upper=clip_value, inplace=True)
        patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
        patients_df = ets.sample_independent_lof_censoring(patients_df, prob_lof_at_t=0.01 * np.ones_like(ets.times),
                                                           seed=seed + 1)
        patients_df = ets.update_event_or_lof(patients_df)
        patients_df.index.name = 'pid'
        patients_df = patients_df.reset_index()


        step = 0.25
        penalizers = np.arange(-7, -4.4, step=step) 
        n_splits = 5

        cross_validators = {}

        for idp, penalizer in enumerate(penalizers):
            print(f"Started Penalizer: {penalizer}, {idp+1}/{len(penalizers)}")
            fit_beta_kwargs = {
                    'model_kwargs': {
                    'penalizer': np.exp(penalizer),
                    'l1_ratio': 1
                }
            }
            start = time()
            cross_validators[penalizer] = TwoStagesCV()
            cross_validators[penalizer].cross_validate(full_df=patients_df, n_splits=n_splits, seed=seed, nb_workers=1, 
                                                       fit_beta_kwargs=fit_beta_kwargs)
            end = time()
            print(f"Finished Penalizer: {penalizer}, {idp+1}/{len(penalizers)}, {int(end-start)} seconds")
        
        with open(os.path.join(OUTPUT_DIR, f'FP-FN_cross_validators_{seed}.pkl'), 'wb') as f:
            pickle.dump(cross_validators, f)

    except Exception as e:
        print(f"Run {run} failed: {seed}, {e}")
    
    

    
penalizers_x, mean_gauc, std_gauc = [], [], []
for penalizer in sorted(cross_validators.keys()):
    ser = pd.Series(cross_validators[penalizer].global_auc)
    penalizers_x.append(penalizer)
    mean_gauc.append(ser.mean())