In [None]:
import pandas as pd
import numpy as np
import os
from time import time
from sklearn.model_selection import train_test_split
from pydts.examples_utils.generate_simulations_data import generate_quick_start_df
from pydts.examples_utils.plots import plot_example_pred_output
from pydts.examples_utils.plots import add_panel_text
from pydts.fitters import TwoStagesFitter, DataExpansionFitter

from pydts.data_generation import EventTimesSampler
from matplotlib import pyplot as plt
import warnings
import pickle
pd.set_option("display.max_rows", 500)
warnings.filterwarnings('ignore')
%matplotlib inline
slicer = pd.IndexSlice

In [None]:
OUTPUT_DIR = '/app/output/'

# Sampling data

In [None]:
real_coef_dict = {
    "alpha": {
        1: lambda t: -11.5 + 2.5 * np.log(t),
        2: lambda t: -10.5 + 2 * np.log(t)
    },
    "beta": {
        1: [1.3, 1.7, -1.5, 0.5, 1.6],
        2: [-1.5, 1.5, 1.8, -1, 1.2]
    }
}

n_patients = 50000
n_cov = 5
d_times = 60
j_events = 2

ets = EventTimesSampler(d_times=d_times, j_event_types=j_events)

seed = 0
means_vector = np.zeros(n_cov)
covariance_matrix = np.identity(n_cov)


covariates = [f'Z{i + 1}' for i in range(n_cov)]

In [None]:
COEF_COL = '   coef   '
STDERR_COL = ' std err '

In [None]:
patients_df = pd.DataFrame(data=np.random.multivariate_normal(means_vector, covariance_matrix, size=n_patients),
                        columns=covariates)

patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
patients_df = ets.sample_independent_lof_censoring(patients_df, prob_los_at_t=0.002*np.ones_like(ets.times), seed=seed+1)
patients_df = ets.update_event_or_lof(patients_df)
patients_df.index.name='pid'
patients_df = patients_df.reset_index()

from pydts.examples_utils.plots import plot_events_occurrence
plot_events_occurrence(patients_df)

In [None]:
for n_patients in [10_000]: # 5_000, 20_000, 50_000, 100_000
    case = f'Sample_size_{n_patients}'
    k_runs = 3
    two_step_fit_times = []
    lee_fit_times = []

    for k in range(k_runs):
        try:
            # Sampling based on different seed each time
            np.random.seed(seed+k)
            patients_df = pd.DataFrame(data=np.random.multivariate_normal(means_vector, covariance_matrix, size=n_patients),
                                    columns=covariates)

            patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
            patients_df = ets.sample_independent_lof_censoring(patients_df, prob_los_at_t=0.01*np.ones_like(ets.times), seed=seed+1)
            patients_df = ets.update_event_or_lof(patients_df)
            patients_df.index.name='pid'
            patients_df = patients_df.reset_index()

            # Two step fitter
            new_fitter = TwoStagesFitter()
            print(case)
            print(f'Starting two-step: {k}')
            two_step_start = time()
            new_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1))
            two_step_end = time()
            print(f'Finished two-step: {k}, {two_step_end-two_step_start}sec')


            # Lee et al fitter
            lee_fitter = DataExpansionFitter()
            print(f'Starting Lee: {k}')
            lee_start = time()
            lee_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1))
            lee_end = time()
            print(f'Finished lee: {k}, {lee_end-lee_start}sec')


            lee_alpha_ser = lee_fitter.get_alpha_df().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()
            lee_beta_ser = lee_fitter.get_beta_SE().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()


            # Save results only if both fitters were successful
            two_step_fit_times.append(two_step_end - two_step_start)
            lee_fit_times.append(lee_end-lee_start)


            if k == 0:
                two_step_alpha_k_results = new_fitter.alpha_df[['J', 'X', 'alpha_jt']]
                two_step_beta_k_results = new_fitter.get_beta_SE().unstack().to_frame()

                lee_alpha_k_results = lee_alpha_ser.to_frame()
                lee_beta_k_results = lee_beta_ser.to_frame()

            else:
                two_step_alpha_k_results = pd.concat([two_step_alpha_k_results, new_fitter.alpha_df['alpha_jt']], axis=1)
                two_step_beta_k_results = pd.concat([two_step_beta_k_results, new_fitter.get_beta_SE().unstack()], axis=1)

                lee_alpha_k_results = pd.concat([lee_alpha_k_results, lee_alpha_ser], axis=1)
                lee_beta_k_results = pd.concat([lee_beta_k_results, lee_beta_ser], axis=1)

            # Cache results
            two_step_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'two_step_alpha_run_{k_runs}.csv'))
            two_step_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'two_step_beta_run_{k_runs}.csv'))
            lee_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'lee_alpha_run_{k_runs}.csv'))
            lee_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'lee_beta_run_{k_runs}.csv'))
            
            with open(os.path.join(OUTPUT_DIR, f"two_step_fit_times_{k_runs}"), "wb") as fp: 
                pickle.dump(two_step_fit_times, fp)

            with open(os.path.join(OUTPUT_DIR, f"lee_fit_times_{k_runs}"), "wb") as fp:   
                pickle.dump(lee_fit_times, fp)

        except Exception as e:
            print(f'Failed during trial {k}')
            print(e)
            

    two_step_alpha_k_results = two_step_alpha_k_results.set_index(['J', 'X'])
    two_step_alpha_k_results.columns = list(range(1, 1+k_runs))
    two_step_beta_k_results.columns = list(range(1, 1+k_runs))
    lee_alpha_k_results.columns = list(range(1, 1+k_runs))
    lee_beta_k_results.columns = list(range(1, 1+k_runs))


    # Save results
    two_step_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha_run_{k_runs}.csv'))
    two_step_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta_run_{k_runs}.csv'))
    lee_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha_run_{k_runs}.csv'))
    lee_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_beta_run_{k_runs}.csv'))

    with open(os.path.join(OUTPUT_DIR, f"{case}_two_step_fit_times_{k_runs}"), "wb") as fp: 
        pickle.dump(two_step_fit_times, fp)

    with open(os.path.join(OUTPUT_DIR, f"{case}_lee_fit_times_{k_runs}"), "wb") as fp:   
        pickle.dump(lee_fit_times, fp)

In [None]:
with open(os.path.join(OUTPUT_DIR, f"{case}_two_step_fit_times_{k_runs}"), "rb") as fp: 
    two_step_fit_times = pickle.load(fp)

with open(os.path.join(OUTPUT_DIR, f"{case}_lee_fit_times_{k_runs}"), "rb") as fp:   
    lee_fit_times = pickle.load(fp)

# L1 Regularization

In [None]:
real_coef_dict = {
    "alpha": {
        1: lambda t: -9.5 + 2.5 * np.log(t),
        2: lambda t: -6.5 + 1.5 * np.log(t)
    },
    "beta": {
        1: [1.3, 1.7, -1.5, 0.5, 1.6],
        2: [-1.5, 1.5, 1.8, -1, 1.2]
    }
}

n_patients = 20000
n_cov = 5
d_times = 30
j_events = 2

ets = EventTimesSampler(d_times=d_times, j_event_types=j_events)

seed = 0
means_vector = np.zeros(n_cov)
covariance_matrix = np.identity(n_cov)


covariates = [f'Z{i + 1}' for i in range(n_cov)]

In [None]:
patients_df = pd.DataFrame(data=np.random.multivariate_normal(means_vector, covariance_matrix, size=n_patients),
                        columns=covariates)

patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
patients_df = ets.sample_independent_lof_censoring(patients_df, prob_los_at_t=0.002*np.ones_like(ets.times), seed=seed+1)
patients_df = ets.update_event_or_lof(patients_df)
patients_df.index.name='pid'
patients_df = patients_df.reset_index()

from pydts.examples_utils.plots import plot_events_occurrence
plot_events_occurrence(patients_df)

In [None]:
train_test_split to decide the value of the penalizer
mse

In [None]:
for penalizer in [0.0005, 0.005, 0.01, 0.05]:

    L1_regularized_fitter = TwoStagesFitter()

    fit_beta_kwargs = {
        'model_kwargs': {
            'penalizer': penalizer,
            'l1_ratio': 1
        }
    }

    L1_regularized_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1), fit_beta_kwargs=fit_beta_kwargs)

    print('**********************************************')
    print(f'Penalizer {penalizer}')
    print(L1_regularized_fitter.get_beta_SE())
    print('**********************************************')


# Large number of covariates

In [None]:
n_patients = 30000
n_cov = 100
beta_j1 = np.zeros(100)
beta_j1[:5] = -np.log([0.8, 3, 3, 2.5, 2])

beta_j2 = np.zeros(100)
beta_j2[:5] = -np.log([2, 3, 4, 3, 2])

real_coef_dict = {
    "alpha": {
        1: lambda t: -0.25 - 0.3 * np.log(t),
        2: lambda t: -0.75 - 0.15 * np.log(t)
    },
    "beta": {
        1: beta_j1,
        2: beta_j2
    }
}

ets = EventTimesSampler(d_times=14, j_event_types=2)

seed = 0
covariates = [f'Z{i + 1}' for i in range(n_cov)]
patients_df = pd.DataFrame(data=np.random.uniform(low=0.0, high=1.0, size=[n_patients, n_cov]),
                            columns=covariates)



patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
patients_df = ets.sample_independent_lof_censoring(patients_df, prob_los_at_t=0.01*np.ones_like(ets.times), seed=seed+1)
patients_df = ets.update_event_or_lof(patients_df)
patients_df.index.name='pid'
patients_df = patients_df.reset_index()
patients_df

In [None]:
patients_df['J'].value_counts()

In [None]:
for penalizer in [0.0, 0.001, 0.005]:

    L1_regularized_fitter = TwoStagesFitter()

    fit_beta_kwargs = {
        'model_kwargs': {
            'penalizer': penalizer,
            'l1_ratio': 1
        }
    }

    L1_regularized_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1), fit_beta_kwargs=fit_beta_kwargs)

    print('**********************************************')
    print(f'Penalizer {penalizer}')
    print(L1_regularized_fitter.get_beta_SE())
    print('**********************************************')

# L2 Regularization

In [None]:
L2_regularized_fitter = TwoStagesFitter()

fit_beta_kwargs = {
    'model_kwargs': {
        'penalizer': 0.003,
        'l1_ratio': 0
    }
}

L2_regularized_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1), fit_beta_kwargs=fit_beta_kwargs)

L2_regularized_fitter.get_beta_SE()

# EN regularization

In [None]:
EN_regularized_fitter = TwoStagesFitter()

fit_beta_kwargs = {
    'model_kwargs': {
        'penalizer': 0.003,
        'l1_ratio': 0.5
    }
}

EN_regularized_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1), fit_beta_kwargs=fit_beta_kwargs)

EN_regularized_fitter.get_beta_SE()