In [None]:
import pandas as pd
import numpy as np
import os
from time import time
from sklearn.model_selection import train_test_split
from pydts.examples_utils.generate_simulations_data import generate_quick_start_df
from pydts.examples_utils.plots import plot_example_pred_output
from pydts.examples_utils.plots import add_panel_text
from pydts.fitters import TwoStagesFitter, DataExpansionFitter

from pydts.data_generation import EventTimesSampler
from matplotlib import pyplot as plt
import warnings
import pickle
pd.set_option("display.max_rows", 500)
warnings.filterwarnings('ignore')
%matplotlib inline
slicer = pd.IndexSlice

In [None]:
OUTPUT_DIR = ''
COEF_COL = '   coef   '
STDERR_COL = ' std err '

# Example Data

In [None]:
n_patients = 5000
n_cov = 5
d_times = 30
j_events = 3
covariates = [f'Z{i + 1}' for i in range(n_cov)]

real_coef_dict = {
    "alpha": {
        1: lambda t: - 2.2 - 0.1 * np.log(t),
        2: lambda t: - 2.3 - 0.1 * np.log(t),
        3: lambda t: - 2.4 - 0.1 * np.log(t)
    },
    "beta": {
        1: -np.log([2.5, 1.5, 0.8, 3, 2]),
        2: -np.log([0.8, 3, 2.8, 2.2, 1.5]),
        3: -np.log([1.8, 0.8, 2.5, 1.2, 3])
    }
}



ets = EventTimesSampler(d_times=d_times, j_event_types=j_events)

seed = 0


patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.uniform(0,1, size=[n_patients, n_cov]),
                           columns=covariates))

patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
patients_df = ets.sample_independent_lof_censoring(patients_df, prob_lof_at_t=0.01 * np.ones_like(ets.times))
patients_df = ets.update_event_or_lof(patients_df)

patients_df.index.name='pid'                 
patients_df = patients_df.reset_index()

In [None]:
from pydts.examples_utils.plots import plot_events_occurrence
plot_events_occurrence(patients_df[patients_df['X'] != 31])
plot_events_occurrence(patients_df)

In [None]:
patients_df[patients_df['J'] != 0].groupby(['X', 'J'])['pid'].count()[-40:]

# Resample and Estimate K times

In [None]:
n_patients_list = [5_000, 10_000, 15_000, 20_000] 
k_runs = 200

In [None]:
for inp, n_patients in enumerate(n_patients_list):
    case = f'Sample_size_{n_patients}_final_3comp_censoring_d30_'
    
    for k in range(k_runs):
        try:
            # Sampling based on different seed each time
            loop_seed = 3000*inp+k+seed
            print(f'Sampling Patients, loop seed: {loop_seed}')
            np.random.seed(loop_seed)
            patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.uniform(0,1, size=[n_patients, n_cov]),
                                       columns=covariates))
            
            patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=loop_seed)
            patients_df = ets.sample_independent_lof_censoring(patients_df, 
                                                               prob_lof_at_t=0.01 * np.ones_like(ets.times))
            patients_df = ets.update_event_or_lof(patients_df)
            patients_df.index.name='pid'
            patients_df = patients_df.reset_index()

            # Two step fitter
            new_fitter = TwoStagesFitter()
            print(case)
            print(f'Starting two-step: {k+1}/{k_runs}')
            two_step_start = time()
            new_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1), nb_workers=1)
            two_step_end = time()
            print(f'Finished two-step: {k+1}/{k_runs}, {two_step_end-two_step_start}sec')


            # Lee et al fitter
            lee_fitter = DataExpansionFitter()
            print(f'Starting Lee: {k+1}/{k_runs}')
            lee_start = time()
            lee_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1))
            lee_end = time()
            print(f'Finished lee: {k+1}/{k_runs}, {lee_end-lee_start}sec')


            lee_alpha_ser = lee_fitter.get_alpha_df().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()
            lee_beta_ser = lee_fitter.get_beta_SE().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()

            if k == 0:
                two_step_alpha_k_results = new_fitter.alpha_df[['J', 'X', 'alpha_jt']]
                two_step_beta_k_results = new_fitter.get_beta_SE().unstack().to_frame()

                lee_alpha_k_results = lee_alpha_ser.to_frame()
                lee_beta_k_results = lee_beta_ser.to_frame()

            else:
                two_step_alpha_k_results = pd.concat([two_step_alpha_k_results, new_fitter.alpha_df['alpha_jt']], axis=1)
                two_step_beta_k_results = pd.concat([two_step_beta_k_results, new_fitter.get_beta_SE().unstack()], axis=1)

                lee_alpha_k_results = pd.concat([lee_alpha_k_results, lee_alpha_ser], axis=1)
                lee_beta_k_results = pd.concat([lee_beta_k_results, lee_beta_ser], axis=1)

            # Cache results
            two_step_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha_run_{k_runs}.csv'))
            two_step_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta_run_{k_runs}.csv'))
            lee_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha_run_{k_runs}.csv'))
            lee_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_beta_run_{k_runs}.csv'))

        except Exception as e:
            print(f'Failed during trial {k}')
            print(e)
            

    two_step_alpha_k_results = two_step_alpha_k_results.set_index(['J', 'X'])
    two_step_alpha_k_results.columns = list(range(1, 1+k_runs))
    two_step_beta_k_results.columns = list(range(1, 1+k_runs))
    lee_alpha_k_results.columns = list(range(1, 1+k_runs))
    lee_beta_k_results.columns = list(range(1, 1+k_runs))

    # Save results
    two_step_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha_run_{k_runs}.csv'))
    two_step_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta_run_{k_runs}.csv'))
    lee_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha_run_{k_runs}.csv'))
    lee_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_beta_run_{k_runs}.csv'))

# Read Results

In [None]:
final_dfs = []

for n_patients in n_patients_list: 
    case = f'Sample_size_{n_patients}_final_3comp_censoring_d30_'

    two_step_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha_run_{k_runs}.csv'), 
                                           index_col=['J', 'X'])
    two_step_beta_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta_run_{k_runs}.csv'),
                                          index_col=[0, 1])
    lee_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha_run_{k_runs}.csv'),
                                      index_col=[0,1,2])
    lee_beta_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_beta_run_{k_runs}.csv'),
                                     index_col=[0, 1,2])

    # Beta
    
    coverage_df = pd.DataFrame(index=two_step_beta_k_results.loc[['j1_params', 'j2_params', 'j3_params'], :].index,
                               columns=two_step_beta_k_results.loc[['j1_params', 'j2_params', 'j3_params'], :].columns.astype(int))

    true_col = np.concatenate([np.concatenate([real_coef_dict['beta'][1], real_coef_dict['beta'][2]]), real_coef_dict['beta'][3]])  

    for idc, c in enumerate(covariates):
        for run_id in range(len(two_step_beta_k_results.columns)):
            est = two_step_beta_k_results.loc['j1_params', c][run_id]
            se = two_step_beta_k_results.loc['j1_SE', c][run_id]
            true_val = true_col[idc]
            coverage_df.loc[('j1_params', c), run_id+1] = int(( (est - 1.96*se) <= true_val ) & ( (est + 1.96*se) >= true_val))

            est = two_step_beta_k_results.loc['j2_params', c][run_id]
            se = two_step_beta_k_results.loc['j2_SE', c][run_id]
            true_val = true_col[idc + ((len(true_col)) // 3)]
            coverage_df.loc[('j2_params', c), run_id+1] = int(( (est - 1.96*se) <= true_val ) & ( (est + 1.96*se) >= true_val))
            
            est = two_step_beta_k_results.loc['j3_params', c][run_id]
            se = two_step_beta_k_results.loc['j3_SE', c][run_id]
            true_val = true_col[idc + 2*((len(true_col)) // 3)]
            coverage_df.loc[('j3_params', c), run_id+1] = int(( (est - 1.96*se) <= true_val ) & ( (est + 1.96*se) >= true_val))
            
    twostep_beta1_summary = two_step_beta_k_results.mean(axis=1).unstack([0]).round(3).iloc[:, [1,0]]
    twostep_beta2_summary = two_step_beta_k_results.mean(axis=1).unstack([0]).round(3).iloc[:, [3,2]]
    twostep_beta3_summary = two_step_beta_k_results.mean(axis=1).unstack([0]).round(3).iloc[:, [5,4]]

    twostep_empirical_beta1 = two_step_beta_k_results.std(axis=1).unstack([0]).round(3).iloc[:, [1,0]].iloc[:, 0]
    twostep_empirical_beta2 = two_step_beta_k_results.std(axis=1).unstack([0]).round(3).iloc[:, [3,2]].iloc[:, 0]
    twostep_empirical_beta3 = two_step_beta_k_results.std(axis=1).unstack([0]).round(3).iloc[:, [5,4]].iloc[:, 0]

    lee_beta1_summary = lee_beta_k_results.mean(axis=1).loc[slicer[1,:,:]].unstack([0]).round(3)
    lee_beta2_summary = lee_beta_k_results.mean(axis=1).loc[slicer[2,:,:]].unstack([0]).round(3)
    lee_beta3_summary = lee_beta_k_results.mean(axis=1).loc[slicer[3,:,:]].unstack([0]).round(3)


    lee_beta1_summary.columns = pd.MultiIndex.from_tuples([('Lee et al.', 'Estimate'), ('Lee et al.', 'Estimated SE')])
    lee_beta2_summary.columns = pd.MultiIndex.from_tuples([('Lee et al.', 'Estimate'), ('Lee et al.', 'Estimated SE')])
    lee_beta3_summary.columns = pd.MultiIndex.from_tuples([('Lee et al.', 'Estimate'), ('Lee et al.', 'Estimated SE')])

    beta_summary_comparison = pd.concat([lee_beta1_summary, lee_beta2_summary, lee_beta3_summary], axis=0)
    beta_summary_comparison.index = [r'$\beta_{11}$', r'$\beta_{12}$', r'$\beta_{13}$', r'$\beta_{14}$', r'$\beta_{15}$',
                                     r'$\beta_{21}$', r'$\beta_{22}$', r'$\beta_{23}$', r'$\beta_{24}$', r'$\beta_{25}$',
                                     r'$\beta_{31}$', r'$\beta_{32}$', r'$\beta_{33}$', r'$\beta_{34}$', r'$\beta_{35}$']
    twostep_beta1_summary.columns = pd.MultiIndex.from_tuples([('two-step', 'Estimate'), ('two-step', 'Estimated SE')])
    twostep_beta2_summary.columns = pd.MultiIndex.from_tuples([('two-step', 'Estimate'), ('two-step', 'Estimated SE')])
    twostep_beta3_summary.columns = pd.MultiIndex.from_tuples([('two-step', 'Estimate'), ('two-step', 'Estimated SE')])

    tmp = pd.concat([twostep_beta1_summary.round(3), twostep_beta2_summary.round(3), twostep_beta3_summary.round(3)], axis=0)
    tmp.index = [r'$\beta_{11}$', r'$\beta_{12}$', r'$\beta_{13}$', r'$\beta_{14}$', r'$\beta_{15}$',
                 r'$\beta_{21}$', r'$\beta_{22}$', r'$\beta_{23}$', r'$\beta_{24}$', r'$\beta_{25}$',
                 r'$\beta_{31}$', r'$\beta_{32}$', r'$\beta_{33}$', r'$\beta_{34}$', r'$\beta_{35}$']

    tmp_std = pd.concat([twostep_empirical_beta1, twostep_empirical_beta2, twostep_empirical_beta3], axis=0).to_frame()
    tmp_std.index = [r'$\beta_{11}$', r'$\beta_{12}$', r'$\beta_{13}$', r'$\beta_{14}$', r'$\beta_{15}$',
                     r'$\beta_{21}$', r'$\beta_{22}$', r'$\beta_{23}$', r'$\beta_{24}$', r'$\beta_{25}$',
                     r'$\beta_{31}$', r'$\beta_{32}$', r'$\beta_{33}$', r'$\beta_{34}$', r'$\beta_{35}$']
    tmp_std.columns = pd.MultiIndex.from_tuples([('two-step', 'Empirical SE')])

    cov_series = ((coverage_df.sum(axis=1) / k_runs).round(3)).to_frame()
    cov_series.index = [r'$\beta_{11}$', r'$\beta_{12}$', r'$\beta_{13}$', r'$\beta_{14}$', r'$\beta_{15}$',
                        r'$\beta_{21}$', r'$\beta_{22}$', r'$\beta_{23}$', r'$\beta_{24}$', r'$\beta_{25}$',
                        r'$\beta_{31}$', r'$\beta_{32}$', r'$\beta_{33}$', r'$\beta_{34}$', r'$\beta_{35}$',]
    cov_series.columns = pd.MultiIndex.from_tuples([('two-step', 'Coverage Rate')])
    
    beta_summary_comparison = pd.concat([beta_summary_comparison, tmp, tmp_std, cov_series], axis=1)
    beta_summary_comparison.index.name =  r'$\beta_{jk}$'


    # True Values
    beta_summary_comparison.insert(loc=0, column='True', value=true_col)
    final_dfs.append(beta_summary_comparison.astype(float).round(3))
    
final_df = pd.concat(final_dfs, keys=n_patients_list)
final_df

In [None]:
print(final_df.to_latex(escape=False))

In [None]:
filename = 'alpha_different_n_J3_censoring.png'

first_model_name = 'Lee et al.'
second_model_name = 'two-step'
times = range(1, d_times+1)

lee_colors = ['tab:blue', 'tab:green', 'tab:red']
two_step_colors = ['navy', 'darkgreen', 'tab:brown']
true_colors = ['tab:blue', 'tab:green', 'tab:red']

fig, axes = plt.subplots(2, 2, figsize=(15, 11))

for idn, n_patients in enumerate(n_patients_list): 
    case = f'Sample_size_{n_patients}_final_3comp_censoring_d30_'
      
    np.random.seed(idn)
    patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.uniform(0,1, size=[n_patients, n_cov]),
                                 columns=covariates))

    patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
    patients_df = ets.sample_independent_lof_censoring(patients_df, 
                                                       prob_lof_at_t=0.01 * np.ones_like(ets.times))
    patients_df = ets.update_event_or_lof(patients_df)
    patients_df.index.name='pid'
    patients_df = patients_df.reset_index()
    counts = patients_df.groupby(['J', 'X'])['pid'].count().unstack('J').fillna(0)

      
    two_step_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha_run_{k_runs}.csv'), 
                                             index_col=['J', 'X'])

    lee_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha_run_{k_runs}.csv'),
                                       index_col=[0,1,2])
      
    ax = axes[int(idn // 2), int(idn % 2)]
    ax.set_title(f'n={n_patients}', fontsize=15)
    ax.tick_params(axis='both', which='major', labelsize=15)
    ax.tick_params(axis='both', which='minor', labelsize=15)

    for j in [1, 2, 3]:

        tmp_alpha = lee_alpha_k_results.loc[slicer[j, COEF_COL, :]].mean(axis=1)
        tmp_alpha.index = [int(idx.split(')[')[1].split(']')[0]) for idx in tmp_alpha.index]
        tmp_alpha = pd.Series(tmp_alpha.values.squeeze().astype(float), index=tmp_alpha.index)
   
        ax.scatter(tmp_alpha.index, tmp_alpha.values,
           label=f'J={j} ({first_model_name})', color=lee_colors[j-1], marker='o', alpha=0.4, s=40)

        tmp_alpha = two_step_alpha_k_results.loc[slicer[j, :]].mean(axis=1)
        ax.scatter(tmp_alpha.index, tmp_alpha.values.squeeze(),
           label=f'J={j} ({second_model_name})', color=two_step_colors[j-1], marker='*', alpha=0.7, s=20)
   
        true_values = [real_coef_dict['alpha'][j](t) for t in times]
        ax.plot(times, true_values, label=f'J={j} (True)', ls='--', color=true_colors[j-1])

        ax.set_xlabel(r'Time', fontsize=18)
        ax.set_ylabel(r'$\alpha_{jt}$', fontsize=18)
        ax.legend(loc='upper right', fontsize=12)
        ax.set_ylim([-3.7, -0.7])
    
    ax2 = ax.twinx()
    ax2.bar(counts.index, counts[1].values.squeeze(), label='J=1', color='navy', alpha=0.4, width=0.4)
    ax2.bar(counts.index, counts[2].values.squeeze(), label='J=2', color='darkgreen', alpha=0.4, align='edge',
            width=0.4)
    ax2.bar(counts.index, counts[3].values.squeeze(), label='J=3', color='tab:red', alpha=0.6, align='edge',
            width=-0.4)
    ax2.legend(loc='upper center', fontsize=12)
    ax2.set_ylabel('Number of observed events', fontsize=16, color='red')
    ax2.tick_params(axis='y', colors='red')
    ax2.set_ylim([0, 1700])
    ax2.tick_params(axis='both', which='major', labelsize=15)
    ax2.tick_params(axis='both', which='minor', labelsize=15)
    
fig.tight_layout()

if filename is not None:
    fig.savefig(os.path.join(OUTPUT_DIR, filename), dpi=300)