In [1]:
import pandas as pd
import numpy as np
import os
from time import time
from sklearn.model_selection import train_test_split
from pydts.examples_utils.generate_simulations_data import generate_quick_start_df
from pydts.examples_utils.plots import plot_example_pred_output
from pydts.examples_utils.plots import add_panel_text
from pydts.fitters import TwoStagesFitter, DataExpansionFitter

from pydts.data_generation import EventTimesSampler
from matplotlib import pyplot as plt
import warnings
import pickle
pd.set_option("display.max_rows", 500)
warnings.filterwarnings('ignore')
%matplotlib inline
slicer = pd.IndexSlice

In [2]:
OUTPUT_DIR = '/app/output'

# Sampling data

In [None]:
real_coef_dict = {
    "alpha": {
        1: lambda t: -3.65 + 0.8 * np.log(t),
        2: lambda t: -3.75 + 0.9 * np.log(t)
    },
    "beta": {
        1: -np.log([0.8, 3.5, 3, 2.5, 2]),
        2: -np.log([1, 3, 4, 3, 2.5])
    }
}

n_patients = 15000
n_cov = 5
d_times = 40
j_events = 2

ets = EventTimesSampler(d_times=d_times, j_event_types=j_events)

seed = 0

covariates = [f'Z{i + 1}' for i in range(n_cov)]

In [None]:
COEF_COL = '   coef   '
STDERR_COL = ' std err '

In [None]:
patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.uniform(0,1, size=[n_patients, n_cov]),
                           columns=covariates))
                           
patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
patients_df = ets.sample_independent_lof_censoring(patients_df, prob_los_at_t=0.005*np.ones_like(ets.times), seed=seed+1)
patients_df = ets.update_event_or_lof(patients_df)

patients_df.index.name='pid'                 
patients_df = patients_df.reset_index()

from pydts.examples_utils.plots import plot_events_occurrence
plot_events_occurrence(patients_df)

In [None]:
patients_df.groupby(['X', 'J'])['pid'].count()[-20:]

# Run Simulations

In [None]:
for n_patients in [15_000, 20_000, 25_000]:
    print('**************************************')
    case = f'Sample_size_{n_patients}_rerun_'
    k_runs = 200
    two_step_fit_times = []
    lee_fit_times = []

    for k in range(k_runs):
        try:
            # Sampling based on different seed each time
            np.random.seed(seed+k)
            patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.uniform(0,1, size=[n_patients, n_cov]),
                                       columns=covariates))

            patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
            patients_df = ets.sample_independent_lof_censoring(patients_df, prob_los_at_t=0.005*np.ones_like(ets.times), seed=seed+1)
            patients_df = ets.update_event_or_lof(patients_df)
            patients_df.index.name='pid'
            patients_df = patients_df.reset_index()

            # Two step fitter
            new_fitter = TwoStagesFitter()
            print(case)
            print(f'Starting two-step: {k}')
            two_step_start = time()
            new_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1), nb_workers=1)
            two_step_end = time()
            print(f'Finished two-step: {k}, {two_step_end-two_step_start}sec')


            # Lee et al fitter
            lee_fitter = DataExpansionFitter()
            print(f'Starting Lee: {k}')
            lee_start = time()
            lee_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1))
            lee_end = time()
            print(f'Finished lee: {k}, {lee_end-lee_start}sec')


            lee_alpha_ser = lee_fitter.get_alpha_df().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()
            lee_beta_ser = lee_fitter.get_beta_SE().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()


            # Save results only if both fitters were successful
            two_step_fit_times.append(two_step_end - two_step_start)
            lee_fit_times.append(lee_end-lee_start)


            if k == 0:
                two_step_alpha_k_results = new_fitter.alpha_df[['J', 'X', 'alpha_jt']]
                two_step_beta_k_results = new_fitter.get_beta_SE().unstack().to_frame()

                lee_alpha_k_results = lee_alpha_ser.to_frame()
                lee_beta_k_results = lee_beta_ser.to_frame()

            else:
                two_step_alpha_k_results = pd.concat([two_step_alpha_k_results, new_fitter.alpha_df['alpha_jt']], axis=1)
                two_step_beta_k_results = pd.concat([two_step_beta_k_results, new_fitter.get_beta_SE().unstack()], axis=1)

                lee_alpha_k_results = pd.concat([lee_alpha_k_results, lee_alpha_ser], axis=1)
                lee_beta_k_results = pd.concat([lee_beta_k_results, lee_beta_ser], axis=1)

            # Cache results
            two_step_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'two_step_alpha_run_{k_runs}.csv'))
            two_step_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'two_step_beta_run_{k_runs}.csv'))
            lee_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'lee_alpha_run_{k_runs}.csv'))
            lee_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'lee_beta_run_{k_runs}.csv'))
            
            with open(os.path.join(OUTPUT_DIR, f"two_step_fit_times_{k_runs}"), "wb") as fp: 
                pickle.dump(two_step_fit_times, fp)

            with open(os.path.join(OUTPUT_DIR, f"lee_fit_times_{k_runs}"), "wb") as fp:   
                pickle.dump(lee_fit_times, fp)

        except Exception as e:
            print(f'Failed during trial {k}')
            print(e)
            

    two_step_alpha_k_results = two_step_alpha_k_results.set_index(['J', 'X'])
    two_step_alpha_k_results.columns = list(range(1, 1+k_runs))
    two_step_beta_k_results.columns = list(range(1, 1+k_runs))
    lee_alpha_k_results.columns = list(range(1, 1+k_runs))
    lee_beta_k_results.columns = list(range(1, 1+k_runs))


    # Save results
    two_step_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha_run_{k_runs}.csv'))
    two_step_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta_run_{k_runs}.csv'))
    lee_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha_run_{k_runs}.csv'))
    lee_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_beta_run_{k_runs}.csv'))

    with open(os.path.join(OUTPUT_DIR, f"{case}_two_step_fit_times_{k_runs}"), "wb") as fp: 
        pickle.dump(two_step_fit_times, fp)

    with open(os.path.join(OUTPUT_DIR, f"{case}_lee_fit_times_{k_runs}"), "wb") as fp:   
        pickle.dump(lee_fit_times, fp)

# Read Results

In [None]:
k_runs = 200

final_dfs = []
n_patients_list = [15_000, 20_000, 25_000]

run_time_dfs = []

for n_patients in n_patients_list: 
    print('**************************************')
    case = f'Sample_size_{n_patients}_rerun_'
    
    with open(os.path.join(OUTPUT_DIR, f"{case}_two_step_fit_times_{k_runs}"), "rb") as fp: 
        two_step_fit_times = pickle.load(fp)

    with open(os.path.join(OUTPUT_DIR, f"{case}_lee_fit_times_{k_runs}"), "rb") as fp:   
        lee_fit_times = pickle.load(fp)
    
    run_time_dfs.append(pd.DataFrame([two_step_fit_times, lee_fit_times], 
             index=pd.MultiIndex.from_tuples([('two-step', n_patients), ('Lee et al.', n_patients)])))
    
    print(np.median(two_step_fit_times))
    print(np.median(lee_fit_times))
    
    two_step_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha_run_{k_runs}.csv'), 
                                           index_col=['J', 'X'])
    two_step_beta_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta_run_{k_runs}.csv'),
                                          index_col=[0, 1])
    lee_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha_run_{k_runs}.csv'),
                                      index_col=[0,1,2])
    lee_beta_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_beta_run_{k_runs}.csv'),
                                     index_col=[0, 1,2])

    # Beta
    
    coverage_df = pd.DataFrame(index=two_step_beta_k_results.loc[['j1_params', 'j2_params'], :].index,
                           columns=two_step_beta_k_results.loc[['j1_params', 'j2_params'], :].columns.astype(int))

    true_col = -np.log([0.8, 3, 3, 2.5, 2, 1, 3, 4, 3, 2])   

    for idc, c in enumerate(covariates):
        for run_id in range(len(two_step_beta_k_results.columns)):
            est = two_step_beta_k_results.loc['j1_params', c][run_id]
            se = two_step_beta_k_results.loc['j1_SE', c][run_id]
            true_val = true_col[idc]
            coverage_df.loc[('j1_params', c), run_id+1] = int(( (est - 1.96*se) <= true_val ) & ( (est + 1.96*se) >= true_val))


            est = two_step_beta_k_results.loc['j2_params', c][run_id]
            se = two_step_beta_k_results.loc['j2_SE', c][run_id]
            true_val = true_col[idc + ((len(true_col)) // 2)]
            coverage_df.loc[('j2_params', c), run_id+1] = int(( (est - 1.96*se) <= true_val ) & ( (est + 1.96*se) >= true_val))
            
            
    twostep_beta1_summary = two_step_beta_k_results.mean(axis=1).unstack([0]).round(3).iloc[:, [1,0]]
    twostep_beta2_summary = two_step_beta_k_results.mean(axis=1).unstack([0]).round(3).iloc[:, [3,2]]
    twostep_empirical_beta1 = two_step_beta_k_results.std(axis=1).unstack([0]).round(3).iloc[:, [1,0]].iloc[:, 0]
    twostep_empirical_beta2 = two_step_beta_k_results.std(axis=1).unstack([0]).round(3).iloc[:, [3,2]].iloc[:, 0]

    lee_beta1_summary = lee_beta_k_results.mean(axis=1).loc[slicer[1,:,:]].unstack([0]).round(3)
    lee_beta2_summary = lee_beta_k_results.mean(axis=1).loc[slicer[2,:,:]].unstack([0]).round(3)


    lee_beta1_summary.columns = pd.MultiIndex.from_tuples([('Lee et al.', 'Estimate'), ('Lee et al.', 'Estimated SE')])
    lee_beta2_summary.columns = pd.MultiIndex.from_tuples([('Lee et al.', 'Estimate'), ('Lee et al.', 'Estimated SE')])
    beta_summary_comparison = pd.concat([lee_beta1_summary, lee_beta2_summary], axis=0)
    beta_summary_comparison.index = [r'$\beta_{11}$', r'$\beta_{12}$', r'$\beta_{13}$', r'$\beta_{14}$', r'$\beta_{15}$',
                                     r'$\beta_{21}$', r'$\beta_{22}$', r'$\beta_{23}$', r'$\beta_{24}$', r'$\beta_{25}$']
    twostep_beta1_summary.columns = pd.MultiIndex.from_tuples([('two-step', 'Estimate'), ('two-step', 'Estimated SE')])
    twostep_beta2_summary.columns = pd.MultiIndex.from_tuples([('two-step', 'Estimate'), ('two-step', 'Estimated SE')])
    tmp = pd.concat([twostep_beta1_summary.round(3), twostep_beta2_summary.round(3)], axis=0)
    tmp.index = [r'$\beta_{11}$', r'$\beta_{12}$', r'$\beta_{13}$', r'$\beta_{14}$', r'$\beta_{15}$',
                 r'$\beta_{21}$', r'$\beta_{22}$', r'$\beta_{23}$', r'$\beta_{24}$', r'$\beta_{25}$']

    tmp_std = pd.concat([twostep_empirical_beta1, twostep_empirical_beta2], axis=0).to_frame()
    tmp_std.index = [r'$\beta_{11}$', r'$\beta_{12}$', r'$\beta_{13}$', r'$\beta_{14}$', r'$\beta_{15}$',
                 r'$\beta_{21}$', r'$\beta_{22}$', r'$\beta_{23}$', r'$\beta_{24}$', r'$\beta_{25}$']
    tmp_std.columns = pd.MultiIndex.from_tuples([('two-step', 'Empirical SE')])

    cov_series = ((coverage_df.sum(axis=1) / k_runs).round(3)).to_frame()
    cov_series.index = [r'$\beta_{11}$', r'$\beta_{12}$', r'$\beta_{13}$', r'$\beta_{14}$', r'$\beta_{15}$',
                 r'$\beta_{21}$', r'$\beta_{22}$', r'$\beta_{23}$', r'$\beta_{24}$', r'$\beta_{25}$']
    cov_series.columns = pd.MultiIndex.from_tuples([('two-step', 'Coverage Rate')])
    
    beta_summary_comparison = pd.concat([beta_summary_comparison, tmp, tmp_std, cov_series], axis=1)
    beta_summary_comparison.index.name =  r'$\beta_{jk}$'


    # True Values
    beta_summary_comparison.insert(loc=0, column='True', value=true_col)
    final_dfs.append(beta_summary_comparison.astype(float).round(3))
    
final_df = pd.concat(final_dfs, keys=n_patients_list)
final_df

In [None]:
print(final_df.to_latex(escape=False))

In [None]:
lee_alpha_k_results

In [None]:
run_time = pd.concat(run_time_dfs)
run_time

In [None]:
print(run_time.mean(axis=1).to_frame().unstack(0).round(3).to_latex())

In [None]:
k_runs = 200

n_patients_list = [15_000, 20_000, 25_000]



filename = 'alpha_different_n_.png'

first_model_name = 'Lee et al.'
second_model_name = 'two-step'
times = range(1, d_times+1)

lee_colors = ['tab:blue', 'tab:green']
two_step_colors = ['navy', 'darkgreen']
true_colors = ['tab:blue', 'tab:green']

fig, axes = plt.subplots(2, 2, figsize=(16, 16))

for idn, n_patients in enumerate(n_patients_list): 
    case = f'Sample_size_{n_patients}_rerun_'
    
    np.random.seed(idn)
    patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.uniform(0,1, size=[n_patients, n_cov]),
                               columns=covariates))

    patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
    patients_df = ets.sample_independent_lof_censoring(patients_df, prob_los_at_t=0.005*np.ones_like(ets.times), 
                                                       seed=seed+1)
    patients_df = ets.update_event_or_lof(patients_df)
    patients_df.index.name='pid'
    patients_df = patients_df.reset_index()
    counts = patients_df.groupby(['J', 'X'])['pid'].count().unstack('J').dropna()

    
    two_step_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha_run_{k_runs}.csv'), 
                                           index_col=['J', 'X'])

    lee_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha_run_{k_runs}.csv'),
                                      index_col=[0,1,2])
    
    ax = axes[int(idn // 2), int(idn % 2)]
    ax.set_title(f'N={n_patients}')
    ax.tick_params(axis='both', which='major', labelsize=15)
    ax.tick_params(axis='both', which='minor', labelsize=15)

    for j in [1, 2]:

        tmp_alpha = lee_alpha_k_results.loc[slicer[j, COEF_COL, :]].mean(axis=1)
        tmp_alpha.index = [int(idx.split(')[')[1].split(']')[0]) for idx in tmp_alpha.index]
        tmp_alpha = pd.Series(tmp_alpha.values.squeeze().astype(float), index=tmp_alpha.index)
    
        ax.scatter(tmp_alpha.index, tmp_alpha.values,
           label=f'J={j} ({first_model_name})', color=lee_colors[j-1], marker='o', alpha=0.4, s=40)

        tmp_alpha = two_step_alpha_k_results.loc[slicer[j, :]].mean(axis=1)
        ax.scatter(tmp_alpha.index, tmp_alpha.values.squeeze(),
           label=f'J={j} ({second_model_name})', color=two_step_colors[j-1], marker='*', alpha=0.7, s=20)
    
        true_values = [real_coef_dict['alpha'][j](t) for t in times]
        ax.plot(times, true_values, label=f'J={j} (True)', ls='--', color=true_colors[j-1])

        ax.set_xlabel(r'Time', fontsize=18)
        ax.set_ylabel(r'$\alpha_{t}$', fontsize=18)
        ax.legend(loc='upper left', fontsize=12)
        #ax.set_ylim([-3, 0.5])
    
    ax2 = ax.twinx()
    ax2.bar(counts.index, counts[1].values.squeeze(), label='J=1', color='tab:red', alpha=0.4, width=0.5)
    ax2.bar(counts.index, counts[2].values.squeeze(), label='J=2', color='tab:brown', alpha=0.6, align='edge',
            width=0.5)
    ax2.legend(loc='upper center', fontsize=12)
    ax2.set_ylabel('Number of observed events', fontsize=16, color='red')
    ax2.tick_params(axis='y', colors='red')
    ax2.set_ylim([0,2500])


In [None]:
tmp_alpha = lee_alpha_k_results.loc[slicer[j, COEF_COL, :]].mean(axis=1)
tmp_alpha.index = [int(idx.split(')[')[1].split(']')[0]) for idx in tmp_alpha.index]
tmp_alpha = pd.Series(tmp_alpha.values.squeeze().astype(float), index=tmp_alpha.index)
tmp_alpha

In [None]:
tmp_alpha = lee_alpha_k_results.loc[slicer[j, COEF_COL, :]].mean(axis=1)
tmp_alpha.index = [int(idx.split(')[')[1].split(']')[0]) for idx in tmp_alpha.index]
tmp_alpha = pd.Series(tmp_alpha.values.squeeze().astype(float), index=tmp_alpha.index)
tmp_alpha

In [None]:
tmp_alpha = lee_alpha_k_results.loc[slicer[j, COEF_COL, :]].mean(axis=1)
tmp_alpha.index = [int(idx.split(')[')[1].split(']')[0]) for idx in tmp_alpha.index]
tmp_alpha = pd.Series(tmp_alpha.values.squeeze().astype(float), index=tmp_alpha.index)
tmp_alpha

In [32]:
tmp_alpha = lee_alpha_k_results.loc[slicer[j, COEF_COL, :]].mean(axis=1)
tmp_alpha.index = [int(idx.split(')[')[1].split(']')[0]) for idx in tmp_alpha.index]
tmp_alpha = pd.Series(tmp_alpha.values.squeeze().astype(float), index=tmp_alpha.index)
tmp_alpha

10   -1.982340
11   -2.000573
12   -2.015583
13   -2.032216
14   -2.035612
15   -2.041994
16   -2.058343
17   -2.063003
18   -2.064579
19   -2.073679
1    -1.764881
20   -2.076352
21   -2.075812
22   -2.082347
23   -2.077652
24   -2.092188
25   -2.089552
26   -2.089023
27   -2.089427
28   -2.098780
29   -2.097718
2    -1.823048
30   -2.103069
31   -2.100723
32   -2.106856
33   -2.114051
34   -2.111807
35   -2.126085
36   -2.124012
37   -2.128068
38   -2.135266
39   -2.158462
3    -1.866213
40   -2.160121
41   -2.142882
42   -2.152004
43   -2.160157
44   -2.161459
45   -2.159425
46   -2.182967
47   -2.194953
48   -2.174118
49   -2.099736
4    -1.894417
50   -2.018544
5    -1.915841
6    -1.934400
7    -1.946326
8    -1.970904
9    -1.975183
dtype: float64