In [None]:
import pandas as pd
import numpy as np
import os
from time import time
from sklearn.model_selection import train_test_split
from pydts.examples_utils.generate_simulations_data import generate_quick_start_df
from pydts.examples_utils.plots import plot_example_pred_output
from pydts.examples_utils.plots import add_panel_text
from pydts.fitters import TwoStagesFitter, DataExpansionFitter

from pydts.data_generation import EventTimesSampler
from matplotlib import pyplot as plt
import warnings
import pickle
pd.set_option("display.max_rows", 500)
warnings.filterwarnings('ignore')
%matplotlib inline
slicer = pd.IndexSlice

In [None]:
OUTPUT_DIR = '/home/tomer.me/DiscreteTimeSurvivalPenalization/output/'

In [None]:
real_coef_dict = {
    "alpha": {
        1: lambda t: -2.5 - 0.3 * np.log(t),
        2: lambda t: -2.8 - 0.3 * np.log(t)
    },
    "beta": {
        1: -0.5*np.log([0.8, 3, 3, 2.5, 4, 1, 3, 2, 2, 3]),
        2: -0.5*np.log([1, 3, 2, 1, 4, 3, 4, 3, 3, 2])
    }
}

n_patients = 20000
n_cov = 10
j_events = 2

d_times = 150
ets = EventTimesSampler(d_times=d_times, j_event_types=j_events)

seed = 0

covariates = [f'Z{i}' for i in range(n_cov)]

COEF_COL = '   coef   '
STDERR_COL = ' std err '

In [None]:
patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.uniform(0,1, size=[n_patients, n_cov]),
                           columns=covariates))
                           
patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
patients_df['X'] = patients_df['T']
patients_df['C'] = patients_df['T']

patients_df.index.name='pid'                 
patients_df = patients_df.reset_index()

from pydts.examples_utils.plots import plot_events_occurrence
plot_events_occurrence(patients_df[patients_df['X'] != (d_times+1)])
plot_events_occurrence(patients_df)

In [None]:
k_runs = 10
d_times_list = [25, 50, 75, 100, 125, 150] 

In [None]:
final_two_step = {}
final_lee = {}

for idp, d_times in enumerate(d_times_list):
    #print('**************************************')
    case = f'timing_d{d_times}_final_'
    two_step_timing = []
    lee_timing = []
    for k in range(k_runs):
        try:
            # Sampling based on different seed each time
            loop_seed = 1000*idp+k+seed
            print(f'Sampling Patients, loop seed: {loop_seed}')
            np.random.seed(loop_seed)
            ets = EventTimesSampler(d_times=d_times, j_event_types=j_events)
            patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.uniform(0,1, size=[n_patients, n_cov]),
                                       columns=covariates))
            
            patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=loop_seed)
            patients_df['X'] = patients_df['T']
            patients_df['C'] = patients_df['T'] + 1
            patients_df.index.name='pid'
            patients_df = patients_df.reset_index()

            # Two step fitter
            new_fitter = TwoStagesFitter()
            print(case)
            print(f'Starting two-step: {k+1}/{k_runs}')
            two_step_start = time()
            new_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1), nb_workers=1)
            two_step_end = time()
            print(f'Finished two-step: {k+1}/{k_runs}, {two_step_end-two_step_start}sec')

            two_step_timing.append(two_step_end-two_step_start)
            
            # Lee et al fitter
            lee_fitter = DataExpansionFitter()
            print(f'Starting Lee: {k+1}/{k_runs}')
            lee_start = time()
            lee_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1))
            lee_end = time()
            print(f'Finished lee: {k+1}/{k_runs}, {lee_end-lee_start}sec')

            lee_timing.append(lee_end-lee_start) 

            lee_alpha_ser = lee_fitter.get_alpha_df().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()
            lee_beta_ser = lee_fitter.get_beta_SE().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()

            if k == 0:
                two_step_alpha_k_results = new_fitter.alpha_df[['J', 'X', 'alpha_jt']]
                two_step_beta_k_results = new_fitter.get_beta_SE().unstack().to_frame()

                lee_alpha_k_results = lee_alpha_ser.to_frame()
                lee_beta_k_results = lee_beta_ser.to_frame()

            else:
                two_step_alpha_k_results = pd.concat([two_step_alpha_k_results, new_fitter.alpha_df['alpha_jt']], axis=1)
                two_step_beta_k_results = pd.concat([two_step_beta_k_results, new_fitter.get_beta_SE().unstack()], axis=1)

                lee_alpha_k_results = pd.concat([lee_alpha_k_results, lee_alpha_ser], axis=1)
                lee_beta_k_results = pd.concat([lee_beta_k_results, lee_beta_ser], axis=1)

            # Cache results
            two_step_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha_run_{k_runs}.csv'))
            two_step_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta_run_{k_runs}.csv'))
            lee_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha_run_{k_runs}.csv'))
            lee_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_beta_run_{k_runs}.csv'))

        except Exception as e:
            print(f'Failed during trial {k}')
            print(e)
            

    two_step_alpha_k_results = two_step_alpha_k_results.set_index(['J', 'X'])
    two_step_alpha_k_results.columns = list(range(1, 1+k_runs))
    two_step_beta_k_results.columns = list(range(1, 1+k_runs))
    lee_alpha_k_results.columns = list(range(1, 1+k_runs))
    lee_beta_k_results.columns = list(range(1, 1+k_runs))


    # Save results
    two_step_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha_run_{k_runs}.csv'))
    two_step_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta_run_{k_runs}.csv'))
    lee_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha_run_{k_runs}.csv'))
    lee_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_beta_run_{k_runs}.csv'))
    
    final_two_step[d_times] = two_step_timing
    final_lee[d_times] = lee_timing
    
    with open(os.path.join(OUTPUT_DIR, 'final_timing_two_step.pkl'), 'wb') as f:
        pickle.dump(final_two_step, f)

    with open(os.path.join(OUTPUT_DIR, 'final_timing_lee.pkl'), 'wb') as f:
        pickle.dump(final_lee, f)

In [None]:
final_two_step

In [None]:
final_lee

In [None]:
summary_df = pd.DataFrame(index=d_times_list, columns=['Lee mean', 'Lee std', 
                                                       'two-step mean', 'two-step std'])
lee_results_df = pd.DataFrame(columns=d_times_list, index=range(1,k_runs+1))
two_step_results_df = pd.DataFrame(columns=d_times_list, index=range(1,k_runs+1))

for idk, k in enumerate(d_times_list):
    summary_df.loc[k, 'Lee mean'] = np.mean(final_lee[k])
    summary_df.loc[k, 'Lee std'] = np.std(final_lee[k])
    summary_df.loc[k, 'two-step mean'] = np.mean(final_two_step[k])
    summary_df.loc[k, 'two-step std'] = np.std(final_two_step[k])
    
    lee_results_df.loc[:, k] = final_lee[k]
    two_step_results_df.loc[:, k] = final_two_step[k]
    
summary_df['ratio'] = summary_df['Lee mean'] / summary_df['two-step mean']
summary_df

In [None]:
filename = 'fitting_time_comparison.png'

fig, ax = plt.subplots(1, 1, figsize=(6,4))
ax.set_title('n=20000, p=10', fontsize=16)
flierprops = dict(marker='.', markersize=4)
lee_boxprops = dict(color='darkgreen')
lee_medianprops = dict(color='darkgreen')
two_step_boxprops = dict(color='navy')
two_step_medianprops = dict(color='navy')

ax.boxplot(lee_results_df, vert=True, positions=lee_results_df.columns, whis=1.5, flierprops=flierprops,
           widths=8, boxprops=lee_boxprops, medianprops=lee_medianprops)
ax.boxplot(two_step_results_df, vert=True, positions=two_step_results_df.columns, whis=1.5, flierprops=flierprops,
           widths=8, boxprops=two_step_boxprops, medianprops=two_step_medianprops)
ax.set_xlabel('Number of Discrete Times', fontsize=16)
ax.set_ylabel('Fitting Time [seconds]', fontsize=16)
ax.set_xticks(d_times_list)
ax.set_xticklabels(d_times_list)
ax.tick_params(axis='both', which='major', labelsize=14)
ax.tick_params(axis='both', which='minor', labelsize=14)
ax.set_xlim([20,155])
# ax.legend()
leg = ax.legend(['Lee et al.', 'two-step'], handlelength=0, handletextpad=0)
color_l = ['darkgreen', 'navy']
for n, text in enumerate( leg.texts ):
    text.set_color( color_l[n] )

    ax.grid(alpha=0.5)

fig.tight_layout()
if filename is not None:
    fig.savefig(os.path.join(OUTPUT_DIR, filename), dpi=300)