In [None]:
import pandas as pd
import numpy as np
import os
from time import time
from sklearn.model_selection import train_test_split
from pydts.examples_utils.generate_simulations_data import generate_quick_start_df
from pydts.examples_utils.plots import plot_example_pred_output
from pydts.examples_utils.plots import add_panel_text
from pydts.fitters import TwoStagesFitter, DataExpansionFitter

from pydts.data_generation import EventTimesSampler
from matplotlib import pyplot as plt
import warnings
import pickle
pd.set_option("display.max_rows", 500)
warnings.filterwarnings('ignore')
%matplotlib inline
slicer = pd.IndexSlice

In [None]:
OUTPUT_DIR = '/home/tomer.me/DiscreteTimeSurvivalPenalization/output'

# Sampling data

In [None]:
n_cov = 5

real_coef_dict = {
    "alpha": {
        1: lambda t: -2.9 + 0.3 * np.log(t),
        2: lambda t: -3 + 0.5 * np.log(t),
        3: lambda t: -3.2 + 0.4 * np.log(t),

    },
    "beta": {
        1: (-np.log([1.2, 0.9, 1.2, 0.8, 1.2])),
        2: (-np.log([1.2, 0.8, 1.1, 1, 1.1])),
        3: (-np.log([1.3, 0.9, 1.1, 1, 1.2])),
    }
}

censoring_hazard_coefs = {
    "alpha": {
        0: lambda t: -4.5 + 0.5 * np.log(t),
    },
    "beta": {
        0: (-np.log([1.2, 0.9, 1.2, 0.8, 1.2])),
    }
}

n_patients = 25000
d_times = 15
j_events = 3

ets = EventTimesSampler(d_times=d_times, j_event_types=j_events)

seed = 0
means_vector = np.zeros(n_cov)
covariance_matrix = 0.4 * np.identity(n_cov)
clip_value = 1.5

covariates = [f'Z{i + 1}' for i in range(n_cov)]

In [None]:
covariance_matrix[1,3] = 0.2
covariance_matrix[3,1] = 0.2

covariance_matrix[2,4] = 0.3
covariance_matrix[4,2] = 0.3

covariance_matrix

In [None]:
print(pd.DataFrame(covariance_matrix, index=covariates, columns=covariates).to_latex())

In [None]:
COEF_COL = '   coef   '
STDERR_COL = ' std err '

In [None]:
patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.multivariate_normal(means_vector, covariance_matrix, size=n_patients),
                           columns=covariates))


                           
patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
patients_df = ets.sample_hazard_lof_censoring(patients_df, 
                                              censoring_hazard_coefs=censoring_hazard_coefs, 
                                              seed=seed+1, 
                                              events=[0])
patients_df = ets.update_event_or_lof(patients_df)

# patients_df['X'] = patients_df['T']

patients_df.index.name='pid'                 
patients_df = patients_df.reset_index()

from pydts.examples_utils.plots import plot_events_occurrence
plot_events_occurrence(patients_df, fname=os.path.join(OUTPUT_DIR, 'competingevents3.png'))

In [None]:
patients_df.groupby(['X', 'J'])['pid'].count()

In [None]:
k_runs = 1

for n_patients in [25_000]: # 5_000, 20_000, 50_000, 100_000
    print('**************************************')
    case = f'Sample_size_{n_patients}_3_events'
    two_step_fit_times = []
    lee_fit_times = []

    for k in range(k_runs):
        try:
            # Sampling based on different seed each time
            np.random.seed(seed+k)

            # Two step fitter
            new_fitter = TwoStagesFitter()
            print(case)
            print(f'Starting two-step: {k}')
            two_step_start = time()
            new_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1), nb_workers=1)
            two_step_end = time()
            print(f'Finished two-step: {k}, {two_step_end-two_step_start}sec')

            # Lee et al fitter
            lee_fitter = DataExpansionFitter()
            print(f'Starting Lee: {k}')
            lee_start = time()
            lee_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1))
            lee_end = time()
            print(f'Finished lee: {k}, {lee_end-lee_start}sec')


            lee_alpha_ser = lee_fitter.get_alpha_df().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()
            lee_beta_ser = lee_fitter.get_beta_SE().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()


            # Save results only if both fitters were successful
            two_step_fit_times.append(two_step_end - two_step_start)
            lee_fit_times.append(lee_end-lee_start)


            if k == 0:
                two_step_alpha_k_results = new_fitter.alpha_df[['J', 'X', 'alpha_jt']]
                two_step_beta_k_results = new_fitter.get_beta_SE().unstack().to_frame()

                lee_alpha_k_results = lee_alpha_ser.to_frame()
                lee_beta_k_results = lee_beta_ser.to_frame()

            else:
                two_step_alpha_k_results = pd.concat([two_step_alpha_k_results, new_fitter.alpha_df['alpha_jt']], axis=1)
                two_step_beta_k_results = pd.concat([two_step_beta_k_results, new_fitter.get_beta_SE().unstack()], axis=1)

                lee_alpha_k_results = pd.concat([lee_alpha_k_results, lee_alpha_ser], axis=1)
                lee_beta_k_results = pd.concat([lee_beta_k_results, lee_beta_ser], axis=1)

            # Cache results
            two_step_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'two_step_alpha_run_{k_runs}.csv'))
            two_step_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'two_step_beta_run_{k_runs}.csv'))
            lee_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'lee_alpha_run_{k_runs}.csv'))
            lee_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'lee_beta_run_{k_runs}.csv'))
            
            with open(os.path.join(OUTPUT_DIR, f"two_step_fit_times_{k_runs}"), "wb") as fp: 
                pickle.dump(two_step_fit_times, fp)

            with open(os.path.join(OUTPUT_DIR, f"lee_fit_times_{k_runs}"), "wb") as fp:   
                pickle.dump(lee_fit_times, fp)

        except Exception as e:
            print(f'Failed during trial {k}')
            print(e)
            

    two_step_alpha_k_results = two_step_alpha_k_results.set_index(['J', 'X'])
    two_step_alpha_k_results.columns = list(range(1, 1+k_runs))
    two_step_beta_k_results.columns = list(range(1, 1+k_runs))
    lee_alpha_k_results.columns = list(range(1, 1+k_runs))
    lee_beta_k_results.columns = list(range(1, 1+k_runs))


    # Save results
    two_step_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha_run_{k_runs}.csv'))
    two_step_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta_run_{k_runs}.csv'))
    lee_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha_run_{k_runs}.csv'))
    lee_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_beta_run_{k_runs}.csv'))

    with open(os.path.join(OUTPUT_DIR, f"{case}_two_step_fit_times_{k_runs}"), "wb") as fp: 
        pickle.dump(two_step_fit_times, fp)

    with open(os.path.join(OUTPUT_DIR, f"{case}_lee_fit_times_{k_runs}"), "wb") as fp:   
        pickle.dump(lee_fit_times, fp)

In [None]:
true_col = pd.Series((-np.log([1.2, 0.9, 1.2, 0.8, 1.2, 1.2, 0.8, 1.1, 1, 1.1, 1.3, 0.9, 1.1, 1, 1.2])),
                     name = 'True')

tmp = two_step_beta_k_results.loc[['j1_params', 'j2_params', 'j3_params'], :]
tmp.index = lee_beta_k_results.unstack(1)[[(1, '   coef   ')]].index
true_col.index = lee_beta_k_results.unstack(1)[[(1, '   coef   ')]].index
tmp2 = pd.concat([true_col, lee_beta_k_results.unstack(1)[[(1, '   coef   ')]], tmp], 
                 axis=1)
tmp2.columns=['True', 'Lee et al. Estimate', 'two-step Estimate']
tmp2

In [None]:
print(tmp2.astype(float).round(3).to_latex(escape=False))

In [None]:
filename = 'alpha_3_events.png'

first_model_name = 'Lee et al.'
second_model_name = 'two-step'
times = range(1, d_times+1)
counts = patients_df.groupby(['J', 'X'])['pid'].count().unstack('J').dropna()

lee_colors = ['tab:blue', 'tab:blue', 'tab:blue']
two_step_colors = ['k', 'k', 'k']
true_colors = ['tab:green', 'tab:green', 'tab:green']

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for j in range(1, j_events+1):
    ax = axes[j-1]
    
    ax.tick_params(axis='y', which='major', labelsize=15)
    ax.tick_params(axis='y', which='minor', labelsize=15)
    ax.tick_params(axis='x', which='major', labelsize=13)
    ax.tick_params(axis='x', which='minor', labelsize=13)

    
    tmp_alpha = lee_alpha_k_results.loc[slicer[j, COEF_COL, :]]
    tmp_alpha.index = [int(idx.split(')[')[1].split(']')[0]) for idx in tmp_alpha.index]
    tmp_alpha = pd.Series(tmp_alpha.values.squeeze().astype(float), index=tmp_alpha.index)
    
    ax.scatter(tmp_alpha.index, tmp_alpha.values,
           label=f'J={j} ({first_model_name})', color=lee_colors[j-1], marker='o', alpha=0.4, s=40)

    tmp_alpha = two_step_alpha_k_results.loc[slicer[j, :]]
    ax.scatter(tmp_alpha.index, tmp_alpha.values.squeeze(),
           label=f'J={j} ({second_model_name})', color=two_step_colors[j-1], marker='*', alpha=0.7, s=20)
    
    true_values = [real_coef_dict['alpha'][j](t) for t in times]
    ax.plot(times, true_values, label=f'J={j} (True)', ls='--', color=true_colors[j-1])
    ax.set_xlabel(r'Time', fontsize=18)
    ax.set_xticks(times)
    ax.set_xticklabels([f'{t}' for t in times])
    if j == 1:
        ax.set_ylabel(r'$\alpha_{t}$', fontsize=18)
    ax.legend(loc='upper left', fontsize=13)
    ax.set_ylim([-3.5, -1])

    ax2 = ax.twinx()
    ax2.tick_params(axis='y', which='major', labelsize=15)
    ax2.tick_params(axis='y', which='minor', labelsize=15)
    ax2.tick_params(axis='x', which='major', labelsize=13)
    ax2.tick_params(axis='x', which='minor', labelsize=13)
    
    ax2.bar(times, counts.loc[:, j].values, label=f'J={j}', color='tab:red', alpha=0.4, width=0.5)
    ax2.legend(loc='upper right', fontsize=13)
    if j == 3:
        ax2.set_ylabel('Number of observed events', fontsize=16, color='red')
    ax2.tick_params(axis='y', colors='red')
    ax2.set_ylim([0, 2500])
fig.tight_layout()

if filename is not None:
    fig.savefig(os.path.join(OUTPUT_DIR, filename), dpi=300)