In [1]:
import pandas as pd
from lifelines import CoxPHFitter
from src.helper_methods import *
from src.pipe_store import *
from src.constants import *

%matplotlib inline 
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

# Data Loading

In [2]:
path = '~/Repos/STRIDE/STRIDE-Analytics/data/20210614-mismatch_ep_db-extended.pickle'

donor_type ='Deceased'
# antibody_epitope = 
status = 'All'
num_col = ['DonorAge_NOTR', 'DialysisYears', 'RecipientAge_NOTR', 'CIPHour_DBD', 'CIPHour_DCD']


df = (
    data_loading(path)
    .pipe(start_pipeline, status, donor_type)
    .pipe(keeping_features,
        'Failure',
        'Survival[Y]',
        'EpvsHLA_Donor',
        '#DESA',
        'DESA', 
        'DonorAge_NOTR',
        'DialysisYears',
        'RecipientAge_NOTR',
        'CIPHour_DBD', 
        'CIPHour_DCD' ,
        'TypeCadaveric_NOTR', 
        'Donor_Type',
        'IL2rMoAb_T0'
    )
    .pipe(integer_encoder, 'TypeCadaveric_NOTR', 'IL2rMoAb_T0')
    .pipe(features_from_antibody_epitopes)
    .pipe(set_time_event_label, E='Failure', T='Survival[Y]')
    # .pipe(feature_scaler, num_col, scaler='standard')
    .pipe(censoring_deaths)
    .pipe(setting_prediction_horizon, 15)
    .rename(columns={'IL2rMoAb_T0':'IL2rMoAb'})
)    

Step: data_loading | Shape: (4690, 24) | Computation Time: 0.018283s
- selceted cohort --> Donor Type: Deceased, Epitope Antibody Presence: All
Step: start_pipeline | Shape: (3235, 23) | Computation Time: 0.008829s
Step: integer_encoder | Shape: (3235, 13) | Computation Time: 0.001923s
Step: set_time_event_label | Shape: (3235, 17) | Computation Time: 0.002525s
Step: censoring_deaths | Shape: (3235, 17) | Computation Time: 0.001148s
Step: setting_prediction_horizon | Shape: (3235, 17) | Computation Time: 0.00377s


# Checking the Cox Proportional Hazard Assumptions

https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html?highlight=cox%20assumptions

The proportional hazard assumption is that all individuals have the same hazard function, but a unique scaling factor infront. So the shape of the hazard function is the same for all individuals, and only a scalar multiple changes per individual.

## Finding Inverse Probability Wieghts (IPW)

In [4]:
from lifelines import CoxPHFitter

df_new = df.copy(deep=True)
df_new['Class_I_II'] = df_new.Class_I * df_new.Class_II
poly_col = ['RecipientAge_NOTR', 'DonorAge_NOTR']
# for col in poly_col:
#     df_new[col +'^2'] = df_new[col] * df_new[col]
confounders = [
    'RecipientAge_NOTR', 'RecipientAge_NOTR*RecipientAge_NOTR', 
    'DonorAge_NOTR', 'DonorAge_NOTR*DonorAge_NOTR', 'IL2rMoAb', 
    'CIPHour_DBD', 'CIPHour_DCD', 
]

df_treat_group = create_treatment_grups(df_new, [RELEVANT_DESA_BAD])
treatments = ['No_DESA', 'Other_DESA', 'Specific_DESA']
# treatments = ['Relevant_DESA_Bad', 'Relevant_DESA_Good']
df_weight = find_ipw(df_treat_group, confounders, treatments, verbose=False)

cph = CoxPHFitter()
# df_new = df_weight.drop(
#     [
#         '#DESA', 'Donor_Type', 'TypeCadaveric_NOTR', 
#         'DESA',
#         'Class_I',
#         'Class_II', 
#         'Class_I_II',
#         'Other_DESA', 
#         # 'No DESA',
#         'Relevant_DESA_Bad',
#         'Relevant_DESA_Good',
#     ], axis=1)

# cph.fit(df_new, weights_col='w', duration_col='T', event_col='E', robust=True)
# cph.print_summary()
cols = [
    'DonorAge_NOTR', 'RecipientAge_NOTR',
    'DialysisYears', 'IL2rMoAb',
    'CIPHour_DBD', 'CIPHour_DCD', 
    'No_DESA',
    'Specific_DESA',
    'Class_I', 
    'Class_II', 
    # 'Class_I_II',
    'E', 'T', 'w',
]
df_new = df_weight[cols]
cph.fit(df_new, weights_col='w', duration_col='T', event_col='E', robust=True)
cph.print_summary()
# df_weight[cols]


  return summary_df[columns].to_latex(float_format="%." + str(self.decimals) + "f")


0,1
model,lifelines.CoxPHFitter
duration col,'T'
event col,'E'
weights col,'w'
robust variance,True
baseline estimation,breslow
number of observations,9688.73
number of events observed,3858.78
partial log-likelihood,-32980.25
time fit was run,2022-03-24 13:59:22 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
DonorAge_NOTR,0.21,1.23,0.08,0.04,0.37,1.04,1.45,0.0,2.46,0.01,6.18
RecipientAge_NOTR,-0.21,0.81,0.07,-0.35,-0.07,0.71,0.94,0.0,-2.88,<0.005,7.98
DialysisYears,0.01,1.01,0.02,-0.02,0.05,0.98,1.05,0.0,0.71,0.48,1.07
IL2rMoAb,0.02,1.02,0.19,-0.36,0.4,0.7,1.49,0.0,0.1,0.92,0.12
CIPHour_DBD,0.14,1.15,0.11,-0.08,0.35,0.92,1.42,0.0,1.24,0.22,2.21
CIPHour_DCD,0.32,1.38,0.11,0.11,0.54,1.12,1.71,0.0,2.98,<0.005,8.42
No_DESA,-1.07,0.34,0.35,-1.75,-0.38,0.17,0.68,0.0,-3.05,<0.005,8.75
Specific_DESA,0.98,2.68,0.17,0.65,1.32,1.92,3.73,0.0,5.79,<0.005,27.06
Class_I,-0.86,0.42,0.33,-1.5,-0.22,0.22,0.8,0.0,-2.64,0.01,6.91
Class_II,-0.66,0.52,0.32,-1.29,-0.02,0.27,0.98,0.0,-2.02,0.04,4.53

0,1
Concordance,0.64
Partial AIC,65980.49
log-likelihood ratio test,1679.67 on 10 df
-log2(p) of ll-ratio test,inf


In [5]:
cph.check_assumptions(df_new, p_value_threshold=0.05, show_plots=True)

  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  


Proportional hazard assumption looks okay.


  for variable in self.params_.index & (columns or self.params_.index):


[]

In [6]:
from lifelines.statistics import proportional_hazard_test

results = proportional_hazard_test(cph, df_new, time_transform='rank')
results.print_summary(decimals=3, model="untransformed variables")

  return self.summary.to_latex()


0,1
time_transform,rank
null_distribution,chi squared
degrees_of_freedom,1
model,<lifelines.CoxPHFitter: fitted with 9688.73 to...
test_name,proportional_hazard_test

Unnamed: 0,test_statistic,p,-log2(p)
CIPHour_DBD,0.11,0.74,0.43
CIPHour_DCD,0.12,0.73,0.46
Class_I,0.01,0.91,0.13
Class_II,0.0,0.94,0.08
DialysisYears,0.03,0.86,0.22
DonorAge_NOTR,0.0,0.99,0.01
IL2rMoAb,0.0,0.96,0.05
No_DESA,0.03,0.87,0.2
RecipientAge_NOTR,0.28,0.59,0.75
Specific_DESA,0.02,0.89,0.18


# Relevant DESA Good

In [7]:
df_new = df.copy(deep=True)

# poly_col = ['RecipientAge_NOTR', 'DonorAge_NOTR']
# for col in poly_col:
#     df_new[col +'^2'] = df_new[col] * df_new[col]
confounders = [
    'RecipientAge_NOTR', 'RecipientAge_NOTR*RecipientAge_NOTR', 
    'DonorAge_NOTR', 'DonorAge_NOTR*DonorAge_NOTR', 'IL2rMoAb', 
    'CIPHour_DBD', 'CIPHour_DCD', 
]

df_treat_group = create_treatment_grups(df_new, [RELEVANT_DESA_GOOD])
treatments = ['No_DESA', 'Other_DESA', 'Specific_DESA']
num_col = ['DonorAge_NOTR', 'DialysisYears', 'RecipientAge_NOTR', 'CIPHour_DBD', 'CIPHour_DCD']
df_weight = find_ipw(df_treat_group, confounders, treatments,  verbose=False)

cph = CoxPHFitter()
cols = [
    'DonorAge_NOTR', 'RecipientAge_NOTR',
    'DialysisYears', 'IL2rMoAb',
    'CIPHour_DBD', 'CIPHour_DCD', 
    'No_DESA',
    'Specific_DESA',
    'Class_I', 
    'Class_II', 
    # 'Class_I_II',
    'E', 'T', 'w',
]
df_new = df_weight[cols]

cph.fit(df_new, weights_col='w', duration_col='T', event_col='E', robust=True)

cph.print_summary()

  return summary_df[columns].to_latex(float_format="%." + str(self.decimals) + "f")


0,1
model,lifelines.CoxPHFitter
duration col,'T'
event col,'E'
weights col,'w'
robust variance,True
baseline estimation,breslow
number of observations,9688.9
number of events observed,3578.04
partial log-likelihood,-30694.93
time fit was run,2022-03-24 13:59:24 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
DonorAge_NOTR,0.27,1.31,0.08,0.11,0.42,1.12,1.52,0.0,3.38,<0.005,10.45
RecipientAge_NOTR,-0.22,0.8,0.07,-0.35,-0.08,0.7,0.92,0.0,-3.13,<0.005,9.18
DialysisYears,0.03,1.03,0.02,-0.01,0.07,0.99,1.07,0.0,1.32,0.19,2.42
IL2rMoAb,-0.08,0.92,0.17,-0.42,0.25,0.66,1.29,0.0,-0.48,0.63,0.66
CIPHour_DBD,0.18,1.2,0.11,-0.03,0.39,0.97,1.48,0.0,1.7,0.09,3.48
CIPHour_DCD,0.38,1.47,0.1,0.19,0.58,1.21,1.78,0.0,3.84,<0.005,12.96
No_DESA,-1.46,0.23,0.36,-2.16,-0.75,0.12,0.47,0.0,-4.07,<0.005,14.35
Specific_DESA,-0.73,0.48,0.21,-1.14,-0.32,0.32,0.73,0.0,-3.49,<0.005,11.01
Class_I,-0.46,0.63,0.35,-1.15,0.22,0.32,1.25,0.0,-1.32,0.19,2.43
Class_II,-0.44,0.64,0.34,-1.1,0.22,0.33,1.24,0.0,-1.32,0.19,2.42

0,1
Concordance,0.64
Partial AIC,61409.86
log-likelihood ratio test,1340.28 on 10 df
-log2(p) of ll-ratio test,933.83


In [8]:
cph.check_assumptions(df_new, p_value_threshold=0.05, show_plots=True)

Proportional hazard assumption looks okay.


  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  
  for variable in self.params_.index & (columns or self.params_.index):


[]

In [9]:
from lifelines.statistics import proportional_hazard_test

results = proportional_hazard_test(cph, df_new, time_transform='rank')
results.print_summary(decimals=3, model="untransformed variables")

  return self.summary.to_latex()


0,1
time_transform,rank
null_distribution,chi squared
degrees_of_freedom,1
model,<lifelines.CoxPHFitter: fitted with 9688.9 tot...
test_name,proportional_hazard_test

Unnamed: 0,test_statistic,p,-log2(p)
CIPHour_DBD,0.09,0.76,0.39
CIPHour_DCD,0.11,0.74,0.43
Class_I,0.01,0.94,0.08
Class_II,0.0,0.97,0.04
DialysisYears,0.03,0.87,0.2
DonorAge_NOTR,0.0,0.97,0.04
IL2rMoAb,0.0,0.97,0.05
No_DESA,0.02,0.89,0.16
RecipientAge_NOTR,0.27,0.6,0.73
Specific_DESA,0.0,0.96,0.06


# Table Hazard Ratios

## Deceased Donors

In [32]:
path = '~/Repos/STRIDE/STRIDE-Analytics/data/20210614-mismatch_ep_db-extended.pickle'
donor_type ='Deceased'
status = 'All'
antibody_epitope = True
num_col = ['DonorAge_NOTR', 'DialysisYears', 'RecipientAge_NOTR', 'CIPHour_DBD', 'CIPHour_DCD']

df_deceased = (
    data_loading(path)
    .pipe(start_pipeline, status, donor_type)
    .pipe(keeping_features,
        'Failure',
        'Survival[Y]',
        'EpvsHLA_Donor',
        '#DESA',
        'DESA', 
        'DonorAge_NOTR',
        'DialysisYears',
        'RecipientAge_NOTR',
        'CIPHour_DBD', 
        'CIPHour_DCD' ,
        'Donor_Type',
        'IL2rMoAb_T0'
    )
    .pipe(integer_encoder, 'IL2rMoAb_T0')
    # .pipe(polynomial_power2, 'DonorAge_NOTR', 'RecipientAge_NOTR') 
    .pipe(eng_immunological_features, antibody_epitope=antibody_epitope)
    .pipe(set_time_event_label, E='Failure', T='Survival[Y]')
    .pipe(censoring_deaths)
    .pipe(feature_scaler, num_col, scaler='standard')
    .rename(columns={'IL2rMoAb_T0':'IL2rMoAb'})
    # .pipe(setting_prediction_horizon, 10)
)
# df_deceased['No_DESA'] = df_deceased['#DESA'].apply(lambda x: 1 if not x else 0)
# df_deceased['Early_Failures'] = df_deceased[df_deceased['T'] <= 1]['No_DESA'].apply(lambda x: 1 if x == 0 else 0 )
df_deceased['DESA_12'] = df_deceased['#DESA'].apply(lambda x: 1 if (x >= 1) & (x <= 2) else 0)
df_deceased['DESA_3more'] = df_deceased['#DESA'].apply(lambda x: 1 if x > 2 else 0)
# df_deceased

Step: data_loading | Shape: (4690, 24) | Computation Time: 0.015422s
- selceted cohort --> Donor Type: Deceased, Epitope Antibody Presence: All
Step: start_pipeline | Shape: (3235, 23) | Computation Time: 0.010392s
Step: integer_encoder | Shape: (3235, 12) | Computation Time: 0.001803s
Step: eng_immunological_features | Shape: (3235, 16) | Computation Time: 0.018604s
Step: set_time_event_label | Shape: (3235, 16) | Computation Time: 0.003112s
Step: censoring_deaths | Shape: (3235, 16) | Computation Time: 0.001876s


In [33]:
confounders = [
    'RecipientAge_NOTR', 
    'RecipientAge_NOTR*RecipientAge_NOTR', 
    'DonorAge_NOTR', 
    'DonorAge_NOTR*DonorAge_NOTR', 
    'IL2rMoAb', 
    # 'CIPHour_DBD', 'CIPHour_DCD',
]
num_col = ['DonorAge_NOTR', 'RecipientAge_NOTR',
            # 'DonorAge_NOTR^2', 'RecipientAge_NOTR^2',
            'DialysisYears', 'CIPHour_DBD', 'CIPHour_DCD']

df_deceased = create_treatment_grups(df_deceased, [RELEVANT_DESA_BAD])
treatments = ['No_DESA', 'Other_DESA', 'Specific_DESA']
df_weight = find_ipw(df_deceased, confounders, treatments, scaler='standard', num_col=num_col)
# df_weight = df_deceased
df_weight = df_weight[[
    'DonorAge_NOTR', 'RecipientAge_NOTR',
    # 'DonorAge_NOTR^2', 'RecipientAge_NOTR^2',
    'CIPHour_DBD', 'CIPHour_DCD', 
    'IL2rMoAb', 
    'No_DESA', 
    # 'Specific_DESA', 
    'Relevant_DESA_Bad',
    'DialysisYears',
    # 'Early_Failures', 
    # 'DESA 1-2',
    # 'DESA_3more',
    'T', 'E', 
    'w',
]]
cph = CoxPHFitter()
cph.fit(df_weight, 
        weights_col='w', 
        duration_col='T', event_col='E', robust=True)
cph.print_summary()

  return summary_df[columns].to_latex(float_format="%." + str(self.decimals) + "f")


0,1
model,lifelines.CoxPHFitter
duration col,'T'
event col,'E'
weights col,'w'
robust variance,True
baseline estimation,breslow
number of observations,9704.34
number of events observed,3971.15
partial log-likelihood,-33770.80
time fit was run,2022-03-24 15:30:27 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
DonorAge_NOTR,0.22,1.25,0.08,0.07,0.38,1.07,1.46,0.0,2.81,<0.005,7.66
RecipientAge_NOTR,-0.22,0.8,0.07,-0.36,-0.09,0.7,0.92,0.0,-3.24,<0.005,9.7
CIPHour_DBD,0.11,1.12,0.11,-0.1,0.32,0.9,1.38,0.0,1.02,0.31,1.7
CIPHour_DCD,0.3,1.34,0.1,0.09,0.5,1.09,1.65,0.0,2.82,<0.005,7.69
IL2rMoAb,0.04,1.04,0.19,-0.33,0.4,0.72,1.5,0.0,0.19,0.85,0.24
No_DESA,-0.29,0.75,0.13,-0.55,-0.03,0.58,0.97,0.0,-2.19,0.03,5.14
Relevant_DESA_Bad,0.9,2.46,0.17,0.58,1.23,1.78,3.41,0.0,5.44,<0.005,24.16
DialysisYears,0.0,1.0,0.05,-0.1,0.1,0.9,1.11,0.0,0.01,0.99,0.01

0,1
Concordance,0.64
Partial AIC,67557.60
log-likelihood ratio test,1574.93 on 8 df
-log2(p) of ll-ratio test,inf


In [31]:
confounders = [
    'RecipientAge_NOTR', 
    'RecipientAge_NOTR*RecipientAge_NOTR', 
    'DonorAge_NOTR', 
    'DonorAge_NOTR*DonorAge_NOTR', 
    'IL2rMoAb', 
    # 'CIPHour_DBD', 'CIPHour_DCD',
]
num_col = ['DonorAge_NOTR', 'RecipientAge_NOTR',
            # 'DonorAge_NOTR^2', 'RecipientAge_NOTR^2',
            'DialysisYears', 'CIPHour_DBD', 'CIPHour_DCD']
df_deceased_early = df_deceased[df_deceased['T'] < 1]
df_deceased_early = create_treatment_grups(df_deceased_early, [RELEVANT_DESA_BAD])
treatments = ['No_DESA', 'Other_DESA', 'Specific_DESA']
df_weight = find_ipw(df_deceased_early, confounders, treatments, scaler='standard', num_col=num_col)
# df_weight = df_deceased
df_weight = df_weight[[
    'DonorAge_NOTR', 'RecipientAge_NOTR',
    # 'DonorAge_NOTR^2', 'RecipientAge_NOTR^2',
    'CIPHour_DBD', 'CIPHour_DCD', 
    'IL2rMoAb', 
    'No_DESA', 
    # 'Specific_DESA', 
    'Relevant_DESA_Bad',
    'DialysisYears',
    # 'Early_Failures', 
    # 'DESA 1-2',
    # 'DESA_3more',
    'T', 'E', 
    'w',
]]
cph = CoxPHFitter()
cph.fit(df_weight, 
        weights_col='w', 
        duration_col='T', event_col='E', robust=True)
cph.print_summary()

SyntaxError: invalid syntax (2628478294.py, line 11)