# Flatiron Health aNSCLC: Survival metrics for strict elgibility criteria
**Background: Calculate survival metrics for emulated trials involving patients meeting strict elgibliity criteria. Hazard ratio for the full cohort is calculated from a Cox-IPTW model. Restricted mean survival time and median overall survival are calculated for phenotypes using an IPTW-adjusted KM curve.** 

## Part 1: Identify patients with exclusion criteria

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Function that returns number of rows and count of unique PatientIDs for a dataframe. 
def row_ID(dataframe):
    row = dataframe.shape[0]
    ID = dataframe['PatientID'].nunique()
    return row, ID

In [3]:
train = pd.read_csv('train_full.csv')
row_ID(train)

(54786, 54786)

In [4]:
test = pd.read_csv('test_full.csv')
row_ID(test)

(13697, 13697)

In [5]:
df = pd.concat([train, test], ignore_index = True)
row_ID(df)

(68483, 68483)

### 1. Autoimmune diseases in the year preceding advanced diagnosis 
* Type 1 diabetes
* Rheumatoid arthritis
* Lupus
* Systemic sclerosis 
* Dermatomyositis
* Polymyositis
* Crohn's disease
* Ulcerative colitis 
* Psoriasis 
* Multiple sclerosis 

In [6]:
diagnosis = pd.read_csv('Diagnosis.csv')

In [7]:
diagnosis = diagnosis[diagnosis['PatientID'].isin(df['PatientID'])]       

In [8]:
diagnosis.loc[:, 'DiagnosisDate'] = pd.to_datetime(diagnosis['DiagnosisDate'])

In [9]:
enhanced_adv = pd.read_csv('Enhanced_AdvancedNSCLC.csv', low_memory = False)

In [10]:
enhanced_adv.loc[:, 'AdvancedDiagnosisDate'] = pd.to_datetime(enhanced_adv['AdvancedDiagnosisDate'])

In [11]:
row_ID(diagnosis)

(1499292, 68483)

In [12]:
diagnosis = pd.merge(diagnosis, enhanced_adv[['PatientID', 'AdvancedDiagnosisDate']], on = 'PatientID', how = 'left')

In [13]:
row_ID(diagnosis)

(1499292, 68483)

In [14]:
diagnosis.loc[:, 'date_diff'] = (diagnosis['DiagnosisDate'] - diagnosis['AdvancedDiagnosisDate']).dt.days

In [15]:
diagnosis.loc[:, 'diagnosis_code'] = diagnosis['DiagnosisCode'].replace('\.', '', regex = True)

In [16]:
# ICD-9 dataframe with unique codes for each patient. 
diagnosis_9 = (
    diagnosis
    .query('date_diff <= 0 and date_diff > -365')
    .query('DiagnosisCodeSystem == "ICD-9-CM"')
    .drop_duplicates(subset = (['PatientID', 'DiagnosisCode']), keep = 'first')
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

In [17]:
auto_ID_9 = (
    diagnosis_9[diagnosis_9['diagnosis_code'].str.match('250(0[13]|1[13]|2[13]|3[13]|4[13]|5[13]|6[13]|7[13]|8[13]|9[13])|'
                                                        '714|'
                                                        '710[0134]|'
                                                        '55[56]|'
                                                        '696|'
                                                        '340')].PatientID.unique())

In [18]:
len(auto_ID_9)

87

In [19]:
# ICD-9 dataframe with unique codes for each patient. 
diagnosis_10 = (
    diagnosis
    .query('date_diff <= 0 and date_diff > -365')
    .query('DiagnosisCodeSystem == "ICD-10-CM"')
    .drop_duplicates(subset = (['PatientID', 'DiagnosisCode']), keep = 'first')
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

In [20]:
auto_ID_10 = (
    diagnosis_10[diagnosis_10['diagnosis_code'].str.match('E10|'
                                                          'M05|'
                                                          'M32|'
                                                          'M33|'
                                                          'M34|'
                                                          'K50|'
                                                          'K51|'
                                                          'L40|'
                                                          'G35')].PatientID.unique())

In [21]:
len(auto_ID_10)

238

In [22]:
auto_IDs = np.unique(np.concatenate([auto_ID_9, auto_ID_10]))

In [23]:
len(auto_IDs)

322

### 2. Other relevant comorbidities in the year preceding advanced diagnosis 
* Interstitial lung disease 
* HIV
* Hep C
* Hep B
* Psychosis and other significant psychiatric disorders 
* Drug use disorders

In [24]:
other_comorb_9 = (
    diagnosis_9[diagnosis_9['diagnosis_code'].str.match('516|'
                                                        '042|'
                                                        '070[234567]|'
                                                        '29[5789]|'
                                                        '30[34]')].PatientID.unique())

In [25]:
len(other_comorb_9)

69

In [26]:
other_comorb_10 = (
    diagnosis_10[diagnosis_10['diagnosis_code'].str.match('J84|'
                                                          'B20|'
                                                          'B18|'
                                                          'F2[024589]|'
                                                          'F1[0145]')].PatientID.unique())

In [27]:
len(other_comorb_10)

474

In [28]:
other_comorb_IDs = np.unique(np.concatenate([other_comorb_9, other_comorb_10]))

In [29]:
len(other_comorb_IDs)

537

### 3. CNS metastasis at start of treatment 

#### First line treatment

In [30]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [31]:
line_therapy = line_therapy[line_therapy['PatientID'].isin(df['PatientID'])]       

In [32]:
line_therapy.loc[:, 'StartDate'] = pd.to_datetime(line_therapy['StartDate'])

In [33]:
therapy_fl = line_therapy.query('LineNumber == 1').query('IsMaintenanceTherapy == False')[['PatientID', 'StartDate']]

In [34]:
row_ID(diagnosis)

(1499292, 68483)

In [35]:
cns_fl = pd.merge(diagnosis, therapy_fl, on = 'PatientID', how = 'left')

In [36]:
row_ID(diagnosis)

(1499292, 68483)

In [37]:
cns_fl.loc[:, 'fl_date_diff'] = (cns_fl['DiagnosisDate'] - cns_fl['StartDate']).dt.days

In [38]:
# ICD-9 dataframe with unique codes for each patient. 
cns_fl_9 = (
    cns_fl
    .query('fl_date_diff <= 0 and fl_date_diff > -90')
    .query('DiagnosisCodeSystem == "ICD-9-CM"')
    .drop_duplicates(subset = (['PatientID', 'DiagnosisCode']), keep = 'first')
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

In [39]:
cns_fl_9_ids = (
    cns_fl_9[cns_fl_9['diagnosis_code'].str.match('198[34]')].PatientID.unique()
)

In [40]:
len(cns_fl_9_ids)

1065

In [41]:
# ICD-9 dataframe with unique codes for each patient. 
cns_fl_10 = (
    cns_fl
    .query('fl_date_diff <= 0 and fl_date_diff > -90')
    .query('DiagnosisCodeSystem == "ICD-10-CM"')
    .drop_duplicates(subset = (['PatientID', 'DiagnosisCode']), keep = 'first')
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

In [42]:
cns_fl_10_ids = (
    cns_fl_10[cns_fl_10['diagnosis_code'].str.match('C79[34]')].PatientID.unique()
)

In [43]:
len(cns_fl_10_ids)

2829

In [44]:
cns_fl_IDs = np.unique(np.concatenate([cns_fl_9_ids, cns_fl_10_ids]))

In [45]:
len(cns_fl_IDs)

3876

#### Second line treatment

In [46]:
therapy_sec = line_therapy.query('LineNumber == 2')[['PatientID', 'StartDate']]

In [47]:
row_ID(diagnosis)

(1499292, 68483)

In [48]:
cns_sec = pd.merge(diagnosis, therapy_sec, on = 'PatientID', how = 'left')

In [49]:
row_ID(diagnosis)

(1499292, 68483)

In [50]:
cns_sec.loc[:, 'sec_date_diff'] = (cns_sec['DiagnosisDate'] - cns_sec['StartDate']).dt.days

In [51]:
# ICD-9 dataframe with unique codes for each patient. 
cns_sec_9 = (
    cns_sec
    .query('sec_date_diff <= 0 and sec_date_diff > -90')
    .query('DiagnosisCodeSystem == "ICD-9-CM"')
    .drop_duplicates(subset = (['PatientID', 'DiagnosisCode']), keep = 'first')
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

In [52]:
cns_sec_9_ids = (
    cns_sec_9[cns_sec_9['diagnosis_code'].str.match('198[34]')].PatientID.unique()
)

In [53]:
len(cns_sec_9_ids)

252

In [54]:
# ICD-9 dataframe with unique codes for each patient. 
cns_sec_10 = (
    cns_sec
    .query('sec_date_diff <= 0 and sec_date_diff > -90')
    .query('DiagnosisCodeSystem == "ICD-10-CM"')
    .drop_duplicates(subset = (['PatientID', 'DiagnosisCode']), keep = 'first')
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

In [55]:
cns_sec_10_ids = (
    cns_sec_10[cns_sec_10['diagnosis_code'].str.match('C79[34]')].PatientID.unique()
)

In [56]:
len(cns_sec_10_ids)

765

In [57]:
cns_sec_IDs = np.unique(np.concatenate([cns_sec_9_ids, cns_sec_10_ids]))

In [58]:
len(cns_sec_IDs)

1000

### 4. ECOG >1 at start of treatment 

In [59]:
base_ecog = pd.read_csv('BaselineECOG.csv')

In [60]:
base_ecog = base_ecog[base_ecog['PatientID'].isin(df['PatientID'])]       

In [61]:
ecog_fl_IDs = (
    base_ecog
    .query('LineNumber == 1')
    .query('ECOGValue == "2" or ECOGValue == "3" or ECOGValue == "4"')
    .PatientID.unique())

In [62]:
len(ecog_fl_IDs)

8927

In [63]:
ecog_sec_IDs = (
    base_ecog
    .query('LineNumber == 2')
    .query('ECOGValue == "2" or ECOGValue == "3" or ECOGValue == "4"')
    .PatientID.unique())

In [64]:
len(ecog_sec_IDs)

4022

### 5. Abnormal organ function at start of treatment 
* Hemoglobin >9
* Creatinine <2
* Total bilirubin <3

In [65]:
lab = pd.read_csv('Lab.csv')

In [66]:
lab = lab[lab['PatientID'].isin(df['PatientID'])]

In [67]:
lab.loc[:, 'ResultDate'] = pd.to_datetime(lab['ResultDate'], errors = 'coerce') 

In [68]:
row_ID(lab)

(39492037, 64852)

In [69]:
lab = pd.merge(lab, therapy_fl[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [70]:
row_ID(lab)

(39492037, 64852)

In [71]:
# Select rows with clinically relevant labs.
lab_core = (
    lab[
    (lab['LOINC'] == "2160-0") |
    (lab['LOINC'] == "38483-4") | 
    (lab['LOINC'] == "718-7") |
    (lab['LOINC'] == "20509-6") |
    (lab['LOINC'] == "42719-5") |
    (lab['LOINC'] == "1975-2")]
    .filter(items = ['PatientID', 
                     'ResultDate', 
                     'LOINC', 
                     'LabComponent', 
                     'TestUnits', 
                     'TestUnitsCleaned', 
                     'TestResult', 
                     'TestResultCleaned', 
                     'StartDate'])
)

In [72]:
conditions = [
    ((lab_core['LOINC'] == '2160-0') | (lab_core['LOINC'] == '38483-4')),
    ((lab_core['LOINC'] == '718-7') | (lab_core['LOINC'] == '20509-6')),
    ((lab_core['LOINC'] == '42719-5') | (lab_core['LOINC'] == '1975-2'))]

choices = ['creatinine', 
           'hemoglobin', 
           'total_bilirubin']

lab_core.loc[:, 'lab_name'] = np.select(conditions, choices)

In [73]:
row_ID(lab_core)

(3835679, 64616)

In [74]:
conditions = [
    (lab_core['lab_name'] == 'hemoglobin') & (lab_core['TestUnits'] == 'g/uL')]

choices = [lab_core['TestResultCleaned'] / 100000]

lab_core.loc[:, 'test_result_cleaned'] = np.select(conditions, choices, default = lab_core['TestResultCleaned'])

In [75]:
lab_f = (
    lab_core
    .assign(lab_date_diff = (lab_core['ResultDate'] - lab_core['StartDate']).dt.days)
    .query('lab_date_diff <= 0 and lab_date_diff > -90')
    .sort_values(by = ['PatientID', 'lab_name', 'lab_date_diff'], ascending = [True, True, False])
    .drop_duplicates(subset = ['PatientID', 'lab_name'], keep = 'first' )
)

In [76]:
# Select lab closest to date of advanced diagnosis and pivot to a wide table. 
lab_wide = (
    lab_f
    .pivot(index = 'PatientID', columns = 'lab_name', values = 'test_result_cleaned')
    .reset_index())

lab_wide.columns.name = None

In [77]:
lab_wide.sample(3)

Unnamed: 0,PatientID,creatinine,hemoglobin,total_bilirubin
31333,FAF2376460B38,0.5,,0.4
10375,F39FA833F0BD8,0.8,14.2,0.3
7403,F298B7E539B55,0.9,10.3,0.5


In [78]:
ab_organ_IDs = lab_wide.query('creatinine > 2 or hemoglobin < 9 or total_bilirubin > 3').PatientID

In [79]:
len(ab_organ_IDs)

2256

In [80]:
ab_organ_IDs.to_csv('ab_organ_IDs.csv', index = False)

In [81]:
del diagnosis
del diagnosis_10
del diagnosis_9
del lab
del lab_core
del lab_f
del lab_wide 

## Part 2: In-silico trials 

### Import packages and create necessary functions

In [82]:
from scipy import stats

from sksurv.nonparametric import kaplan_meier_estimator
from survive import KaplanMeier, SurvivalData

from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.plotting import add_at_risk_counts
from lifelines.utils import median_survival_times, restricted_mean_survival_time
from lifelines.statistics import logrank_test

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

import warnings

In [83]:
# Find index for value closest to input value. 
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

In [84]:
# Calculates median overeall suvival for risk groups. 
def mos(low, med, high, comp):
    low_os = low.median_survival_time_
    med_os = med.median_survival_time_
    high_os = high.median_survival_time_
    comp_os = comp.median_survival_time_
    mos = [low_os, med_os, high_os, comp_os]
    return (mos)

In [85]:
def rmst_mos_95ci(df, num_samples, drug, event, items_list, numerical_features, rmst_time):
    
    """
    Estimate the 95% confidence interval for RMST and mOS using bootstrap resampling.

    Parameters:
    - df: DataFrame containing survival data
    - num_samples: Number of bootstrap samples
    - drug: Treatment indicator variable
    - event: Event type ('death' or 'progression')
    - items_list: Feature list for IPTW 
    - numerical_features: List of numerical features
    - rmst_time: Time to calculate RMST 

    Returns:
    - mos_A_95: mOS 95% CI for treatment
    - mos_B_95: mOS 95% CI for control
    - rmst_A_95: RMST 95% CI for treatment
    - rmst_B_95: RMST 95% CI for control
    - difference_rmst_95: RMST 95% CI for difference between treatment and control 
    """
    
    np.random.seed(42)
    mos_A = []
    mos_B = []
    rmst_A_list = []
    rmst_B_list = []
    differences_rmst = []
    
    # Define variables based on the event type
    if event == 'death':
        time_column = 'timerisk_treatment'
        status_column = 'death_status'
        
    else:
        time_column = 'time_prog_treatment'
        status_column = 'pfs_status'
        
    # Set up preprocessor for logistical regression which will be for IPTW  
    numerical_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('std_scaler', StandardScaler())])
        
    categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')
    categorical_features = list(df.select_dtypes(include = ['category']).columns)
        
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)],
        remainder = 'passthrough')
    
    # Boostrap HR 
    for _ in range(num_samples):
        
        # Resample data with replacement
        resampled_df = resample(df).drop(columns = ['ps', 'weight'])
        
        # Calculated IPTW for the resampled group 
        df_x = preprocessor.fit_transform(resampled_df.filter(items = items_list))
                                           
        df_lr = LogisticRegression(max_iter = 1000)
        df_lr.fit(df_x, resampled_df[drug])
        warnings.filterwarnings("ignore")
                                           
        pred = df_lr.predict_proba(df_x)        
        resampled_df['ps'] = pred[:, 1]                          
        resampled_df['weight'] = (
                np.where(resampled_df[drug] == 1, 1/resampled_df['ps'], 1/(1 - resampled_df['ps'])))
    
        # mOS from IPTW-KM
        kmf_A = KaplanMeierFitter()
        kmf_A.fit(resampled_df.query(f'{drug} == 1')[time_column]/30,
                  resampled_df.query(f'{drug} == 1')[status_column], 
                  weights = resampled_df.query(f'{drug} == 1')['weight'])

        kmf_B = KaplanMeierFitter()
        kmf_B.fit(resampled_df.query(f'{drug} == 0')[time_column]/30,
                  resampled_df.query(f'{drug} == 0')[status_column], 
                  weights = resampled_df.query(f'{drug} == 0')['weight'])
    
        mos_A.append(kmf_A.median_survival_time_)
        mos_B.append(kmf_B.median_survival_time_)
        
        # RMST from IPTW-KM
        rmst_A = restricted_mean_survival_time(kmf_A, rmst_time)
        rmst_B = restricted_mean_survival_time(kmf_B, rmst_time)
        
        rmst_A_list.append(rmst_A)
        rmst_B_list.append(rmst_B)
        differences_rmst.append(rmst_A - rmst_B)

    # Calculate the 95% confidence interval
    results = pd.Series({
    'mos_A_95': np.percentile(mos_A, [2.5, 97.5]),
    'mos_B_95': np.percentile(mos_B, [2.5, 97.5]),
    'rmst_A_95': np.percentile(rmst_A_list, [2.5, 97.5]),
    'rmst_B_95': np.percentile(rmst_B_list, [2.5, 97.5]),
    'difference_rmst_95': np.percentile(differences_rmst, [2.5, 97.5])
    })
    
    return results

In [86]:
cutoff = pd.read_csv('risk_cutoff_lung.csv', index_col = 0)

### KEYNOTE-042: First-line pembrolizumab vs. platinum-based chemotherapy in those with PDL1 >=1%

**INCLUSION CRITERIA**
* Untreated locally advanced or metastatic NSCLC
* Received first line pemobrolizumab or platinum-based chemotherapy 
* PDL1 >= 1% and status known within (-inf, +30] days of first-line treatment
* EGFR and ALK negative
* No autoimmune diseases in the year preceding metastatic diagnosis 
* No history of ILD, HIV, Hep C, Hep B, severe psychiatric history, or drug use disorder in the year preceding metastatic diagnosis 
* No CNS metastasis at start of treatment 
* ECOG is not 2, 3, or 4 at start of treatment 
* Adeuquate organ function at start of treatment 

#### Pembrolizumab

In [87]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [88]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [89]:
key042_pembro = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
    .query('LineName == "Pembrolizumab"')
    [['PatientID', 'StartDate']]
)

In [90]:
key042_pembro.loc[:, 'pembro'] = 1

In [91]:
row_ID(key042_pembro)

(3648, 3648)

In [92]:
# Dataframe of all therapies received for those receiving first line pembrolizumab only. 
line_therapy_pembro_042 = (
    line_therapy[line_therapy['PatientID'].isin(key042_pembro.PatientID)])

In [93]:
targeted = [
    'Afatinib',
    'Alectinib',
    'Brigatinib',
    'Cabozantinib',
    'Capmatinib',
    'Ceritinib',
    'Crizotinib',
    'Dabrafenib',
    'Dacomitinib',
    'Entrectinib',
    'Erlotinib',
    'Gefitinib',
    'Lorlatinib',
    'Osimertinib',
    'Pralsetinib',
    'Selpercatinib',
    'Sotorasib',
    'Tepotinib',
    'Trametinib',
    'Vandetanib']

In [94]:
# Patients receiving pembrolizumab therapy who later recieve targeted therapy. 
pembro_042_xcross = (
    line_therapy_pembro_042[line_therapy_pembro_042['LineName'].str.contains('|'.join(targeted))].PatientID)

In [95]:
# Select patients who don't receive targeted therapy in future lines.
key042_pembro = key042_pembro[~key042_pembro['PatientID'].isin(pembro_042_xcross)]

In [96]:
row_ID(key042_pembro)

(3582, 3582)

In [97]:
row_ID(key042_pembro)

(3582, 3582)

#### Platinum-based chemotherapy 

In [98]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
)

In [99]:
plat_chemo = [
    'Carboplatin',
    'Cisplatin']

immuno = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab',
    'Pembrolizumab'
]

In [100]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')].LineName.value_counts().head(10)

Carboplatin,Paclitaxel                  8524
Carboplatin,Pemetrexed                  5417
Bevacizumab,Carboplatin,Pemetrexed      2825
Carboplatin,Paclitaxel Protein-Bound    1826
Bevacizumab,Carboplatin,Paclitaxel      1591
Carboplatin,Gemcitabine                 1224
Cisplatin,Etoposide                      793
Carboplatin,Docetaxel                    780
Cisplatin,Pemetrexed                     684
Carboplatin,Etoposide                    363
Name: LineName, dtype: int64

In [101]:
key042_plat = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [102]:
key042_plat.loc[:, 'pembro'] = 0

In [103]:
row_ID(key042_plat)

(25861, 25861)

In [104]:
# Dataframe of all therapies received for those receiving first line platinum regimen   
line_therapy_plat_042 = (
    line_therapy[line_therapy['PatientID'].isin(key042_plat.PatientID)])

In [105]:
# Patients receiving platinum therapy who later recieve targeted therapy. 
plat_042_xcross = (
    line_therapy_plat_042[line_therapy_plat_042['LineName'].str.contains('|'.join(targeted))].PatientID)

In [106]:
# Select patients who don't receive targeted therapy in future lines 
key042_plat = key042_plat[~key042_plat['PatientID'].isin(plat_042_xcross)]

In [107]:
row_ID(key042_plat)

(23800, 23800)

In [108]:
key_042 = pd.concat([key042_pembro, key042_plat])

In [109]:
row_ID(key_042)

(27382, 27382)

In [110]:
key_042 = pd.merge(key_042, df_full, on = 'PatientID', how = 'left')

In [111]:
row_ID(key_042)

(27382, 27382)

In [112]:
key_042['StartDate'] = pd.to_datetime(key_042['StartDate'])

#### PDL1 >=1%

In [113]:
biomarkers = pd.read_csv('Enhanced_AdvNSCLCBiomarkers.csv')

In [114]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(key_042['PatientID'])]

In [115]:
biomarkers = pd.merge(biomarkers, key_042[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [116]:
row_ID(biomarkers)

(94603, 19875)

In [117]:
biomarkers['ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [118]:
biomarkers['SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [119]:
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [120]:
biomarkers.loc[:, 'date_diff'] = (biomarkers['result_date'] - biomarkers['StartDate']).dt.days

In [121]:
pdl1_value = (
    biomarkers
    .query('BiomarkerName == "PDL1"')
    .query('date_diff <=30')
    .query('PercentStaining != "0%" and PercentStaining != "< 1%" and PercentStaining.notnull()', engine = 'python')
    .sort_values(by = ['PatientID', 'PercentStaining'], ascending = [True, False])
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'PercentStaining']]
)

In [122]:
pdl1_ids = (
    biomarkers
    .query('BiomarkerName == "PDL1"')
    .query('date_diff <=30')
    .query('PercentStaining != "0%" and PercentStaining != "< 1%" and PercentStaining.notnull()', engine = 'python')
    .PatientID
    .unique()
)

In [123]:
key_042 = key_042[key_042.PatientID.isin(pdl1_ids)]

In [124]:
row_ID(key_042)

(5671, 5671)

#### Time from treatment to death or censor

In [125]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [126]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [127]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [128]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [129]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
print(len(mortality), mortality.PatientID.is_unique)

68483 True


In [130]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [131]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [132]:
key_042 = pd.merge(key_042, mortality, on = 'PatientID', how = 'left')

In [133]:
row_ID(key_042)

(5671, 5671)

In [134]:
conditions = [
    (key_042['death_status'] == 1),
    (key_042['death_status'] == 0)]

choices = [
    (key_042['death_date'] - key_042['StartDate']).dt.days,
    (key_042['last_activity'] - key_042['StartDate']).dt.days]

key_042.loc[:, 'timerisk_treatment'] = np.select(conditions, choices)

In [135]:
key_042 = key_042.query('timerisk_treatment >= 0')

#### Patient count

In [136]:
key_042 = (
    key_042
    .query('EGFR != "positive"')
    .query('ALK != "positive"')
)

In [137]:
row_ID(key_042)

(5548, 5548)

In [138]:
# Exclude those with autoimmune conditions. 
key_042 = key_042[~key_042['PatientID'].isin(auto_IDs)]

In [139]:
# Exclude those with other relevant comorbidities
key_042 = key_042[~key_042['PatientID'].isin(other_comorb_IDs)]

In [140]:
# Exlcude those with CNS metastasis 
key_042 = key_042[~key_042['PatientID'].isin(cns_fl_IDs)]

In [141]:
# Exclude those with ECOG 2, 3, or 4
key_042 = key_042[~key_042['PatientID'].isin(ecog_fl_IDs)]

In [142]:
# Exclude those with abnormal organ function
key_042 = key_042[~key_042['PatientID'].isin(ab_organ_IDs)]

In [143]:
row_ID(key_042)

(3727, 3727)

In [144]:
low_cutoff_042 = cutoff.loc['keynote_042'].low

In [145]:
high_cutoff_042 = cutoff.loc['keynote_042'].high

In [146]:
print('Pembro total:', key_042.query('pembro == 1').shape[0])
print('High risk:', key_042.query('pembro == 1').query('risk_score >= @high_cutoff_042').shape[0])
print('Med risk:', key_042.query('pembro == 1').query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042').shape[0])
print('Low risk:', key_042.query('pembro == 1').query('risk_score <= @low_cutoff_042').shape[0])

Pembro total: 1818
High risk: 475
Med risk: 651
Low risk: 692


In [147]:
print('Platinum total:',  key_042.query('pembro == 0').shape[0])
print('High risk:', key_042.query('pembro == 0').query('risk_score >= @high_cutoff_042').shape[0])
print('Med risk:', key_042.query('pembro == 0').query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042').shape[0])
print('Low risk:', key_042.query('pembro == 0').query('risk_score <= @low_cutoff_042').shape[0])

Platinum total: 1909
High risk: 450
Med risk: 653
Low risk: 806


#### Survival curves with covariate balancing 

In [148]:
row_ID(key_042)

(3727, 3727)

In [149]:
key_042 = pd.merge(key_042, pdl1_value, on = 'PatientID', how = 'left')

In [150]:
row_ID(key_042)

(3727, 3727)

In [151]:
conditions = [
    (key_042['PercentStaining'] == "1%") | 
    (key_042['PercentStaining'] == "2% - 4%") |
    (key_042['PercentStaining'] == "5% - 9%") |
    (key_042['PercentStaining'] == "10% - 19%") |
    (key_042['PercentStaining'] == "20% - 29%") |
    (key_042['PercentStaining'] == "30% - 39%") |
    (key_042['PercentStaining'] == "40% - 49%")
]

choices = ['lt50']

key_042['pdl1_det'] = np.select(conditions, choices, default = 'gte50')

In [152]:
key_042 = key_042.set_index('PatientID')

In [153]:
key_042_iptw = key_042.filter(items = ['death_status',
                                       'timerisk_treatment',
                                       'pembro',
                                       'age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'adv_year',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_diagnosis',
                                       'pdl1_det',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'])

In [154]:
key_042_iptw['met_cat'] = pd.cut(key_042_iptw['adv_year'],
                                 bins = [2010, 2016, float('inf')],
                                 labels = ['11-16', '17-21'])

In [155]:
conditions = [
    ((key_042_iptw['ecog_diagnosis'] == "1.0") | (key_042_iptw['ecog_diagnosis'] == "0.0")),  
    ((key_042_iptw['ecog_diagnosis'] == "2.0") | (key_042_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

key_042_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [156]:
key_042_iptw.dtypes

death_status               bool
timerisk_treatment      float64
pembro                    int64
age                       int64
gender                   object
race                     object
PracticeType             object
Histology                object
adv_year                  int64
delta_adv_diagnosis       int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
pdl1_det                 object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
ecog_2                   object
dtype: object

In [157]:
to_be_categorical = list(key_042_iptw.select_dtypes(include = ['object']).columns)

In [158]:
to_be_categorical

['gender',
 'race',
 'PracticeType',
 'Histology',
 'ecog_diagnosis',
 'pdl1_det',
 'ecog_2']

In [159]:
to_be_categorical.append('met_cat')

In [160]:
to_be_categorical.remove('ecog_diagnosis')

In [161]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    key_042_iptw[x] = key_042_iptw[x].astype('category')

In [162]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [163]:
# List of categorical features.
categorical_features = list(key_042_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [164]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [165]:
key_042_iptw_low = (
    key_042_iptw
    .query('risk_score <= @low_cutoff_042'))

key_042_iptw_med = (
    key_042_iptw
    .query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042'))

key_042_iptw_high = (
    key_042_iptw
    .query('risk_score >= @high_cutoff_042'))

key_042_iptw_all = key_042_iptw

In [166]:
key_042_low_x = preprocessor.fit_transform(key_042_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_det',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

key_042_med_x = preprocessor.fit_transform(key_042_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_det',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

key_042_high_x = preprocessor.fit_transform(key_042_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'PracticeType',
                                                                              'Histology',
                                                                              'met_cat',
                                                                              'delta_adv_diagnosis',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ecog_2',
                                                                              'pdl1_det', 
                                                                              'albumin_diag',
                                                                              'weight_pct_change',
                                                                              'risk_score']))

key_042_all_x = preprocessor.fit_transform(key_042_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_det', 
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

In [167]:
lr_042_low = LogisticRegression(max_iter = 1000)
lr_042_low.fit(key_042_low_x, key_042_iptw_low['pembro'])

LogisticRegression(max_iter=1000)

In [168]:
lr_042_med = LogisticRegression(max_iter = 1000)
lr_042_med.fit(key_042_med_x, key_042_iptw_med['pembro'])

LogisticRegression(max_iter=1000)

In [169]:
lr_042_high = LogisticRegression(max_iter = 1000)
lr_042_high.fit(key_042_high_x, key_042_iptw_high['pembro'])

LogisticRegression(max_iter=1000)

In [170]:
lr_042_all = LogisticRegression(max_iter = 1000)
lr_042_all.fit(key_042_all_x, key_042_iptw_all['pembro'])

LogisticRegression(max_iter=1000)

In [171]:
pred_low = lr_042_low.predict_proba(key_042_low_x)
pred_med = lr_042_med.predict_proba(key_042_med_x)
pred_high = lr_042_high.predict_proba(key_042_high_x)
pred_all = lr_042_all.predict_proba(key_042_all_x)

In [172]:
key_042_iptw_low['ps'] = pred_low[:, 1]
key_042_iptw_med['ps'] = pred_med[:, 1]
key_042_iptw_high['ps'] = pred_high[:, 1]
key_042_iptw_all['ps'] = pred_all[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [173]:
key_042_iptw_low['weight'] = (
    np.where(key_042_iptw_low['pembro'] == 1, 1/key_042_iptw_low['ps'], 1/(1 - key_042_iptw_low['ps'])))

key_042_iptw_med['weight'] = (
    np.where(key_042_iptw_med['pembro'] == 1, 1/key_042_iptw_med['ps'], 1/(1 - key_042_iptw_med['ps'])))

key_042_iptw_high['weight'] = (
    np.where(key_042_iptw_high['pembro'] == 1, 1/key_042_iptw_high['ps'], 1/(1 - key_042_iptw_high['ps'])))

key_042_iptw_all['weight'] = (
    np.where(key_042_iptw_all['pembro'] == 1, 1/key_042_iptw_all['ps'], 1/(1 - key_042_iptw_all['ps'])))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [174]:
# Low KM curves
kmf_low_pembro_042_iptw = KaplanMeierFitter()
kmf_low_plat_042_iptw = KaplanMeierFitter()

kmf_low_pembro_042_iptw.fit(
    key_042_iptw_low.query('pembro == 1').timerisk_treatment/30,
    key_042_iptw_low.query('pembro == 1').death_status,
    weights = key_042_iptw_low.query('pembro == 1')['weight'])

kmf_low_plat_042_iptw.fit(
    key_042_iptw_low.query('pembro == 0').timerisk_treatment/30,
    key_042_iptw_low.query('pembro == 0').death_status,
    weights = key_042_iptw_low.query('pembro == 0')['weight'])

# Med KM curves
kmf_med_pembro_042_iptw = KaplanMeierFitter()
kmf_med_plat_042_iptw = KaplanMeierFitter()

kmf_med_pembro_042_iptw.fit(
    key_042_iptw_med.query('pembro == 1').timerisk_treatment/30,
    key_042_iptw_med.query('pembro == 1').death_status,
    weights = key_042_iptw_med.query('pembro == 1')['weight'])

kmf_med_plat_042_iptw.fit(
    key_042_iptw_med.query('pembro == 0').timerisk_treatment/30,
    key_042_iptw_med.query('pembro == 0').death_status,
    weights = key_042_iptw_med.query('pembro == 0')['weight'])

# High KM curves 
kmf_high_pembro_042_iptw = KaplanMeierFitter()
kmf_high_plat_042_iptw = KaplanMeierFitter()

kmf_high_pembro_042_iptw.fit(
    key_042_iptw_high.query('pembro == 1').timerisk_treatment/30,
    key_042_iptw_high.query('pembro == 1').death_status,
    weights = key_042_iptw_high.query('pembro == 1')['weight'])

kmf_high_plat_042_iptw.fit(
    key_042_iptw_high.query('pembro == 0').timerisk_treatment/30,
    key_042_iptw_high.query('pembro == 0').death_status,
    weights = key_042_iptw_high.query('pembro == 0')['weight'])

# All KM curves 
kmf_all_pembro_042_iptw = KaplanMeierFitter()
kmf_all_plat_042_iptw = KaplanMeierFitter()

kmf_all_pembro_042_iptw.fit(
    key_042_iptw_all.query('pembro == 1').timerisk_treatment/30,
    key_042_iptw_all.query('pembro == 1').death_status,
    weights = key_042_iptw_all.query('pembro == 1')['weight'])

kmf_all_plat_042_iptw.fit(
    key_042_iptw_all.query('pembro == 0').timerisk_treatment/30,
    key_042_iptw_all.query('pembro == 0').death_status,
    weights = key_042_iptw_all.query('pembro == 0')['weight'])

  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  


<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 3897.08 total observations, 1623.18 right-censored observations>

#### Calculating survival metrics 

In [175]:
pembro_042_median_os = mos(kmf_low_pembro_042_iptw,
                           kmf_med_pembro_042_iptw,
                           kmf_high_pembro_042_iptw,
                           kmf_all_pembro_042_iptw)

plat_042_median_os = mos(kmf_low_plat_042_iptw,
                         kmf_med_plat_042_iptw,
                         kmf_high_plat_042_iptw,
                         kmf_all_plat_042_iptw)

In [176]:
key_042_iptw_all_imputed = key_042_iptw_all.copy()
key_042_iptw_all_imputed['albumin_diag'] = key_042_iptw_all_imputed['albumin_diag'].fillna(key_042_iptw_all_imputed['albumin_diag'].median())
key_042_iptw_all_imputed['weight_pct_change'] = key_042_iptw_all_imputed['weight_pct_change'].fillna(key_042_iptw_all_imputed['weight_pct_change'].median())

In [177]:
key042_hr_all = CoxPHFitter()
key042_hr_all.fit(key_042_iptw_all_imputed,
                  duration_col = 'timerisk_treatment',
                  event_col = 'death_status',
                  formula = 'pembro + age + gender + race + PracticeType + Histology + met_cat + delta_adv_diagnosis + commercial + medicare + medicaid + ecog_2 + pdl1_det + albumin_diag + weight_pct_change', 
                  weights_col = 'weight', 
                  robust = True)

<lifelines.CoxPHFitter: fitted with 7604.64 total observations, 3402 right-censored observations>

In [178]:
key042_all_rmst_mos_95 = rmst_mos_95ci(key_042_iptw_all,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_det', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [179]:
key042_low_rmst_mos_95 = rmst_mos_95ci(key_042_iptw_low,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_det', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [180]:
key042_med_rmst_mos_95 = rmst_mos_95ci(key_042_iptw_med,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_det', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [181]:
key042_high_rmst_mos_95 = rmst_mos_95ci(key_042_iptw_high,
                                        1000,
                                        'pembro',
                                        'death',
                                        ['age',
                                         'gender',
                                         'race',
                                         'PracticeType',
                                         'Histology',
                                         'met_cat',
                                         'delta_adv_diagnosis',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ecog_2',
                                         'pdl1_det', 
                                         'albumin_diag',
                                         'weight_pct_change',
                                         'risk_score'],
                                        ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        36)

In [182]:
keynote_042_data = [
    {'trial_name': 'KEYNOTE-042', 
     'risk_group': 'low', 
     's_trt_mos': pembro_042_median_os[0],
     's_trt_mos_95': key042_low_rmst_mos_95.mos_A_95,
     's_cont_mos': plat_042_median_os[0],
     's_cont_mos_95': key042_low_rmst_mos_95.mos_B_95,
     's_mos_diff': pembro_042_median_os[0] - plat_042_median_os[0], 
     'rct_trt_arm': 16.7, 
     'rct_cont_arm': 12.1, 
     'rct_mos_diff': 16.7-12.1,
     's_trt_rmst': restricted_mean_survival_time(kmf_low_pembro_042_iptw, 36),
     's_trt_rmst_95': key042_low_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_low_plat_042_iptw, 36),
     's_cont_rmst_95': key042_low_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_low_pembro_042_iptw, 36) - restricted_mean_survival_time(kmf_low_plat_042_iptw, 36),
     's_diff_rmst_95': key042_low_rmst_mos_95.difference_rmst_95,
     'scount': key_042.query('risk_score <= @low_cutoff_042').shape[0]},
    
    {'trial_name': 'KEYNOTE-042', 
     'risk_group': 'medium', 
     's_trt_mos': pembro_042_median_os[1],
     's_trt_mos_95': key042_med_rmst_mos_95.mos_A_95,
     's_cont_mos': plat_042_median_os[1],
     's_cont_mos_95': key042_med_rmst_mos_95.mos_B_95,
     's_mos_diff': pembro_042_median_os[1] - plat_042_median_os[1], 
     'rct_trt_arm': 16.7, 
     'rct_cont_arm': 12.1, 
     'rct_mos_diff': 16.7-12.1,
     's_trt_rmst': restricted_mean_survival_time(kmf_med_pembro_042_iptw, 36),
     's_trt_rmst_95': key042_med_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_med_plat_042_iptw, 36),
     's_cont_rmst_95': key042_med_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_med_pembro_042_iptw, 36) - restricted_mean_survival_time(kmf_med_plat_042_iptw, 36),
     's_diff_rmst_95': key042_med_rmst_mos_95.difference_rmst_95,
     'scount': key_042.query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042').shape[0]},
    
    {'trial_name': 'KEYNOTE-042', 
     'risk_group': 'high', 
     's_trt_mos': pembro_042_median_os[2],
     's_trt_mos_95': key042_high_rmst_mos_95.mos_A_95,
     's_cont_mos': plat_042_median_os[2],
     's_cont_mos_95': key042_high_rmst_mos_95.mos_B_95,
     's_mos_diff': pembro_042_median_os[2] - plat_042_median_os[2], 
     'rct_trt_arm': 16.7, 
     'rct_cont_arm': 12.1, 
     'rct_mos_diff': 16.7-12.1,
     's_trt_rmst': restricted_mean_survival_time(kmf_high_pembro_042_iptw, 36),
     's_trt_rmst_95': key042_high_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_high_plat_042_iptw, 36),
     's_cont_rmst_95': key042_high_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_high_pembro_042_iptw, 36) - restricted_mean_survival_time(kmf_high_plat_042_iptw, 36),
     's_diff_rmst_95': key042_high_rmst_mos_95.difference_rmst_95,
     'scount': key_042.query('risk_score >= @high_cutoff_042').shape[0]},
    
    {'trial_name': 'KEYNOTE-042', 
     'risk_group': 'all', 
     's_hr': key042_hr_all.hazard_ratios_['pembro'],
     's_hr_95': [key042_hr_all.summary.loc['pembro']['exp(coef) lower 95%'], key042_hr_all.summary.loc['pembro']['exp(coef) upper 95%']],
     's_trt_mos': pembro_042_median_os[3],
     's_trt_mos_95': key042_all_rmst_mos_95.mos_A_95,
     's_cont_mos': plat_042_median_os[3],
     's_cont_mos_95': key042_all_rmst_mos_95.mos_B_95,
     's_mos_diff': pembro_042_median_os[3] - plat_042_median_os[3], 
     'rct_trt_arm': 16.7, 
     'rct_cont_arm': 12.1, 
     'rct_mos_diff': 16.7-12.1,
     'scount': key_042.shape[0]}
]

### KEYNOTE-024: First-line pembrolizumab vs. platinum-based chemotherapy in those with high PDL1 

**INCLUSION CRITERIA**
* Untreated stage IV NSCLC
* Received first line pemobrolizumab or platinum-based chemotherapy
* PDL1 >= 50% and status known within (-inf, +30] days of start of first-line treatment 
* EGFR and ALK negatve
* No autoimmune diseases in the year preceding metastatic diagnosis 
* No history of ILD, HIV, Hep C, Hep B, severe psychiatric history, or drug use disorder in the year preceding metastatic diagnosis 
* No CNS metastasis at start of treatment 
* ECOG is not 2, 3, or 4 at start of treatment 
* Adeuquate organ function at start of treatment 

#### Pembrolizumab

In [183]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [184]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [185]:
key024_pembro = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
    .query('LineName == "Pembrolizumab"')
    [['PatientID', 'StartDate']]
)

In [186]:
key024_pembro.loc[:, 'pembro'] = 1

In [187]:
row_ID(key024_pembro)

(3648, 3648)

In [188]:
# Dataframe of all therapies received for those receiving first line pembrolizumab only. 
line_therapy_pembro_024 = (
    line_therapy[line_therapy['PatientID'].isin(key024_pembro.PatientID)])

In [189]:
targeted = [
    'Afatinib',
    'Alectinib',
    'Brigatinib',
    'Cabozantinib',
    'Capmatinib',
    'Ceritinib',
    'Crizotinib',
    'Dabrafenib',
    'Dacomitinib',
    'Entrectinib',
    'Erlotinib',
    'Gefitinib',
    'Lorlatinib',
    'Osimertinib',
    'Pralsetinib',
    'Selpercatinib',
    'Sotorasib',
    'Tepotinib',
    'Trametinib',
    'Vandetanib']

In [190]:
# Patients receiving pembrolizumab therapy who later recieve targeted therapy. 
pembro_024_xcross = (
    line_therapy_pembro_024[line_therapy_pembro_024['LineName'].str.contains('|'.join(targeted))].PatientID)

In [191]:
# Select patients who don't receive targeted therapy in future lines.
key024_pembro = key024_pembro[~key024_pembro['PatientID'].isin(pembro_024_xcross)]

In [192]:
row_ID(key024_pembro)

(3582, 3582)

#### Platinum-based chemotherapy 

In [193]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
)

In [194]:
plat_chemo = [
    'Carboplatin',
    'Cisplatin']

immuno = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab',
    'Pembrolizumab'
]

In [195]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')].LineName.value_counts().head(10)

Carboplatin,Paclitaxel                  8524
Carboplatin,Pemetrexed                  5417
Bevacizumab,Carboplatin,Pemetrexed      2825
Carboplatin,Paclitaxel Protein-Bound    1826
Bevacizumab,Carboplatin,Paclitaxel      1591
Carboplatin,Gemcitabine                 1224
Cisplatin,Etoposide                      793
Carboplatin,Docetaxel                    780
Cisplatin,Pemetrexed                     684
Carboplatin,Etoposide                    363
Name: LineName, dtype: int64

In [196]:
key024_plat = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [197]:
key024_plat.loc[:, 'pembro'] = 0

In [198]:
row_ID(key024_plat)

(25861, 25861)

In [199]:
# Dataframe of all therapies received for those receiving first line platinum regimen  
line_therapy_plat_024 = (
    line_therapy[line_therapy['PatientID'].isin(key024_plat.PatientID)])

In [200]:
# Patients receiving platinum therapy who later recieve targeted therapy. 
plat_024_xcross = (
    line_therapy_plat_024[line_therapy_plat_024['LineName'].str.contains('|'.join(targeted))].PatientID)

In [201]:
# Select patients who don't receive targeted therapy in future lines.
key024_plat = key024_plat[~key024_plat['PatientID'].isin(plat_024_xcross)]

In [202]:
row_ID(key024_plat)

(23800, 23800)

In [203]:
key_024 = pd.concat([key024_pembro, key024_plat])

In [204]:
row_ID(key_024)

(27382, 27382)

In [205]:
key_024 = pd.merge(key_024, df_full, on = 'PatientID', how = 'left')

In [206]:
row_ID(key_024)

(27382, 27382)

In [207]:
key_024['StartDate'] = pd.to_datetime(key_024['StartDate'])

#### High PDL1

In [208]:
biomarkers = pd.read_csv('Enhanced_AdvNSCLCBiomarkers.csv')

In [209]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(key_024['PatientID'])]

In [210]:
biomarkers = pd.merge(biomarkers, key_024[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [211]:
row_ID(biomarkers)

(94603, 19875)

In [212]:
biomarkers['StartDate'] = pd.to_datetime(biomarkers['StartDate'])

In [213]:
biomarkers['ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [214]:
biomarkers['SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [215]:
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [216]:
biomarkers.loc[:, 'date_diff'] = (biomarkers['result_date'] - biomarkers['StartDate']).dt.days

In [217]:
lst = ["50% - 59%", "60% - 69%", "70% - 79%", "80% - 89%", "90% - 99%", "100%"]

pdl1_ids = (
    biomarkers
    .query('BiomarkerName == "PDL1"')
    .query('date_diff <=30')
    .query('PercentStaining == @lst')
    .PatientID
    .unique()
)

In [218]:
key_024 = key_024[key_024.PatientID.isin(pdl1_ids)]

In [219]:
row_ID(key_024)

(3519, 3519)

#### Time from treatment to death/progression or censor 

In [220]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [221]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [222]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [223]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [224]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
row_ID(mortality)

(68483, 68483)

In [225]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [226]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [227]:
row_ID(mortality)

(68483, 68483)

In [228]:
key_024 = pd.merge(key_024, mortality, on = 'PatientID', how = 'left')

In [229]:
row_ID(key_024)

(3519, 3519)

In [230]:
progression = pd.read_csv('Enhanced_AdvNSCLCProgression.csv')

In [231]:
progression = progression[progression.PatientID.isin(key_024.PatientID)][['PatientID', 'ProgressionDate']]

In [232]:
progression['ProgressionDate'] = pd.to_datetime(progression['ProgressionDate'])

In [233]:
progression = (
    progression
    .sort_values(['PatientID', 'ProgressionDate'], ascending = [True, True])
    .drop_duplicates(subset = 'PatientID', keep = 'first')
)

In [234]:
row_ID(progression)

(3517, 3517)

In [235]:
key_024 = pd.merge(key_024, progression, on = 'PatientID', how = 'left')

In [236]:
row_ID(key_024)

(3519, 3519)

In [237]:
# Percent without progression date in Flaura trial
len(key_024.query('ProgressionDate.isna()', engine = 'python'))/len(key_024)

0.51974992895709

In [238]:
conditions = [
    (key_024.ProgressionDate.notna()),
    ((key_024.ProgressionDate.isna()) & (key_024['death_status'] == 1)),
    ((key_024.ProgressionDate.isna()) & (key_024['death_status'] == 0))]

choices = [
    (key_024['ProgressionDate'] - key_024['StartDate']).dt.days,
    (key_024['death_date'] - key_024['StartDate']).dt.days,
    (key_024['last_activity'] - key_024['StartDate']).dt.days]

key_024.loc[:, 'time_prog_treatment'] = np.select(conditions, choices)

In [239]:
key_024 = key_024.query('time_prog_treatment >= 0')

In [240]:
row_ID(key_024)

(3209, 3209)

In [241]:
conditions = [
    (key_024.ProgressionDate.notna()),
    ((key_024.ProgressionDate.isna()) & (key_024['death_status'] == 1)),
    ((key_024.ProgressionDate.isna()) & (key_024['death_status'] == 0))]

choices = [1, 1, 0]

key_024.loc[:, 'pfs_status'] = np.select(conditions, choices)

#### Patient counts

In [242]:
key_024 = (
    key_024
    .query('stage == "IV"')
    .query('EGFR != "positive"')
    .query('ALK != "positive"')
)

In [243]:
row_ID(key_024)

(2120, 2120)

In [244]:
# Exclude those with autoimmune conditions. 
key_024 = key_024[~key_024['PatientID'].isin(auto_IDs)]

In [245]:
# Exclude those with other relevant comorbidities
key_024 = key_024[~key_024['PatientID'].isin(other_comorb_IDs)]

In [246]:
# Exlcude those with CNS metastasis 
key_024 = key_024[~key_024['PatientID'].isin(cns_fl_IDs)]

In [247]:
# Include those with ECOG 2, 3, or 4 
key_024 = key_024[~key_024['PatientID'].isin(ecog_fl_IDs)]

In [248]:
# Exclude those with abnormal organ function
key_024 = key_024[~key_024['PatientID'].isin(ab_organ_IDs)]

In [249]:
row_ID(key_024)

(1330, 1330)

In [250]:
low_cutoff_024 = cutoff.loc['keynote_024'].low

In [251]:
high_cutoff_024 = cutoff.loc['keynote_024'].high

In [252]:
print('Pembro total:',  key_024.query('pembro == 1').shape[0])
print('High risk:', key_024.query('pembro == 1').query('risk_score >= @high_cutoff_024').shape[0])
print('Med risk:', key_024.query('pembro == 1').query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024').shape[0])
print('Low risk:', key_024.query('pembro == 1').query('risk_score <= @low_cutoff_024').shape[0])

Pembro total: 1016
High risk: 230
Med risk: 372
Low risk: 414


In [253]:
print('Platinum total:',  key_024.query('pembro == 0').shape[0])
print('High risk:', key_024.query('pembro == 0').query('risk_score >= @high_cutoff_024').shape[0])
print('Med risk:', key_024.query('pembro == 0').query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024').shape[0])
print('Low risk:', key_024.query('pembro == 0').query('risk_score <= @low_cutoff_024').shape[0])

Platinum total: 314
High risk: 89
Med risk: 106
Low risk: 119


#### PFS with covariate balancing 

In [254]:
key_024 = key_024.set_index('PatientID')

In [255]:
key_024_iptw = key_024.filter(items = ['pfs_status',
                                       'time_prog_treatment',
                                       'pembro',
                                       'age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'adv_year',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_diagnosis',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'])

In [256]:
key_024_iptw['met_cat'] = pd.cut(key_024_iptw['adv_year'],
                                 bins = [2010, 2016, float('inf')],
                                 labels = ['11-16', '17-21'])

In [257]:
conditions = [
    ((key_024_iptw['ecog_diagnosis'] == "1.0") | (key_024_iptw['ecog_diagnosis'] == "0.0")),  
    ((key_024_iptw['ecog_diagnosis'] == "2.0") | (key_024_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

key_024_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [258]:
key_024_iptw.dtypes

pfs_status                int64
time_prog_treatment     float64
pembro                    int64
age                       int64
gender                   object
race                     object
PracticeType             object
Histology                object
adv_year                  int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
ecog_2                   object
dtype: object

In [259]:
to_be_categorical = list(key_024_iptw.select_dtypes(include = ['object']).columns)

In [260]:
to_be_categorical

['gender', 'race', 'PracticeType', 'Histology', 'ecog_diagnosis', 'ecog_2']

In [261]:
to_be_categorical.append('met_cat')

In [262]:
to_be_categorical.remove('ecog_diagnosis')

In [263]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    key_024_iptw[x] = key_024_iptw[x].astype('category')

In [264]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [265]:
# List of categorical features.
categorical_features = list(key_024_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [266]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [267]:
key_024_iptw_low = (
    key_024_iptw
    .query('risk_score <= @low_cutoff_024'))

key_024_iptw_med = (
    key_024_iptw
    .query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024'))

key_024_iptw_high = (
    key_024_iptw
    .query('risk_score >= @high_cutoff_024'))

key_024_iptw_all = key_024_iptw

In [268]:
key_024_low_x = preprocessor.fit_transform(key_024_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

key_024_med_x = preprocessor.fit_transform(key_024_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

key_024_high_x = preprocessor.fit_transform(key_024_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'PracticeType',
                                                                              'Histology',
                                                                              'met_cat',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ecog_2',
                                                                              'albumin_diag',
                                                                              'weight_pct_change',
                                                                              'risk_score']))

key_024_all_x = preprocessor.fit_transform(key_024_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

In [269]:
lr_024_low = LogisticRegression(max_iter = 1000)
lr_024_low.fit(key_024_low_x, key_024_iptw_low['pembro'])

LogisticRegression(max_iter=1000)

In [270]:
lr_024_med = LogisticRegression(max_iter = 1000)
lr_024_med.fit(key_024_med_x, key_024_iptw_med['pembro'])

LogisticRegression(max_iter=1000)

In [271]:
lr_024_high = LogisticRegression(max_iter = 1000)
lr_024_high.fit(key_024_high_x, key_024_iptw_high['pembro'])

LogisticRegression(max_iter=1000)

In [272]:
lr_024_all = LogisticRegression(max_iter = 1000)
lr_024_all.fit(key_024_all_x, key_024_iptw_all['pembro'])

LogisticRegression(max_iter=1000)

In [273]:
pred_low = lr_024_low.predict_proba(key_024_low_x)
pred_med = lr_024_med.predict_proba(key_024_med_x)
pred_high = lr_024_high.predict_proba(key_024_high_x)
pred_all = lr_024_all.predict_proba(key_024_all_x)

In [274]:
key_024_iptw_low['ps'] = pred_low[:, 1]
key_024_iptw_med['ps'] = pred_med[:, 1]
key_024_iptw_high['ps'] = pred_high[:, 1]
key_024_iptw_all['ps'] = pred_all[:, 1]

In [275]:
key_024_iptw_low['weight'] = (
    np.where(key_024_iptw_low['pembro'] == 1, 1/key_024_iptw_low['ps'], 1/(1 - key_024_iptw_low['ps'])))

key_024_iptw_med['weight'] = (
    np.where(key_024_iptw_med['pembro'] == 1, 1/key_024_iptw_med['ps'], 1/(1 - key_024_iptw_med['ps'])))

key_024_iptw_high['weight'] = (
    np.where(key_024_iptw_high['pembro'] == 1, 1/key_024_iptw_high['ps'], 1/(1 - key_024_iptw_high['ps'])))

key_024_iptw_all['weight'] = (
    np.where(key_024_iptw_all['pembro'] == 1, 1/key_024_iptw_all['ps'], 1/(1 - key_024_iptw_all['ps'])))

In [276]:
# Low KM curves
kmf_low_pembro_024_iptw = KaplanMeierFitter()
kmf_low_plat_024_iptw = KaplanMeierFitter()

kmf_low_pembro_024_iptw.fit(
    key_024_iptw_low.query('pembro == 1').time_prog_treatment/30,
    key_024_iptw_low.query('pembro == 1').pfs_status,
    weights = key_024_iptw_low.query('pembro == 1')['weight'])

kmf_low_plat_024_iptw.fit(
    key_024_iptw_low.query('pembro == 0').time_prog_treatment/30,
    key_024_iptw_low.query('pembro == 0').pfs_status,
    weights = key_024_iptw_low.query('pembro == 0')['weight'])

# Med KM curves
kmf_med_pembro_024_iptw = KaplanMeierFitter()
kmf_med_plat_024_iptw = KaplanMeierFitter()

kmf_med_pembro_024_iptw.fit(
    key_024_iptw_med.query('pembro == 1').time_prog_treatment/30,
    key_024_iptw_med.query('pembro == 1').pfs_status,
    weights = key_024_iptw_med.query('pembro == 1')['weight'])

kmf_med_plat_024_iptw.fit(
    key_024_iptw_med.query('pembro == 0').time_prog_treatment/30,
    key_024_iptw_med.query('pembro == 0').pfs_status,
    weights = key_024_iptw_med.query('pembro == 0')['weight'])

# High KM curves 
kmf_high_pembro_024_iptw = KaplanMeierFitter()
kmf_high_plat_024_iptw = KaplanMeierFitter()

kmf_high_pembro_024_iptw.fit(
    key_024_iptw_high.query('pembro == 1').time_prog_treatment/30,
    key_024_iptw_high.query('pembro == 1').pfs_status,
    weights = key_024_iptw_high.query('pembro == 1')['weight'])

kmf_high_plat_024_iptw.fit(
    key_024_iptw_high.query('pembro == 0').time_prog_treatment/30,
    key_024_iptw_high.query('pembro == 0').pfs_status,
    weights = key_024_iptw_high.query('pembro == 0')['weight'])

# All KM curves 
kmf_all_pembro_024_iptw = KaplanMeierFitter()
kmf_all_plat_024_iptw = KaplanMeierFitter()

kmf_all_pembro_024_iptw.fit(
    key_024_iptw_all.query('pembro == 1').time_prog_treatment/30,
    key_024_iptw_all.query('pembro == 1').pfs_status,
    weights = key_024_iptw_all.query('pembro == 1')['weight'])

kmf_all_plat_024_iptw.fit(
    key_024_iptw_all.query('pembro == 0').time_prog_treatment/30,
    key_024_iptw_all.query('pembro == 0').pfs_status,
    weights = key_024_iptw_all.query('pembro == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 1316.02 total observations, 313.729 right-censored observations>

In [277]:
pembro_024_median_os = mos(kmf_low_pembro_024_iptw,
                           kmf_med_pembro_024_iptw,
                           kmf_high_pembro_024_iptw,
                           kmf_all_pembro_024_iptw)

plat_024_median_os = mos(kmf_low_plat_024_iptw,
                         kmf_med_plat_024_iptw,
                         kmf_high_plat_024_iptw,
                         kmf_all_plat_024_iptw)

In [278]:
key_024_iptw_all_imputed = key_024_iptw_all.copy()
key_024_iptw_all_imputed['albumin_diag'] = key_024_iptw_all_imputed['albumin_diag'].fillna(key_024_iptw_all_imputed['albumin_diag'].median())
key_024_iptw_all_imputed['weight_pct_change'] = key_024_iptw_all_imputed['weight_pct_change'].fillna(key_024_iptw_all_imputed['weight_pct_change'].median())

In [279]:
key024_hr_all = CoxPHFitter()
key024_hr_all.fit(key_024_iptw_all_imputed,
                  duration_col = 'time_prog_treatment',
                  event_col = 'pfs_status',
                  formula = 'pembro + age + gender + race + PracticeType + Histology + met_cat + commercial + medicare + medicaid + ecog_2 + albumin_diag + weight_pct_change + risk_score', 
                  weights_col = 'weight', 
                  robust = True)

<lifelines.CoxPHFitter: fitted with 2646.05 total observations, 639.756 right-censored observations>

In [280]:
key024_all_rmst_mos_95 = rmst_mos_95ci(key_024_iptw_all,
                                       1000,
                                       'pembro',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       18)

In [281]:
key024_low_rmst_mos_95 = rmst_mos_95ci(key_024_iptw_low,
                                       1000,
                                       'pembro',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       18)

In [282]:
key024_med_rmst_mos_95 = rmst_mos_95ci(key_024_iptw_med,
                                       1000,
                                       'pembro',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       18)

In [283]:
key024_high_rmst_mos_95 = rmst_mos_95ci(key_024_iptw_high,
                                        1000,
                                        'pembro',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'PracticeType',
                                         'Histology',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ecog_2',
                                         'albumin_diag',
                                         'weight_pct_change',
                                         'risk_score'],
                                        ['age', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        18)

In [284]:
keynote_024_data = [
    {'trial_name': 'KEYNOTE-024', 
     'risk_group': 'low', 
     's_trt_mos': pembro_024_median_os[0],
     's_trt_mos_95': key024_low_rmst_mos_95.mos_A_95,
     's_cont_mos': plat_024_median_os[0],
     's_cont_mos_95': key024_low_rmst_mos_95.mos_B_95,
     's_mos_diff': pembro_024_median_os[0] - plat_024_median_os[0], 
     'rct_trt_arm': 10.3, 
     'rct_cont_arm': 6.0,
     'rct_mos_diff': 10.3-6.0, 
     's_trt_rmst': restricted_mean_survival_time(kmf_low_pembro_024_iptw, 18),
     's_trt_rmst_95': key024_low_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_low_plat_024_iptw, 18),
     's_cont_rmst_95': key024_low_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_low_pembro_024_iptw, 18) - restricted_mean_survival_time(kmf_low_plat_024_iptw, 18),
     's_diff_rmst_95': key024_low_rmst_mos_95.difference_rmst_95,
     'scount': key_024.query('risk_score <= @low_cutoff_024').shape[0]},
    
    {'trial_name': 'KEYNOTE-024', 
     'risk_group': 'medium', 
     's_trt_mos': pembro_024_median_os[1],
     's_trt_mos_95': key024_med_rmst_mos_95.mos_A_95,
     's_cont_mos': plat_024_median_os[1],
     's_cont_mos_95': key024_med_rmst_mos_95.mos_B_95,
     's_mos_diff': pembro_024_median_os[1] - plat_024_median_os[1], 
     'rct_trt_arm': 10.3, 
     'rct_cont_arm': 6.0,
     'rct_mos_diff': 10.3-6.0, 
     's_trt_rmst': restricted_mean_survival_time(kmf_med_pembro_024_iptw, 18),
     's_trt_rmst_95': key024_med_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_med_plat_024_iptw, 18),
     's_cont_rmst_95': key024_med_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_med_pembro_024_iptw, 18) - restricted_mean_survival_time(kmf_med_plat_024_iptw, 18),
     's_diff_rmst_95': key024_med_rmst_mos_95.difference_rmst_95,
     'scount': key_024.query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024').shape[0]},
    
    {'trial_name': 'KEYNOTE-024', 
     'risk_group': 'high', 
     's_trt_mos': pembro_024_median_os[2],
     's_trt_mos_95': key024_high_rmst_mos_95.mos_A_95,
     's_cont_mos': plat_024_median_os[2],
     's_cont_mos_95': key024_high_rmst_mos_95.mos_B_95,
     's_mos_diff': pembro_024_median_os[2] - plat_024_median_os[2], 
     'rct_trt_arm': 10.3, 
     'rct_cont_arm': 6.0,
     'rct_mos_diff': 10.3-6.0, 
     's_trt_rmst': restricted_mean_survival_time(kmf_high_pembro_024_iptw, 18),
     's_trt_rmst_95': key024_high_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_high_plat_024_iptw, 18),
     's_cont_rmst_95': key024_high_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_high_pembro_024_iptw, 18) - restricted_mean_survival_time(kmf_high_plat_024_iptw, 18),
     's_diff_rmst_95': key024_high_rmst_mos_95.difference_rmst_95,
     'scount': key_024.query('risk_score >= @high_cutoff_024').shape[0]},
    
    {'trial_name': 'KEYNOTE-024', 
     'risk_group': 'all', 
     's_hr': key024_hr_all.hazard_ratios_['pembro'],
     's_hr_95': [key024_hr_all.summary.loc['pembro']['exp(coef) lower 95%'], key024_hr_all.summary.loc['pembro']['exp(coef) upper 95%']],
     's_trt_mos': pembro_024_median_os[3],
     's_trt_mos_95': key024_all_rmst_mos_95.mos_A_95,
     's_cont_mos': plat_024_median_os[3],
     's_cont_mos_95': key024_all_rmst_mos_95.mos_B_95,
     's_mos_diff': pembro_024_median_os[3] - plat_024_median_os[3], 
     'rct_trt_arm': 10.3, 
     'rct_cont_arm': 6.0,
     'rct_mos_diff': 10.3-6.0, 
     'scount': key_024.shape[0]}
]

### KEYNOTE-189: First-line pembrolizumab plus chemotherapy vs. chemotherapy

**INCLUSION CRITERIA**
* Untreated stage IV NSCLC
* Received first line pemobrolizumab plus platinum-based chemotherapy or platinum-based chemotherapy
* EGFR and ALK negative
* No autoimmune diseases in the year preceding metastatic diagnosis 
* No history of ILD, HIV, Hep C, Hep B, severe psychiatric history, or drug use disorder in the year preceding metastatic diagnosis 
* No CNS metastasis at start of treatment 
* ECOG is not 2, 3, or 4 at start of treatment 
* Adeuquate organ function at start of treatment 

#### Pembrolizumab + chemotherapy 

In [285]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [286]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [287]:
line_therapy[line_therapy['LineName'].str.contains('Pemetrexed')].LineName.value_counts().head(10)

Carboplatin,Pemetrexed                              6700
Carboplatin,Pembrolizumab,Pemetrexed                5113
Pemetrexed                                          4711
Bevacizumab,Carboplatin,Pemetrexed                  3459
Bevacizumab,Pemetrexed                              1569
Pembrolizumab,Pemetrexed                            1554
Cisplatin,Pemetrexed                                 811
Bevacizumab,Cisplatin,Pemetrexed                     174
Abiraterone,Carboplatin,Pembrolizumab,Pemetrexed     100
Bevacizumab-Awwb,Carboplatin,Pemetrexed               82
Name: LineName, dtype: int64

In [288]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
)

In [289]:
plat_chemo = [
    'Carboplatin',
    'Cisplatin']

immuno_wout_pembro = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab']

targeted = [
    'Afatinib',
    'Alectinib',
    'Brigatinib',
    'Cabozantinib',
    'Capmatinib',
    'Ceritinib',
    'Crizotinib',
    'Dabrafenib',
    'Dacomitinib',
    'Entrectinib',
    'Erlotinib',
    'Gefitinib',
    'Lorlatinib',
    'Osimertinib',
    'Pralsetinib',
    'Selpercatinib',
    'Sotorasib',
    'Tepotinib',
    'Trametinib',
    'Vandetanib']

In [290]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                line_therapy_fl['LineName'].str.contains('Pembrolizumab') &
                ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                ~line_therapy_fl['LineName'].str.contains('|'.join(immuno_wout_pembro)) &
                ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')].LineName.value_counts().head(10)

Carboplatin,Pembrolizumab,Pemetrexed                     4275
Carboplatin,Paclitaxel,Pembrolizumab                      803
Carboplatin,Paclitaxel Protein-Bound,Pembrolizumab        534
Abiraterone,Carboplatin,Pembrolizumab,Pemetrexed           79
Carboplatin,Cyclophosphamide,Pembrolizumab,Pemetrexed      31
Cisplatin,Pembrolizumab,Pemetrexed                         27
Carboplatin,Pembrolizumab                                  19
Carboplatin,Docetaxel,Pembrolizumab                        18
Carboplatin,Paclitaxel,Pembrolizumab,Pemetrexed            18
Bevacizumab,Carboplatin,Pembrolizumab,Pemetrexed           10
Name: LineName, dtype: int64

In [291]:
key189_pembro = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                    line_therapy_fl['LineName'].str.contains('Pembrolizumab') &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno_wout_pembro)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [292]:
key189_pembro.loc[:, 'pembro'] = 1

In [293]:
row_ID(key189_pembro)

(5878, 5878)

In [294]:
# Dataframe of all therapies received for those receiving first line pembrolizumab only. 
line_therapy_pembro_189 = (
    line_therapy[line_therapy['PatientID'].isin(key189_pembro.PatientID)])

In [295]:
# Patients receiving pembrolizumab therapy who later recieve targeted therapy. 
pembro_189_xcross = (
    line_therapy_pembro_189[line_therapy_pembro_189['LineName'].str.contains('|'.join(targeted))].PatientID)

In [296]:
# Select patients who don't receive targeted therapy in future lines.
key189_pembro = key189_pembro[~key189_pembro['PatientID'].isin(pembro_189_xcross)]

In [297]:
row_ID(key189_pembro)

(5673, 5673)

#### Platinum-based chemotherapy

In [298]:
plat_chemo = [
    'Carboplatin',
    'Cisplatin']

immuno = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab',
    'Pembrolizumab'
]

In [299]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')].LineName.value_counts().head(10)

Carboplatin,Paclitaxel                  8524
Carboplatin,Pemetrexed                  5417
Bevacizumab,Carboplatin,Pemetrexed      2825
Carboplatin,Paclitaxel Protein-Bound    1826
Bevacizumab,Carboplatin,Paclitaxel      1591
Carboplatin,Gemcitabine                 1224
Cisplatin,Etoposide                      793
Carboplatin,Docetaxel                    780
Cisplatin,Pemetrexed                     684
Carboplatin,Etoposide                    363
Name: LineName, dtype: int64

In [300]:
key189_plat = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [301]:
key189_plat.loc[:, 'pembro'] = 0

In [302]:
row_ID(key189_plat)

(25861, 25861)

In [303]:
# Dataframe of all therapies received for those receiving first line platinum regimen 
line_therapy_plat_189 = (
    line_therapy[line_therapy['PatientID'].isin(key189_plat.PatientID)])

In [304]:
# Patients receiving platinum therapy who later recieve targeted therapy
plat_189_xcross = (
    line_therapy_plat_189[line_therapy_plat_189['LineName'].str.contains('|'.join(targeted))].PatientID)

In [305]:
# Select patients who don't receive targeted therapy
key189_plat = key189_plat[~key189_plat['PatientID'].isin(plat_189_xcross)]

In [306]:
row_ID(key189_plat)

(23800, 23800)

In [307]:
key_189 = pd.concat([key189_pembro, key189_plat])

In [308]:
row_ID(key_189)

(29473, 29473)

In [309]:
key_189 = pd.merge(key_189, df_full, on = 'PatientID', how = 'left')

In [310]:
row_ID(key_189)

(29473, 29473)

In [311]:
key_189['StartDate'] = pd.to_datetime(key_189['StartDate'])

#### Time from treatment to death or censor

In [312]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [313]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [314]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [315]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [316]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
print(len(mortality), mortality.PatientID.is_unique)

68483 True


In [317]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [318]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [319]:
key_189 = pd.merge(key_189, mortality, on = 'PatientID', how = 'left')

In [320]:
len(key_189)

29473

In [321]:
conditions = [
    (key_189['death_status'] == 1),
    (key_189['death_status'] == 0)]

choices = [
    (key_189['death_date'] - key_189['StartDate']).dt.days,
    (key_189['last_activity'] - key_189['StartDate']).dt.days]

key_189.loc[:, 'timerisk_treatment'] = np.select(conditions, choices)

In [322]:
key_189 = key_189.query('timerisk_treatment >= 0')

#### Patient count

In [323]:
key_189 = (
    key_189
    .query('Histology == "Non-squamous cell carcinoma"')
    .query('EGFR != "positive"')
    .query('ALK != "positive"')
)

In [324]:
row_ID(key_189)

(19124, 19124)

In [325]:
# Exclude those with autoimmune conditions. 
key_189 = key_189[~key_189['PatientID'].isin(auto_IDs)]

In [326]:
# Exclude those with other relevant comorbidities
key_189 = key_189[~key_189['PatientID'].isin(other_comorb_IDs)]

In [327]:
# Exlcude those with CNS metastasis 
key_189 = key_189[~key_189['PatientID'].isin(cns_fl_IDs)]

In [328]:
# Exclude those with ECOG 2, 3, or 4
key_189 = key_189[~key_189['PatientID'].isin(ecog_fl_IDs)]

In [329]:
# Exclude those with abnormal organ function
key_189 = key_189[~key_189['PatientID'].isin(ab_organ_IDs)]

In [330]:
row_ID(key_189)

(13761, 13761)

In [331]:
low_cutoff_189 = cutoff.loc['keynote_189'].low

In [332]:
high_cutoff_189 = cutoff.loc['keynote_189'].high

In [333]:
print('Pembro + chemo total:',  key_189.query('pembro == 1').shape[0])
print('High risk:', key_189.query('pembro == 1').query('risk_score >= @high_cutoff_189').shape[0])
print('Med risk:', key_189.query('pembro == 1').query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189').shape[0])
print('Low risk:', key_189.query('pembro == 1').query('risk_score <= @low_cutoff_189').shape[0])

Pembro + chemo total: 2794
High risk: 749
Med risk: 964
Low risk: 1081


In [334]:
print('Platinum total:',  key_189.query('pembro == 0').shape[0])
print('High risk:', key_189.query('pembro == 0').query('risk_score >= @high_cutoff_189').shape[0])
print('Med risk:', key_189.query('pembro == 0').query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189').shape[0])
print('Low risk:', key_189.query('pembro == 0').query('risk_score <= @low_cutoff_189').shape[0])

Platinum total: 10967
High risk: 2694
Med risk: 3637
Low risk: 4636


#### Survival curves with covariate balancing 

In [335]:
key_189 = key_189.set_index('PatientID')

In [336]:
key_189_iptw = key_189.filter(items = ['death_status',
                                       'timerisk_treatment',
                                       'pembro',
                                       'age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'adv_year',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_diagnosis',
                                       'pdl1',
                                       'albumin_diag', 
                                       'weight_pct_change',
                                       'risk_score'])

In [337]:
key_189_iptw['met_cat'] = pd.cut(key_189_iptw['adv_year'],
                                 bins = [2010, 2018, float('inf')],
                                 labels = ['11-18', '19-22'])

In [338]:
conditions = [
    ((key_189_iptw['pdl1'] == "1-49%") | (key_189_iptw['pdl1'] == "50-100%"))]

choices = ['>0%']

key_189_iptw['pdl1_cat'] = np.select(conditions, choices, default = key_189_iptw['pdl1'])

In [339]:
conditions = [
    ((key_189_iptw['ecog_diagnosis'] == "1.0") | (key_189_iptw['ecog_diagnosis'] == "0.0")),  
    ((key_189_iptw['ecog_diagnosis'] == "2.0") | (key_189_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

key_189_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [340]:
key_189_iptw.dtypes

death_status               bool
timerisk_treatment      float64
pembro                    int64
age                       int64
gender                   object
race                     object
PracticeType             object
Histology                object
adv_year                  int64
delta_adv_diagnosis       int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
pdl1                     object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
pdl1_cat                 object
ecog_2                   object
dtype: object

In [341]:
to_be_categorical = list(key_189_iptw.select_dtypes(include = ['object']).columns)

In [342]:
to_be_categorical

['gender',
 'race',
 'PracticeType',
 'Histology',
 'ecog_diagnosis',
 'pdl1',
 'pdl1_cat',
 'ecog_2']

In [343]:
to_be_categorical.append('met_cat')

In [344]:
to_be_categorical.remove('pdl1')

In [345]:
to_be_categorical.remove('ecog_diagnosis')

In [346]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    key_189_iptw[x] = key_189_iptw[x].astype('category')

In [347]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [348]:
# List of categorical features.
categorical_features = list(key_189_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [349]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [350]:
key_189_iptw_low = (
    key_189_iptw
    .query('risk_score <= @low_cutoff_189'))

key_189_iptw_med = (
    key_189_iptw
    .query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189'))

key_189_iptw_high = (
    key_189_iptw
    .query('risk_score >= @high_cutoff_189'))

key_189_iptw_all = key_189_iptw

In [351]:
key_189_low_x = preprocessor.fit_transform(key_189_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_cat', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

key_189_med_x = preprocessor.fit_transform(key_189_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_cat', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

key_189_high_x = preprocessor.fit_transform(key_189_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'PracticeType',
                                                                              'Histology',
                                                                              'met_cat',
                                                                              'delta_adv_diagnosis',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ecog_2',
                                                                              'pdl1_cat', 
                                                                              'albumin_diag', 
                                                                              'weight_pct_change',
                                                                              'risk_score']))

key_189_all_x = preprocessor.fit_transform(key_189_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_cat', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change',
                                                                            'risk_score']))

In [352]:
lr_189_low = LogisticRegression(max_iter = 1000)
lr_189_low.fit(key_189_low_x, key_189_iptw_low['pembro'])

LogisticRegression(max_iter=1000)

In [353]:
lr_189_med = LogisticRegression(max_iter = 1000)
lr_189_med.fit(key_189_med_x, key_189_iptw_med['pembro'])

LogisticRegression(max_iter=1000)

In [354]:
lr_189_high = LogisticRegression(max_iter = 1000)
lr_189_high.fit(key_189_high_x, key_189_iptw_high['pembro'])

LogisticRegression(max_iter=1000)

In [355]:
lr_189_all = LogisticRegression(max_iter = 1000)
lr_189_all.fit(key_189_all_x, key_189_iptw_all['pembro'])

LogisticRegression(max_iter=1000)

In [356]:
pred_low = lr_189_low.predict_proba(key_189_low_x)
pred_med = lr_189_med.predict_proba(key_189_med_x)
pred_high = lr_189_high.predict_proba(key_189_high_x)
pred_all = lr_189_all.predict_proba(key_189_all_x)

In [357]:
key_189_iptw_low['ps'] = pred_low[:, 1]
key_189_iptw_med['ps'] = pred_med[:, 1]
key_189_iptw_high['ps'] = pred_high[:, 1]
key_189_iptw_all['ps'] = pred_all[:, 1]

In [358]:
key_189_iptw_low['weight'] = (
    np.where(key_189_iptw_low['pembro'] == 1, 1/key_189_iptw_low['ps'], 1/(1 - key_189_iptw_low['ps'])))

key_189_iptw_med['weight'] = (
    np.where(key_189_iptw_med['pembro'] == 1, 1/key_189_iptw_med['ps'], 1/(1 - key_189_iptw_med['ps'])))

key_189_iptw_high['weight'] = (
    np.where(key_189_iptw_high['pembro'] == 1, 1/key_189_iptw_high['ps'], 1/(1 - key_189_iptw_high['ps'])))

key_189_iptw_all['weight'] = (
    np.where(key_189_iptw_all['pembro'] == 1, 1/key_189_iptw_all['ps'], 1/(1 - key_189_iptw_all['ps'])))

In [359]:
# Low KM curves
kmf_low_pembro_189_iptw = KaplanMeierFitter()
kmf_low_plat_189_iptw = KaplanMeierFitter()

kmf_low_pembro_189_iptw.fit(
    key_189_iptw_low.query('pembro == 1').timerisk_treatment/30,
    key_189_iptw_low.query('pembro == 1').death_status,
    weights = key_189_iptw_low.query('pembro == 1')['weight'])

kmf_low_plat_189_iptw.fit(
    key_189_iptw_low.query('pembro == 0').timerisk_treatment/30,
    key_189_iptw_low.query('pembro == 0').death_status,
    weights = key_189_iptw_low.query('pembro == 0')['weight'])

# Med KM curves
kmf_med_pembro_189_iptw = KaplanMeierFitter()
kmf_med_plat_189_iptw = KaplanMeierFitter()

kmf_med_pembro_189_iptw.fit(
    key_189_iptw_med.query('pembro == 1').timerisk_treatment/30,
    key_189_iptw_med.query('pembro == 1').death_status,
    weights = key_189_iptw_med.query('pembro == 1')['weight'])

kmf_med_plat_189_iptw.fit(
    key_189_iptw_med.query('pembro == 0').timerisk_treatment/30,
    key_189_iptw_med.query('pembro == 0').death_status,
    weights = key_189_iptw_med.query('pembro == 0')['weight'])

# High KM curves 
kmf_high_pembro_189_iptw = KaplanMeierFitter()
kmf_high_plat_189_iptw = KaplanMeierFitter()

kmf_high_pembro_189_iptw.fit(
    key_189_iptw_high.query('pembro == 1').timerisk_treatment/30,
    key_189_iptw_high.query('pembro == 1').death_status,
    weights = key_189_iptw_high.query('pembro == 1')['weight'])

kmf_high_plat_189_iptw.fit(
    key_189_iptw_high.query('pembro == 0').timerisk_treatment/30,
    key_189_iptw_high.query('pembro == 0').death_status,
    weights = key_189_iptw_high.query('pembro == 0')['weight'])

# All KM curves 
kmf_all_pembro_189_iptw = KaplanMeierFitter()
kmf_all_plat_189_iptw = KaplanMeierFitter()

kmf_all_pembro_189_iptw.fit(
    key_189_iptw_all.query('pembro == 1').timerisk_treatment/30,
    key_189_iptw_all.query('pembro == 1').death_status,
    weights = key_189_iptw_all.query('pembro == 1')['weight'])

kmf_all_plat_189_iptw.fit(
    key_189_iptw_all.query('pembro == 0').timerisk_treatment/30,
    key_189_iptw_all.query('pembro == 0').death_status,
    weights = key_189_iptw_all.query('pembro == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 14128.8 total observations, 4227.21 right-censored observations>

In [360]:
pembro_189_median_os = mos(kmf_low_pembro_189_iptw,
                           kmf_med_pembro_189_iptw,
                           kmf_high_pembro_189_iptw,
                           kmf_all_pembro_189_iptw)

plat_189_median_os = mos(kmf_low_plat_189_iptw,
                         kmf_med_plat_189_iptw,
                         kmf_high_plat_189_iptw,
                         kmf_all_plat_189_iptw)

In [361]:
key_189_iptw_all_imputed = key_189_iptw_all.copy()
key_189_iptw_all_imputed['albumin_diag'] = key_189_iptw_all_imputed['albumin_diag'].fillna(key_189_iptw_all_imputed['albumin_diag'].median())
key_189_iptw_all_imputed['weight_pct_change'] = key_189_iptw_all_imputed['weight_pct_change'].fillna(key_189_iptw_all_imputed['weight_pct_change'].median())

In [362]:
key189_hr_all = CoxPHFitter()
key189_hr_all.fit(key_189_iptw_all_imputed,
                  duration_col = 'timerisk_treatment',
                  event_col = 'death_status',
                  formula = 'pembro + age + gender + race + PracticeType + Histology + met_cat + delta_adv_diagnosis + commercial + medicare + medicaid + ecog_2 + pdl1_cat + albumin_diag + weight_pct_change + risk_score',
                  weights_col = 'weight',
                  robust = True)

<lifelines.CoxPHFitter: fitted with 26750.9 total observations, 9453.38 right-censored observations>

In [363]:
key189_all_rmst_mos_95 = rmst_mos_95ci(key_189_iptw_all,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat', 
                                        'albumin_diag', 
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [364]:
key189_low_rmst_mos_95 = rmst_mos_95ci(key_189_iptw_low,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat', 
                                        'albumin_diag', 
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [365]:
key189_med_rmst_mos_95 = rmst_mos_95ci(key_189_iptw_med,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat', 
                                        'albumin_diag', 
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [366]:
key189_high_rmst_mos_95 = rmst_mos_95ci(key_189_iptw_high,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat', 
                                        'albumin_diag', 
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [367]:
keynote_189_data = [
    {'trial_name': 'KEYNOTE-189', 
     'risk_group': 'low', 
     's_trt_mos': pembro_189_median_os[0],
     's_trt_mos_95': key189_low_rmst_mos_95.mos_A_95,
     's_cont_mos': plat_189_median_os[0],
     's_cont_mos_95': key189_low_rmst_mos_95.mos_B_95,
     's_mos_diff': pembro_189_median_os[0] - plat_189_median_os[0], 
     'rct_trt_arm': 22.0, 
     'rct_cont_arm': 10.6,
     'rct_mos_diff': 22.0-10.6, 
     's_trt_rmst': restricted_mean_survival_time(kmf_low_pembro_189_iptw, 36),
     's_trt_rmst_95': key189_low_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_low_plat_189_iptw, 36),
     's_cont_rmst_95': key189_low_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_low_pembro_189_iptw, 36) - restricted_mean_survival_time(kmf_low_plat_189_iptw, 36),
     's_diff_rmst_95': key189_low_rmst_mos_95.difference_rmst_95,
     'scount': key_189.query('risk_score <= @low_cutoff_189').shape[0]},
    
    {'trial_name': 'KEYNOTE-189', 
     'risk_group': 'medium', 
     's_trt_mos': pembro_189_median_os[1],
     's_trt_mos_95': key189_med_rmst_mos_95.mos_A_95,
     's_cont_mos': plat_189_median_os[1],
     's_cont_mos_95': key189_med_rmst_mos_95.mos_B_95,
     's_mos_diff': pembro_189_median_os[1] - plat_189_median_os[1], 
     'rct_trt_arm': 22.0, 
     'rct_cont_arm': 10.6,
     'rct_mos_diff': 22.0-10.6, 
     's_trt_rmst': restricted_mean_survival_time(kmf_med_pembro_189_iptw, 36),
     's_trt_rmst_95': key189_med_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_med_plat_189_iptw, 36),
     's_cont_rmst_95': key189_med_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_med_pembro_189_iptw, 36) - restricted_mean_survival_time(kmf_med_plat_189_iptw, 36),
     's_diff_rmst_95': key189_med_rmst_mos_95.difference_rmst_95,
     'scount': key_189.query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189').shape[0]},
    
    {'trial_name': 'KEYNOTE-189', 
     'risk_group': 'high', 
     's_trt_mos': pembro_189_median_os[2],
     's_trt_mos_95': key189_high_rmst_mos_95.mos_A_95,
     's_cont_mos': plat_189_median_os[2],
     's_cont_mos_95': key189_high_rmst_mos_95.mos_B_95,
     's_mos_diff': pembro_189_median_os[2] - plat_189_median_os[2], 
     'rct_trt_arm': 22.0, 
     'rct_cont_arm': 10.6,
     'rct_mos_diff': 22.0-10.6, 
     's_trt_rmst': restricted_mean_survival_time(kmf_high_pembro_189_iptw, 36),
     's_trt_rmst_95': key189_high_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_high_plat_189_iptw, 36),
     's_cont_rmst_95': key189_high_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_high_pembro_189_iptw, 36) - restricted_mean_survival_time(kmf_high_plat_189_iptw, 36),
     's_diff_rmst_95': key189_high_rmst_mos_95.difference_rmst_95,
     'scount': key_189.query('risk_score >= @high_cutoff_189').shape[0]},
    
    {'trial_name': 'KEYNOTE-189', 
     'risk_group': 'all', 
     's_hr': key189_hr_all.hazard_ratios_['pembro'],
     's_hr_95': [key189_hr_all.summary.loc['pembro']['exp(coef) lower 95%'], key189_hr_all.summary.loc['pembro']['exp(coef) upper 95%']],
     's_trt_mos': pembro_189_median_os[3],
     's_trt_mos_95': key189_all_rmst_mos_95.mos_A_95,
     's_cont_mos': plat_189_median_os[3],
     's_cont_mos_95': key189_all_rmst_mos_95.mos_B_95,
     's_mos_diff': pembro_189_median_os[3] - plat_189_median_os[3], 
     'rct_trt_arm': 22.0, 
     'rct_cont_arm': 10.6,
     'rct_mos_diff': 22.0-10.6, 
     'scount': key_189.shape[0]}
]

### CHECKMATE-078: Second-line nivolumab vs. docetaxel

**INCLUSION CRITERIA**
* Advanced or metastatic NSCLC 
* Progressed on first line platinum-based chemotherapy 
* Received second line nivolumab or docetaxel
* Prior treatments with docetaxel or immunotherapy contraindicated
* EGFR and ALK negative 
* No autoimmune diseases in the year preceding metastatic diagnosis 
* No history of ILD, HIV, Hep C, Hep B, severe psychiatric history, or drug use disorder in the year preceding metastatic diagnosis 
* No CNS metastasis at start of treatment 
* ECOG is not 2, 3, or 4 at start of treatment 

#### Nivolumab 

In [368]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [369]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [370]:
line_therapy_fl = (
    line_therapy[line_therapy.PatientID.isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
)

In [371]:
targeted = [
    'Afatinib',
    'Alectinib',
    'Brigatinib',
    'Cabozantinib',
    'Capmatinib',
    'Ceritinib',
    'Crizotinib',
    'Dabrafenib',
    'Dacomitinib',
    'Entrectinib',
    'Erlotinib',
    'Gefitinib',
    'Lorlatinib',
    'Osimertinib',
    'Pralsetinib',
    'Selpercatinib',
    'Sotorasib',
    'Tepotinib',
    'Trametinib',
    'Vandetanib']

In [372]:
immunotherapy = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab',
    'Pembrolizumab'
]

In [373]:
fl_plat = (
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('Carboplatin|Cisplatin')
     & ~line_therapy_fl['LineName'].str.contains('Docetaxel')
     & ~line_therapy_fl['LineName'].str.contains('|'.join(targeted))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(immunotherapy))]
    .PatientID
)

In [374]:
checkmate_nivo = (
    line_therapy[line_therapy.PatientID.isin(fl_plat)]
    .query('LineNumber == 2')
    .query('LineName == "Nivolumab"')
    [['PatientID', 'StartDate']]
)

In [375]:
checkmate_nivo.loc[:, 'nivo'] = 1

In [376]:
row_ID(checkmate_nivo)

(3397, 3397)

#### Docetaxel

In [377]:
checkmate_dotx = (
    line_therapy[line_therapy.PatientID.isin(fl_plat)]
    .query('LineNumber == 2')
    .query('LineName == "Docetaxel"')
    [['PatientID', 'StartDate']]
)

In [378]:
checkmate_dotx.loc[:, 'nivo'] = 0

In [379]:
row_ID(checkmate_dotx)

(745, 745)

In [380]:
checkmate = pd.concat([checkmate_nivo, checkmate_dotx])

In [381]:
row_ID(checkmate)

(4142, 4142)

In [382]:
checkmate = pd.merge(checkmate, df_full, on = 'PatientID', how = 'left')

In [383]:
row_ID(checkmate)

(4142, 4142)

In [384]:
checkmate['StartDate'] = pd.to_datetime(checkmate['StartDate'])

#### Time from treatment to death or censor 

In [385]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [386]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [387]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [388]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [389]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
print(len(mortality), mortality.PatientID.is_unique)

68483 True


In [390]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [391]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [392]:
checkmate = pd.merge(checkmate, mortality, on = 'PatientID', how = 'left')

In [393]:
row_ID(checkmate)

(4142, 4142)

In [394]:
conditions = [
    (checkmate['death_status'] == 1),
    (checkmate['death_status'] == 0)]

choices = [
    (checkmate['death_date'] - checkmate['StartDate']).dt.days,
    (checkmate['last_activity'] - checkmate['StartDate']).dt.days]

checkmate.loc[:, 'timerisk_treatment'] = np.select(conditions, choices)

In [395]:
checkmate = checkmate.query('timerisk_treatment >= 0')

#### Patient count 

In [396]:
checkmate = (
    checkmate
    .query('EGFR != "positive"')
    .query('ALK != "positive"')
)

In [397]:
row_ID(checkmate)

(4073, 4073)

In [398]:
# Exclude those with autoimmune conditions. 
checkmate = checkmate[~checkmate['PatientID'].isin(auto_IDs)]

In [399]:
# Exclude those with other relevant comorbidities
checkmate = checkmate[~checkmate['PatientID'].isin(other_comorb_IDs)]

In [400]:
# Exlcude those with CNS metastasis 
checkmate = checkmate[~checkmate['PatientID'].isin(cns_sec_IDs)]

In [401]:
# Include only those with ECOG 2, 3, or 4 
checkmate = checkmate[~checkmate['PatientID'].isin(ecog_sec_IDs)]

In [402]:
row_ID(checkmate)

(3045, 3045)

In [403]:
low_cutoff_078 = cutoff.loc['checkmate_078'].low

In [404]:
high_cutoff_078 = cutoff.loc['checkmate_078'].high

In [405]:
print('Nivolumab total:',  checkmate.query('nivo == 1').shape[0])
print('High risk:', checkmate.query('nivo == 1').query('risk_score >= @high_cutoff_078').shape[0])
print('Med risk:', checkmate.query('nivo == 1').query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078').shape[0])
print('Low risk:', checkmate.query('nivo == 1').query('risk_score <= @low_cutoff_078').shape[0])

Nivolumab total: 2460
High risk: 703
Med risk: 867
Low risk: 890


In [406]:
print('Docetaxel total:',  checkmate.query('nivo == 0').shape[0])
print('High risk:', checkmate.query('nivo == 0').query('risk_score >= @high_cutoff_078').shape[0])
print('Med risk:', checkmate.query('nivo == 0').query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078').shape[0])
print('Low risk:', checkmate.query('nivo == 0').query('risk_score <= @low_cutoff_078').shape[0])

Docetaxel total: 585
High risk: 174
Med risk: 202
Low risk: 209


#### Survival curves with covariate balancing 

In [407]:
checkmate = checkmate.set_index('PatientID')

In [408]:
check_iptw = checkmate.filter(items = ['death_status',
                                       'timerisk_treatment',
                                       'nivo',
                                       'age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'adv_year',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_diagnosis',
                                       'pdl1',
                                       'albumin_diag', 
                                       'weight_pct_change',
                                       'risk_score'])

In [409]:
check_iptw['met_cat'] = pd.cut(check_iptw['adv_year'],
                               bins = [2010, 2015, float('inf')],
                               labels = ['11-15', '16-20'])

In [410]:
conditions = [
    ((check_iptw['pdl1'] == "1-49%") | (check_iptw['pdl1'] == "50-100%"))]

choices = ['>0%']

check_iptw['pdl1_cat'] = np.select(conditions, choices, default = check_iptw['pdl1'])

In [411]:
conditions = [
    ((check_iptw['ecog_diagnosis'] == "1.0") | (check_iptw['ecog_diagnosis'] == "0.0")),  
    ((check_iptw['ecog_diagnosis'] == "2.0") | (check_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

check_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [412]:
check_iptw.dtypes

death_status               bool
timerisk_treatment      float64
nivo                      int64
age                       int64
gender                   object
race                     object
PracticeType             object
Histology                object
adv_year                  int64
delta_adv_diagnosis       int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
pdl1                     object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
pdl1_cat                 object
ecog_2                   object
dtype: object

In [413]:
to_be_categorical = list(check_iptw.select_dtypes(include = ['object']).columns)

In [414]:
to_be_categorical

['gender',
 'race',
 'PracticeType',
 'Histology',
 'ecog_diagnosis',
 'pdl1',
 'pdl1_cat',
 'ecog_2']

In [415]:
to_be_categorical.append('met_cat')

In [416]:
to_be_categorical.remove('pdl1')

In [417]:
to_be_categorical.remove('ecog_diagnosis')

In [418]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    check_iptw[x] = check_iptw[x].astype('category')

In [419]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [420]:
# List of categorical features.
categorical_features = list(check_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [421]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [422]:
check_iptw_low = (
    check_iptw
    .query('risk_score <= @low_cutoff_078'))

check_iptw_med = (
    check_iptw
    .query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078'))

check_iptw_high = (
    check_iptw
    .query('risk_score >= @high_cutoff_078'))

check_iptw_all = check_iptw

In [423]:
check_low_x = preprocessor.fit_transform(check_iptw_low.filter(items = ['age',
                                                                        'gender',
                                                                        'race',
                                                                        'PracticeType',
                                                                        'Histology',
                                                                        'met_cat',
                                                                        'delta_adv_diagnosis',
                                                                        'commercial',
                                                                        'medicare',
                                                                        'medicaid',
                                                                        'ecog_2',
                                                                        'pdl1_cat',
                                                                        'albumin_diag',
                                                                        'weight_pct_change',
                                                                        'risk_score']))

check_med_x = preprocessor.fit_transform(check_iptw_med.filter(items = ['age',
                                                                        'gender',
                                                                        'race',
                                                                        'PracticeType',
                                                                        'Histology',
                                                                        'met_cat',
                                                                        'delta_adv_diagnosis',
                                                                        'commercial',
                                                                        'medicare',
                                                                        'medicaid',
                                                                        'ecog_2',
                                                                        'pdl1_cat',
                                                                        'albumin_diag',
                                                                        'weight_pct_change',
                                                                        'risk_score']))

check_high_x = preprocessor.fit_transform(check_iptw_high.filter(items = ['age',
                                                                          'gender',
                                                                          'race',
                                                                          'PracticeType',
                                                                          'Histology',
                                                                          'met_cat',
                                                                          'delta_adv_diagnosis',
                                                                          'commercial',
                                                                          'medicare',
                                                                          'medicaid',
                                                                          'ecog_2',
                                                                          'pdl1_cat',
                                                                          'albumin_diag',
                                                                          'weight_pct_change',
                                                                          'risk_score']))

check_all_x = preprocessor.fit_transform(check_iptw_all.filter(items = ['age',
                                                                        'gender',
                                                                        'race',
                                                                        'PracticeType',
                                                                        'Histology',
                                                                        'met_cat',
                                                                        'delta_adv_diagnosis',
                                                                        'commercial',
                                                                        'medicare',
                                                                        'medicaid',
                                                                        'ecog_2',
                                                                        'pdl1_cat',
                                                                        'albumin_diag',
                                                                        'weight_pct_change',
                                                                        'risk_score']))

In [424]:
lr_check_low = LogisticRegression(max_iter = 1000)
lr_check_low.fit(check_low_x, check_iptw_low['nivo'])

LogisticRegression(max_iter=1000)

In [425]:
lr_check_med = LogisticRegression(max_iter = 1000)
lr_check_med.fit(check_med_x, check_iptw_med['nivo'])

LogisticRegression(max_iter=1000)

In [426]:
lr_check_high = LogisticRegression(max_iter = 1000)
lr_check_high.fit(check_high_x, check_iptw_high['nivo'])

LogisticRegression(max_iter=1000)

In [427]:
lr_check_all = LogisticRegression(max_iter = 1000)
lr_check_all.fit(check_all_x, check_iptw_all['nivo'])

LogisticRegression(max_iter=1000)

In [428]:
pred_low = lr_check_low.predict_proba(check_low_x)
pred_med = lr_check_med.predict_proba(check_med_x)
pred_high = lr_check_high.predict_proba(check_high_x)
pred_all = lr_check_all.predict_proba(check_all_x)

In [429]:
check_iptw_low['ps'] = pred_low[:, 1]
check_iptw_med['ps'] = pred_med[:, 1]
check_iptw_high['ps'] = pred_high[:, 1]
check_iptw_all['ps'] = pred_all[:, 1]

In [430]:
check_iptw_low['weight'] = (
    np.where(check_iptw_low['nivo'] == 1, 1/check_iptw_low['ps'], 1/(1 - check_iptw_low['ps'])))

check_iptw_med['weight'] = (
    np.where(check_iptw_med['nivo'] == 1, 1/check_iptw_med['ps'], 1/(1 - check_iptw_med['ps'])))

check_iptw_high['weight'] = (
    np.where(check_iptw_high['nivo'] == 1, 1/check_iptw_high['ps'], 1/(1 - check_iptw_high['ps'])))

check_iptw_all['weight'] = (
    np.where(check_iptw_all['nivo'] == 1, 1/check_iptw_all['ps'], 1/(1 - check_iptw_all['ps'])))

In [431]:
# Low KM curves
kmf_low_nivo_check_iptw = KaplanMeierFitter()
kmf_low_dotx_check_iptw = KaplanMeierFitter()

kmf_low_nivo_check_iptw.fit(
    check_iptw_low.query('nivo == 1').timerisk_treatment/30,
    check_iptw_low.query('nivo == 1').death_status,
    weights = check_iptw_low.query('nivo == 1')['weight'])

kmf_low_dotx_check_iptw.fit(
    check_iptw_low.query('nivo == 0').timerisk_treatment/30,
    check_iptw_low.query('nivo == 0').death_status,
    weights = check_iptw_low.query('nivo == 0')['weight'])

# Med KM curves
kmf_med_nivo_check_iptw = KaplanMeierFitter()
kmf_med_dotx_check_iptw = KaplanMeierFitter()

kmf_med_nivo_check_iptw.fit(
    check_iptw_med.query('nivo == 1').timerisk_treatment/30,
    check_iptw_med.query('nivo == 1').death_status,
    weights = check_iptw_med.query('nivo == 1')['weight'])

kmf_med_dotx_check_iptw.fit(
    check_iptw_med.query('nivo == 0').timerisk_treatment/30,
    check_iptw_med.query('nivo == 0').death_status,
    weights = check_iptw_med.query('nivo == 0')['weight'])

# High KM curves 
kmf_high_nivo_check_iptw = KaplanMeierFitter()
kmf_high_dotx_check_iptw = KaplanMeierFitter()

kmf_high_nivo_check_iptw.fit(
    check_iptw_high.query('nivo == 1').timerisk_treatment/30,
    check_iptw_high.query('nivo == 1').death_status,
    weights = check_iptw_high.query('nivo == 1')['weight'])

kmf_high_dotx_check_iptw.fit(
    check_iptw_high.query('nivo == 0').timerisk_treatment/30,
    check_iptw_high.query('nivo == 0').death_status,
    weights = check_iptw_high.query('nivo == 0')['weight'])

# All KM curves 
kmf_all_nivo_check_iptw = KaplanMeierFitter()
kmf_all_dotx_check_iptw = KaplanMeierFitter()

kmf_all_nivo_check_iptw.fit(
    check_iptw_all.query('nivo == 1').timerisk_treatment/30,
    check_iptw_all.query('nivo == 1').death_status,
    weights = check_iptw_all.query('nivo == 1')['weight'])

kmf_all_dotx_check_iptw.fit(
    check_iptw_all.query('nivo == 0').timerisk_treatment/30,
    check_iptw_all.query('nivo == 0').death_status,
    weights = check_iptw_all.query('nivo == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 3367.23 total observations, 698.864 right-censored observations>

#### Calculating survival metrics 

In [432]:
nivo_check_median_os = mos(kmf_low_nivo_check_iptw, 
                           kmf_med_nivo_check_iptw,
                           kmf_high_nivo_check_iptw,
                           kmf_all_nivo_check_iptw)

dotx_check_median_os = mos(kmf_low_dotx_check_iptw,
                           kmf_med_dotx_check_iptw,
                           kmf_high_dotx_check_iptw,
                           kmf_all_dotx_check_iptw)

In [433]:
check_iptw_all_imputed = check_iptw_all.copy()
check_iptw_all_imputed['albumin_diag'] = check_iptw_all_imputed['albumin_diag'].fillna(check_iptw_all_imputed['albumin_diag'].median())
check_iptw_all_imputed['weight_pct_change'] = check_iptw_all_imputed['weight_pct_change'].fillna(check_iptw_all_imputed['weight_pct_change'].median())

In [434]:
check_hr_all = CoxPHFitter()
check_hr_all.fit(check_iptw_all_imputed,
                 duration_col = 'timerisk_treatment',
                 event_col = 'death_status',
                 formula = 'nivo + age + gender + race + PracticeType + Histology + delta_adv_diagnosis + commercial + medicare + medicaid + ecog_2 + pdl1_cat + albumin_diag + weight_pct_change + risk_score',
                 weights_col = 'weight',
                 robust = True)

<lifelines.CoxPHFitter: fitted with 6401.28 total observations, 1319.56 right-censored observations>

In [435]:
check_all_rmst_mos_95 = rmst_mos_95ci(check_iptw_all,
                                      1000,
                                      'nivo',
                                      'death',
                                      ['age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'met_cat',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_2',
                                       'pdl1_cat',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'],
                                      ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [436]:
check_low_rmst_mos_95 = rmst_mos_95ci(check_iptw_low,
                                      1000,
                                      'nivo',
                                      'death',
                                      ['age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'met_cat',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_2',
                                       'pdl1_cat',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'],
                                      ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [437]:
check_med_rmst_mos_95 = rmst_mos_95ci(check_iptw_med,
                                      1000,
                                      'nivo',
                                      'death',
                                      ['age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'met_cat',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_2',
                                       'pdl1_cat',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'],
                                      ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [438]:
check_high_rmst_mos_95 = rmst_mos_95ci(check_iptw_high,
                                       1000,
                                       'nivo',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        36)

In [439]:
check_data = [
    {'trial_name': 'CHECKMATE-078', 
     'risk_group': 'low', 
     's_trt_mos': nivo_check_median_os[0],
     's_trt_mos_95': check_low_rmst_mos_95.mos_A_95,
     's_cont_mos': dotx_check_median_os[0],
     's_cont_mos_95': check_low_rmst_mos_95.mos_B_95,
     's_mos_diff': nivo_check_median_os[0] - dotx_check_median_os[0], 
     'rct_trt_arm': 11.9, 
     'rct_cont_arm': 9.5,
     'rct_mos_diff': 11.9-9.5,
     's_trt_rmst': restricted_mean_survival_time(kmf_low_nivo_check_iptw, 36),
     's_trt_rmst_95': check_low_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_low_dotx_check_iptw, 36),
     's_cont_rmst_95': check_low_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_low_nivo_check_iptw, 36) - restricted_mean_survival_time(kmf_low_dotx_check_iptw, 36),
     's_diff_rmst_95': check_low_rmst_mos_95.difference_rmst_95,
     'scount': checkmate.query('risk_score <= @low_cutoff_078').shape[0]},
    
    {'trial_name': 'CHECKMATE-078', 
     'risk_group': 'medium', 
     's_trt_mos': nivo_check_median_os[1],
     's_trt_mos_95': check_med_rmst_mos_95.mos_A_95,
     's_cont_mos': dotx_check_median_os[1],
     's_cont_mos_95': check_med_rmst_mos_95.mos_B_95,
     's_mos_diff': nivo_check_median_os[1] - dotx_check_median_os[1], 
     'rct_trt_arm': 11.9, 
     'rct_cont_arm': 9.5,
     'rct_mos_diff': 11.9-9.5,
     's_trt_rmst': restricted_mean_survival_time(kmf_med_nivo_check_iptw, 36),
     's_trt_rmst_95': check_med_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_med_dotx_check_iptw, 36),
     's_cont_rmst_95': check_med_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_med_nivo_check_iptw, 36) - restricted_mean_survival_time(kmf_med_dotx_check_iptw, 36),
     's_diff_rmst_95': check_med_rmst_mos_95.difference_rmst_95,
     'scount': checkmate.query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078').shape[0]},
    
    {'trial_name': 'CHECKMATE-078', 
     'risk_group': 'high', 
     's_trt_mos': nivo_check_median_os[2],
     's_trt_mos_95': check_high_rmst_mos_95.mos_A_95,
     's_cont_mos': dotx_check_median_os[2],
     's_cont_mos_95': check_high_rmst_mos_95.mos_B_95,
     's_mos_diff': nivo_check_median_os[2] - dotx_check_median_os[2], 
     'rct_trt_arm': 11.9, 
     'rct_cont_arm': 9.5,
     'rct_mos_diff': 11.9-9.5,
     's_trt_rmst': restricted_mean_survival_time(kmf_high_nivo_check_iptw, 36),
     's_trt_rmst_95': check_high_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_high_dotx_check_iptw, 36),
     's_cont_rmst_95': check_high_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_high_nivo_check_iptw, 36) - restricted_mean_survival_time(kmf_high_dotx_check_iptw, 36),
     's_diff_rmst_95': check_high_rmst_mos_95.difference_rmst_95,
     'scount': checkmate.query('risk_score >= @high_cutoff_078').shape[0]},
    
    {'trial_name': 'CHECKMATE-078', 
     'risk_group': 'all', 
     's_hr': check_hr_all.hazard_ratios_['nivo'],
     's_hr_95': [check_hr_all.summary.loc['nivo']['exp(coef) lower 95%'], check_hr_all.summary.loc['nivo']['exp(coef) upper 95%']],
     's_trt_mos': nivo_check_median_os[3],
     's_trt_mos_95': check_all_rmst_mos_95.mos_A_95,
     's_cont_mos': dotx_check_median_os[3],
     's_cont_mos_95': check_all_rmst_mos_95.mos_B_95,
     's_mos_diff': nivo_check_median_os[3] - dotx_check_median_os[3], 
     'rct_trt_arm': 11.9, 
     'rct_cont_arm': 9.5,
     'rct_mos_diff': 11.9-9.5,
     'scount': checkmate.shape[0]}
]

## FLAURA: osimertinib vs. gefitinib or erlotinib

**INCLUSION CRITERIA**
* Untreated stage IV NSCLC
* Received first line osimertinib or gefitinib or erlotinib
* No autoimmune diseases in the year preceding metastatic diagnosis 
* No history of ILD, HIV, Hep C, Hep B, severe psychiatric history, or drug use disorder in the year preceding metastatic diagnosis 
* No CNS metastasis at start of treatment 
* ECOG is not 2, 3, or 4 at start of treatment 
* Adeuquate organ function at start of treatment 

#### Osimertinib

In [440]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [441]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [442]:
flaura_osim = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
    .query('LineName == "Osimertinib"')
    [['PatientID', 'StartDate']]
)

In [443]:
flaura_osim.loc[:, 'osim'] = 1

In [444]:
row_ID(flaura_osim)

(1241, 1241)

#### Gefitinib or Erlotinib

In [445]:
flaura_gefer = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
    .query('LineName == "Gefitinib" or LineName == "Erlotinib"')
    [['PatientID', 'StartDate']]
)

In [446]:
flaura_gefer.loc[:, 'osim'] = 0

In [447]:
row_ID(flaura_gefer)

(3003, 3003)

In [448]:
flaura = pd.concat([flaura_osim, flaura_gefer])

In [449]:
row_ID(flaura)

(4244, 4244)

In [450]:
flaura = pd.merge(flaura, df_full, on = 'PatientID', how = 'left')

In [451]:
row_ID(flaura)

(4244, 4244)

In [452]:
flaura['StartDate'] = pd.to_datetime(flaura['StartDate'])

#### Time from treatment to death/progression or censor 

In [453]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [454]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [455]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [456]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [457]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
row_ID(mortality)

(68483, 68483)

In [458]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [459]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [460]:
row_ID(mortality)

(68483, 68483)

In [461]:
flaura = pd.merge(flaura, mortality, on = 'PatientID', how = 'left')

In [462]:
row_ID(flaura)

(4244, 4244)

In [463]:
progression = pd.read_csv('Enhanced_AdvNSCLCProgression.csv')

In [464]:
progression = progression[progression.PatientID.isin(flaura.PatientID)][['PatientID', 'ProgressionDate']]

In [465]:
progression['ProgressionDate'] = pd.to_datetime(progression['ProgressionDate'])

In [466]:
progression = (
    progression
    .sort_values(['PatientID', 'ProgressionDate'], ascending = [True, True])
    .drop_duplicates(subset = 'PatientID', keep = 'first')
)

In [467]:
row_ID(progression)

(4243, 4243)

In [468]:
flaura = pd.merge(flaura, progression, on = 'PatientID', how = 'left')

In [469]:
row_ID(flaura)

(4244, 4244)

In [470]:
# Percent without progression date in Flaura trial
len(flaura.query('ProgressionDate.isna()', engine = 'python'))/len(flaura)

0.3437794533459001

In [471]:
conditions = [
    (flaura.ProgressionDate.notna()),
    ((flaura.ProgressionDate.isna()) & (flaura['death_status'] == 1)),
    ((flaura.ProgressionDate.isna()) & (flaura['death_status'] == 0))]

choices = [
    (flaura['ProgressionDate'] - flaura['StartDate']).dt.days,
    (flaura['death_date'] - flaura['StartDate']).dt.days,
    (flaura['last_activity'] - flaura['StartDate']).dt.days]

flaura.loc[:, 'time_prog_treatment'] = np.select(conditions, choices)

In [472]:
flaura = flaura.query('time_prog_treatment >= 0')

In [473]:
row_ID(flaura)

(3622, 3622)

In [474]:
conditions = [
    (flaura.ProgressionDate.notna()),
    ((flaura.ProgressionDate.isna()) & (flaura['death_status'] == 1)),
    ((flaura.ProgressionDate.isna()) & (flaura['death_status'] == 0))]

choices = [1, 1, 0]

flaura.loc[:, 'pfs_status'] = np.select(conditions, choices)

#### Patient count

In [475]:
row_ID(flaura)

(3622, 3622)

In [476]:
# Exclude those with autoimmune conditions. 
flaura = flaura[~flaura['PatientID'].isin(auto_IDs)]

In [477]:
# Exclude those with other relevant comorbidities
flaura = flaura[~flaura['PatientID'].isin(other_comorb_IDs)]

In [478]:
# Exlcude those with CNS metastasis 
flaura = flaura[~flaura['PatientID'].isin(cns_fl_IDs)]

In [479]:
# Exclude those with ECOG 2, 3, or 4
flaura = flaura[~flaura['PatientID'].isin(ecog_fl_IDs)]

In [480]:
# Exclude those with abnormal organ function
flaura = flaura[~flaura['PatientID'].isin(ab_organ_IDs)]

In [481]:
row_ID(flaura)

(2725, 2725)

In [482]:
low_cutoff_fl = cutoff.loc['flaura'].low

In [483]:
high_cutoff_fl = cutoff.loc['flaura'].high

In [484]:
print('Osimertinib total:', flaura.query('osim == 1').shape[0])
print('High risk:', flaura.query('osim == 1').query('risk_score >= @high_cutoff_fl').shape[0])
print('Med risk:', flaura.query('osim == 1').query('risk_score < @high_cutoff_fl and risk_score > @low_cutoff_fl').shape[0])
print('Low risk:', flaura.query('osim == 1').query('risk_score <= @low_cutoff_fl').shape[0])

Osimertinib total: 751
High risk: 159
Med risk: 234
Low risk: 358


In [485]:
print('Gefitinib or Erlotinib total:', flaura.query('osim == 0').shape[0])
print('High risk:', flaura.query('osim == 0').query('risk_score >= @high_cutoff_fl').shape[0])
print('Med risk:', flaura.query('osim == 0').query('risk_score < @high_cutoff_fl and risk_score > @low_cutoff_fl').shape[0])
print('Low risk:', flaura.query('osim == 0').query('risk_score <= @low_cutoff_fl').shape[0])

Gefitinib or Erlotinib total: 1974
High risk: 579
Med risk: 725
Low risk: 670


#### PFS with covariate balancing 

In [486]:
flaura = flaura.set_index('PatientID')

In [487]:
flaura['pfs_status'] = flaura['pfs_status'].astype('bool')

In [488]:
flaura_iptw = flaura.filter(items = ['pfs_status',
                                     'time_prog_treatment',
                                     'osim',
                                     'age',
                                     'gender',
                                     'race',
                                     'PracticeType',
                                     'adv_year',
                                     'delta_adv_diagnosis',
                                     'commercial',
                                     'medicare',
                                     'medicaid',
                                     'ecog_diagnosis',
                                     'albumin_diag',
                                     'weight_pct_change',
                                     'risk_score'])

In [489]:
flaura_iptw['met_cat'] = pd.cut(flaura_iptw['adv_year'],
                                bins = [2010, 2018, float('inf')],
                                labels = ['11-18', '19-21'])

In [490]:
conditions = [
    ((flaura_iptw['ecog_diagnosis'] == "1.0") | (flaura_iptw['ecog_diagnosis'] == "0.0")),  
    ((flaura_iptw['ecog_diagnosis'] == "2.0") | (flaura_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

flaura_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [491]:
flaura_iptw.dtypes

pfs_status                 bool
time_prog_treatment     float64
osim                      int64
age                       int64
gender                   object
race                     object
PracticeType             object
adv_year                  int64
delta_adv_diagnosis       int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
ecog_2                   object
dtype: object

In [492]:
to_be_categorical = list(flaura_iptw.select_dtypes(include = ['object']).columns)

In [493]:
to_be_categorical

['gender', 'race', 'PracticeType', 'ecog_diagnosis', 'ecog_2']

In [494]:
to_be_categorical.append('met_cat')

In [495]:
to_be_categorical.remove('ecog_diagnosis')

In [496]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    flaura_iptw[x] = flaura_iptw[x].astype('category')

In [497]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [498]:
# List of categorical features.
categorical_features = list(flaura_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [499]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [500]:
flaura_iptw_low = (
    flaura_iptw
    .query('risk_score <= @low_cutoff_fl'))

flaura_iptw_med = (
    flaura_iptw
    .query('risk_score < @high_cutoff_fl and risk_score > @low_cutoff_fl'))

flaura_iptw_high = (
    flaura_iptw
    .query('risk_score >= @high_cutoff_fl'))

flaura_iptw_all = flaura_iptw

In [501]:
flaura_low_x = preprocessor.fit_transform(flaura_iptw_low.filter(items = ['age',
                                                                          'gender',
                                                                          'race',
                                                                          'PracticeType',
                                                                          'met_cat',
                                                                          'delta_adv_diagnosis',
                                                                          'commercial',
                                                                          'medicare',
                                                                          'medicaid',
                                                                          'ecog_2', 
                                                                          'albumin_diag',
                                                                          'weight_pct_change',
                                                                          'risk_score']))

flaura_med_x = preprocessor.fit_transform(flaura_iptw_med.filter(items = ['age',
                                                                          'gender',
                                                                          'race',
                                                                          'PracticeType',
                                                                          'met_cat',
                                                                          'delta_adv_diagnosis',
                                                                          'commercial',
                                                                          'medicare',
                                                                          'medicaid',
                                                                          'ecog_2', 
                                                                          'albumin_diag',
                                                                          'weight_pct_change',
                                                                          'risk_score']))

flaura_high_x = preprocessor.fit_transform(flaura_iptw_high.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2', 
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

flaura_all_x = preprocessor.fit_transform(flaura_iptw_all.filter(items = ['age',
                                                                          'gender',
                                                                          'race',
                                                                          'PracticeType',
                                                                          'met_cat',
                                                                          'delta_adv_diagnosis',
                                                                          'commercial',
                                                                          'medicare',
                                                                          'medicaid',
                                                                          'ecog_2', 
                                                                          'albumin_diag',
                                                                          'weight_pct_change',
                                                                          'risk_score']))

In [502]:
lr_flaura_low = LogisticRegression(max_iter = 1000)
lr_flaura_low.fit(flaura_low_x, flaura_iptw_low['osim'])

LogisticRegression(max_iter=1000)

In [503]:
lr_flaura_med = LogisticRegression(max_iter = 1000)
lr_flaura_med.fit(flaura_med_x, flaura_iptw_med['osim'])

LogisticRegression(max_iter=1000)

In [504]:
lr_flaura_high = LogisticRegression(max_iter = 1000)
lr_flaura_high.fit(flaura_high_x, flaura_iptw_high['osim'])

LogisticRegression(max_iter=1000)

In [505]:
lr_flaura_all = LogisticRegression(max_iter = 1000)
lr_flaura_all.fit(flaura_all_x, flaura_iptw_all['osim'])

LogisticRegression(max_iter=1000)

In [506]:
pred_low = lr_flaura_low.predict_proba(flaura_low_x)
pred_med = lr_flaura_med.predict_proba(flaura_med_x)
pred_high = lr_flaura_high.predict_proba(flaura_high_x)
pred_all = lr_flaura_all.predict_proba(flaura_all_x)

In [507]:
flaura_iptw_low['ps'] = pred_low[:, 1]
flaura_iptw_med['ps'] = pred_med[:, 1]
flaura_iptw_high['ps'] = pred_high[:, 1]
flaura_iptw_all['ps'] = pred_all[:, 1]

In [508]:
flaura_iptw_low['weight'] = (
    np.where(flaura_iptw_low['osim'] == 1, 1/flaura_iptw_low['ps'], 1/(1 - flaura_iptw_low['ps'])))

flaura_iptw_med['weight'] = (
    np.where(flaura_iptw_med['osim'] == 1, 1/flaura_iptw_med['ps'], 1/(1 - flaura_iptw_med['ps'])))

flaura_iptw_high['weight'] = (
    np.where(flaura_iptw_high['osim'] == 1, 1/flaura_iptw_high['ps'], 1/(1 - flaura_iptw_high['ps'])))

flaura_iptw_all['weight'] = (
    np.where(flaura_iptw_all['osim'] == 1, 1/flaura_iptw_all['ps'], 1/(1 - flaura_iptw_all['ps'])))

In [509]:
# Low KM curves
kmf_low_osim_flaura_iptw_pfs = KaplanMeierFitter()
kmf_low_gefer_flaura_iptw_pfs = KaplanMeierFitter()

kmf_low_osim_flaura_iptw_pfs.fit(
    flaura_iptw_low.query('osim == 1').time_prog_treatment/30,
    flaura_iptw_low.query('osim == 1').pfs_status,
    weights = flaura_iptw_low.query('osim == 1')['weight'])

kmf_low_gefer_flaura_iptw_pfs.fit(
    flaura_iptw_low.query('osim == 0').time_prog_treatment/30,
    flaura_iptw_low.query('osim == 0').pfs_status,
    weights = flaura_iptw_low.query('osim == 0')['weight'])

# Med KM curves
kmf_med_osim_flaura_iptw_pfs = KaplanMeierFitter()
kmf_med_gefer_flaura_iptw_pfs = KaplanMeierFitter()

kmf_med_osim_flaura_iptw_pfs.fit(
    flaura_iptw_med.query('osim == 1').time_prog_treatment/30,
    flaura_iptw_med.query('osim == 1').pfs_status,
    weights = flaura_iptw_med.query('osim == 1')['weight'])

kmf_med_gefer_flaura_iptw_pfs.fit(
    flaura_iptw_med.query('osim == 0').time_prog_treatment/30,
    flaura_iptw_med.query('osim == 0').pfs_status,
    weights = flaura_iptw_med.query('osim == 0')['weight'])

# High KM curves 
kmf_high_osim_flaura_iptw_pfs = KaplanMeierFitter()
kmf_high_gefer_flaura_iptw_pfs = KaplanMeierFitter()

kmf_high_osim_flaura_iptw_pfs.fit(
    flaura_iptw_high.query('osim == 1').time_prog_treatment/30,
    flaura_iptw_high.query('osim == 1').pfs_status,
    weights = flaura_iptw_high.query('osim == 1')['weight'])

kmf_high_gefer_flaura_iptw_pfs.fit(
    flaura_iptw_high.query('osim == 0').time_prog_treatment/30,
    flaura_iptw_high.query('osim == 0').pfs_status,
    weights = flaura_iptw_high.query('osim == 0')['weight'])

# All KM curves 
kmf_all_osim_flaura_iptw_pfs = KaplanMeierFitter()
kmf_all_gefer_flaura_iptw_pfs = KaplanMeierFitter()

kmf_all_osim_flaura_iptw_pfs.fit(
    flaura_iptw_all.query('osim == 1').time_prog_treatment/30,
    flaura_iptw_all.query('osim == 1').pfs_status,
    weights = flaura_iptw_all.query('osim == 1')['weight'])

kmf_all_gefer_flaura_iptw_pfs.fit(
    flaura_iptw_all.query('osim == 0').time_prog_treatment/30,
    flaura_iptw_all.query('osim == 0').pfs_status,
    weights = flaura_iptw_all.query('osim == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 2700.42 total observations, 416.461 right-censored observations>

#### Calculate survival metrics 

In [510]:
osim_flaura_median_pfs = mos(kmf_low_osim_flaura_iptw_pfs,
                             kmf_med_osim_flaura_iptw_pfs,
                             kmf_high_osim_flaura_iptw_pfs,
                             kmf_all_osim_flaura_iptw_pfs)

gefer_flaura_median_pfs = mos(kmf_low_gefer_flaura_iptw_pfs,
                              kmf_med_gefer_flaura_iptw_pfs,
                              kmf_high_gefer_flaura_iptw_pfs,
                              kmf_all_gefer_flaura_iptw_pfs)

In [511]:
flaura_iptw_all_imputed = flaura_iptw_all.copy()
flaura_iptw_all_imputed['albumin_diag'] = flaura_iptw_all_imputed['albumin_diag'].fillna(flaura_iptw_all_imputed['albumin_diag'].median())
flaura_iptw_all_imputed['weight_pct_change'] = flaura_iptw_all_imputed['weight_pct_change'].fillna(flaura_iptw_all_imputed['weight_pct_change'].median())

In [512]:
flaura_hr_all = CoxPHFitter()
flaura_hr_all.fit(flaura_iptw_all_imputed,
                  duration_col = 'time_prog_treatment',
                  event_col = 'pfs_status',
                  formula = 'osim + age + gender + race + PracticeType + met_cat + delta_adv_diagnosis + commercial + medicare + medicaid + ecog_2 + albumin_diag + weight_pct_change + risk_score',
                  weights_col = 'weight',
                  robust = True)

<lifelines.CoxPHFitter: fitted with 5296.8 total observations, 1262.85 right-censored observations>

In [513]:
flaura_all_rmst_mos_95 = rmst_mos_95ci(flaura_iptw_all,
                                       1000,
                                       'osim',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [514]:
flaura_low_rmst_mos_95 = rmst_mos_95ci(flaura_iptw_low,
                                       1000,
                                       'osim',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [515]:
flaura_med_rmst_mos_95 = rmst_mos_95ci(flaura_iptw_med,
                                       1000,
                                       'osim',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [516]:
flaura_high_rmst_mos_95 = rmst_mos_95ci(flaura_iptw_high,
                                        1000,
                                        'osim',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'PracticeType',
                                         'met_cat',
                                         'delta_adv_diagnosis',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ecog_2', 
                                         'albumin_diag',
                                         'weight_pct_change',
                                         'risk_score'],
                                        ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        36)

In [517]:
flaura_data = [
    {'trial_name': 'FLAURA', 
     'risk_group': 'low', 
     's_trt_mos': osim_flaura_median_pfs[0],
     's_trt_mos_95': flaura_low_rmst_mos_95.mos_A_95,
     's_cont_mos': gefer_flaura_median_pfs[0],
     's_cont_mos_95': flaura_low_rmst_mos_95.mos_B_95,
     's_mos_diff': osim_flaura_median_pfs[0] - gefer_flaura_median_pfs[0], 
     'rct_trt_arm': 18.9, 
     'rct_cont_arm': 10.2,
     'rct_mos_diff': 18.9-10.2,
     's_trt_rmst': restricted_mean_survival_time(kmf_low_osim_flaura_iptw_pfs, 36),
     's_trt_rmst_95': flaura_low_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_low_gefer_flaura_iptw_pfs, 36),
     's_cont_rmst_95': flaura_low_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_low_osim_flaura_iptw_pfs, 36) - restricted_mean_survival_time(kmf_low_gefer_flaura_iptw_pfs, 36),
     's_diff_rmst_95': flaura_low_rmst_mos_95.difference_rmst_95,
     'scount': flaura.query('risk_score <= @low_cutoff_fl').shape[0]},
    
    {'trial_name': 'FLAURA', 
     'risk_group': 'medium', 
     's_trt_mos': osim_flaura_median_pfs[1],
     's_trt_mos_95': flaura_med_rmst_mos_95.mos_A_95,
     's_cont_mos': gefer_flaura_median_pfs[1],
     's_cont_mos_95': flaura_med_rmst_mos_95.mos_B_95,
     's_mos_diff': osim_flaura_median_pfs[1] - gefer_flaura_median_pfs[1], 
     'rct_trt_arm': 18.9, 
     'rct_cont_arm': 10.2,
     'rct_mos_diff': 18.9-10.2,
     's_trt_rmst': restricted_mean_survival_time(kmf_med_osim_flaura_iptw_pfs, 36),
     's_trt_rmst_95': flaura_med_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_med_gefer_flaura_iptw_pfs, 36),
     's_cont_rmst_95': flaura_med_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_med_osim_flaura_iptw_pfs, 36) - restricted_mean_survival_time(kmf_med_gefer_flaura_iptw_pfs, 36),
     's_diff_rmst_95': flaura_med_rmst_mos_95.difference_rmst_95,
     'scount': flaura.query('risk_score < @high_cutoff_fl and risk_score > @low_cutoff_fl').shape[0]},
    
    {'trial_name': 'FLAURA', 
     'risk_group': 'high', 
     's_trt_mos': osim_flaura_median_pfs[2],
     's_trt_mos_95': flaura_high_rmst_mos_95.mos_A_95,
     's_cont_mos': gefer_flaura_median_pfs[2],
     's_cont_mos_95': flaura_high_rmst_mos_95.mos_B_95,
     's_mos_diff': osim_flaura_median_pfs[2] - gefer_flaura_median_pfs[2], 
     'rct_trt_arm': 18.9, 
     'rct_cont_arm': 10.2,
     'rct_mos_diff': 18.9-10.2,
     's_trt_rmst': restricted_mean_survival_time(kmf_high_osim_flaura_iptw_pfs, 36),
     's_trt_rmst_95': flaura_high_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_high_gefer_flaura_iptw_pfs, 36),
     's_cont_rmst_95': flaura_high_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_high_osim_flaura_iptw_pfs, 36) - restricted_mean_survival_time(kmf_high_gefer_flaura_iptw_pfs, 36),
     's_diff_rmst_95': flaura_high_rmst_mos_95.difference_rmst_95,
     'scount': flaura.query('risk_score >= @high_cutoff_fl').shape[0]},
    
    {'trial_name': 'FLAURA', 
     'risk_group': 'all', 
     's_hr': flaura_hr_all.hazard_ratios_['osim'],
     's_hr_95': [flaura_hr_all.summary.loc['osim']['exp(coef) lower 95%'], flaura_hr_all.summary.loc['osim']['exp(coef) upper 95%']],
     's_trt_mos': osim_flaura_median_pfs[3],
     's_trt_mos_95': flaura_all_rmst_mos_95.mos_A_95,
     's_cont_mos': gefer_flaura_median_pfs[3],
     's_cont_mos_95': flaura_all_rmst_mos_95.mos_B_95,
     's_mos_diff': osim_flaura_median_pfs[3] - gefer_flaura_median_pfs[3], 
     'rct_trt_arm': 18.9, 
     'rct_cont_arm': 10.2,
     'rct_mos_diff': 18.9-10.2,
     'scount': flaura.shape[0]}
]

## Part 3. Combining dictionaries

In [518]:
data_combined = keynote_042_data + keynote_024_data + keynote_189_data + check_data + flaura_data

In [519]:
strials_mos_rmst_boot = pd.DataFrame(data_combined)

In [520]:
strials_mos_rmst_boot

Unnamed: 0,trial_name,risk_group,s_trt_mos,s_trt_mos_95,s_cont_mos,s_cont_mos_95,s_mos_diff,rct_trt_arm,rct_cont_arm,rct_mos_diff,s_trt_rmst,s_trt_rmst_95,s_cont_rmst,s_cont_rmst_95,s_diff_rmst,s_diff_rmst_95,scount,s_hr,s_hr_95
0,KEYNOTE-042,low,25.666667,"[20.399166666666666, 31.333333333333332]",26.333333,"[23.030833333333334, 32.0]",-0.666667,16.7,12.1,4.6,22.804925,"[21.15009571576442, 24.374985415059662]",23.947788,"[22.39097263195355, 25.29163534803153]",-1.142864,"[-3.2634503621334545, 1.058860639374799]",1498,,
1,KEYNOTE-042,medium,20.066667,"[14.266666666666667, 23.901666666666664]",15.333333,"[14.233333333333333, 17.633333333333333]",4.733333,16.7,12.1,4.6,19.955144,"[18.185066650155633, 21.82123783695513]",18.33514,"[17.05531462451536, 19.733829491306302]",1.620004,"[-0.5590386517538245, 3.876899830902065]",1304,,
2,KEYNOTE-042,high,5.033333,"[3.966666666666667, 7.133333333333334]",6.366667,"[5.2, 8.0]",-1.333333,16.7,12.1,4.6,11.765965,"[10.289605815068583, 13.213814186296789]",11.027729,"[9.577650025489794, 12.485302319352973]",0.738236,"[-1.3127516556938899, 2.8256016916523032]",925,,
3,KEYNOTE-042,all,17.733333,"[15.495833333333334, 20.366666666666667]",15.0,"[10.999166666666667, 17.0]",2.733333,16.7,12.1,4.6,,,,,,,3727,0.882152,"[0.7840489735613151, 0.9925296092856004]"
4,KEYNOTE-024,low,9.6,"[8.131666666666666, 12.866666666666667]",8.8,"[6.133333333333334, 13.533333333333333]",0.8,10.3,6.0,4.3,10.340478,"[9.672921996527739, 11.085497715485708]",9.720394,"[8.250485129287448, 11.342706831360267]",0.620084,"[-1.1976751871761009, 2.310585990613227]",533,,
5,KEYNOTE-024,medium,5.433333,"[3.966666666666667, 6.8]",5.933333,"[4.266666666666667, 7.868333333333331]",-0.5,10.3,6.0,4.3,7.754938,"[7.094701755379255, 8.417592729327138]",8.125335,"[6.498564385723091, 9.771670223218822]",-0.370397,"[-2.1535170734950584, 1.4276673606024326]",478,,
6,KEYNOTE-024,high,2.166667,"[1.6666666666666667, 2.6666666666666665]",2.733333,"[1.7, 3.966666666666667]",-0.566667,10.3,6.0,4.3,4.924184,"[4.100860986665681, 5.847181570133066]",4.526616,"[3.249954077479059, 5.943830606846119]",0.397569,"[-1.2660116016826817, 1.9673183622945305]",319,,
7,KEYNOTE-024,all,5.566667,"[4.833333333333333, 6.266666666666667]",5.933333,"[4.6, 7.366666666666666]",-0.366667,10.3,6.0,4.3,,,,,,,1330,0.957244,"[0.8190678687865103, 1.118731499674572]"
8,KEYNOTE-189,low,27.066667,"[22.933333333333334, 31.1]",22.1,"[20.866666666666667, 22.966666666666665]",4.966667,22.0,10.6,11.4,23.789954,"[22.687519216426253, 24.950273463133705]",22.080383,"[21.672311511379686, 22.455459029406676]",1.709571,"[0.5558566503819004, 2.9280653088569872]",5717,,
9,KEYNOTE-189,medium,12.4,"[10.366666666666667, 13.866666666666667]",11.3,"[10.6, 11.934166666666666]",1.1,22.0,10.6,11.4,16.076878,"[14.783314306364062, 17.26092190151365]",15.488129,"[14.974468438878093, 15.985396462885008]",0.588749,"[-0.7933309154756211, 1.840567089043534]",4601,,


In [521]:
strials_mos_rmst_boot.to_csv('strials_mos_rmst_boot.csv', index = False)