# Flatiron Health aNSCLC: Survival metrics for appropriate chemo dosing 
**Background: Calculate survival metrics for emulated trials involving patients who receive appropriate upfront dosing of chemotherapeutics. Hazard ratio for the full cohort is calculated from a Cox-IPTW model. Restricted mean survival time and median overall survival are calculated for phenotypes using an IPTW-adjusted KM curve.**

## Part 1: Preprocessing

### 1.1 Import packages and create necessary functions

In [1]:
import numpy as np
import pandas as pd

from scipy import stats

from sksurv.nonparametric import kaplan_meier_estimator
from survive import KaplanMeier, SurvivalData

from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.plotting import add_at_risk_counts
from lifelines.utils import median_survival_times, restricted_mean_survival_time
from lifelines.statistics import logrank_test

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

import warnings

In [2]:
# Function that returns number of rows and count of unique PatientIDs for a dataframe. 
def row_ID(dataframe):
    row = dataframe.shape[0]
    ID = dataframe['PatientID'].nunique()
    return row, ID

In [3]:
# Find index for value closest to input value. 
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

In [4]:
# Calculates median overeall suvival for risk groups. 
def mos(low, med, high, comp):
    low_os = low.median_survival_time_
    med_os = med.median_survival_time_
    high_os = high.median_survival_time_
    comp_os = comp.median_survival_time_
    mos = [low_os, med_os, high_os, comp_os]
    return (mos)

In [5]:
def rmst_mos_95ci(df, num_samples, drug, event, items_list, numerical_features, rmst_time):
    
    """
    Estimate the 95% confidence interval for RMST and mOS using bootstrap resampling.

    Parameters:
    - df: DataFrame containing survival data
    - num_samples: Number of bootstrap samples
    - drug: Treatment indicator variable
    - event: Event type ('death' or 'progression')
    - items_list: Feature list for IPTW 
    - numerical_features: List of numerical features
    - rmst_time: Time to calculate RMST 

    Returns:
    - mos_A_95: mOS 95% CI for treatment
    - mos_B_95: mOS 95% CI for control
    - rmst_A_95: RMST 95% CI for treatment
    - rmst_B_95: RMST 95% CI for control
    - difference_rmst_95: RMST 95% CI for difference between treatment and control 
    """
    
    np.random.seed(42)
    mos_A = []
    mos_B = []
    rmst_A_list = []
    rmst_B_list = []
    differences_rmst = []
    
    # Define variables based on the event type
    if event == 'death':
        time_column = 'timerisk_treatment'
        status_column = 'death_status'
        
    else:
        time_column = 'time_prog_treatment'
        status_column = 'pfs_status'
        
    # Set up preprocessor for logistical regression which will be for IPTW  
    numerical_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('std_scaler', StandardScaler())])
        
    categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')
    categorical_features = list(df.select_dtypes(include = ['category']).columns)
        
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)],
        remainder = 'passthrough')
    
    # Boostrap 
    for _ in range(num_samples):
        
        # Resample data with replacement
        resampled_df = resample(df).drop(columns = ['ps', 'weight'])
        
        # Calculated IPTW for the resampled group 
        df_x = preprocessor.fit_transform(resampled_df.filter(items = items_list))
                                           
        df_lr = LogisticRegression(max_iter = 1000)
        df_lr.fit(df_x, resampled_df[drug])
        warnings.filterwarnings("ignore")
                                           
        pred = df_lr.predict_proba(df_x)        
        resampled_df['ps'] = pred[:, 1]                          
        resampled_df['weight'] = (
                np.where(resampled_df[drug] == 1, 1/resampled_df['ps'], 1/(1 - resampled_df['ps'])))
    
        # mOS from IPTW-KM
        kmf_A = KaplanMeierFitter()
        kmf_A.fit(resampled_df.query(f'{drug} == 1')[time_column]/30,
                  resampled_df.query(f'{drug} == 1')[status_column], 
                  weights = resampled_df.query(f'{drug} == 1')['weight'])

        kmf_B = KaplanMeierFitter()
        kmf_B.fit(resampled_df.query(f'{drug} == 0')[time_column]/30,
                  resampled_df.query(f'{drug} == 0')[status_column], 
                  weights = resampled_df.query(f'{drug} == 0')['weight'])
    
        mos_A.append(kmf_A.median_survival_time_)
        mos_B.append(kmf_B.median_survival_time_)
        
        # RMST from IPTW-KM
        rmst_A = restricted_mean_survival_time(kmf_A, rmst_time)
        rmst_B = restricted_mean_survival_time(kmf_B, rmst_time)
        
        rmst_A_list.append(rmst_A)
        rmst_B_list.append(rmst_B)
        differences_rmst.append(rmst_A - rmst_B)

    # Calculate the 95% confidence interval
    results = pd.Series({
    'mos_A_95': np.percentile(mos_A, [2.5, 97.5]),
    'mos_B_95': np.percentile(mos_B, [2.5, 97.5]),
    'rmst_A_95': np.percentile(rmst_A_list, [2.5, 97.5]),
    'rmst_B_95': np.percentile(rmst_B_list, [2.5, 97.5]),
    'difference_rmst_95': np.percentile(differences_rmst, [2.5, 97.5])
    })
    
    return results

In [6]:
cutoff = pd.read_csv('risk_cutoff_lung.csv', index_col = 0)

## Part 2: In-silico trials 

### KEYNOTE-042: First-line pembrolizumab vs. platinum-based chemotherapy in those with PDL1 >=1%

**INCLUSION CRITERIA**
* Untreated locally advanced or metastatic NSCLC
* Received first line pemobrolizumab or platinum-based chemotherapy
* Received appropriate dose of platinum-based chemotherapy
* PDL1 >= 1% and status known within (-inf, +30] days of first-line treatment
* EGFR and ALK negative

#### Pembrolizumab

In [7]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [8]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [9]:
key042_pembro = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
    .query('LineName == "Pembrolizumab"')
    [['PatientID', 'StartDate']]
)

In [10]:
key042_pembro.loc[:, 'pembro'] = 1

In [11]:
row_ID(key042_pembro)

(3648, 3648)

In [12]:
# Dataframe of all therapies received for those receiving first line pembrolizumab only. 
line_therapy_pembro_042 = (
    line_therapy[line_therapy['PatientID'].isin(key042_pembro.PatientID)])

In [13]:
targeted = [
    'Afatinib',
    'Alectinib',
    'Brigatinib',
    'Cabozantinib',
    'Capmatinib',
    'Ceritinib',
    'Crizotinib',
    'Dabrafenib',
    'Dacomitinib',
    'Entrectinib',
    'Erlotinib',
    'Gefitinib',
    'Lorlatinib',
    'Osimertinib',
    'Pralsetinib',
    'Selpercatinib',
    'Sotorasib',
    'Tepotinib',
    'Trametinib',
    'Vandetanib']

In [14]:
# Patients receiving pembrolizumab therapy who later recieve targeted therapy. 
pembro_042_xcross = (
    line_therapy_pembro_042[line_therapy_pembro_042['LineName'].str.contains('|'.join(targeted))].PatientID)

In [15]:
# Select patients who don't receive targeted therapy in future lines.
key042_pembro = key042_pembro[~key042_pembro['PatientID'].isin(pembro_042_xcross)]

In [16]:
row_ID(key042_pembro)

(3582, 3582)

In [17]:
row_ID(key042_pembro)

(3582, 3582)

#### Platinum-based chemotherapy 

In [18]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
)

In [19]:
plat_chemo = [
    'Carboplatin',
    'Cisplatin']

immuno = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab',
    'Pembrolizumab'
]

In [20]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')].LineName.value_counts().head(10)

Carboplatin,Paclitaxel                  8524
Carboplatin,Pemetrexed                  5417
Bevacizumab,Carboplatin,Pemetrexed      2825
Carboplatin,Paclitaxel Protein-Bound    1826
Bevacizumab,Carboplatin,Paclitaxel      1591
Carboplatin,Gemcitabine                 1224
Cisplatin,Etoposide                      793
Carboplatin,Docetaxel                    780
Cisplatin,Pemetrexed                     684
Carboplatin,Etoposide                    363
Name: LineName, dtype: int64

In [21]:
key042_carb = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('Carboplatin') & 
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [22]:
key042_carb.loc[:, 'carb'] = 1

In [23]:
key042_cis = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('Cisplatin') & 
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [24]:
key042_cis.loc[:, 'carb'] = 0

In [25]:
key042_plat = pd.concat([key042_carb, key042_cis])

In [26]:
key042_plat.loc[:, 'pembro'] = 0

In [27]:
row_ID(key042_plat)

(25861, 25861)

In [28]:
# Dataframe of all therapies received for those receiving first line platinum regimen   
line_therapy_plat_042 = (
    line_therapy[line_therapy['PatientID'].isin(key042_plat.PatientID)])

In [29]:
# Patients receiving platinum therapy who later recieve targeted therapy. 
plat_042_xcross = (
    line_therapy_plat_042[line_therapy_plat_042['LineName'].str.contains('|'.join(targeted))].PatientID)

In [30]:
# Select patients who don't receive targeted therapy in future lines 
key042_plat = key042_plat[~key042_plat['PatientID'].isin(plat_042_xcross)]

In [31]:
row_ID(key042_plat)

(23800, 23800)

#### Platinum-based chemotherapy dosing

#### Carboplatin

In [32]:
med_order = pd.read_csv('MedicationOrder.csv', low_memory = False)

In [33]:
med_order['ExpectedStartDate'] = np.where(med_order['ExpectedStartDate'].isna(), 
                                          med_order['OrderedDate'], 
                                          med_order['ExpectedStartDate'])

In [34]:
med_order.loc[:, 'ExpectedStartDate'] = pd.to_datetime(med_order['ExpectedStartDate'])

In [35]:
key042_plat.loc[:, 'StartDate'] = pd.to_datetime(key042_plat['StartDate'])

In [36]:
med_order_carb = (
    med_order[med_order['PatientID'].isin(key042_plat.query('carb == 1').PatientID)]
    .query('CommonDrugName == "carboplatin"')
)

In [37]:
med_order_carb.shape

(147262, 18)

In [38]:
med_order_carb = pd.merge(med_order_carb, 
                          key042_plat.query('carb == 1')[['PatientID', 'StartDate']], 
                          on = 'PatientID', 
                          how = 'left')

In [39]:
med_order_carb.shape

(147262, 19)

In [40]:
med_order_carb.loc[:, 'date_diff'] = (med_order_carb['ExpectedStartDate'] - med_order_carb['StartDate']).dt.days.abs()

In [41]:
med_order_carb = med_order_carb.query('date_diff <= 14')

In [42]:
carb_index = med_order_carb.groupby('PatientID')['date_diff'].idxmin()

In [43]:
carb_dose = med_order_carb.loc[carb_index].query('RelativeOrderedUnits == "AUC"')[['PatientID', 'RelativeOrderedAmount']]

In [44]:
carb_dose = carb_dose.rename(columns = {'RelativeOrderedAmount': 'carb_dose_auc'})

In [45]:
carb_IDs = carb_dose.query('carb_dose_auc >= 5').PatientID

#### Cisplatin

In [46]:
med_order_cis = (
    med_order[med_order['PatientID'].isin(key042_plat.query('carb == 0').PatientID)]
    .query('CommonDrugName == "cisplatin"')
)

In [47]:
med_order_cis.shape

(10521, 18)

In [48]:
med_order_cis = pd.merge(med_order_cis,
                         key042_plat.query('carb == 0')[['PatientID', 'StartDate']], 
                         on = 'PatientID', 
                         how = 'left')

In [49]:
med_order_cis.shape

(10521, 19)

In [50]:
med_order_cis.loc[:, 'date_diff'] = (med_order_cis['ExpectedStartDate'] - med_order_cis['StartDate']).dt.days.abs()

In [51]:
med_order_cis = med_order_cis.query('date_diff <= 14')

In [52]:
cis_index = med_order_cis.groupby('PatientID')['date_diff'].idxmin()

In [53]:
cis_dose = med_order_cis.loc[cis_index].query('RelativeOrderedUnits == "mg/m2"')[['PatientID', 'RelativeOrderedAmount']]

In [54]:
cis_dose = cis_dose.rename(columns = {'RelativeOrderedAmount': 'cis_dose_mgm2'})

In [55]:
cis_IDs = cis_dose.query('cis_dose_mgm2 >= 75').PatientID

#### Combine chemotherapy and pembrolizumab dataframes 

In [56]:
key042_plat = key042_plat[key042_plat['PatientID'].isin(pd.concat([carb_IDs, cis_IDs]))]

In [57]:
key042_plat = key042_plat.drop(columns = ['carb'])

In [58]:
key_042 = pd.concat([key042_pembro, key042_plat])

In [59]:
row_ID(key_042)

(15321, 15321)

In [60]:
key_042 = pd.merge(key_042, df_full, on = 'PatientID', how = 'left')

In [61]:
row_ID(key_042)

(15321, 15321)

In [62]:
key_042['StartDate'] = pd.to_datetime(key_042['StartDate'])

#### PDL1 >=1%

In [63]:
biomarkers = pd.read_csv('Enhanced_AdvNSCLCBiomarkers.csv')

In [64]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(key_042['PatientID'])]

In [65]:
biomarkers = pd.merge(biomarkers, key_042[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [66]:
row_ID(biomarkers)

(58666, 12087)

In [67]:
biomarkers['ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [68]:
biomarkers['SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [69]:
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [70]:
biomarkers.loc[:, 'date_diff'] = (biomarkers['result_date'] - biomarkers['StartDate']).dt.days

In [71]:
pdl1_value = (
    biomarkers
    .query('BiomarkerName == "PDL1"')
    .query('date_diff <=30')
    .query('PercentStaining != "0%" and PercentStaining != "< 1%" and PercentStaining.notnull()', engine = 'python')
    .sort_values(by = ['PatientID', 'PercentStaining'], ascending = [True, False])
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'PercentStaining']]
)

In [72]:
pdl1_ids = (
    biomarkers
    .query('BiomarkerName == "PDL1"')
    .query('date_diff <=30')
    .query('PercentStaining != "0%" and PercentStaining != "< 1%" and PercentStaining.notnull()', engine = 'python')
    .PatientID
    .unique()
)

In [73]:
key_042 = key_042[key_042.PatientID.isin(pdl1_ids)]

In [74]:
row_ID(key_042)

(4086, 4086)

#### Time from treatment to death or censor

In [75]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [76]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [77]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [78]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [79]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
print(len(mortality), mortality.PatientID.is_unique)

68483 True


In [80]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [81]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [82]:
key_042 = pd.merge(key_042, mortality, on = 'PatientID', how = 'left')

In [83]:
row_ID(key_042)

(4086, 4086)

In [84]:
conditions = [
    (key_042['death_status'] == 1),
    (key_042['death_status'] == 0)]

choices = [
    (key_042['death_date'] - key_042['StartDate']).dt.days,
    (key_042['last_activity'] - key_042['StartDate']).dt.days]

key_042.loc[:, 'timerisk_treatment'] = np.select(conditions, choices)

In [85]:
key_042 = key_042.query('timerisk_treatment >= 0')

#### Patient count

In [86]:
key_042 = (
    key_042
    .query('EGFR != "positive"')
    .query('ALK != "positive"')
)

In [87]:
low_cutoff_042 = cutoff.loc['keynote_042'].low

In [88]:
high_cutoff_042 = cutoff.loc['keynote_042'].high

In [89]:
print('Pembro total:',  key_042.query('pembro == 1').shape[0])
print('High risk:', key_042.query('pembro == 1').query('risk_score >= @high_cutoff_042').shape[0])
print('Med risk:', key_042.query('pembro == 1').query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042').shape[0])
print('Low risk:', key_042.query('pembro == 1').query('risk_score <= @low_cutoff_042').shape[0])

Pembro total: 2914
High risk: 1035
Med risk: 983
Low risk: 896


In [90]:
print('Platinum total:',  key_042.query('pembro == 0').shape[0])
print('High risk:', key_042.query('pembro == 0').query('risk_score >= @high_cutoff_042').shape[0])
print('Med risk:', key_042.query('pembro == 0').query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042').shape[0])
print('Low risk:', key_042.query('pembro == 0').query('risk_score <= @low_cutoff_042').shape[0])

Platinum total: 1077
High risk: 372
Med risk: 370
Low risk: 335


#### Survival curves with covariate balancing

In [91]:
row_ID(key_042)

(3991, 3991)

In [92]:
key_042 = pd.merge(key_042, pdl1_value, on = 'PatientID', how = 'left')

In [93]:
row_ID(key_042)

(3991, 3991)

In [94]:
conditions = [
    (key_042['PercentStaining'] == "1%") | 
    (key_042['PercentStaining'] == "2% - 4%") |
    (key_042['PercentStaining'] == "5% - 9%") |
    (key_042['PercentStaining'] == "10% - 19%") |
    (key_042['PercentStaining'] == "20% - 29%") |
    (key_042['PercentStaining'] == "30% - 39%") |
    (key_042['PercentStaining'] == "40% - 49%")
]

choices = ['lt50']

key_042['pdl1_det'] = np.select(conditions, choices, default = 'gte50')

In [95]:
key_042 = key_042.set_index('PatientID')

In [96]:
key_042_iptw = key_042.filter(items = ['death_status',
                                       'timerisk_treatment',
                                       'pembro',
                                       'age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'adv_year',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_diagnosis',
                                       'pdl1_det',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'])

In [97]:
key_042_iptw['met_cat'] = pd.cut(key_042_iptw['adv_year'],
                                 bins = [2010, 2016, float('inf')],
                                 labels = ['11-16', '17-21'])

In [98]:
conditions = [
    ((key_042_iptw['ecog_diagnosis'] == "1.0") | (key_042_iptw['ecog_diagnosis'] == "0.0")),  
    ((key_042_iptw['ecog_diagnosis'] == "2.0") | (key_042_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

key_042_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [99]:
key_042_iptw.dtypes

death_status               bool
timerisk_treatment      float64
pembro                    int64
age                       int64
gender                   object
race                     object
PracticeType             object
Histology                object
adv_year                  int64
delta_adv_diagnosis       int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
pdl1_det                 object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
ecog_2                   object
dtype: object

In [100]:
to_be_categorical = list(key_042_iptw.select_dtypes(include = ['object']).columns)

In [101]:
to_be_categorical

['gender',
 'race',
 'PracticeType',
 'Histology',
 'ecog_diagnosis',
 'pdl1_det',
 'ecog_2']

In [102]:
to_be_categorical.append('met_cat')

In [103]:
to_be_categorical.remove('ecog_diagnosis')

In [104]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    key_042_iptw[x] = key_042_iptw[x].astype('category')

In [105]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [106]:
# List of categorical features.
categorical_features = list(key_042_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [107]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [108]:
key_042_iptw_low = (
    key_042_iptw
    .query('risk_score <= @low_cutoff_042'))

key_042_iptw_med = (
    key_042_iptw
    .query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042'))

key_042_iptw_high = (
    key_042_iptw
    .query('risk_score >= @high_cutoff_042'))

key_042_iptw_all = key_042_iptw

In [109]:
key_042_low_x = preprocessor.fit_transform(key_042_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_det',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

key_042_med_x = preprocessor.fit_transform(key_042_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_det',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

key_042_high_x = preprocessor.fit_transform(key_042_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'PracticeType',
                                                                              'Histology',
                                                                              'met_cat',
                                                                              'delta_adv_diagnosis',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ecog_2',
                                                                              'pdl1_det', 
                                                                              'albumin_diag',
                                                                              'weight_pct_change',
                                                                              'risk_score']))

key_042_all_x = preprocessor.fit_transform(key_042_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_det', 
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

In [110]:
lr_042_low = LogisticRegression(max_iter = 1000)
lr_042_low.fit(key_042_low_x, key_042_iptw_low['pembro'])

LogisticRegression(max_iter=1000)

In [111]:
lr_042_med = LogisticRegression(max_iter = 1000)
lr_042_med.fit(key_042_med_x, key_042_iptw_med['pembro'])

LogisticRegression(max_iter=1000)

In [112]:
lr_042_high = LogisticRegression(max_iter = 1000)
lr_042_high.fit(key_042_high_x, key_042_iptw_high['pembro'])

LogisticRegression(max_iter=1000)

In [113]:
lr_042_all = LogisticRegression(max_iter = 1000)
lr_042_all.fit(key_042_all_x, key_042_iptw_all['pembro'])

LogisticRegression(max_iter=1000)

In [114]:
pred_low = lr_042_low.predict_proba(key_042_low_x)
pred_med = lr_042_med.predict_proba(key_042_med_x)
pred_high = lr_042_high.predict_proba(key_042_high_x)
pred_all = lr_042_all.predict_proba(key_042_all_x)

In [115]:
key_042_iptw_low['ps'] = pred_low[:, 1]
key_042_iptw_med['ps'] = pred_med[:, 1]
key_042_iptw_high['ps'] = pred_high[:, 1]
key_042_iptw_all['ps'] = pred_all[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [116]:
key_042_iptw_low['weight'] = (
    np.where(key_042_iptw_low['pembro'] == 1, 1/key_042_iptw_low['ps'], 1/(1 - key_042_iptw_low['ps'])))

key_042_iptw_med['weight'] = (
    np.where(key_042_iptw_med['pembro'] == 1, 1/key_042_iptw_med['ps'], 1/(1 - key_042_iptw_med['ps'])))

key_042_iptw_high['weight'] = (
    np.where(key_042_iptw_high['pembro'] == 1, 1/key_042_iptw_high['ps'], 1/(1 - key_042_iptw_high['ps'])))

key_042_iptw_all['weight'] = (
    np.where(key_042_iptw_all['pembro'] == 1, 1/key_042_iptw_all['ps'], 1/(1 - key_042_iptw_all['ps'])))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [117]:
# Low KM curves
kmf_low_pembro_042_iptw = KaplanMeierFitter()
kmf_low_plat_042_iptw = KaplanMeierFitter()

kmf_low_pembro_042_iptw.fit(
    key_042_iptw_low.query('pembro == 1').timerisk_treatment/30,
    key_042_iptw_low.query('pembro == 1').death_status,
    weights = key_042_iptw_low.query('pembro == 1')['weight'])

kmf_low_plat_042_iptw.fit(
    key_042_iptw_low.query('pembro == 0').timerisk_treatment/30,
    key_042_iptw_low.query('pembro == 0').death_status,
    weights = key_042_iptw_low.query('pembro == 0')['weight'])

# Med KM curves
kmf_med_pembro_042_iptw = KaplanMeierFitter()
kmf_med_plat_042_iptw = KaplanMeierFitter()

kmf_med_pembro_042_iptw.fit(
    key_042_iptw_med.query('pembro == 1').timerisk_treatment/30,
    key_042_iptw_med.query('pembro == 1').death_status,
    weights = key_042_iptw_med.query('pembro == 1')['weight'])

kmf_med_plat_042_iptw.fit(
    key_042_iptw_med.query('pembro == 0').timerisk_treatment/30,
    key_042_iptw_med.query('pembro == 0').death_status,
    weights = key_042_iptw_med.query('pembro == 0')['weight'])

# High KM curves 
kmf_high_pembro_042_iptw = KaplanMeierFitter()
kmf_high_plat_042_iptw = KaplanMeierFitter()

kmf_high_pembro_042_iptw.fit(
    key_042_iptw_high.query('pembro == 1').timerisk_treatment/30,
    key_042_iptw_high.query('pembro == 1').death_status,
    weights = key_042_iptw_high.query('pembro == 1')['weight'])

kmf_high_plat_042_iptw.fit(
    key_042_iptw_high.query('pembro == 0').timerisk_treatment/30,
    key_042_iptw_high.query('pembro == 0').death_status,
    weights = key_042_iptw_high.query('pembro == 0')['weight'])

# All KM curves 
kmf_all_pembro_042_iptw = KaplanMeierFitter()
kmf_all_plat_042_iptw = KaplanMeierFitter()

kmf_all_pembro_042_iptw.fit(
    key_042_iptw_all.query('pembro == 1').timerisk_treatment/30,
    key_042_iptw_all.query('pembro == 1').death_status,
    weights = key_042_iptw_all.query('pembro == 1')['weight'])

kmf_all_plat_042_iptw.fit(
    key_042_iptw_all.query('pembro == 0').timerisk_treatment/30,
    key_042_iptw_all.query('pembro == 0').death_status,
    weights = key_042_iptw_all.query('pembro == 0')['weight'])

  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  


<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 3767.58 total observations, 1446.87 right-censored observations>

#### Calculating survival metrics 

In [118]:
pembro_042_median_os = mos(kmf_low_pembro_042_iptw,
                           kmf_med_pembro_042_iptw,
                           kmf_high_pembro_042_iptw,
                           kmf_all_pembro_042_iptw)

plat_042_median_os = mos(kmf_low_plat_042_iptw,
                         kmf_med_plat_042_iptw,
                         kmf_high_plat_042_iptw,
                         kmf_all_plat_042_iptw)

In [119]:
key_042_iptw_all_imputed = key_042_iptw_all.copy()
key_042_iptw_all_imputed['albumin_diag'] = key_042_iptw_all_imputed['albumin_diag'].fillna(key_042_iptw_all_imputed['albumin_diag'].median())
key_042_iptw_all_imputed['weight_pct_change'] = key_042_iptw_all_imputed['weight_pct_change'].fillna(key_042_iptw_all_imputed['weight_pct_change'].median())

In [120]:
key042_hr_all = CoxPHFitter()
key042_hr_all.fit(key_042_iptw_all_imputed,
                  duration_col = 'timerisk_treatment',
                  event_col = 'death_status',
                  formula = 'pembro + age + gender + race + PracticeType + Histology + met_cat + delta_adv_diagnosis + commercial + medicare + medicaid + ecog_2 + pdl1_det + albumin_diag + weight_pct_change', 
                  weights_col = 'weight', 
                  robust = True)

<lifelines.CoxPHFitter: fitted with 7755.04 total observations, 3137.8 right-censored observations>

In [121]:
key042_all_rmst_mos_95 = rmst_mos_95ci(key_042_iptw_all,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_det', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [122]:
key042_low_rmst_mos_95 = rmst_mos_95ci(key_042_iptw_low,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_det', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [123]:
key042_med_rmst_mos_95 = rmst_mos_95ci(key_042_iptw_med,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_det', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [124]:
key042_high_rmst_mos_95 = rmst_mos_95ci(key_042_iptw_high,
                                        1000,
                                        'pembro',
                                        'death',
                                        ['age',
                                         'gender',
                                         'race',
                                         'PracticeType',
                                         'Histology',
                                         'met_cat',
                                         'delta_adv_diagnosis',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ecog_2',
                                         'pdl1_det', 
                                         'albumin_diag',
                                         'weight_pct_change',
                                         'risk_score'],
                                        ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        36)

In [125]:
keynote_042_data = [
    {'trial_name': 'KEYNOTE-042', 
     'risk_group': 'low', 
     'r_trt_mos': pembro_042_median_os[0],
     'r_trt_mos_95': key042_low_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_042_median_os[0],
     'r_cont_mos_95': key042_low_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_042_median_os[0] - plat_042_median_os[0], 
     'rct_trt_arm': 16.7, 
     'rct_cont_arm': 12.1, 
     'rct_mos_diff': 16.7-12.1,
     'trt_rmst': restricted_mean_survival_time(kmf_low_pembro_042_iptw, 36),
     'trt_rmst_95': key042_low_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_low_plat_042_iptw, 36),
     'cont_rmst_95': key042_low_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_low_pembro_042_iptw, 36) - restricted_mean_survival_time(kmf_low_plat_042_iptw, 36),
     'diff_rmst_95': key042_low_rmst_mos_95.difference_rmst_95,
     'rcount': key_042.query('risk_score <= @low_cutoff_042').shape[0],
     'rcount_chemo': key_042.query('risk_score <= @low_cutoff_042').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-042', 
     'risk_group': 'medium', 
     'r_trt_mos': pembro_042_median_os[1],
     'r_trt_mos_95': key042_med_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_042_median_os[1],
     'r_cont_mos_95': key042_med_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_042_median_os[1] - plat_042_median_os[1], 
     'rct_trt_arm': 16.7, 
     'rct_cont_arm': 12.1, 
     'rct_mos_diff': 16.7-12.1,
     'trt_rmst': restricted_mean_survival_time(kmf_med_pembro_042_iptw, 36),
     'trt_rmst_95': key042_med_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_med_plat_042_iptw, 36),
     'cont_rmst_95': key042_med_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_med_pembro_042_iptw, 36) - restricted_mean_survival_time(kmf_med_plat_042_iptw, 36),
     'diff_rmst_95': key042_med_rmst_mos_95.difference_rmst_95,
     'rcount': key_042.query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042').shape[0],
     'rcount_chemo': key_042.query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-042', 
     'risk_group': 'high', 
     'r_trt_mos': pembro_042_median_os[2],
     'r_trt_mos_95': key042_high_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_042_median_os[2],
     'r_cont_mos_95': key042_high_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_042_median_os[2] - plat_042_median_os[2], 
     'rct_trt_arm': 16.7, 
     'rct_cont_arm': 12.1, 
     'rct_mos_diff': 16.7-12.1,
     'trt_rmst': restricted_mean_survival_time(kmf_high_pembro_042_iptw, 36),
     'trt_rmst_95': key042_high_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_high_plat_042_iptw, 36),
     'cont_rmst_95': key042_high_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_high_pembro_042_iptw, 36) - restricted_mean_survival_time(kmf_high_plat_042_iptw, 36),
     'diff_rmst_95': key042_high_rmst_mos_95.difference_rmst_95,
     'rcount': key_042.query('risk_score >= @high_cutoff_042').shape[0],
     'rcount_chemo': key_042.query('risk_score >= @high_cutoff_042').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-042', 
     'risk_group': 'all', 
     'r_hr': key042_hr_all.hazard_ratios_['pembro'],
     'r_hr_95': [key042_hr_all.summary.loc['pembro']['exp(coef) lower 95%'], key042_hr_all.summary.loc['pembro']['exp(coef) upper 95%']],
     'r_trt_mos': pembro_042_median_os[3],
     'r_trt_mos_95': key042_all_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_042_median_os[3],
     'r_cont_mos_95': key042_all_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_042_median_os[3] - plat_042_median_os[3], 
     'rct_trt_arm': 16.7, 
     'rct_cont_arm': 12.1, 
     'rct_mos_diff': 16.7-12.1,
     'rcount': key_042.shape[0], 
     'rcount_chemo': key_042.query('pembro == 0').shape[0]}
]

### KEYNOTE-024: First-line pembrolizumab vs. platinum-based chemotherapy in those with high PDL1 

**INCLUSION CRITERIA**
* Untreated aNSCLC
* Received first line pemobrolizumab or platinum-based chemotherapy 
* Received appropriate dose of platinum-based chemotherapy
* PDL1 >= 50% and status known within (-inf, +30] days of start of first-line treatment 
* EGFR and ALK negatve

#### Pembrolizumab

In [126]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [127]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [128]:
key024_pembro = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
    .query('LineName == "Pembrolizumab"')
    [['PatientID', 'StartDate']]
)

In [129]:
key024_pembro.loc[:, 'pembro'] = 1

In [130]:
row_ID(key024_pembro)

(3648, 3648)

In [131]:
# Dataframe of all therapies received for those receiving first line pembrolizumab only. 
line_therapy_pembro_024 = (
    line_therapy[line_therapy['PatientID'].isin(key024_pembro.PatientID)])

In [132]:
targeted = [
    'Afatinib',
    'Alectinib',
    'Brigatinib',
    'Cabozantinib',
    'Capmatinib',
    'Ceritinib',
    'Crizotinib',
    'Dabrafenib',
    'Dacomitinib',
    'Entrectinib',
    'Erlotinib',
    'Gefitinib',
    'Lorlatinib',
    'Osimertinib',
    'Pralsetinib',
    'Selpercatinib',
    'Sotorasib',
    'Tepotinib',
    'Trametinib',
    'Vandetanib']

In [133]:
# Patients receiving pembrolizumab therapy who later recieve targeted therapy. 
pembro_024_xcross = (
    line_therapy_pembro_024[line_therapy_pembro_024['LineName'].str.contains('|'.join(targeted))].PatientID)

In [134]:
# Select patients who don't receive targeted therapy in future lines.
key024_pembro = key024_pembro[~key024_pembro['PatientID'].isin(pembro_024_xcross)]

In [135]:
row_ID(key024_pembro)

(3582, 3582)

#### Platinum-based chemotherapy 

In [136]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
)

In [137]:
plat_chemo = [
    'Carboplatin',
    'Cisplatin']

immuno = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab',
    'Pembrolizumab'
]

In [138]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')].LineName.value_counts().head(10)

Carboplatin,Paclitaxel                  8524
Carboplatin,Pemetrexed                  5417
Bevacizumab,Carboplatin,Pemetrexed      2825
Carboplatin,Paclitaxel Protein-Bound    1826
Bevacizumab,Carboplatin,Paclitaxel      1591
Carboplatin,Gemcitabine                 1224
Cisplatin,Etoposide                      793
Carboplatin,Docetaxel                    780
Cisplatin,Pemetrexed                     684
Carboplatin,Etoposide                    363
Name: LineName, dtype: int64

In [139]:
key024_carb = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('Carboplatin') & 
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [140]:
key024_carb.loc[:, 'carb'] = 1

In [141]:
key024_cis = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('Cisplatin') & 
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [142]:
key024_cis.loc[:, 'carb'] = 0

In [143]:
key024_plat = pd.concat([key024_carb, key024_cis])

In [144]:
key024_plat.loc[:, 'pembro'] = 0

In [145]:
row_ID(key024_plat)

(25861, 25861)

In [146]:
# Dataframe of all therapies received for those receiving first line platinum regimen   
line_therapy_plat_024 = (
    line_therapy[line_therapy['PatientID'].isin(key024_plat.PatientID)])

In [147]:
# Patients receiving platinum therapy who later recieve targeted therapy. 
plat_024_xcross = (
    line_therapy_plat_024[line_therapy_plat_024['LineName'].str.contains('|'.join(targeted))].PatientID)

In [148]:
# Select patients who don't receive targeted therapy in future lines 
key024_plat = key024_plat[~key024_plat['PatientID'].isin(plat_024_xcross)]

In [149]:
row_ID(key024_plat)

(23800, 23800)

#### Platinum-based chemotherapy dosing

#### Carboplatin

In [150]:
med_order = pd.read_csv('MedicationOrder.csv', low_memory = False)

In [151]:
med_order['ExpectedStartDate'] = np.where(med_order['ExpectedStartDate'].isna(), 
                                          med_order['OrderedDate'], 
                                          med_order['ExpectedStartDate'])

In [152]:
med_order.loc[:, 'ExpectedStartDate'] = pd.to_datetime(med_order['ExpectedStartDate'])

In [153]:
key024_plat.loc[:, 'StartDate'] = pd.to_datetime(key024_plat['StartDate'])

In [154]:
med_order_carb = (
    med_order[med_order['PatientID'].isin(key024_plat.query('carb == 1').PatientID)]
    .query('CommonDrugName == "carboplatin"')
)

In [155]:
med_order_carb.shape

(147262, 18)

In [156]:
med_order_carb = pd.merge(med_order_carb, 
                          key024_plat.query('carb == 1')[['PatientID', 'StartDate']], 
                          on = 'PatientID', 
                          how = 'left')

In [157]:
med_order_carb.shape

(147262, 19)

In [158]:
med_order_carb.loc[:, 'date_diff'] = (med_order_carb['ExpectedStartDate'] - med_order_carb['StartDate']).dt.days.abs()

In [159]:
med_order_carb = med_order_carb.query('date_diff <= 14')

In [160]:
carb_index = med_order_carb.groupby('PatientID')['date_diff'].idxmin()

In [161]:
carb_dose = med_order_carb.loc[carb_index].query('RelativeOrderedUnits == "AUC"')[['PatientID', 'RelativeOrderedAmount']]

In [162]:
carb_dose = carb_dose.rename(columns = {'RelativeOrderedAmount': 'carb_dose_auc'})

In [163]:
carb_IDs = carb_dose.query('carb_dose_auc >= 5').PatientID

#### Cisplatin

In [164]:
med_order_cis = (
    med_order[med_order['PatientID'].isin(key024_plat.query('carb == 0').PatientID)]
    .query('CommonDrugName == "cisplatin"')
)

In [165]:
med_order_cis.shape

(10521, 18)

In [166]:
med_order_cis = pd.merge(med_order_cis,
                         key024_plat.query('carb == 0')[['PatientID', 'StartDate']], 
                         on = 'PatientID', 
                         how = 'left')

In [167]:
med_order_cis.shape

(10521, 19)

In [168]:
med_order_cis.loc[:, 'date_diff'] = (med_order_cis['ExpectedStartDate'] - med_order_cis['StartDate']).dt.days.abs()

In [169]:
med_order_cis = med_order_cis.query('date_diff <= 14')

In [170]:
cis_index = med_order_cis.groupby('PatientID')['date_diff'].idxmin()

In [171]:
cis_dose = med_order_cis.loc[cis_index].query('RelativeOrderedUnits == "mg/m2"')[['PatientID', 'RelativeOrderedAmount']]

In [172]:
cis_dose = cis_dose.rename(columns = {'RelativeOrderedAmount': 'cis_dose_mgm2'})

In [173]:
cis_IDs = cis_dose.query('cis_dose_mgm2 >= 75').PatientID

#### Combine chemotherapy and pembrolizumab dataframes 

In [174]:
key024_plat = key024_plat[key024_plat['PatientID'].isin(pd.concat([carb_IDs, cis_IDs]))]

In [175]:
key024_plat = key024_plat.drop(columns = ['carb'])

In [176]:
key_024 = pd.concat([key024_pembro, key024_plat])

In [177]:
row_ID(key_024)

(15321, 15321)

In [178]:
key_024 = pd.merge(key_024, df_full, on = 'PatientID', how = 'left')

In [179]:
row_ID(key_024)

(15321, 15321)

In [180]:
key_024['StartDate'] = pd.to_datetime(key_024['StartDate'])

#### High PDL1

In [181]:
biomarkers = pd.read_csv('Enhanced_AdvNSCLCBiomarkers.csv')

In [182]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(key_024['PatientID'])]

In [183]:
biomarkers = pd.merge(biomarkers, key_024[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [184]:
row_ID(biomarkers)

(58666, 12087)

In [185]:
biomarkers['StartDate'] = pd.to_datetime(biomarkers['StartDate'])

In [186]:
biomarkers['ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [187]:
biomarkers['SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [188]:
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [189]:
biomarkers.loc[:, 'date_diff'] = (biomarkers['result_date'] - biomarkers['StartDate']).dt.days

In [190]:
lst = ["50% - 59%", "60% - 69%", "70% - 79%", "80% - 89%", "90% - 99%", "100%"]

pdl1_ids = (
    biomarkers
    .query('BiomarkerName == "PDL1"')
    .query('date_diff <=30')
    .query('PercentStaining == @lst')
    .PatientID
    .unique()
)

In [191]:
key_024 = key_024[key_024.PatientID.isin(pdl1_ids)]

In [192]:
row_ID(key_024)

(2868, 2868)

#### Time from treatment to death/progression or censor 

In [193]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [194]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [195]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [196]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [197]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
row_ID(mortality)

(68483, 68483)

In [198]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [199]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [200]:
row_ID(mortality)

(68483, 68483)

In [201]:
key_024 = pd.merge(key_024, mortality, on = 'PatientID', how = 'left')

In [202]:
row_ID(key_024)

(2868, 2868)

In [203]:
progression = pd.read_csv('Enhanced_AdvNSCLCProgression.csv')

In [204]:
progression = progression[progression.PatientID.isin(key_024.PatientID)][['PatientID', 'ProgressionDate']]

In [205]:
progression['ProgressionDate'] = pd.to_datetime(progression['ProgressionDate'])

In [206]:
progression = (
    progression
    .sort_values(['PatientID', 'ProgressionDate'], ascending = [True, True])
    .drop_duplicates(subset = 'PatientID', keep = 'first')
)

In [207]:
row_ID(progression)

(2866, 2866)

In [208]:
key_024 = pd.merge(key_024, progression, on = 'PatientID', how = 'left')

In [209]:
row_ID(key_024)

(2868, 2868)

In [210]:
# Percent without progression date in Flaura trial
len(key_024.query('ProgressionDate.isna()', engine = 'python'))/len(key_024)

0.5188284518828452

In [211]:
conditions = [
    (key_024.ProgressionDate.notna()),
    ((key_024.ProgressionDate.isna()) & (key_024['death_status'] == 1)),
    ((key_024.ProgressionDate.isna()) & (key_024['death_status'] == 0))]

choices = [
    (key_024['ProgressionDate'] - key_024['StartDate']).dt.days,
    (key_024['death_date'] - key_024['StartDate']).dt.days,
    (key_024['last_activity'] - key_024['StartDate']).dt.days]

key_024.loc[:, 'time_prog_treatment'] = np.select(conditions, choices)

In [212]:
key_024 = key_024.query('time_prog_treatment >= 0')

In [213]:
row_ID(key_024)

(2584, 2584)

In [214]:
conditions = [
    (key_024.ProgressionDate.notna()),
    ((key_024.ProgressionDate.isna()) & (key_024['death_status'] == 1)),
    ((key_024.ProgressionDate.isna()) & (key_024['death_status'] == 0))]

choices = [1, 1, 0]

key_024.loc[:, 'pfs_status'] = np.select(conditions, choices)

#### Patient counts

In [215]:
key_024 = (
    key_024
    .query('stage == "IV"')
    .query('EGFR != "positive"')
    .query('ALK != "positive"')
)

In [216]:
low_cutoff_024 = cutoff.loc['keynote_024'].low

In [217]:
high_cutoff_024 = cutoff.loc['keynote_024'].high

In [218]:
print('Pembro total:',  key_024.query('pembro == 1').shape[0])
print('High risk:', key_024.query('pembro == 1').query('risk_score >= @high_cutoff_024').shape[0])
print('Med risk:', key_024.query('pembro == 1').query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024').shape[0])
print('Low risk:', key_024.query('pembro == 1').query('risk_score <= @low_cutoff_024').shape[0])

Pembro total: 1656
High risk: 546
Med risk: 563
Low risk: 547


In [219]:
print('Platinum total:',  key_024.query('pembro == 0').shape[0])
print('High risk:', key_024.query('pembro == 0').query('risk_score >= @high_cutoff_024').shape[0])
print('Med risk:', key_024.query('pembro == 0').query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024').shape[0])
print('Low risk:', key_024.query('pembro == 0').query('risk_score <= @low_cutoff_024').shape[0])

Platinum total: 240
High risk: 76
Med risk: 81
Low risk: 83


#### PFS with covariate balancing 

In [220]:
key_024 = key_024.set_index('PatientID')

In [221]:
key_024_iptw = key_024.filter(items = ['pfs_status',
                                       'time_prog_treatment',
                                       'pembro',
                                       'age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'adv_year',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_diagnosis',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'])

In [222]:
key_024_iptw['met_cat'] = pd.cut(key_024_iptw['adv_year'],
                                 bins = [2010, 2016, float('inf')],
                                 labels = ['11-16', '17-21'])

In [223]:
conditions = [
    ((key_024_iptw['ecog_diagnosis'] == "1.0") | (key_024_iptw['ecog_diagnosis'] == "0.0")),  
    ((key_024_iptw['ecog_diagnosis'] == "2.0") | (key_024_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

key_024_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [224]:
key_024_iptw.dtypes

pfs_status                int64
time_prog_treatment     float64
pembro                    int64
age                       int64
gender                   object
race                     object
PracticeType             object
Histology                object
adv_year                  int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
ecog_2                   object
dtype: object

In [225]:
to_be_categorical = list(key_024_iptw.select_dtypes(include = ['object']).columns)

In [226]:
to_be_categorical

['gender', 'race', 'PracticeType', 'Histology', 'ecog_diagnosis', 'ecog_2']

In [227]:
to_be_categorical.append('met_cat')

In [228]:
to_be_categorical.remove('ecog_diagnosis')

In [229]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    key_024_iptw[x] = key_024_iptw[x].astype('category')

In [230]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [231]:
# List of categorical features.
categorical_features = list(key_024_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [232]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [233]:
key_024_iptw_low = (
    key_024_iptw
    .query('risk_score <= @low_cutoff_024'))

key_024_iptw_med = (
    key_024_iptw
    .query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024'))

key_024_iptw_high = (
    key_024_iptw
    .query('risk_score >= @high_cutoff_024'))

key_024_iptw_all = key_024_iptw

In [234]:
key_024_low_x = preprocessor.fit_transform(key_024_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

key_024_med_x = preprocessor.fit_transform(key_024_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

key_024_high_x = preprocessor.fit_transform(key_024_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'PracticeType',
                                                                              'Histology',
                                                                              'met_cat',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ecog_2',
                                                                              'albumin_diag',
                                                                              'weight_pct_change',
                                                                              'risk_score']))

key_024_all_x = preprocessor.fit_transform(key_024_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

In [235]:
lr_024_low = LogisticRegression(max_iter = 1000)
lr_024_low.fit(key_024_low_x, key_024_iptw_low['pembro'])

LogisticRegression(max_iter=1000)

In [236]:
lr_024_med = LogisticRegression(max_iter = 1000)
lr_024_med.fit(key_024_med_x, key_024_iptw_med['pembro'])

LogisticRegression(max_iter=1000)

In [237]:
lr_024_high = LogisticRegression(max_iter = 1000)
lr_024_high.fit(key_024_high_x, key_024_iptw_high['pembro'])

LogisticRegression(max_iter=1000)

In [238]:
lr_024_all = LogisticRegression(max_iter = 1000)
lr_024_all.fit(key_024_all_x, key_024_iptw_all['pembro'])

LogisticRegression(max_iter=1000)

In [239]:
pred_low = lr_024_low.predict_proba(key_024_low_x)
pred_med = lr_024_med.predict_proba(key_024_med_x)
pred_high = lr_024_high.predict_proba(key_024_high_x)
pred_all = lr_024_all.predict_proba(key_024_all_x)

In [240]:
key_024_iptw_low['ps'] = pred_low[:, 1]
key_024_iptw_med['ps'] = pred_med[:, 1]
key_024_iptw_high['ps'] = pred_high[:, 1]
key_024_iptw_all['ps'] = pred_all[:, 1]

In [241]:
key_024_iptw_low['weight'] = (
    np.where(key_024_iptw_low['pembro'] == 1, 1/key_024_iptw_low['ps'], 1/(1 - key_024_iptw_low['ps'])))

key_024_iptw_med['weight'] = (
    np.where(key_024_iptw_med['pembro'] == 1, 1/key_024_iptw_med['ps'], 1/(1 - key_024_iptw_med['ps'])))

key_024_iptw_high['weight'] = (
    np.where(key_024_iptw_high['pembro'] == 1, 1/key_024_iptw_high['ps'], 1/(1 - key_024_iptw_high['ps'])))

key_024_iptw_all['weight'] = (
    np.where(key_024_iptw_all['pembro'] == 1, 1/key_024_iptw_all['ps'], 1/(1 - key_024_iptw_all['ps'])))

In [242]:
# Low KM curves
kmf_low_pembro_024_iptw = KaplanMeierFitter()
kmf_low_plat_024_iptw = KaplanMeierFitter()

kmf_low_pembro_024_iptw.fit(
    key_024_iptw_low.query('pembro == 1').time_prog_treatment/30,
    key_024_iptw_low.query('pembro == 1').pfs_status,
    weights = key_024_iptw_low.query('pembro == 1')['weight'])

kmf_low_plat_024_iptw.fit(
    key_024_iptw_low.query('pembro == 0').time_prog_treatment/30,
    key_024_iptw_low.query('pembro == 0').pfs_status,
    weights = key_024_iptw_low.query('pembro == 0')['weight'])

# Med KM curves
kmf_med_pembro_024_iptw = KaplanMeierFitter()
kmf_med_plat_024_iptw = KaplanMeierFitter()

kmf_med_pembro_024_iptw.fit(
    key_024_iptw_med.query('pembro == 1').time_prog_treatment/30,
    key_024_iptw_med.query('pembro == 1').pfs_status,
    weights = key_024_iptw_med.query('pembro == 1')['weight'])

kmf_med_plat_024_iptw.fit(
    key_024_iptw_med.query('pembro == 0').time_prog_treatment/30,
    key_024_iptw_med.query('pembro == 0').pfs_status,
    weights = key_024_iptw_med.query('pembro == 0')['weight'])

# High KM curves 
kmf_high_pembro_024_iptw = KaplanMeierFitter()
kmf_high_plat_024_iptw = KaplanMeierFitter()

kmf_high_pembro_024_iptw.fit(
    key_024_iptw_high.query('pembro == 1').time_prog_treatment/30,
    key_024_iptw_high.query('pembro == 1').pfs_status,
    weights = key_024_iptw_high.query('pembro == 1')['weight'])

kmf_high_plat_024_iptw.fit(
    key_024_iptw_high.query('pembro == 0').time_prog_treatment/30,
    key_024_iptw_high.query('pembro == 0').pfs_status,
    weights = key_024_iptw_high.query('pembro == 0')['weight'])

# All KM curves 
kmf_all_pembro_024_iptw = KaplanMeierFitter()
kmf_all_plat_024_iptw = KaplanMeierFitter()

kmf_all_pembro_024_iptw.fit(
    key_024_iptw_all.query('pembro == 1').time_prog_treatment/30,
    key_024_iptw_all.query('pembro == 1').pfs_status,
    weights = key_024_iptw_all.query('pembro == 1')['weight'])

kmf_all_plat_024_iptw.fit(
    key_024_iptw_all.query('pembro == 0').time_prog_treatment/30,
    key_024_iptw_all.query('pembro == 0').pfs_status,
    weights = key_024_iptw_all.query('pembro == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 1652.53 total observations, 326.856 right-censored observations>

#### Calculating survival metrics 

In [243]:
pembro_024_median_os = mos(kmf_low_pembro_024_iptw,
                           kmf_med_pembro_024_iptw,
                           kmf_high_pembro_024_iptw,
                           kmf_all_pembro_024_iptw)

plat_024_median_os = mos(kmf_low_plat_024_iptw,
                         kmf_med_plat_024_iptw,
                         kmf_high_plat_024_iptw,
                         kmf_all_plat_024_iptw)

In [244]:
key_024_iptw_all_imputed = key_024_iptw_all.copy()
key_024_iptw_all_imputed['albumin_diag'] = key_024_iptw_all_imputed['albumin_diag'].fillna(key_024_iptw_all_imputed['albumin_diag'].median())
key_024_iptw_all_imputed['weight_pct_change'] = key_024_iptw_all_imputed['weight_pct_change'].fillna(key_024_iptw_all_imputed['weight_pct_change'].median())

In [245]:
key024_hr_all = CoxPHFitter()
key024_hr_all.fit(key_024_iptw_all_imputed,
                  duration_col = 'time_prog_treatment',
                  event_col = 'pfs_status',
                  formula = 'pembro + age + gender + race + PracticeType + Histology + met_cat + commercial + medicare + medicaid + ecog_2 + albumin_diag + weight_pct_change + risk_score', 
                  weights_col = 'weight', 
                  robust = True)

<lifelines.CoxPHFitter: fitted with 3556.29 total observations, 776.792 right-censored observations>

In [246]:
key024_all_rmst_mos_95 = rmst_mos_95ci(key_024_iptw_all,
                                       1000,
                                       'pembro',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       18)

In [247]:
key024_low_rmst_mos_95 = rmst_mos_95ci(key_024_iptw_low,
                                       1000,
                                       'pembro',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       18)

In [248]:
key024_med_rmst_mos_95 = rmst_mos_95ci(key_024_iptw_med,
                                       1000,
                                       'pembro',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       18)

In [249]:
key024_high_rmst_mos_95 = rmst_mos_95ci(key_024_iptw_high,
                                       1000,
                                       'pembro',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       18)

In [250]:
keynote_024_data = [
    {'trial_name': 'KEYNOTE-024', 
     'risk_group': 'low', 
     'r_trt_mos': pembro_024_median_os[0],
     'r_trt_mos_95': key024_low_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_024_median_os[0],
     'r_cont_mos_95': key024_low_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_024_median_os[0] - plat_024_median_os[0], 
     'rct_trt_arm': 10.3, 
     'rct_cont_arm': 6.0,
     'rct_mos_diff': 10.3-6.0, 
     'trt_rmst': restricted_mean_survival_time(kmf_low_pembro_024_iptw, 18),
     'trt_rmst_95': key024_low_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_low_plat_024_iptw, 18),
     'cont_rmst_95': key024_low_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_low_pembro_024_iptw, 18) - restricted_mean_survival_time(kmf_low_plat_024_iptw, 18),
     'diff_rmst_95': key024_low_rmst_mos_95.difference_rmst_95,
     'rcount': key_024.query('risk_score <= @low_cutoff_024').shape[0],
     'rcount_chemo': key_024.query('risk_score <= @low_cutoff_024').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-024', 
     'risk_group': 'medium', 
     'r_trt_mos': pembro_024_median_os[1],
     'r_trt_mos_95': key024_med_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_024_median_os[1],
     'r_cont_mos_95': key024_med_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_024_median_os[1] - plat_024_median_os[1], 
     'rct_trt_arm': 10.3, 
     'rct_cont_arm': 6.0,
     'rct_mos_diff': 10.3-6.0, 
     'trt_rmst': restricted_mean_survival_time(kmf_med_pembro_024_iptw, 18),
     'trt_rmst_95': key024_med_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_med_plat_024_iptw, 18),
     'cont_rmst_95': key024_med_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_med_pembro_024_iptw, 18) - restricted_mean_survival_time(kmf_med_plat_024_iptw, 18),
     'diff_rmst_95': key024_med_rmst_mos_95.difference_rmst_95,
     'rcount': key_024.query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024').shape[0],
     'rcount_chemo': key_024.query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-024', 
     'risk_group': 'high', 
     'r_trt_mos': pembro_024_median_os[2],
     'r_trt_mos_95': key024_high_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_024_median_os[2],
     'r_cont_mos_95': key024_high_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_024_median_os[2] - plat_024_median_os[2], 
     'rct_trt_arm': 10.3, 
     'rct_cont_arm': 6.0,
     'rct_mos_diff': 10.3-6.0, 
     'trt_rmst': restricted_mean_survival_time(kmf_high_pembro_024_iptw, 18),
     'trt_rmst_95': key024_high_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_high_plat_024_iptw, 18),
     'cont_rmst_95': key024_high_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_high_pembro_024_iptw, 18) - restricted_mean_survival_time(kmf_high_plat_024_iptw, 18),
     'diff_rmst_95': key024_high_rmst_mos_95.difference_rmst_95,
     'rcount': key_024.query('risk_score >= @high_cutoff_024').shape[0],
     'rcount_chemo': key_024.query('risk_score >= @high_cutoff_024').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-024', 
     'risk_group': 'all', 
     'r_hr': key024_hr_all.hazard_ratios_['pembro'],
     'r_hr_95': [key024_hr_all.summary.loc['pembro']['exp(coef) lower 95%'], key024_hr_all.summary.loc['pembro']['exp(coef) upper 95%']],
     'r_trt_mos': pembro_024_median_os[3],
     'r_trt_mos_95': key024_all_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_024_median_os[3],
     'r_cont_mos_95': key024_all_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_024_median_os[3] - plat_024_median_os[3], 
     'rct_trt_arm': 10.3, 
     'rct_cont_arm': 6.0,
     'rct_mos_diff': 10.3-6.0, 
     'rcount': key_024.shape[0], 
     'rcount_chemo': key_024.query('pembro == 0').shape[0]}
]

### KEYNOTE-189: First-line pembrolizumab plus chemotherapy vs. chemotherapy

**INCLUSION CRITERIA**
* Untreated stage IV NSCLC
* Received first line pemobrolizumab plus platinum-based chemotherapy or platinum-based chemotherapy
* Received appropriate dose of platinum-based chemotherapy
* EGFR and ALK negative

#### Pembrolizumab + chemotherapy 

In [251]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [252]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [253]:
line_therapy[line_therapy['LineName'].str.contains('Pemetrexed')].LineName.value_counts().head(10)

Carboplatin,Pemetrexed                              6700
Carboplatin,Pembrolizumab,Pemetrexed                5113
Pemetrexed                                          4711
Bevacizumab,Carboplatin,Pemetrexed                  3459
Bevacizumab,Pemetrexed                              1569
Pembrolizumab,Pemetrexed                            1554
Cisplatin,Pemetrexed                                 811
Bevacizumab,Cisplatin,Pemetrexed                     174
Abiraterone,Carboplatin,Pembrolizumab,Pemetrexed     100
Bevacizumab-Awwb,Carboplatin,Pemetrexed               82
Name: LineName, dtype: int64

In [254]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
)

In [255]:
plat_chemo = [
    'Carboplatin',
    'Cisplatin']

immuno_wout_pembro = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab']

targeted = [
    'Afatinib',
    'Alectinib',
    'Brigatinib',
    'Cabozantinib',
    'Capmatinib',
    'Ceritinib',
    'Crizotinib',
    'Dabrafenib',
    'Dacomitinib',
    'Entrectinib',
    'Erlotinib',
    'Gefitinib',
    'Lorlatinib',
    'Osimertinib',
    'Pralsetinib',
    'Selpercatinib',
    'Sotorasib',
    'Tepotinib',
    'Trametinib',
    'Vandetanib']

In [256]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                line_therapy_fl['LineName'].str.contains('Pembrolizumab') &
                ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                ~line_therapy_fl['LineName'].str.contains('|'.join(immuno_wout_pembro)) &
                ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')].LineName.value_counts().head(10)

Carboplatin,Pembrolizumab,Pemetrexed                     4275
Carboplatin,Paclitaxel,Pembrolizumab                      803
Carboplatin,Paclitaxel Protein-Bound,Pembrolizumab        534
Abiraterone,Carboplatin,Pembrolizumab,Pemetrexed           79
Carboplatin,Cyclophosphamide,Pembrolizumab,Pemetrexed      31
Cisplatin,Pembrolizumab,Pemetrexed                         27
Carboplatin,Pembrolizumab                                  19
Carboplatin,Docetaxel,Pembrolizumab                        18
Carboplatin,Paclitaxel,Pembrolizumab,Pemetrexed            18
Bevacizumab,Carboplatin,Pembrolizumab,Pemetrexed           10
Name: LineName, dtype: int64

In [257]:
key189_carb_pembro = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('Carboplatin') & 
                    line_therapy_fl['LineName'].str.contains('Pembrolizumab') &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno_wout_pembro)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [258]:
key189_carb_pembro.loc[:, 'carb'] = 1

In [259]:
key189_cis_pembro = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('Cisplatin') & 
                    line_therapy_fl['LineName'].str.contains('Pembrolizumab') &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno_wout_pembro)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [260]:
key189_cis_pembro.loc[:, 'carb'] = 0

In [261]:
key189_pembro = pd.concat([key189_carb_pembro, key189_cis_pembro])

In [262]:
key189_pembro.loc[:, 'pembro'] = 1

In [263]:
row_ID(key189_pembro)

(5878, 5878)

In [264]:
# Dataframe of all therapies received for those receiving first line pembrolizumab only. 
line_therapy_pembro_189 = (
    line_therapy[line_therapy['PatientID'].isin(key189_pembro.PatientID)])

In [265]:
# Patients receiving pembrolizumab therapy who later recieve targeted therapy. 
pembro_189_xcross = (
    line_therapy_pembro_189[line_therapy_pembro_189['LineName'].str.contains('|'.join(targeted))].PatientID)

In [266]:
# Select patients who don't receive targeted therapy in future lines.
key189_pembro = key189_pembro[~key189_pembro['PatientID'].isin(pembro_189_xcross)]

In [267]:
row_ID(key189_pembro)

(5673, 5673)

#### Platinum-based chemotherapy

In [268]:
plat_chemo = [
    'Carboplatin',
    'Cisplatin']

immuno = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab',
    'Pembrolizumab'
]

In [269]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')].LineName.value_counts().head(10)

Carboplatin,Paclitaxel                  8524
Carboplatin,Pemetrexed                  5417
Bevacizumab,Carboplatin,Pemetrexed      2825
Carboplatin,Paclitaxel Protein-Bound    1826
Bevacizumab,Carboplatin,Paclitaxel      1591
Carboplatin,Gemcitabine                 1224
Cisplatin,Etoposide                      793
Carboplatin,Docetaxel                    780
Cisplatin,Pemetrexed                     684
Carboplatin,Etoposide                    363
Name: LineName, dtype: int64

In [270]:
key189_carb = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('Carboplatin') & 
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [271]:
key189_carb.loc[:, 'carb'] = 1

In [272]:
key189_cis = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('Cisplatin') & 
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [273]:
key189_cis.loc[:, 'carb'] = 0

In [274]:
key189_plat = pd.concat([key189_carb, key189_cis])

In [275]:
key189_plat.loc[:, 'pembro'] = 0

In [276]:
row_ID(key189_plat)

(25861, 25861)

In [277]:
# Dataframe of all therapies received for those receiving first line platinum regimen 
line_therapy_plat_189 = (
    line_therapy[line_therapy['PatientID'].isin(key189_plat.PatientID)])

In [278]:
# Patients receiving platinum therapy who later recieve targeted therapy
plat_189_xcross = (
    line_therapy_plat_189[line_therapy_plat_189['LineName'].str.contains('|'.join(targeted))].PatientID)

In [279]:
# Select patients who don't receive targeted therapy
key189_plat = key189_plat[~key189_plat['PatientID'].isin(plat_189_xcross)]

In [280]:
row_ID(key189_plat)

(23800, 23800)

In [281]:
key_189 = pd.concat([key189_pembro, key189_plat])

In [282]:
row_ID(key_189)

(29473, 29473)

#### Platinum-based chemotherapy dosing

#### Carboplatin

In [283]:
med_order = pd.read_csv('MedicationOrder.csv', low_memory = False)

In [284]:
med_order['ExpectedStartDate'] = np.where(med_order['ExpectedStartDate'].isna(), 
                                          med_order['OrderedDate'], 
                                          med_order['ExpectedStartDate'])

In [285]:
med_order.loc[:, 'ExpectedStartDate'] = pd.to_datetime(med_order['ExpectedStartDate'])

In [286]:
key_189.loc[:, 'StartDate'] = pd.to_datetime(key_189['StartDate'])

In [287]:
med_order_carb = (
    med_order[med_order['PatientID'].isin(key_189.query('carb == 1').PatientID)]
    .query('CommonDrugName == "carboplatin"')
)

In [288]:
med_order_carb.shape

(176309, 18)

In [289]:
med_order_carb = pd.merge(med_order_carb, 
                          key_189.query('carb == 1')[['PatientID', 'StartDate']], 
                          on = 'PatientID', 
                          how = 'left')

In [290]:
med_order_carb.shape

(176309, 19)

In [291]:
med_order_carb.loc[:, 'date_diff'] = (med_order_carb['ExpectedStartDate'] - med_order_carb['StartDate']).dt.days.abs()

In [292]:
med_order_carb = med_order_carb.query('date_diff <= 14')

In [293]:
carb_index = med_order_carb.groupby('PatientID')['date_diff'].idxmin()

In [294]:
carb_dose = med_order_carb.loc[carb_index].query('RelativeOrderedUnits == "AUC"')[['PatientID', 'RelativeOrderedAmount']]

In [295]:
carb_dose = carb_dose.rename(columns = {'RelativeOrderedAmount': 'carb_dose_auc'})

In [296]:
carb_IDs = carb_dose.query('carb_dose_auc >= 5').PatientID

#### Cisplatin

In [297]:
med_order_cis = (
    med_order[med_order['PatientID'].isin(key_189.query('carb == 0').PatientID)]
    .query('CommonDrugName == "cisplatin"')
)

In [298]:
med_order_cis.shape

(10685, 18)

In [299]:
med_order_cis = pd.merge(med_order_cis,
                         key_189.query('carb == 0')[['PatientID', 'StartDate']], 
                         on = 'PatientID', 
                         how = 'left')

In [300]:
med_order_cis.shape

(10685, 19)

In [301]:
med_order_cis.loc[:, 'date_diff'] = (med_order_cis['ExpectedStartDate'] - med_order_cis['StartDate']).dt.days.abs()

In [302]:
med_order_cis = med_order_cis.query('date_diff <= 14')

In [303]:
cis_index = med_order_cis.groupby('PatientID')['date_diff'].idxmin()

In [304]:
cis_dose = med_order_cis.loc[cis_index].query('RelativeOrderedUnits == "mg/m2"')[['PatientID', 'RelativeOrderedAmount']]

In [305]:
cis_dose = cis_dose.rename(columns = {'RelativeOrderedAmount': 'cis_dose_mgm2'})

In [306]:
cis_IDs = cis_dose.query('cis_dose_mgm2 >= 75').PatientID

In [307]:
key_189 = key_189[key_189['PatientID'].isin(pd.concat([carb_IDs, cis_IDs]))]

In [308]:
key_189 = key_189.drop(columns = ['carb'])

In [309]:
row_ID(key_189)

(15880, 15880)

In [310]:
key_189 = pd.merge(key_189, df_full, on = 'PatientID', how = 'left')

In [311]:
row_ID(key_189)

(15880, 15880)

#### Time from treatment to death or censor

In [312]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [313]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [314]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [315]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [316]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
print(len(mortality), mortality.PatientID.is_unique)

68483 True


In [317]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [318]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [319]:
key_189 = pd.merge(key_189, mortality, on = 'PatientID', how = 'left')

In [320]:
len(key_189)

15880

In [321]:
conditions = [
    (key_189['death_status'] == 1),
    (key_189['death_status'] == 0)]

choices = [
    (key_189['death_date'] - key_189['StartDate']).dt.days,
    (key_189['last_activity'] - key_189['StartDate']).dt.days]

key_189.loc[:, 'timerisk_treatment'] = np.select(conditions, choices)

In [322]:
key_189 = key_189.query('timerisk_treatment >= 0')

#### Patient count

In [323]:
key_189 = (
    key_189
    .query('EGFR != "positive"')
    .query('ALK != "positive"')
)

In [324]:
low_cutoff_189 = cutoff.loc['keynote_189'].low

In [325]:
high_cutoff_189 = cutoff.loc['keynote_189'].high

In [326]:
print('Pembro + chemo total:',  key_189.query('pembro == 1').shape[0])
print('High risk:', key_189.query('pembro == 1').query('risk_score >= @high_cutoff_189').shape[0])
print('Med risk:', key_189.query('pembro == 1').query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189').shape[0])
print('Low risk:', key_189.query('pembro == 1').query('risk_score <= @low_cutoff_189').shape[0])

Pembro + chemo total: 4048
High risk: 1435
Med risk: 1312
Low risk: 1301


In [327]:
print('Platinum total:',  key_189.query('pembro == 0').shape[0])
print('High risk:', key_189.query('pembro == 0').query('risk_score >= @high_cutoff_189').shape[0])
print('Med risk:', key_189.query('pembro == 0').query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189').shape[0])
print('Low risk:', key_189.query('pembro == 0').query('risk_score <= @low_cutoff_189').shape[0])

Platinum total: 11581
High risk: 4106
Med risk: 3951
Low risk: 3524


#### Survival curves with covariate balancing

In [328]:
key_189 = key_189.set_index('PatientID')

In [329]:
key_189_iptw = key_189.filter(items = ['death_status',
                                       'timerisk_treatment',
                                       'pembro',
                                       'age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'adv_year',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_diagnosis',
                                       'pdl1',
                                       'albumin_diag', 
                                       'weight_pct_change',
                                       'risk_score'])

In [330]:
key_189_iptw['met_cat'] = pd.cut(key_189_iptw['adv_year'],
                                 bins = [2010, 2018, float('inf')],
                                 labels = ['11-18', '19-22'])

In [331]:
conditions = [
    ((key_189_iptw['pdl1'] == "1-49%") | (key_189_iptw['pdl1'] == "50-100%"))]

choices = ['>0%']

key_189_iptw['pdl1_cat'] = np.select(conditions, choices, default = key_189_iptw['pdl1'])

In [332]:
conditions = [
    ((key_189_iptw['ecog_diagnosis'] == "1.0") | (key_189_iptw['ecog_diagnosis'] == "0.0")),  
    ((key_189_iptw['ecog_diagnosis'] == "2.0") | (key_189_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

key_189_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [333]:
key_189_iptw.dtypes

death_status               bool
timerisk_treatment      float64
pembro                    int64
age                       int64
gender                   object
race                     object
PracticeType             object
Histology                object
adv_year                  int64
delta_adv_diagnosis       int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
pdl1                     object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
pdl1_cat                 object
ecog_2                   object
dtype: object

In [334]:
to_be_categorical = list(key_189_iptw.select_dtypes(include = ['object']).columns)

In [335]:
to_be_categorical

['gender',
 'race',
 'PracticeType',
 'Histology',
 'ecog_diagnosis',
 'pdl1',
 'pdl1_cat',
 'ecog_2']

In [336]:
to_be_categorical.append('met_cat')

In [337]:
to_be_categorical.remove('pdl1')

In [338]:
to_be_categorical.remove('ecog_diagnosis')

In [339]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    key_189_iptw[x] = key_189_iptw[x].astype('category')

In [340]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [341]:
# List of categorical features.
categorical_features = list(key_189_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [342]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [343]:
key_189_iptw_low = (
    key_189_iptw
    .query('risk_score <= @low_cutoff_189'))

key_189_iptw_med = (
    key_189_iptw
    .query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189'))

key_189_iptw_high = (
    key_189_iptw
    .query('risk_score >= @high_cutoff_189'))

key_189_iptw_all = key_189_iptw

In [344]:
key_189_low_x = preprocessor.fit_transform(key_189_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_cat', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

key_189_med_x = preprocessor.fit_transform(key_189_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_cat', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

key_189_high_x = preprocessor.fit_transform(key_189_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'PracticeType',
                                                                              'Histology',
                                                                              'met_cat',
                                                                              'delta_adv_diagnosis',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ecog_2',
                                                                              'pdl1_cat', 
                                                                              'albumin_diag', 
                                                                              'weight_pct_change',
                                                                              'risk_score']))

key_189_all_x = preprocessor.fit_transform(key_189_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_cat', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change',
                                                                            'risk_score']))

In [345]:
lr_189_low = LogisticRegression(max_iter = 1000)
lr_189_low.fit(key_189_low_x, key_189_iptw_low['pembro'])

LogisticRegression(max_iter=1000)

In [346]:
lr_189_med = LogisticRegression(max_iter = 1000)
lr_189_med.fit(key_189_med_x, key_189_iptw_med['pembro'])

LogisticRegression(max_iter=1000)

In [347]:
lr_189_high = LogisticRegression(max_iter = 1000)
lr_189_high.fit(key_189_high_x, key_189_iptw_high['pembro'])

LogisticRegression(max_iter=1000)

In [348]:
lr_189_all = LogisticRegression(max_iter = 1000)
lr_189_all.fit(key_189_all_x, key_189_iptw_all['pembro'])

LogisticRegression(max_iter=1000)

In [349]:
pred_low = lr_189_low.predict_proba(key_189_low_x)
pred_med = lr_189_med.predict_proba(key_189_med_x)
pred_high = lr_189_high.predict_proba(key_189_high_x)
pred_all = lr_189_all.predict_proba(key_189_all_x)

In [350]:
key_189_iptw_low['ps'] = pred_low[:, 1]
key_189_iptw_med['ps'] = pred_med[:, 1]
key_189_iptw_high['ps'] = pred_high[:, 1]
key_189_iptw_all['ps'] = pred_all[:, 1]

In [351]:
key_189_iptw_low['weight'] = (
    np.where(key_189_iptw_low['pembro'] == 1, 1/key_189_iptw_low['ps'], 1/(1 - key_189_iptw_low['ps'])))

key_189_iptw_med['weight'] = (
    np.where(key_189_iptw_med['pembro'] == 1, 1/key_189_iptw_med['ps'], 1/(1 - key_189_iptw_med['ps'])))

key_189_iptw_high['weight'] = (
    np.where(key_189_iptw_high['pembro'] == 1, 1/key_189_iptw_high['ps'], 1/(1 - key_189_iptw_high['ps'])))

key_189_iptw_all['weight'] = (
    np.where(key_189_iptw_all['pembro'] == 1, 1/key_189_iptw_all['ps'], 1/(1 - key_189_iptw_all['ps'])))

In [352]:
# Low KM curves
kmf_low_pembro_189_iptw = KaplanMeierFitter()
kmf_low_plat_189_iptw = KaplanMeierFitter()

kmf_low_pembro_189_iptw.fit(
    key_189_iptw_low.query('pembro == 1').timerisk_treatment/30,
    key_189_iptw_low.query('pembro == 1').death_status,
    weights = key_189_iptw_low.query('pembro == 1')['weight'])

kmf_low_plat_189_iptw.fit(
    key_189_iptw_low.query('pembro == 0').timerisk_treatment/30,
    key_189_iptw_low.query('pembro == 0').death_status,
    weights = key_189_iptw_low.query('pembro == 0')['weight'])

# Med KM curves
kmf_med_pembro_189_iptw = KaplanMeierFitter()
kmf_med_plat_189_iptw = KaplanMeierFitter()

kmf_med_pembro_189_iptw.fit(
    key_189_iptw_med.query('pembro == 1').timerisk_treatment/30,
    key_189_iptw_med.query('pembro == 1').death_status,
    weights = key_189_iptw_med.query('pembro == 1')['weight'])

kmf_med_plat_189_iptw.fit(
    key_189_iptw_med.query('pembro == 0').timerisk_treatment/30,
    key_189_iptw_med.query('pembro == 0').death_status,
    weights = key_189_iptw_med.query('pembro == 0')['weight'])

# High KM curves 
kmf_high_pembro_189_iptw = KaplanMeierFitter()
kmf_high_plat_189_iptw = KaplanMeierFitter()

kmf_high_pembro_189_iptw.fit(
    key_189_iptw_high.query('pembro == 1').timerisk_treatment/30,
    key_189_iptw_high.query('pembro == 1').death_status,
    weights = key_189_iptw_high.query('pembro == 1')['weight'])

kmf_high_plat_189_iptw.fit(
    key_189_iptw_high.query('pembro == 0').timerisk_treatment/30,
    key_189_iptw_high.query('pembro == 0').death_status,
    weights = key_189_iptw_high.query('pembro == 0')['weight'])

# All KM curves 
kmf_all_pembro_189_iptw = KaplanMeierFitter()
kmf_all_plat_189_iptw = KaplanMeierFitter()

kmf_all_pembro_189_iptw.fit(
    key_189_iptw_all.query('pembro == 1').timerisk_treatment/30,
    key_189_iptw_all.query('pembro == 1').death_status,
    weights = key_189_iptw_all.query('pembro == 1')['weight'])

kmf_all_plat_189_iptw.fit(
    key_189_iptw_all.query('pembro == 0').timerisk_treatment/30,
    key_189_iptw_all.query('pembro == 0').death_status,
    weights = key_189_iptw_all.query('pembro == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 17164.6 total observations, 4765.47 right-censored observations>

#### Calculating survival metrics

In [353]:
pembro_189_median_os = mos(kmf_low_pembro_189_iptw,
                           kmf_med_pembro_189_iptw,
                           kmf_high_pembro_189_iptw,
                           kmf_all_pembro_189_iptw)

plat_189_median_os = mos(kmf_low_plat_189_iptw,
                         kmf_med_plat_189_iptw,
                         kmf_high_plat_189_iptw,
                         kmf_all_plat_189_iptw)

In [354]:
key_189_iptw_all_imputed = key_189_iptw_all.copy()
key_189_iptw_all_imputed['albumin_diag'] = key_189_iptw_all_imputed['albumin_diag'].fillna(key_189_iptw_all_imputed['albumin_diag'].median())
key_189_iptw_all_imputed['weight_pct_change'] = key_189_iptw_all_imputed['weight_pct_change'].fillna(key_189_iptw_all_imputed['weight_pct_change'].median())

In [355]:
key189_hr_all = CoxPHFitter()
key189_hr_all.fit(key_189_iptw_all_imputed,
                  duration_col = 'timerisk_treatment',
                  event_col = 'death_status',
                  formula = 'pembro + age + gender + race + PracticeType + Histology + met_cat + delta_adv_diagnosis + commercial + medicare + medicaid + ecog_2 + pdl1_cat + albumin_diag + weight_pct_change + risk_score',
                  weights_col = 'weight',
                  robust = True)

<lifelines.CoxPHFitter: fitted with 31131.6 total observations, 9999.32 right-censored observations>

In [356]:
key189_all_rmst_mos_95 = rmst_mos_95ci(key_189_iptw_all,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat', 
                                        'albumin_diag', 
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [357]:
key189_low_rmst_mos_95 = rmst_mos_95ci(key_189_iptw_low,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat', 
                                        'albumin_diag', 
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [358]:
key189_med_rmst_mos_95 = rmst_mos_95ci(key_189_iptw_med,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat', 
                                        'albumin_diag', 
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [359]:
key189_high_rmst_mos_95 = rmst_mos_95ci(key_189_iptw_high,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat', 
                                        'albumin_diag', 
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [360]:
keynote_189_data = [
    {'trial_name': 'KEYNOTE-189', 
     'risk_group': 'low', 
     'r_trt_mos': pembro_189_median_os[0],
     'r_trt_mos_95': key189_low_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_189_median_os[0],
     'r_cont_mos_95': key189_low_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_189_median_os[0] - plat_189_median_os[0], 
     'rct_trt_arm': 22.0, 
     'rct_cont_arm': 10.6,
     'rct_mos_diff': 22.0-10.6, 
     'trt_rmst': restricted_mean_survival_time(kmf_low_pembro_189_iptw, 36),
     'trt_rmst_95': key189_low_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_low_plat_189_iptw, 36),
     'cont_rmst_95': key189_low_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_low_pembro_189_iptw, 36) - restricted_mean_survival_time(kmf_low_plat_189_iptw, 36),
     'diff_rmst_95': key189_low_rmst_mos_95.difference_rmst_95,
     'rcount': key_189.query('risk_score <= @low_cutoff_189').shape[0],
     'rcount_chemo': key_189.query('risk_score <= @low_cutoff_189').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-189', 
     'risk_group': 'medium', 
     'r_trt_mos': pembro_189_median_os[1],
     'r_trt_mos_95': key189_med_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_189_median_os[1],
     'r_cont_mos_95': key189_med_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_189_median_os[1] - plat_189_median_os[1], 
     'rct_trt_arm': 22.0, 
     'rct_cont_arm': 10.6,
     'rct_mos_diff': 22.0-10.6, 
     'trt_rmst': restricted_mean_survival_time(kmf_med_pembro_189_iptw, 36),
     'trt_rmst_95': key189_med_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_med_plat_189_iptw, 36),
     'cont_rmst_95': key189_med_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_med_pembro_189_iptw, 36) - restricted_mean_survival_time(kmf_med_plat_189_iptw, 36),
     'diff_rmst_95': key189_med_rmst_mos_95.difference_rmst_95,
     'rcount': key_189.query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189').shape[0],
     'rcount_chemo': key_189.query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-189', 
     'risk_group': 'high', 
     'r_trt_mos': pembro_189_median_os[2],
     'r_trt_mos_95': key189_high_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_189_median_os[2],
     'r_cont_mos_95': key189_high_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_189_median_os[2] - plat_189_median_os[2], 
     'rct_trt_arm': 22.0, 
     'rct_cont_arm': 10.6,
     'rct_mos_diff': 22.0-10.6, 
     'trt_rmst': restricted_mean_survival_time(kmf_high_pembro_189_iptw, 36),
     'trt_rmst_95': key189_high_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_high_plat_189_iptw, 36),
     'cont_rmst_95': key189_high_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_high_pembro_189_iptw, 36) - restricted_mean_survival_time(kmf_high_plat_189_iptw, 36),
     'diff_rmst_95': key189_high_rmst_mos_95.difference_rmst_95,
     'rcount': key_189.query('risk_score >= @high_cutoff_189').shape[0],
     'rcount_chemo': key_189.query('risk_score >= @high_cutoff_189').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-189', 
     'risk_group': 'all', 
     'r_hr': key189_hr_all.hazard_ratios_['pembro'],
     'r_hr_95': [key189_hr_all.summary.loc['pembro']['exp(coef) lower 95%'], key189_hr_all.summary.loc['pembro']['exp(coef) upper 95%']],
     'r_trt_mos': pembro_189_median_os[3],
     'r_trt_mos_95': key189_all_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_189_median_os[3],
     'r_cont_mos_95': key189_all_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_189_median_os[3] - plat_189_median_os[3], 
     'rct_trt_arm': 22.0, 
     'rct_cont_arm': 10.6,
     'rct_mos_diff': 22.0-10.6, 
     'rcount': key_189.shape[0], 
     'rcount_chemo': key_189.query('pembro == 0').shape[0]}
]

### CHECKMATE-078: Second-line nivolumab vs. docetaxel

**INCLUSION CRITERIA**
* Advanced or metastatic NSCLC 
* Progressed on first line platinum-based chemotherapy 
* Received second line nivolumab or docetaxel
* Received appropriate dose of docetaxel
* Prior treatments with docetaxel or immunotherapy contraindicated
* EGFR and ALK negative 

#### Nivolumab 

In [361]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [362]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [363]:
line_therapy_fl = (
    line_therapy[line_therapy.PatientID.isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
)

In [364]:
targeted = [
    'Afatinib',
    'Alectinib',
    'Brigatinib',
    'Cabozantinib',
    'Capmatinib',
    'Ceritinib',
    'Crizotinib',
    'Dabrafenib',
    'Dacomitinib',
    'Entrectinib',
    'Erlotinib',
    'Gefitinib',
    'Lorlatinib',
    'Osimertinib',
    'Pralsetinib',
    'Selpercatinib',
    'Sotorasib',
    'Tepotinib',
    'Trametinib',
    'Vandetanib']

In [365]:
immunotherapy = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab',
    'Pembrolizumab'
]

In [366]:
fl_plat = (
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('Carboplatin|Cisplatin')
     & ~line_therapy_fl['LineName'].str.contains('Docetaxel')
     & ~line_therapy_fl['LineName'].str.contains('|'.join(targeted))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(immunotherapy))]
    .PatientID
)

In [367]:
checkmate_nivo = (
    line_therapy[line_therapy.PatientID.isin(fl_plat)]
    .query('LineNumber == 2')
    .query('LineName == "Nivolumab"')
    [['PatientID', 'StartDate']]
)

In [368]:
checkmate_nivo.loc[:, 'nivo'] = 1

In [369]:
row_ID(checkmate_nivo)

(3397, 3397)

#### Docetaxel

In [370]:
checkmate_dotx = (
    line_therapy[line_therapy.PatientID.isin(fl_plat)]
    .query('LineNumber == 2')
    .query('LineName == "Docetaxel"')
    [['PatientID', 'StartDate']]
)

In [371]:
checkmate_dotx.loc[:, 'nivo'] = 0

In [372]:
row_ID(checkmate_dotx)

(745, 745)

#### Docetaxel dosing

In [373]:
med_order = pd.read_csv('MedicationOrder.csv', low_memory = False)

In [374]:
med_order['ExpectedStartDate'] = np.where(med_order['ExpectedStartDate'].isna(), 
                                          med_order['OrderedDate'], 
                                          med_order['ExpectedStartDate'])

In [375]:
med_order.loc[:, 'ExpectedStartDate'] = pd.to_datetime(med_order['ExpectedStartDate'])

In [376]:
checkmate_dotx.loc[:, 'StartDate'] = pd.to_datetime(checkmate_dotx['StartDate'])

In [377]:
med_order_dotx = (
    med_order[med_order['PatientID'].isin(checkmate_dotx.PatientID)]
    .query('CommonDrugName == "docetaxel"')
)

In [378]:
med_order_dotx.shape

(4537, 18)

In [379]:
med_order_dotx = pd.merge(med_order_dotx, 
                          checkmate_dotx[['PatientID', 'StartDate']], 
                          on = 'PatientID', 
                          how = 'left')

In [380]:
med_order_dotx.shape

(4537, 19)

In [381]:
med_order_dotx.loc[:, 'date_diff'] = (med_order_dotx['ExpectedStartDate'] - med_order_dotx['StartDate']).dt.days.abs()

In [382]:
med_order_dotx = med_order_dotx.query('date_diff <= 14')

In [383]:
dotx_index = med_order_dotx.groupby('PatientID')['date_diff'].idxmin()

In [384]:
dotx_dose = med_order_dotx.loc[dotx_index].query('RelativeOrderedUnits == "mg/m2"')[['PatientID', 'RelativeOrderedAmount']]

In [385]:
dotx_dose = dotx_dose.rename(columns = {'RelativeOrderedAmount': 'dotx_dose_mgm2'})

In [386]:
dotx_IDs = dotx_dose.query('dotx_dose_mgm2 >= 75').PatientID

In [387]:
checkmate_dotx = checkmate_dotx[checkmate_dotx['PatientID'].isin(dotx_IDs)]

#### Combining docetaxel and nivolumab dataframes 

In [388]:
checkmate = pd.concat([checkmate_nivo, checkmate_dotx])

In [389]:
row_ID(checkmate)

(3854, 3854)

In [390]:
checkmate = pd.merge(checkmate, df_full, on = 'PatientID', how = 'left')

In [391]:
row_ID(checkmate)

(3854, 3854)

In [392]:
checkmate['StartDate'] = pd.to_datetime(checkmate['StartDate'])

#### Time from treatment to death or censor 

In [393]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [394]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [395]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [396]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [397]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
print(len(mortality), mortality.PatientID.is_unique)

68483 True


In [398]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [399]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [400]:
checkmate = pd.merge(checkmate, mortality, on = 'PatientID', how = 'left')

In [401]:
row_ID(checkmate)

(3854, 3854)

In [402]:
conditions = [
    (checkmate['death_status'] == 1),
    (checkmate['death_status'] == 0)]

choices = [
    (checkmate['death_date'] - checkmate['StartDate']).dt.days,
    (checkmate['last_activity'] - checkmate['StartDate']).dt.days]

checkmate.loc[:, 'timerisk_treatment'] = np.select(conditions, choices)

In [403]:
checkmate = checkmate.query('timerisk_treatment >= 0')

#### Patient count 

In [404]:
checkmate = (
    checkmate
    .query('EGFR != "positive"')
    .query('ALK != "positive"')
)

In [405]:
low_cutoff_078 = cutoff.loc['checkmate_078'].low

In [406]:
high_cutoff_078 = cutoff.loc['checkmate_078'].high

In [407]:
print('Nivolumab total:',  checkmate.query('nivo == 1').shape[0])
print('High risk:', checkmate.query('nivo == 1').query('risk_score >= @high_cutoff_078').shape[0])
print('Med risk:', checkmate.query('nivo == 1').query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078').shape[0])
print('Low risk:', checkmate.query('nivo == 1').query('risk_score <= @low_cutoff_078').shape[0])

Nivolumab total: 3339
High risk: 1112
Med risk: 1114
Low risk: 1113


In [408]:
print('Docetaxel total:',  checkmate.query('nivo == 0').shape[0])
print('High risk:', checkmate.query('nivo == 0').query('risk_score >= @high_cutoff_078').shape[0])
print('Med risk:', checkmate.query('nivo == 0').query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078').shape[0])
print('Low risk:', checkmate.query('nivo == 0').query('risk_score <= @low_cutoff_078').shape[0])

Docetaxel total: 450
High risk: 145
Med risk: 155
Low risk: 150


#### Survival curves with covariate balancing

In [409]:
checkmate = checkmate.set_index('PatientID')

In [410]:
check_iptw = checkmate.filter(items = ['death_status',
                                       'timerisk_treatment',
                                       'nivo',
                                       'age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'adv_year',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_diagnosis',
                                       'pdl1',
                                       'albumin_diag', 
                                       'weight_pct_change',
                                       'risk_score'])

In [411]:
check_iptw['met_cat'] = pd.cut(check_iptw['adv_year'],
                               bins = [2010, 2015, float('inf')],
                               labels = ['11-15', '16-20'])

In [412]:
conditions = [
    ((check_iptw['pdl1'] == "1-49%") | (check_iptw['pdl1'] == "50-100%"))]

choices = ['>0%']

check_iptw['pdl1_cat'] = np.select(conditions, choices, default = check_iptw['pdl1'])

In [413]:
conditions = [
    ((check_iptw['ecog_diagnosis'] == "1.0") | (check_iptw['ecog_diagnosis'] == "0.0")),  
    ((check_iptw['ecog_diagnosis'] == "2.0") | (check_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

check_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [414]:
check_iptw.dtypes

death_status               bool
timerisk_treatment      float64
nivo                      int64
age                       int64
gender                   object
race                     object
PracticeType             object
Histology                object
adv_year                  int64
delta_adv_diagnosis       int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
pdl1                     object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
pdl1_cat                 object
ecog_2                   object
dtype: object

In [415]:
to_be_categorical = list(check_iptw.select_dtypes(include = ['object']).columns)

In [416]:
to_be_categorical

['gender',
 'race',
 'PracticeType',
 'Histology',
 'ecog_diagnosis',
 'pdl1',
 'pdl1_cat',
 'ecog_2']

In [417]:
to_be_categorical.append('met_cat')

In [418]:
to_be_categorical.remove('pdl1')

In [419]:
to_be_categorical.remove('ecog_diagnosis')

In [420]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    check_iptw[x] = check_iptw[x].astype('category')

In [421]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [422]:
# List of categorical features.
categorical_features = list(check_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [423]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [424]:
check_iptw_low = (
    check_iptw
    .query('risk_score <= @low_cutoff_078'))

check_iptw_med = (
    check_iptw
    .query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078'))

check_iptw_high = (
    check_iptw
    .query('risk_score >= @high_cutoff_078'))

check_iptw_all = check_iptw

In [425]:
check_low_x = preprocessor.fit_transform(check_iptw_low.filter(items = ['age',
                                                                        'gender',
                                                                        'race',
                                                                        'PracticeType',
                                                                        'Histology',
                                                                        'met_cat',
                                                                        'delta_adv_diagnosis',
                                                                        'commercial',
                                                                        'medicare',
                                                                        'medicaid',
                                                                        'ecog_2',
                                                                        'pdl1_cat',
                                                                        'albumin_diag',
                                                                        'weight_pct_change',
                                                                        'risk_score']))

check_med_x = preprocessor.fit_transform(check_iptw_med.filter(items = ['age',
                                                                        'gender',
                                                                        'race',
                                                                        'PracticeType',
                                                                        'Histology',
                                                                        'met_cat',
                                                                        'delta_adv_diagnosis',
                                                                        'commercial',
                                                                        'medicare',
                                                                        'medicaid',
                                                                        'ecog_2',
                                                                        'pdl1_cat',
                                                                        'albumin_diag',
                                                                        'weight_pct_change',
                                                                        'risk_score']))

check_high_x = preprocessor.fit_transform(check_iptw_high.filter(items = ['age',
                                                                          'gender',
                                                                          'race',
                                                                          'PracticeType',
                                                                          'Histology',
                                                                          'met_cat',
                                                                          'delta_adv_diagnosis',
                                                                          'commercial',
                                                                          'medicare',
                                                                          'medicaid',
                                                                          'ecog_2',
                                                                          'pdl1_cat',
                                                                          'albumin_diag',
                                                                          'weight_pct_change',
                                                                          'risk_score']))

check_all_x = preprocessor.fit_transform(check_iptw_all.filter(items = ['age',
                                                                        'gender',
                                                                        'race',
                                                                        'PracticeType',
                                                                        'Histology',
                                                                        'met_cat',
                                                                        'delta_adv_diagnosis',
                                                                        'commercial',
                                                                        'medicare',
                                                                        'medicaid',
                                                                        'ecog_2',
                                                                        'pdl1_cat',
                                                                        'albumin_diag',
                                                                        'weight_pct_change',
                                                                        'risk_score']))

In [426]:
lr_check_low = LogisticRegression(max_iter = 1000)
lr_check_low.fit(check_low_x, check_iptw_low['nivo'])

LogisticRegression(max_iter=1000)

In [427]:
lr_check_med = LogisticRegression(max_iter = 1000)
lr_check_med.fit(check_med_x, check_iptw_med['nivo'])

LogisticRegression(max_iter=1000)

In [428]:
lr_check_high = LogisticRegression(max_iter = 1000)
lr_check_high.fit(check_high_x, check_iptw_high['nivo'])

LogisticRegression(max_iter=1000)

In [429]:
lr_check_all = LogisticRegression(max_iter = 1000)
lr_check_all.fit(check_all_x, check_iptw_all['nivo'])

LogisticRegression(max_iter=1000)

In [430]:
pred_low = lr_check_low.predict_proba(check_low_x)
pred_med = lr_check_med.predict_proba(check_med_x)
pred_high = lr_check_high.predict_proba(check_high_x)
pred_all = lr_check_all.predict_proba(check_all_x)

In [431]:
check_iptw_low['ps'] = pred_low[:, 1]
check_iptw_med['ps'] = pred_med[:, 1]
check_iptw_high['ps'] = pred_high[:, 1]
check_iptw_all['ps'] = pred_all[:, 1]

In [432]:
check_iptw_low['weight'] = (
    np.where(check_iptw_low['nivo'] == 1, 1/check_iptw_low['ps'], 1/(1 - check_iptw_low['ps'])))

check_iptw_med['weight'] = (
    np.where(check_iptw_med['nivo'] == 1, 1/check_iptw_med['ps'], 1/(1 - check_iptw_med['ps'])))

check_iptw_high['weight'] = (
    np.where(check_iptw_high['nivo'] == 1, 1/check_iptw_high['ps'], 1/(1 - check_iptw_high['ps'])))

check_iptw_all['weight'] = (
    np.where(check_iptw_all['nivo'] == 1, 1/check_iptw_all['ps'], 1/(1 - check_iptw_all['ps'])))

In [433]:
# Low KM curves
kmf_low_nivo_check_iptw = KaplanMeierFitter()
kmf_low_dotx_check_iptw = KaplanMeierFitter()

kmf_low_nivo_check_iptw.fit(
    check_iptw_low.query('nivo == 1').timerisk_treatment/30,
    check_iptw_low.query('nivo == 1').death_status,
    weights = check_iptw_low.query('nivo == 1')['weight'])

kmf_low_dotx_check_iptw.fit(
    check_iptw_low.query('nivo == 0').timerisk_treatment/30,
    check_iptw_low.query('nivo == 0').death_status,
    weights = check_iptw_low.query('nivo == 0')['weight'])

# Med KM curves
kmf_med_nivo_check_iptw = KaplanMeierFitter()
kmf_med_dotx_check_iptw = KaplanMeierFitter()

kmf_med_nivo_check_iptw.fit(
    check_iptw_med.query('nivo == 1').timerisk_treatment/30,
    check_iptw_med.query('nivo == 1').death_status,
    weights = check_iptw_med.query('nivo == 1')['weight'])

kmf_med_dotx_check_iptw.fit(
    check_iptw_med.query('nivo == 0').timerisk_treatment/30,
    check_iptw_med.query('nivo == 0').death_status,
    weights = check_iptw_med.query('nivo == 0')['weight'])

# High KM curves 
kmf_high_nivo_check_iptw = KaplanMeierFitter()
kmf_high_dotx_check_iptw = KaplanMeierFitter()

kmf_high_nivo_check_iptw.fit(
    check_iptw_high.query('nivo == 1').timerisk_treatment/30,
    check_iptw_high.query('nivo == 1').death_status,
    weights = check_iptw_high.query('nivo == 1')['weight'])

kmf_high_dotx_check_iptw.fit(
    check_iptw_high.query('nivo == 0').timerisk_treatment/30,
    check_iptw_high.query('nivo == 0').death_status,
    weights = check_iptw_high.query('nivo == 0')['weight'])

# All KM curves 
kmf_all_nivo_check_iptw = KaplanMeierFitter()
kmf_all_dotx_check_iptw = KaplanMeierFitter()

kmf_all_nivo_check_iptw.fit(
    check_iptw_all.query('nivo == 1').timerisk_treatment/30,
    check_iptw_all.query('nivo == 1').death_status,
    weights = check_iptw_all.query('nivo == 1')['weight'])

kmf_all_dotx_check_iptw.fit(
    check_iptw_all.query('nivo == 0').timerisk_treatment/30,
    check_iptw_all.query('nivo == 0').death_status,
    weights = check_iptw_all.query('nivo == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 4034.13 total observations, 575.862 right-censored observations>

#### Calculating survival metrics 

In [434]:
nivo_check_median_os = mos(kmf_low_nivo_check_iptw, 
                           kmf_med_nivo_check_iptw,
                           kmf_high_nivo_check_iptw,
                           kmf_all_nivo_check_iptw)

dotx_check_median_os = mos(kmf_low_dotx_check_iptw,
                           kmf_med_dotx_check_iptw,
                           kmf_high_dotx_check_iptw,
                           kmf_all_dotx_check_iptw)

In [435]:
check_iptw_all_imputed = check_iptw_all.copy()
check_iptw_all_imputed['albumin_diag'] = check_iptw_all_imputed['albumin_diag'].fillna(check_iptw_all_imputed['albumin_diag'].median())
check_iptw_all_imputed['weight_pct_change'] = check_iptw_all_imputed['weight_pct_change'].fillna(check_iptw_all_imputed['weight_pct_change'].median())

In [436]:
check_hr_all = CoxPHFitter()
check_hr_all.fit(check_iptw_all_imputed,
                 duration_col = 'timerisk_treatment',
                 event_col = 'death_status',
                 formula = 'nivo + age + gender + race + PracticeType + Histology + delta_adv_diagnosis + commercial + medicare + medicaid + ecog_2 + pdl1_cat + albumin_diag + weight_pct_change + risk_score',
                 weights_col = 'weight',
                 robust = True)

<lifelines.CoxPHFitter: fitted with 7817.87 total observations, 1276.35 right-censored observations>

In [437]:
check_all_rmst_mos_95 = rmst_mos_95ci(check_iptw_all,
                                      1000,
                                      'nivo',
                                      'death',
                                      ['age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'met_cat',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_2',
                                       'pdl1_cat',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'],
                                      ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [438]:
check_low_rmst_mos_95 = rmst_mos_95ci(check_iptw_low,
                                      1000,
                                      'nivo',
                                      'death',
                                      ['age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'met_cat',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_2',
                                       'pdl1_cat',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'],
                                      ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [439]:
check_med_rmst_mos_95 = rmst_mos_95ci(check_iptw_med,
                                      1000,
                                      'nivo',
                                      'death',
                                      ['age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'met_cat',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_2',
                                       'pdl1_cat',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'],
                                      ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [440]:
check_high_rmst_mos_95 = rmst_mos_95ci(check_iptw_high,
                                       1000,
                                       'nivo',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        36)

In [441]:
check_data = [
    {'trial_name': 'CHECKMATE-078', 
     'risk_group': 'low', 
     'r_trt_mos': nivo_check_median_os[0],
     'r_trt_mos_95': check_low_rmst_mos_95.mos_A_95,
     'r_cont_mos': dotx_check_median_os[0],
     'r_cont_mos_95': check_low_rmst_mos_95.mos_B_95,
     'r_mos_diff': nivo_check_median_os[0] - dotx_check_median_os[0], 
     'rct_trt_arm': 11.9, 
     'rct_cont_arm': 9.5,
     'rct_mos_diff': 11.9-9.5,
     'trt_rmst': restricted_mean_survival_time(kmf_low_nivo_check_iptw, 36),
     'trt_rmst_95': check_low_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_low_dotx_check_iptw, 36),
     'cont_rmst_95': check_low_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_low_nivo_check_iptw, 36) - restricted_mean_survival_time(kmf_low_dotx_check_iptw, 36),
     'diff_rmst_95': check_low_rmst_mos_95.difference_rmst_95,
     'rcount': checkmate.query('risk_score <= @low_cutoff_078').shape[0],
     'rcount_chemo': checkmate.query('risk_score <= @low_cutoff_078').query('nivo == 0').shape[0]},
    
    {'trial_name': 'CHECKMATE-078', 
     'risk_group': 'medium', 
     'r_trt_mos': nivo_check_median_os[1],
     'r_trt_mos_95': check_med_rmst_mos_95.mos_A_95,
     'r_cont_mos': dotx_check_median_os[1],
     'r_cont_mos_95': check_med_rmst_mos_95.mos_B_95,
     'r_mos_diff': nivo_check_median_os[1] - dotx_check_median_os[1], 
     'rct_trt_arm': 11.9, 
     'rct_cont_arm': 9.5,
     'rct_mos_diff': 11.9-9.5,
     'trt_rmst': restricted_mean_survival_time(kmf_med_nivo_check_iptw, 36),
     'trt_rmst_95': check_med_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_med_dotx_check_iptw, 36),
     'cont_rmst_95': check_med_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_med_nivo_check_iptw, 36) - restricted_mean_survival_time(kmf_med_dotx_check_iptw, 36),
     'diff_rmst_95': check_med_rmst_mos_95.difference_rmst_95,
     'rcount': checkmate.query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078').shape[0],
     'rcount_chemo': checkmate.query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078').query('nivo == 0').shape[0]},
    
    {'trial_name': 'CHECKMATE-078', 
     'risk_group': 'high', 
     'r_trt_mos': nivo_check_median_os[2],
     'r_trt_mos_95': check_high_rmst_mos_95.mos_A_95,
     'r_cont_mos': dotx_check_median_os[2],
     'r_cont_mos_95': check_high_rmst_mos_95.mos_B_95,
     'r_mos_diff': nivo_check_median_os[2] - dotx_check_median_os[2], 
     'rct_trt_arm': 11.9, 
     'rct_cont_arm': 9.5,
     'rct_mos_diff': 11.9-9.5,
     'trt_rmst': restricted_mean_survival_time(kmf_high_nivo_check_iptw, 36),
     'trt_rmst_95': check_high_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_high_dotx_check_iptw, 36),
     'cont_rmst_95': check_high_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_high_nivo_check_iptw, 36) - restricted_mean_survival_time(kmf_high_dotx_check_iptw, 36),
     'diff_rmst_95': check_high_rmst_mos_95.difference_rmst_95,
     'rcount': checkmate.query('risk_score >= @high_cutoff_078').shape[0],
     'rcount_chemo': checkmate.query('risk_score >= @high_cutoff_078').query('nivo == 0').shape[0]},
    
    {'trial_name': 'CHECKMATE-078', 
     'risk_group': 'all', 
     'r_hr': check_hr_all.hazard_ratios_['nivo'],
     'r_hr_95': [check_hr_all.summary.loc['nivo']['exp(coef) lower 95%'], check_hr_all.summary.loc['nivo']['exp(coef) upper 95%']],
     'r_trt_mos': nivo_check_median_os[3],
     'r_trt_mos_95': check_all_rmst_mos_95.mos_A_95,
     'r_cont_mos': dotx_check_median_os[3],
     'r_cont_mos_95': check_all_rmst_mos_95.mos_B_95,
     'r_mos_diff': nivo_check_median_os[3] - dotx_check_median_os[3], 
     'rct_trt_arm': 11.9, 
     'rct_cont_arm': 9.5,
     'rct_mos_diff': 11.9-9.5,
     'rcount': checkmate.shape[0], 
     'rcount_chemo': checkmate.query('nivo == 0').shape[0]}
]

## Part 3. Combining dictionaries

In [442]:
data_combined = keynote_042_data + keynote_024_data + keynote_189_data + check_data

In [443]:
rtrials_dc_mos = pd.DataFrame(data_combined)

In [444]:
rtrials_dc_mos

Unnamed: 0,trial_name,risk_group,r_trt_mos,r_trt_mos_95,r_cont_mos,r_cont_mos_95,r_mos_diff,rct_trt_arm,rct_cont_arm,rct_mos_diff,trt_rmst,trt_rmst_95,cont_rmst,cont_rmst_95,diff_rmst,diff_rmst_95,rcount,rcount_chemo,r_hr,r_hr_95
0,KEYNOTE-042,low,25.933333,"[22.7, 32.63333333333333]",23.166667,"[17.733333333333334, 27.433333333333334]",2.766667,16.7,12.1,4.6,22.885616,"[21.745064649363883, 24.008568671036507]",22.215395,"[19.074306747125835, 24.48511725015636]",0.670221,"[-1.9333075858791289, 3.8200100972148214]",1231,335,,
1,KEYNOTE-042,medium,14.533333,"[12.1, 18.066666666666666]",17.533333,"[11.133333333333333, 19.5]",-3.0,16.7,12.1,4.6,18.209362,"[16.975288037394186, 19.334500992671614]",17.985655,"[15.150986697134792, 20.31861500951813]",0.223707,"[-2.5591840204995857, 3.089148108280846]",1353,370,,
2,KEYNOTE-042,high,4.166667,"[3.4991666666666665, 4.866666666666666]",6.066667,"[4.329166666666667, 7.733333333333333]",-1.9,16.7,12.1,4.6,10.801809,"[9.871310661086603, 11.705978321942231]",10.241543,"[7.88274591118686, 12.694509533805606]",0.560267,"[-2.158321340616336, 3.0563082417678653]",1407,372,,
3,KEYNOTE-042,all,12.966667,"[11.6, 14.433333333333334]",12.5,"[10.965833333333334, 15.166666666666666]",0.466667,16.7,12.1,4.6,,,,,,,3991,1077,0.975164,"[0.8503047899525819, 1.1183572145368943]"
4,KEYNOTE-024,low,9.233333,"[7.7, 11.133333333333333]",5.833333,"[2.7, 9.7]",3.4,10.3,6.0,4.3,9.962931,"[9.384983391372538, 10.61093499679533]",8.140591,"[6.326049089881067, 10.017809795699154]",1.82234,"[-0.10107844848126439, 3.7305269952761444]",630,83,,
5,KEYNOTE-024,medium,5.466667,"[4.133333333333334, 6.4]",6.766667,"[4.265, 10.866666666666667]",-1.3,10.3,6.0,4.3,7.834143,"[7.247676746754834, 8.406502218941895]",8.847136,"[6.435105843681587, 11.181060579205107]",-1.012993,"[-3.388098756672839, 1.498227799152481]",644,81,,
6,KEYNOTE-024,high,2.233333,"[2.0, 2.4]",2.233333,"[1.4, 3.1333333333333333]",0.0,10.3,6.0,4.3,4.697469,"[4.216802184362788, 5.180623730712141]",3.149833,"[2.2991383622546797, 5.089488272920225]",1.547636,"[-0.4577096500220877, 2.505793031532148]",622,76,,
7,KEYNOTE-024,all,4.3,"[3.9, 5.166666666666667]",4.833333,"[3.1333333333333333, 6.133333333333334]",-0.533333,10.3,6.0,4.3,,,,,,,1896,240,0.852015,"[0.7241494406497687, 1.0024586250318155]"
8,KEYNOTE-189,low,23.866667,"[19.833333333333332, 28.366666666666667]",18.833333,"[17.933333333333334, 19.966666666666665]",5.033333,22.0,10.6,11.4,22.637211,"[21.544869832885926, 23.88071282856943]",20.535516,"[19.978771189111225, 21.10710706948151]",2.101695,"[0.8337600603596916, 3.5185753746409056]",4825,3524,,
9,KEYNOTE-189,medium,12.2,"[10.766666666666667, 14.133333333333333]",11.166667,"[10.433333333333334, 11.966666666666667]",1.033333,22.0,10.6,11.4,16.044271,"[14.916544719546808, 17.2613484743129]",15.224627,"[14.63087081644326, 15.802212537130716]",0.819644,"[-0.3526890544288134, 2.2082971504959144]",5263,3951,,


In [445]:
rtrials_dc_mos.to_csv('rtrials_dc_mos.csv', index = False)