# Flatiron Health aNSCLC: Survival metrics for key elgibility criteria
**Background: Calculate survival metrics for emulated trials involving patients meeting key elgibliity criteria. Hazard ratio for the full cohort is calculated from a Cox-IPTW model. Restricted mean survival time and median overall survival are calculated for phenotypes using an IPTW-adjusted KM curve.** 

## Part 1: Preprocessing

### 1.1 Import packages and create necessary functions

In [1]:
import numpy as np
import pandas as pd

from scipy import stats

from sksurv.nonparametric import kaplan_meier_estimator
from survive import KaplanMeier, SurvivalData

from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.plotting import add_at_risk_counts
from lifelines.utils import median_survival_times, restricted_mean_survival_time
from lifelines.statistics import logrank_test

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

import warnings

In [2]:
# Function that returns number of rows and count of unique PatientIDs for a dataframe. 
def row_ID(dataframe):
    row = dataframe.shape[0]
    ID = dataframe['PatientID'].nunique()
    return row, ID

In [3]:
# Find index for value closest to input value. 
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

In [4]:
# Calculates median overeall suvival for risk groups. 
def mos(low, med, high, comp):
    low_os = low.median_survival_time_
    med_os = med.median_survival_time_
    high_os = high.median_survival_time_
    comp_os = comp.median_survival_time_
    mos = [low_os, med_os, high_os, comp_os]
    return (mos)

In [5]:
def rmst_mos_95ci(df, num_samples, drug, event, items_list, numerical_features, rmst_time):
    
    """
    Estimate the 95% confidence interval for RMST and mOS using bootstrap resampling.

    Parameters:
    - df: DataFrame containing survival data
    - num_samples: Number of bootstrap samples
    - drug: Treatment indicator variable
    - event: Event type ('death' or 'progression')
    - items_list: Feature list for IPTW 
    - numerical_features: List of numerical features
    - rmst_time: Time to calculate RMST 

    Returns:
    - mos_A_95: mOS 95% CI for treatment
    - mos_B_95: mOS 95% CI for control
    - rmst_A_95: RMST 95% CI for treatment
    - rmst_B_95: RMST 95% CI for control
    - difference_rmst_95: RMST 95% CI for difference between treatment and control 
    """
    
    np.random.seed(42)
    mos_A = []
    mos_B = []
    rmst_A_list = []
    rmst_B_list = []
    differences_rmst = []
    
    # Define variables based on the event type
    if event == 'death':
        time_column = 'timerisk_treatment'
        status_column = 'death_status'
        
    else:
        time_column = 'time_prog_treatment'
        status_column = 'pfs_status'
        
    # Set up preprocessor for logistical regression which will be for IPTW  
    numerical_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('std_scaler', StandardScaler())])
        
    categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')
    categorical_features = list(df.select_dtypes(include = ['category']).columns)
        
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)],
        remainder = 'passthrough')
    
    # Boostrap 
    for _ in range(num_samples):
        
        # Resample data with replacement
        resampled_df = resample(df).drop(columns = ['ps', 'weight'])
        
        # Calculated IPTW for the resampled group 
        df_x = preprocessor.fit_transform(resampled_df.filter(items = items_list))
                                           
        df_lr = LogisticRegression(max_iter = 1000)
        df_lr.fit(df_x, resampled_df[drug])
        warnings.filterwarnings("ignore")
                                           
        pred = df_lr.predict_proba(df_x)        
        resampled_df['ps'] = pred[:, 1]                          
        resampled_df['weight'] = (
                np.where(resampled_df[drug] == 1, 1/resampled_df['ps'], 1/(1 - resampled_df['ps'])))
    
        # mOS from IPTW-KM
        kmf_A = KaplanMeierFitter()
        kmf_A.fit(resampled_df.query(f'{drug} == 1')[time_column]/30,
                  resampled_df.query(f'{drug} == 1')[status_column], 
                  weights = resampled_df.query(f'{drug} == 1')['weight'])

        kmf_B = KaplanMeierFitter()
        kmf_B.fit(resampled_df.query(f'{drug} == 0')[time_column]/30,
                  resampled_df.query(f'{drug} == 0')[status_column], 
                  weights = resampled_df.query(f'{drug} == 0')['weight'])
    
        mos_A.append(kmf_A.median_survival_time_)
        mos_B.append(kmf_B.median_survival_time_)
        
        # RMST from IPTW-KM
        rmst_A = restricted_mean_survival_time(kmf_A, rmst_time)
        rmst_B = restricted_mean_survival_time(kmf_B, rmst_time)
        
        rmst_A_list.append(rmst_A)
        rmst_B_list.append(rmst_B)
        differences_rmst.append(rmst_A - rmst_B)

    # Calculate the 95% confidence interval
    results = pd.Series({
    'mos_A_95': np.percentile(mos_A, [2.5, 97.5]),
    'mos_B_95': np.percentile(mos_B, [2.5, 97.5]),
    'rmst_A_95': np.percentile(rmst_A_list, [2.5, 97.5]),
    'rmst_B_95': np.percentile(rmst_B_list, [2.5, 97.5]),
    'difference_rmst_95': np.percentile(differences_rmst, [2.5, 97.5])
    })
    
    return results

## Part 2: In-silico trials 

### KEYNOTE-042: First-line pembrolizumab vs. platinum-based chemotherapy in those with PDL1 >=1%

**INCLUSION CRITERIA**
* Untreated locally advanced or metastatic NSCLC
* Received first line pemobrolizumab or platinum-based chemotherapy 
* PDL1 >= 1% and status known within (-inf, +30] days of first-line treatment
* EGFR and ALK negative

#### Pembrolizumab

In [6]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [7]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [8]:
key042_pembro = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
    .query('LineName == "Pembrolizumab"')
    [['PatientID', 'StartDate']]
)

In [9]:
key042_pembro.loc[:, 'pembro'] = 1

In [10]:
row_ID(key042_pembro)

(3648, 3648)

In [11]:
# Dataframe of all therapies received for those receiving first line pembrolizumab only. 
line_therapy_pembro_042 = (
    line_therapy[line_therapy['PatientID'].isin(key042_pembro.PatientID)])

In [12]:
targeted = [
    'Afatinib',
    'Alectinib',
    'Brigatinib',
    'Cabozantinib',
    'Capmatinib',
    'Ceritinib',
    'Crizotinib',
    'Dabrafenib',
    'Dacomitinib',
    'Entrectinib',
    'Erlotinib',
    'Gefitinib',
    'Lorlatinib',
    'Osimertinib',
    'Pralsetinib',
    'Selpercatinib',
    'Sotorasib',
    'Tepotinib',
    'Trametinib',
    'Vandetanib']

In [13]:
# Patients receiving pembrolizumab therapy who later recieve targeted therapy. 
pembro_042_xcross = (
    line_therapy_pembro_042[line_therapy_pembro_042['LineName'].str.contains('|'.join(targeted))].PatientID)

In [14]:
# Select patients who don't receive targeted therapy in future lines.
key042_pembro = key042_pembro[~key042_pembro['PatientID'].isin(pembro_042_xcross)]

In [15]:
row_ID(key042_pembro)

(3582, 3582)

In [16]:
row_ID(key042_pembro)

(3582, 3582)

#### Platinum-based chemotherapy 

In [17]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
)

In [18]:
plat_chemo = [
    'Carboplatin',
    'Cisplatin']

immuno = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab',
    'Pembrolizumab'
]

In [19]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')].LineName.value_counts().head(10)

Carboplatin,Paclitaxel                  8524
Carboplatin,Pemetrexed                  5417
Bevacizumab,Carboplatin,Pemetrexed      2825
Carboplatin,Paclitaxel Protein-Bound    1826
Bevacizumab,Carboplatin,Paclitaxel      1591
Carboplatin,Gemcitabine                 1224
Cisplatin,Etoposide                      793
Carboplatin,Docetaxel                    780
Cisplatin,Pemetrexed                     684
Carboplatin,Etoposide                    363
Name: LineName, dtype: int64

In [20]:
key042_plat = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [21]:
key042_plat.loc[:, 'pembro'] = 0

In [22]:
row_ID(key042_plat)

(25861, 25861)

In [23]:
# Dataframe of all therapies received for those receiving first line platinum regimen   
line_therapy_plat_042 = (
    line_therapy[line_therapy['PatientID'].isin(key042_plat.PatientID)])

In [24]:
# Patients receiving platinum therapy who later recieve targeted therapy. 
plat_042_xcross = (
    line_therapy_plat_042[line_therapy_plat_042['LineName'].str.contains('|'.join(targeted))].PatientID)

In [25]:
# Select patients who don't receive targeted therapy in future lines 
key042_plat = key042_plat[~key042_plat['PatientID'].isin(plat_042_xcross)]

In [26]:
row_ID(key042_plat)

(23800, 23800)

In [27]:
key_042 = pd.concat([key042_pembro, key042_plat])

In [28]:
row_ID(key_042)

(27382, 27382)

In [29]:
key_042 = pd.merge(key_042, df_full, on = 'PatientID', how = 'left')

In [30]:
row_ID(key_042)

(27382, 27382)

In [31]:
key_042['StartDate'] = pd.to_datetime(key_042['StartDate'])

#### PDL1 >=1%

In [32]:
biomarkers = pd.read_csv('Enhanced_AdvNSCLCBiomarkers.csv')

In [33]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(key_042['PatientID'])]

In [34]:
biomarkers = pd.merge(biomarkers, key_042[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [35]:
row_ID(biomarkers)

(94603, 19875)

In [36]:
biomarkers['ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [37]:
biomarkers['SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [38]:
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [39]:
biomarkers.loc[:, 'date_diff'] = (biomarkers['result_date'] - biomarkers['StartDate']).dt.days

In [40]:
pdl1_value = (
    biomarkers
    .query('BiomarkerName == "PDL1"')
    .query('date_diff <=30')
    .query('PercentStaining != "0%" and PercentStaining != "< 1%" and PercentStaining.notnull()', engine = 'python')
    .sort_values(by = ['PatientID', 'PercentStaining'], ascending = [True, False])
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'PercentStaining']]
)

In [41]:
pdl1_ids = (
    biomarkers
    .query('BiomarkerName == "PDL1"')
    .query('date_diff <=30')
    .query('PercentStaining != "0%" and PercentStaining != "< 1%" and PercentStaining.notnull()', engine = 'python')
    .PatientID
    .unique()
)

In [42]:
key_042 = key_042[key_042.PatientID.isin(pdl1_ids)]

In [43]:
row_ID(key_042)

(5671, 5671)

#### Time from treatment to death or censor

In [44]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [45]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [46]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [47]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [48]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
print(len(mortality), mortality.PatientID.is_unique)

68483 True


In [49]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [50]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [51]:
key_042 = pd.merge(key_042, mortality, on = 'PatientID', how = 'left')

In [52]:
row_ID(key_042)

(5671, 5671)

In [53]:
conditions = [
    (key_042['death_status'] == 1),
    (key_042['death_status'] == 0)]

choices = [
    (key_042['death_date'] - key_042['StartDate']).dt.days,
    (key_042['last_activity'] - key_042['StartDate']).dt.days]

key_042.loc[:, 'timerisk_treatment'] = np.select(conditions, choices)

In [54]:
key_042 = key_042.query('timerisk_treatment >= 0')

#### Patient count

In [55]:
key_042 = (
    key_042
    .query('EGFR != "positive"')
    .query('ALK != "positive"')
)

In [56]:
low_cutoff_042 = key_042.risk_score.quantile(1/3)

In [57]:
high_cutoff_042 = key_042.risk_score.quantile(2/3)

In [58]:
print('Pembro total:',  key_042.query('pembro == 1').shape[0])
print('High risk:', key_042.query('pembro == 1').query('risk_score >= @high_cutoff_042').shape[0])
print('Med risk:', key_042.query('pembro == 1').query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042').shape[0])
print('Low risk:', key_042.query('pembro == 1').query('risk_score <= @low_cutoff_042').shape[0])

Pembro total: 2914
High risk: 1035
Med risk: 984
Low risk: 895


In [59]:
print('Platinum total:',  key_042.query('pembro == 0').shape[0])
print('High risk:', key_042.query('pembro == 0').query('risk_score >= @high_cutoff_042').shape[0])
print('Med risk:', key_042.query('pembro == 0').query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042').shape[0])
print('Low risk:', key_042.query('pembro == 0').query('risk_score <= @low_cutoff_042').shape[0])

Platinum total: 2634
High risk: 815
Med risk: 865
Low risk: 954


#### Survival curves with covariate balancing 

In [60]:
row_ID(key_042)

(5548, 5548)

In [61]:
key_042 = pd.merge(key_042, pdl1_value, on = 'PatientID', how = 'left')

In [62]:
row_ID(key_042)

(5548, 5548)

In [63]:
conditions = [
    (key_042['PercentStaining'] == "1%") | 
    (key_042['PercentStaining'] == "2% - 4%") |
    (key_042['PercentStaining'] == "5% - 9%") |
    (key_042['PercentStaining'] == "10% - 19%") |
    (key_042['PercentStaining'] == "20% - 29%") |
    (key_042['PercentStaining'] == "30% - 39%") |
    (key_042['PercentStaining'] == "40% - 49%")
]

choices = ['lt50']

key_042['pdl1_det'] = np.select(conditions, choices, default = 'gte50')

In [64]:
key_042 = key_042.set_index('PatientID')

In [65]:
key_042_iptw = key_042.filter(items = ['death_status',
                                       'timerisk_treatment',
                                       'pembro',
                                       'age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'adv_year',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_diagnosis',
                                       'pdl1_det',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'])

In [66]:
key_042_iptw['met_cat'] = pd.cut(key_042_iptw['adv_year'],
                                 bins = [2010, 2016, float('inf')],
                                 labels = ['11-16', '17-21'])

In [67]:
conditions = [
    ((key_042_iptw['ecog_diagnosis'] == "1.0") | (key_042_iptw['ecog_diagnosis'] == "0.0")),  
    ((key_042_iptw['ecog_diagnosis'] == "2.0") | (key_042_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

key_042_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [68]:
key_042_iptw.dtypes

death_status               bool
timerisk_treatment      float64
pembro                    int64
age                       int64
gender                   object
race                     object
PracticeType             object
Histology                object
adv_year                  int64
delta_adv_diagnosis       int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
pdl1_det                 object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
ecog_2                   object
dtype: object

In [69]:
to_be_categorical = list(key_042_iptw.select_dtypes(include = ['object']).columns)

In [70]:
to_be_categorical

['gender',
 'race',
 'PracticeType',
 'Histology',
 'ecog_diagnosis',
 'pdl1_det',
 'ecog_2']

In [71]:
to_be_categorical.append('met_cat')

In [72]:
to_be_categorical.remove('ecog_diagnosis')

In [73]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    key_042_iptw[x] = key_042_iptw[x].astype('category')

In [74]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [75]:
# List of categorical features.
categorical_features = list(key_042_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [76]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [77]:
key_042_iptw_low = (
    key_042_iptw
    .query('risk_score <= @low_cutoff_042'))

key_042_iptw_med = (
    key_042_iptw
    .query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042'))

key_042_iptw_high = (
    key_042_iptw
    .query('risk_score >= @high_cutoff_042'))

key_042_iptw_all = key_042_iptw

In [78]:
key_042_low_x = preprocessor.fit_transform(key_042_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_det',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

key_042_med_x = preprocessor.fit_transform(key_042_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_det',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

key_042_high_x = preprocessor.fit_transform(key_042_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'PracticeType',
                                                                              'Histology',
                                                                              'met_cat',
                                                                              'delta_adv_diagnosis',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ecog_2',
                                                                              'pdl1_det', 
                                                                              'albumin_diag',
                                                                              'weight_pct_change',
                                                                              'risk_score']))

key_042_all_x = preprocessor.fit_transform(key_042_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_det', 
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

In [79]:
lr_042_low = LogisticRegression(max_iter = 1000)
lr_042_low.fit(key_042_low_x, key_042_iptw_low['pembro'])

LogisticRegression(max_iter=1000)

In [80]:
lr_042_med = LogisticRegression(max_iter = 1000)
lr_042_med.fit(key_042_med_x, key_042_iptw_med['pembro'])

LogisticRegression(max_iter=1000)

In [81]:
lr_042_high = LogisticRegression(max_iter = 1000)
lr_042_high.fit(key_042_high_x, key_042_iptw_high['pembro'])

LogisticRegression(max_iter=1000)

In [82]:
lr_042_all = LogisticRegression(max_iter = 1000)
lr_042_all.fit(key_042_all_x, key_042_iptw_all['pembro'])

LogisticRegression(max_iter=1000)

In [83]:
pred_low = lr_042_low.predict_proba(key_042_low_x)
pred_med = lr_042_med.predict_proba(key_042_med_x)
pred_high = lr_042_high.predict_proba(key_042_high_x)
pred_all = lr_042_all.predict_proba(key_042_all_x)

In [84]:
key_042_iptw_low['ps'] = pred_low[:, 1]
key_042_iptw_med['ps'] = pred_med[:, 1]
key_042_iptw_high['ps'] = pred_high[:, 1]
key_042_iptw_all['ps'] = pred_all[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [85]:
key_042_iptw_low['weight'] = (
    np.where(key_042_iptw_low['pembro'] == 1, 1/key_042_iptw_low['ps'], 1/(1 - key_042_iptw_low['ps'])))

key_042_iptw_med['weight'] = (
    np.where(key_042_iptw_med['pembro'] == 1, 1/key_042_iptw_med['ps'], 1/(1 - key_042_iptw_med['ps'])))

key_042_iptw_high['weight'] = (
    np.where(key_042_iptw_high['pembro'] == 1, 1/key_042_iptw_high['ps'], 1/(1 - key_042_iptw_high['ps'])))

key_042_iptw_all['weight'] = (
    np.where(key_042_iptw_all['pembro'] == 1, 1/key_042_iptw_all['ps'], 1/(1 - key_042_iptw_all['ps'])))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [86]:
# Low KM curves
kmf_low_pembro_042_iptw = KaplanMeierFitter()
kmf_low_plat_042_iptw = KaplanMeierFitter()

kmf_low_pembro_042_iptw.fit(
    key_042_iptw_low.query('pembro == 1').timerisk_treatment/30,
    key_042_iptw_low.query('pembro == 1').death_status,
    weights = key_042_iptw_low.query('pembro == 1')['weight'])

kmf_low_plat_042_iptw.fit(
    key_042_iptw_low.query('pembro == 0').timerisk_treatment/30,
    key_042_iptw_low.query('pembro == 0').death_status,
    weights = key_042_iptw_low.query('pembro == 0')['weight'])

# Med KM curves
kmf_med_pembro_042_iptw = KaplanMeierFitter()
kmf_med_plat_042_iptw = KaplanMeierFitter()

kmf_med_pembro_042_iptw.fit(
    key_042_iptw_med.query('pembro == 1').timerisk_treatment/30,
    key_042_iptw_med.query('pembro == 1').death_status,
    weights = key_042_iptw_med.query('pembro == 1')['weight'])

kmf_med_plat_042_iptw.fit(
    key_042_iptw_med.query('pembro == 0').timerisk_treatment/30,
    key_042_iptw_med.query('pembro == 0').death_status,
    weights = key_042_iptw_med.query('pembro == 0')['weight'])

# High KM curves 
kmf_high_pembro_042_iptw = KaplanMeierFitter()
kmf_high_plat_042_iptw = KaplanMeierFitter()

kmf_high_pembro_042_iptw.fit(
    key_042_iptw_high.query('pembro == 1').timerisk_treatment/30,
    key_042_iptw_high.query('pembro == 1').death_status,
    weights = key_042_iptw_high.query('pembro == 1')['weight'])

kmf_high_plat_042_iptw.fit(
    key_042_iptw_high.query('pembro == 0').timerisk_treatment/30,
    key_042_iptw_high.query('pembro == 0').death_status,
    weights = key_042_iptw_high.query('pembro == 0')['weight'])

# All KM curves 
kmf_all_pembro_042_iptw = KaplanMeierFitter()
kmf_all_plat_042_iptw = KaplanMeierFitter()

kmf_all_pembro_042_iptw.fit(
    key_042_iptw_all.query('pembro == 1').timerisk_treatment/30,
    key_042_iptw_all.query('pembro == 1').death_status,
    weights = key_042_iptw_all.query('pembro == 1')['weight'])

kmf_all_plat_042_iptw.fit(
    key_042_iptw_all.query('pembro == 0').timerisk_treatment/30,
    key_042_iptw_all.query('pembro == 0').death_status,
    weights = key_042_iptw_all.query('pembro == 0')['weight'])

  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  


<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 5620.26 total observations, 2231.02 right-censored observations>

#### Calculating survival metrics 

In [87]:
pembro_042_median_os = mos(kmf_low_pembro_042_iptw,
                           kmf_med_pembro_042_iptw,
                           kmf_high_pembro_042_iptw,
                           kmf_all_pembro_042_iptw)

plat_042_median_os = mos(kmf_low_plat_042_iptw,
                         kmf_med_plat_042_iptw,
                         kmf_high_plat_042_iptw,
                         kmf_all_plat_042_iptw)

In [88]:
key_042_iptw_all_imputed = key_042_iptw_all.copy()
key_042_iptw_all_imputed['albumin_diag'] = key_042_iptw_all_imputed['albumin_diag'].fillna(key_042_iptw_all_imputed['albumin_diag'].median())
key_042_iptw_all_imputed['weight_pct_change'] = key_042_iptw_all_imputed['weight_pct_change'].fillna(key_042_iptw_all_imputed['weight_pct_change'].median())

In [89]:
key042_hr_all = CoxPHFitter()
key042_hr_all.fit(key_042_iptw_all_imputed,
                  duration_col = 'timerisk_treatment',
                  event_col = 'death_status',
                  formula = 'pembro + age + gender + race + PracticeType + Histology + met_cat + delta_adv_diagnosis + commercial + medicare + medicaid + ecog_2 + pdl1_det + albumin_diag + weight_pct_change + risk_score', 
                  weights_col = 'weight', 
                  robust = True)

<lifelines.CoxPHFitter: fitted with 11182.7 total observations, 4641.88 right-censored observations>

In [90]:
key042_all_rmst_mos_95 = rmst_mos_95ci(key_042_iptw_all,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_det', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [91]:
key042_low_rmst_mos_95 = rmst_mos_95ci(key_042_iptw_low,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_det', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [92]:
key042_med_rmst_mos_95 = rmst_mos_95ci(key_042_iptw_med,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_det', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [93]:
key042_high_rmst_mos_95 = rmst_mos_95ci(key_042_iptw_high,
                                        1000,
                                        'pembro',
                                        'death',
                                        ['age',
                                         'gender',
                                         'race',
                                         'PracticeType',
                                         'Histology',
                                         'met_cat',
                                         'delta_adv_diagnosis',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ecog_2',
                                         'pdl1_det', 
                                         'albumin_diag',
                                         'weight_pct_change',
                                         'risk_score'],
                                        ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        36)

In [94]:
keynote_042_data = [
    {'trial_name': 'KEYNOTE-042', 
     'risk_group': 'low', 
     'r_trt_mos': pembro_042_median_os[0],
     'r_trt_mos_95': key042_low_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_042_median_os[0],
     'r_cont_mos_95': key042_low_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_042_median_os[0] - plat_042_median_os[0], 
     'rct_trt_arm': 16.7, 
     'rct_cont_arm': 12.1, 
     'rct_mos_diff': 16.7-12.1,
     'trt_rmst': restricted_mean_survival_time(kmf_low_pembro_042_iptw, 36),
     'trt_rmst_95': key042_low_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_low_plat_042_iptw, 36),
     'cont_rmst_95': key042_low_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_low_pembro_042_iptw, 36) - restricted_mean_survival_time(kmf_low_plat_042_iptw, 36),
     'diff_rmst_95': key042_low_rmst_mos_95.difference_rmst_95,
     'rcount': key_042.query('risk_score <= @low_cutoff_042').shape[0],
     'rcount_chemo': key_042.query('risk_score <= @low_cutoff_042').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-042', 
     'risk_group': 'medium', 
     'r_trt_mos': pembro_042_median_os[1],
     'r_trt_mos_95': key042_med_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_042_median_os[1],
     'r_cont_mos_95': key042_med_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_042_median_os[1] - plat_042_median_os[1], 
     'rct_trt_arm': 16.7, 
     'rct_cont_arm': 12.1, 
     'rct_mos_diff': 16.7-12.1,
     'trt_rmst': restricted_mean_survival_time(kmf_med_pembro_042_iptw, 36),
     'trt_rmst_95': key042_med_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_med_plat_042_iptw, 36),
     'cont_rmst_95': key042_med_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_med_pembro_042_iptw, 36) - restricted_mean_survival_time(kmf_med_plat_042_iptw, 36),
     'diff_rmst_95': key042_med_rmst_mos_95.difference_rmst_95,
     'rcount': key_042.query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042').shape[0],
     'rcount_chemo': key_042.query('risk_score < @high_cutoff_042 and risk_score > @low_cutoff_042').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-042', 
     'risk_group': 'high', 
     'r_trt_mos': pembro_042_median_os[2],
     'r_trt_mos_95': key042_high_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_042_median_os[2],
     'r_cont_mos_95': key042_high_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_042_median_os[2] - plat_042_median_os[2], 
     'rct_trt_arm': 16.7, 
     'rct_cont_arm': 12.1, 
     'rct_mos_diff': 16.7-12.1,
     'trt_rmst': restricted_mean_survival_time(kmf_high_pembro_042_iptw, 36),
     'trt_rmst_95': key042_high_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_high_plat_042_iptw, 36),
     'cont_rmst_95': key042_high_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_high_pembro_042_iptw, 36) - restricted_mean_survival_time(kmf_high_plat_042_iptw, 36),
     'diff_rmst_95': key042_high_rmst_mos_95.difference_rmst_95,
     'rcount': key_042.query('risk_score >= @high_cutoff_042').shape[0],
     'rcount_chemo': key_042.query('risk_score >= @high_cutoff_042').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-042', 
     'risk_group': 'all', 
     'r_hr': key042_hr_all.hazard_ratios_['pembro'],
     'r_hr_95': [key042_hr_all.summary.loc['pembro']['exp(coef) lower 95%'], key042_hr_all.summary.loc['pembro']['exp(coef) upper 95%']],
     'r_trt_mos': pembro_042_median_os[3],
     'r_trt_mos_95': key042_all_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_042_median_os[3],
     'r_cont_mos_95': key042_all_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_042_median_os[3] - plat_042_median_os[3], 
     'rct_trt_arm': 16.7, 
     'rct_cont_arm': 12.1, 
     'rct_mos_diff': 16.7-12.1,
     'rcount': key_042.shape[0], 
     'rcount_chemo': key_042.query('pembro == 0').shape[0]}
]

### KEYNOTE-024: First-line pembrolizumab vs. platinum-based chemotherapy in those with high PDL1 

**INCLUSION CRITERIA**
* Untreated stage IV NSCLC
* Received first line pemobrolizumab or platinum-based chemotherapy
* PDL1 >= 50% and status known within (-inf, +30] days of start of first-line treatment 
* EGFR and ALK negatve

#### Pembrolizumab

In [95]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [96]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [97]:
key024_pembro = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
    .query('LineName == "Pembrolizumab"')
    [['PatientID', 'StartDate']]
)

In [98]:
key024_pembro.loc[:, 'pembro'] = 1

In [99]:
row_ID(key024_pembro)

(3648, 3648)

In [100]:
# Dataframe of all therapies received for those receiving first line pembrolizumab only. 
line_therapy_pembro_024 = (
    line_therapy[line_therapy['PatientID'].isin(key024_pembro.PatientID)])

In [101]:
targeted = [
    'Afatinib',
    'Alectinib',
    'Brigatinib',
    'Cabozantinib',
    'Capmatinib',
    'Ceritinib',
    'Crizotinib',
    'Dabrafenib',
    'Dacomitinib',
    'Entrectinib',
    'Erlotinib',
    'Gefitinib',
    'Lorlatinib',
    'Osimertinib',
    'Pralsetinib',
    'Selpercatinib',
    'Sotorasib',
    'Tepotinib',
    'Trametinib',
    'Vandetanib']

In [102]:
# Patients receiving pembrolizumab therapy who later recieve targeted therapy. 
pembro_024_xcross = (
    line_therapy_pembro_024[line_therapy_pembro_024['LineName'].str.contains('|'.join(targeted))].PatientID)

In [103]:
# Select patients who don't receive targeted therapy in future lines.
key024_pembro = key024_pembro[~key024_pembro['PatientID'].isin(pembro_024_xcross)]

In [104]:
row_ID(key024_pembro)

(3582, 3582)

#### Platinum-based chemotherapy 

In [105]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
)

In [106]:
plat_chemo = [
    'Carboplatin',
    'Cisplatin']

immuno = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab',
    'Pembrolizumab'
]

In [107]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')].LineName.value_counts().head(10)

Carboplatin,Paclitaxel                  8524
Carboplatin,Pemetrexed                  5417
Bevacizumab,Carboplatin,Pemetrexed      2825
Carboplatin,Paclitaxel Protein-Bound    1826
Bevacizumab,Carboplatin,Paclitaxel      1591
Carboplatin,Gemcitabine                 1224
Cisplatin,Etoposide                      793
Carboplatin,Docetaxel                    780
Cisplatin,Pemetrexed                     684
Carboplatin,Etoposide                    363
Name: LineName, dtype: int64

In [108]:
key024_plat = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [109]:
key024_plat.loc[:, 'pembro'] = 0

In [110]:
row_ID(key024_plat)

(25861, 25861)

In [111]:
# Dataframe of all therapies received for those receiving first line platinum regimen  
line_therapy_plat_024 = (
    line_therapy[line_therapy['PatientID'].isin(key024_plat.PatientID)])

In [112]:
# Patients receiving platinum therapy who later recieve targeted therapy. 
plat_024_xcross = (
    line_therapy_plat_024[line_therapy_plat_024['LineName'].str.contains('|'.join(targeted))].PatientID)

In [113]:
# Select patients who don't receive targeted therapy in future lines.
key024_plat = key024_plat[~key024_plat['PatientID'].isin(plat_024_xcross)]

In [114]:
row_ID(key024_plat)

(23800, 23800)

In [115]:
key_024 = pd.concat([key024_pembro, key024_plat])

In [116]:
row_ID(key_024)

(27382, 27382)

In [117]:
key_024 = pd.merge(key_024, df_full, on = 'PatientID', how = 'left')

In [118]:
row_ID(key_024)

(27382, 27382)

In [119]:
key_024['StartDate'] = pd.to_datetime(key_024['StartDate'])

#### High PDL1

In [120]:
biomarkers = pd.read_csv('Enhanced_AdvNSCLCBiomarkers.csv')

In [121]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(key_024['PatientID'])]

In [122]:
biomarkers = pd.merge(biomarkers, key_024[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [123]:
row_ID(biomarkers)

(94603, 19875)

In [124]:
biomarkers['StartDate'] = pd.to_datetime(biomarkers['StartDate'])

In [125]:
biomarkers['ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [126]:
biomarkers['SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [127]:
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [128]:
biomarkers.loc[:, 'date_diff'] = (biomarkers['result_date'] - biomarkers['StartDate']).dt.days

In [129]:
lst = ["50% - 59%", "60% - 69%", "70% - 79%", "80% - 89%", "90% - 99%", "100%"]

pdl1_ids = (
    biomarkers
    .query('BiomarkerName == "PDL1"')
    .query('date_diff <=30')
    .query('PercentStaining == @lst')
    .PatientID
    .unique()
)

In [130]:
key_024 = key_024[key_024.PatientID.isin(pdl1_ids)]

In [131]:
row_ID(key_024)

(3519, 3519)

#### Time from treatment to death/progression or censor 

In [132]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [133]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [134]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [135]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [136]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
row_ID(mortality)

(68483, 68483)

In [137]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [138]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [139]:
row_ID(mortality)

(68483, 68483)

In [140]:
key_024 = pd.merge(key_024, mortality, on = 'PatientID', how = 'left')

In [141]:
row_ID(key_024)

(3519, 3519)

In [142]:
progression = pd.read_csv('Enhanced_AdvNSCLCProgression.csv')

In [143]:
progression = progression[progression.PatientID.isin(key_024.PatientID)][['PatientID', 'ProgressionDate']]

In [144]:
progression['ProgressionDate'] = pd.to_datetime(progression['ProgressionDate'])

In [145]:
progression = (
    progression
    .sort_values(['PatientID', 'ProgressionDate'], ascending = [True, True])
    .drop_duplicates(subset = 'PatientID', keep = 'first')
)

In [146]:
row_ID(progression)

(3517, 3517)

In [147]:
key_024 = pd.merge(key_024, progression, on = 'PatientID', how = 'left')

In [148]:
row_ID(key_024)

(3519, 3519)

In [149]:
# Percent without progression date in Flaura trial
len(key_024.query('ProgressionDate.isna()', engine = 'python'))/len(key_024)

0.51974992895709

In [150]:
conditions = [
    (key_024.ProgressionDate.notna()),
    ((key_024.ProgressionDate.isna()) & (key_024['death_status'] == 1)),
    ((key_024.ProgressionDate.isna()) & (key_024['death_status'] == 0))]

choices = [
    (key_024['ProgressionDate'] - key_024['StartDate']).dt.days,
    (key_024['death_date'] - key_024['StartDate']).dt.days,
    (key_024['last_activity'] - key_024['StartDate']).dt.days]

key_024.loc[:, 'time_prog_treatment'] = np.select(conditions, choices)

In [151]:
key_024 = key_024.query('time_prog_treatment >= 0')

In [152]:
row_ID(key_024)

(3209, 3209)

In [153]:
conditions = [
    (key_024.ProgressionDate.notna()),
    ((key_024.ProgressionDate.isna()) & (key_024['death_status'] == 1)),
    ((key_024.ProgressionDate.isna()) & (key_024['death_status'] == 0))]

choices = [1, 1, 0]

key_024.loc[:, 'pfs_status'] = np.select(conditions, choices)

#### Patient counts

In [154]:
key_024 = (
    key_024
    .query('stage == "IV"')
    .query('EGFR != "positive"')
    .query('ALK != "positive"')
)

In [155]:
low_cutoff_024 = key_024.risk_score.quantile(1/3)

In [156]:
high_cutoff_024 = key_024.risk_score.quantile(2/3)

In [157]:
print('Pembro total:',  key_024.query('pembro == 1').shape[0])
print('High risk:', key_024.query('pembro == 1').query('risk_score >= @high_cutoff_024').shape[0])
print('Med risk:', key_024.query('pembro == 1').query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024').shape[0])
print('Low risk:', key_024.query('pembro == 1').query('risk_score <= @low_cutoff_024').shape[0])

Pembro total: 1656
High risk: 546
Med risk: 563
Low risk: 547


In [158]:
print('Platinum total:',  key_024.query('pembro == 0').shape[0])
print('High risk:', key_024.query('pembro == 0').query('risk_score >= @high_cutoff_024').shape[0])
print('Med risk:', key_024.query('pembro == 0').query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024').shape[0])
print('Low risk:', key_024.query('pembro == 0').query('risk_score <= @low_cutoff_024').shape[0])

Platinum total: 464
High risk: 161
Med risk: 143
Low risk: 160


#### PFS with covariate balancing 

In [159]:
key_024 = key_024.set_index('PatientID')

In [160]:
key_024_iptw = key_024.filter(items = ['pfs_status',
                                       'time_prog_treatment',
                                       'pembro',
                                       'age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'adv_year',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_diagnosis',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'])

In [161]:
key_024_iptw['met_cat'] = pd.cut(key_024_iptw['adv_year'],
                                 bins = [2010, 2016, float('inf')],
                                 labels = ['11-16', '17-21'])

In [162]:
conditions = [
    ((key_024_iptw['ecog_diagnosis'] == "1.0") | (key_024_iptw['ecog_diagnosis'] == "0.0")),  
    ((key_024_iptw['ecog_diagnosis'] == "2.0") | (key_024_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

key_024_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [163]:
key_024_iptw.dtypes

pfs_status                int64
time_prog_treatment     float64
pembro                    int64
age                       int64
gender                   object
race                     object
PracticeType             object
Histology                object
adv_year                  int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
ecog_2                   object
dtype: object

In [164]:
to_be_categorical = list(key_024_iptw.select_dtypes(include = ['object']).columns)

In [165]:
to_be_categorical

['gender', 'race', 'PracticeType', 'Histology', 'ecog_diagnosis', 'ecog_2']

In [166]:
to_be_categorical.append('met_cat')

In [167]:
to_be_categorical.remove('ecog_diagnosis')

In [168]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    key_024_iptw[x] = key_024_iptw[x].astype('category')

In [169]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [170]:
# List of categorical features.
categorical_features = list(key_024_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [171]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [172]:
key_024_iptw_low = (
    key_024_iptw
    .query('risk_score <= @low_cutoff_024'))

key_024_iptw_med = (
    key_024_iptw
    .query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024'))

key_024_iptw_high = (
    key_024_iptw
    .query('risk_score >= @high_cutoff_024'))

key_024_iptw_all = key_024_iptw

In [173]:
key_024_low_x = preprocessor.fit_transform(key_024_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

key_024_med_x = preprocessor.fit_transform(key_024_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

key_024_high_x = preprocessor.fit_transform(key_024_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'PracticeType',
                                                                              'Histology',
                                                                              'met_cat',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ecog_2',
                                                                              'albumin_diag',
                                                                              'weight_pct_change',
                                                                              'risk_score']))

key_024_all_x = preprocessor.fit_transform(key_024_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

In [174]:
lr_024_low = LogisticRegression(max_iter = 1000)
lr_024_low.fit(key_024_low_x, key_024_iptw_low['pembro'])

LogisticRegression(max_iter=1000)

In [175]:
lr_024_med = LogisticRegression(max_iter = 1000)
lr_024_med.fit(key_024_med_x, key_024_iptw_med['pembro'])

LogisticRegression(max_iter=1000)

In [176]:
lr_024_high = LogisticRegression(max_iter = 1000)
lr_024_high.fit(key_024_high_x, key_024_iptw_high['pembro'])

LogisticRegression(max_iter=1000)

In [177]:
lr_024_all = LogisticRegression(max_iter = 1000)
lr_024_all.fit(key_024_all_x, key_024_iptw_all['pembro'])

LogisticRegression(max_iter=1000)

In [178]:
pred_low = lr_024_low.predict_proba(key_024_low_x)
pred_med = lr_024_med.predict_proba(key_024_med_x)
pred_high = lr_024_high.predict_proba(key_024_high_x)
pred_all = lr_024_all.predict_proba(key_024_all_x)

In [179]:
key_024_iptw_low['ps'] = pred_low[:, 1]
key_024_iptw_med['ps'] = pred_med[:, 1]
key_024_iptw_high['ps'] = pred_high[:, 1]
key_024_iptw_all['ps'] = pred_all[:, 1]

In [180]:
key_024_iptw_low['weight'] = (
    np.where(key_024_iptw_low['pembro'] == 1, 1/key_024_iptw_low['ps'], 1/(1 - key_024_iptw_low['ps'])))

key_024_iptw_med['weight'] = (
    np.where(key_024_iptw_med['pembro'] == 1, 1/key_024_iptw_med['ps'], 1/(1 - key_024_iptw_med['ps'])))

key_024_iptw_high['weight'] = (
    np.where(key_024_iptw_high['pembro'] == 1, 1/key_024_iptw_high['ps'], 1/(1 - key_024_iptw_high['ps'])))

key_024_iptw_all['weight'] = (
    np.where(key_024_iptw_all['pembro'] == 1, 1/key_024_iptw_all['ps'], 1/(1 - key_024_iptw_all['ps'])))

In [181]:
# Low KM curves
kmf_low_pembro_024_iptw = KaplanMeierFitter()
kmf_low_plat_024_iptw = KaplanMeierFitter()

kmf_low_pembro_024_iptw.fit(
    key_024_iptw_low.query('pembro == 1').time_prog_treatment/30,
    key_024_iptw_low.query('pembro == 1').pfs_status,
    weights = key_024_iptw_low.query('pembro == 1')['weight'])

kmf_low_plat_024_iptw.fit(
    key_024_iptw_low.query('pembro == 0').time_prog_treatment/30,
    key_024_iptw_low.query('pembro == 0').pfs_status,
    weights = key_024_iptw_low.query('pembro == 0')['weight'])

# Med KM curves
kmf_med_pembro_024_iptw = KaplanMeierFitter()
kmf_med_plat_024_iptw = KaplanMeierFitter()

kmf_med_pembro_024_iptw.fit(
    key_024_iptw_med.query('pembro == 1').time_prog_treatment/30,
    key_024_iptw_med.query('pembro == 1').pfs_status,
    weights = key_024_iptw_med.query('pembro == 1')['weight'])

kmf_med_plat_024_iptw.fit(
    key_024_iptw_med.query('pembro == 0').time_prog_treatment/30,
    key_024_iptw_med.query('pembro == 0').pfs_status,
    weights = key_024_iptw_med.query('pembro == 0')['weight'])

# High KM curves 
kmf_high_pembro_024_iptw = KaplanMeierFitter()
kmf_high_plat_024_iptw = KaplanMeierFitter()

kmf_high_pembro_024_iptw.fit(
    key_024_iptw_high.query('pembro == 1').time_prog_treatment/30,
    key_024_iptw_high.query('pembro == 1').pfs_status,
    weights = key_024_iptw_high.query('pembro == 1')['weight'])

kmf_high_plat_024_iptw.fit(
    key_024_iptw_high.query('pembro == 0').time_prog_treatment/30,
    key_024_iptw_high.query('pembro == 0').pfs_status,
    weights = key_024_iptw_high.query('pembro == 0')['weight'])

# All KM curves 
kmf_all_pembro_024_iptw = KaplanMeierFitter()
kmf_all_plat_024_iptw = KaplanMeierFitter()

kmf_all_pembro_024_iptw.fit(
    key_024_iptw_all.query('pembro == 1').time_prog_treatment/30,
    key_024_iptw_all.query('pembro == 1').pfs_status,
    weights = key_024_iptw_all.query('pembro == 1')['weight'])

kmf_all_plat_024_iptw.fit(
    key_024_iptw_all.query('pembro == 0').time_prog_treatment/30,
    key_024_iptw_all.query('pembro == 0').pfs_status,
    weights = key_024_iptw_all.query('pembro == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 2060.45 total observations, 442.703 right-censored observations>

#### Calculating survival metrics 

In [182]:
pembro_024_median_os = mos(kmf_low_pembro_024_iptw,
                           kmf_med_pembro_024_iptw,
                           kmf_high_pembro_024_iptw,
                           kmf_all_pembro_024_iptw)

plat_024_median_os = mos(kmf_low_plat_024_iptw,
                         kmf_med_plat_024_iptw,
                         kmf_high_plat_024_iptw,
                         kmf_all_plat_024_iptw)

In [183]:
key_024_iptw_all_imputed = key_024_iptw_all.copy()
key_024_iptw_all_imputed['albumin_diag'] = key_024_iptw_all_imputed['albumin_diag'].fillna(key_024_iptw_all_imputed['albumin_diag'].median())
key_024_iptw_all_imputed['weight_pct_change'] = key_024_iptw_all_imputed['weight_pct_change'].fillna(key_024_iptw_all_imputed['weight_pct_change'].median())

In [184]:
key024_hr_all = CoxPHFitter()
key024_hr_all.fit(key_024_iptw_all_imputed,
                  duration_col = 'time_prog_treatment',
                  event_col = 'pfs_status',
                  formula = 'pembro + age + gender + race + PracticeType + Histology + met_cat + commercial + medicare + medicaid + ecog_2 + albumin_diag + weight_pct_change + risk_score', 
                  weights_col = 'weight', 
                  robust = True)

<lifelines.CoxPHFitter: fitted with 4189.6 total observations, 942.168 right-censored observations>

In [185]:
key024_all_rmst_mos_95 = rmst_mos_95ci(key_024_iptw_all,
                                       1000,
                                       'pembro',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       18)

In [186]:
key024_low_rmst_mos_95 = rmst_mos_95ci(key_024_iptw_low,
                                       1000,
                                       'pembro',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       18)

In [187]:
key024_med_rmst_mos_95 = rmst_mos_95ci(key_024_iptw_med,
                                       1000,
                                       'pembro',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       18)

In [188]:
key024_high_rmst_mos_95 = rmst_mos_95ci(key_024_iptw_high,
                                       1000,
                                       'pembro',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       18)

In [189]:
keynote_024_data = [
    {'trial_name': 'KEYNOTE-024', 
     'risk_group': 'low', 
     'r_trt_mos': pembro_024_median_os[0],
     'r_trt_mos_95': key024_low_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_024_median_os[0],
     'r_cont_mos_95': key024_low_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_024_median_os[0] - plat_024_median_os[0], 
     'rct_trt_arm': 10.3, 
     'rct_cont_arm': 6.0,
     'rct_mos_diff': 10.3-6.0, 
     'trt_rmst': restricted_mean_survival_time(kmf_low_pembro_024_iptw, 18),
     'trt_rmst_95': key024_low_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_low_plat_024_iptw, 18),
     'cont_rmst_95': key024_low_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_low_pembro_024_iptw, 18) - restricted_mean_survival_time(kmf_low_plat_024_iptw, 18),
     'diff_rmst_95': key024_low_rmst_mos_95.difference_rmst_95,
     'rcount': key_024.query('risk_score <= @low_cutoff_024').shape[0],
     'rcount_chemo': key_024.query('risk_score <= @low_cutoff_024').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-024', 
     'risk_group': 'medium', 
     'r_trt_mos': pembro_024_median_os[1],
     'r_trt_mos_95': key024_med_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_024_median_os[1],
     'r_cont_mos_95': key024_med_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_024_median_os[1] - plat_024_median_os[1], 
     'rct_trt_arm': 10.3, 
     'rct_cont_arm': 6.0,
     'rct_mos_diff': 10.3-6.0, 
     'trt_rmst': restricted_mean_survival_time(kmf_med_pembro_024_iptw, 18),
     'trt_rmst_95': key024_med_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_med_plat_024_iptw, 18),
     'cont_rmst_95': key024_med_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_med_pembro_024_iptw, 18) - restricted_mean_survival_time(kmf_med_plat_024_iptw, 18),
     'diff_rmst_95': key024_med_rmst_mos_95.difference_rmst_95,
     'rcount': key_024.query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024').shape[0],
     'rcount_chemo': key_024.query('risk_score < @high_cutoff_024 and risk_score > @low_cutoff_024').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-024', 
     'risk_group': 'high', 
     'r_trt_mos': pembro_024_median_os[2],
     'r_trt_mos_95': key024_high_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_024_median_os[2],
     'r_cont_mos_95': key024_high_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_024_median_os[2] - plat_024_median_os[2], 
     'rct_trt_arm': 10.3, 
     'rct_cont_arm': 6.0,
     'rct_mos_diff': 10.3-6.0, 
     'trt_rmst': restricted_mean_survival_time(kmf_high_pembro_024_iptw, 18),
     'trt_rmst_95': key024_high_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_high_plat_024_iptw, 18),
     'cont_rmst_95': key024_high_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_high_pembro_024_iptw, 18) - restricted_mean_survival_time(kmf_high_plat_024_iptw, 18),
     'diff_rmst_95': key024_high_rmst_mos_95.difference_rmst_95,
     'rcount': key_024.query('risk_score >= @high_cutoff_024').shape[0],
     'rcount_chemo': key_024.query('risk_score >= @high_cutoff_024').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-024', 
     'risk_group': 'all', 
     'r_hr': key024_hr_all.hazard_ratios_['pembro'],
     'r_hr_95': [key024_hr_all.summary.loc['pembro']['exp(coef) lower 95%'], key024_hr_all.summary.loc['pembro']['exp(coef) upper 95%']],
     'r_trt_mos': pembro_024_median_os[3],
     'r_trt_mos_95': key024_all_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_024_median_os[3],
     'r_cont_mos_95': key024_all_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_024_median_os[3] - plat_024_median_os[3], 
     'rct_trt_arm': 10.3, 
     'rct_cont_arm': 6.0,
     'rct_mos_diff': 10.3-6.0, 
     'rcount': key_024.shape[0], 
     'rcount_chemo': key_024.query('pembro == 0').shape[0]}
]

### KEYNOTE-189: First-line pembrolizumab plus chemotherapy vs. chemotherapy

**INCLUSION CRITERIA**
* Untreated stage IV NSCLC
* Received first line pemobrolizumab plus platinum-based chemotherapy or platinum-based chemotherapy
* EGFR and ALK negative

#### Pembrolizumab + chemotherapy 

In [190]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [191]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [192]:
line_therapy[line_therapy['LineName'].str.contains('Pemetrexed')].LineName.value_counts().head(10)

Carboplatin,Pemetrexed                              6700
Carboplatin,Pembrolizumab,Pemetrexed                5113
Pemetrexed                                          4711
Bevacizumab,Carboplatin,Pemetrexed                  3459
Bevacizumab,Pemetrexed                              1569
Pembrolizumab,Pemetrexed                            1554
Cisplatin,Pemetrexed                                 811
Bevacizumab,Cisplatin,Pemetrexed                     174
Abiraterone,Carboplatin,Pembrolizumab,Pemetrexed     100
Bevacizumab-Awwb,Carboplatin,Pemetrexed               82
Name: LineName, dtype: int64

In [193]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
)

In [194]:
plat_chemo = [
    'Carboplatin',
    'Cisplatin']

immuno_wout_pembro = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab']

targeted = [
    'Afatinib',
    'Alectinib',
    'Brigatinib',
    'Cabozantinib',
    'Capmatinib',
    'Ceritinib',
    'Crizotinib',
    'Dabrafenib',
    'Dacomitinib',
    'Entrectinib',
    'Erlotinib',
    'Gefitinib',
    'Lorlatinib',
    'Osimertinib',
    'Pralsetinib',
    'Selpercatinib',
    'Sotorasib',
    'Tepotinib',
    'Trametinib',
    'Vandetanib']

In [195]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                line_therapy_fl['LineName'].str.contains('Pembrolizumab') &
                ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                ~line_therapy_fl['LineName'].str.contains('|'.join(immuno_wout_pembro)) &
                ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')].LineName.value_counts().head(10)

Carboplatin,Pembrolizumab,Pemetrexed                     4275
Carboplatin,Paclitaxel,Pembrolizumab                      803
Carboplatin,Paclitaxel Protein-Bound,Pembrolizumab        534
Abiraterone,Carboplatin,Pembrolizumab,Pemetrexed           79
Carboplatin,Cyclophosphamide,Pembrolizumab,Pemetrexed      31
Cisplatin,Pembrolizumab,Pemetrexed                         27
Carboplatin,Pembrolizumab                                  19
Carboplatin,Docetaxel,Pembrolizumab                        18
Carboplatin,Paclitaxel,Pembrolizumab,Pemetrexed            18
Bevacizumab,Carboplatin,Pembrolizumab,Pemetrexed           10
Name: LineName, dtype: int64

In [196]:
key189_pembro = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                    line_therapy_fl['LineName'].str.contains('Pembrolizumab') &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno_wout_pembro)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [197]:
key189_pembro.loc[:, 'pembro'] = 1

In [198]:
row_ID(key189_pembro)

(5878, 5878)

In [199]:
# Dataframe of all therapies received for those receiving first line pembrolizumab only. 
line_therapy_pembro_189 = (
    line_therapy[line_therapy['PatientID'].isin(key189_pembro.PatientID)])

In [200]:
# Patients receiving pembrolizumab therapy who later recieve targeted therapy. 
pembro_189_xcross = (
    line_therapy_pembro_189[line_therapy_pembro_189['LineName'].str.contains('|'.join(targeted))].PatientID)

In [201]:
# Select patients who don't receive targeted therapy in future lines.
key189_pembro = key189_pembro[~key189_pembro['PatientID'].isin(pembro_189_xcross)]

In [202]:
row_ID(key189_pembro)

(5673, 5673)

#### Platinum-based chemotherapy

In [203]:
plat_chemo = [
    'Carboplatin',
    'Cisplatin']

immuno = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab',
    'Pembrolizumab'
]

In [204]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')].LineName.value_counts().head(10)

Carboplatin,Paclitaxel                  8524
Carboplatin,Pemetrexed                  5417
Bevacizumab,Carboplatin,Pemetrexed      2825
Carboplatin,Paclitaxel Protein-Bound    1826
Bevacizumab,Carboplatin,Paclitaxel      1591
Carboplatin,Gemcitabine                 1224
Cisplatin,Etoposide                      793
Carboplatin,Docetaxel                    780
Cisplatin,Pemetrexed                     684
Carboplatin,Etoposide                    363
Name: LineName, dtype: int64

In [205]:
key189_plat = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(plat_chemo)) & 
                    ~line_therapy_fl['LineName'].str.contains('|'.join(immuno)) &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(targeted)) &
                    ~line_therapy_fl['LineName'].str.contains('Clinical Study Drug')]
    [['PatientID', 'StartDate']]
)

In [206]:
key189_plat.loc[:, 'pembro'] = 0

In [207]:
row_ID(key189_plat)

(25861, 25861)

In [208]:
# Dataframe of all therapies received for those receiving first line platinum regimen 
line_therapy_plat_189 = (
    line_therapy[line_therapy['PatientID'].isin(key189_plat.PatientID)])

In [209]:
# Patients receiving platinum therapy who later recieve targeted therapy
plat_189_xcross = (
    line_therapy_plat_189[line_therapy_plat_189['LineName'].str.contains('|'.join(targeted))].PatientID)

In [210]:
# Select patients who don't receive targeted therapy
key189_plat = key189_plat[~key189_plat['PatientID'].isin(plat_189_xcross)]

In [211]:
row_ID(key189_plat)

(23800, 23800)

In [212]:
key_189 = pd.concat([key189_pembro, key189_plat])

In [213]:
row_ID(key_189)

(29473, 29473)

In [214]:
key_189 = pd.merge(key_189, df_full, on = 'PatientID', how = 'left')

In [215]:
row_ID(key_189)

(29473, 29473)

In [216]:
key_189['StartDate'] = pd.to_datetime(key_189['StartDate'])

#### Time from treatment to death or censor

In [217]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [218]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [219]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [220]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [221]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
print(len(mortality), mortality.PatientID.is_unique)

68483 True


In [222]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [223]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [224]:
key_189 = pd.merge(key_189, mortality, on = 'PatientID', how = 'left')

In [225]:
len(key_189)

29473

In [226]:
conditions = [
    (key_189['death_status'] == 1),
    (key_189['death_status'] == 0)]

choices = [
    (key_189['death_date'] - key_189['StartDate']).dt.days,
    (key_189['last_activity'] - key_189['StartDate']).dt.days]

key_189.loc[:, 'timerisk_treatment'] = np.select(conditions, choices)

In [227]:
key_189 = key_189.query('timerisk_treatment >= 0')

#### Patient count

In [228]:
key_189 = (
    key_189
    .query('EGFR != "positive"')
    .query('ALK != "positive"')
)

In [229]:
low_cutoff_189 = key_189.risk_score.quantile(1/3)

In [230]:
high_cutoff_189 = key_189.risk_score.quantile(2/3)

In [231]:
print('Pembro + chemo total:',  key_189.query('pembro == 1').shape[0])
print('High risk:', key_189.query('pembro == 1').query('risk_score >= @high_cutoff_189').shape[0])
print('Med risk:', key_189.query('pembro == 1').query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189').shape[0])
print('Low risk:', key_189.query('pembro == 1').query('risk_score <= @low_cutoff_189').shape[0])

Pembro + chemo total: 5560
High risk: 2010
Med risk: 1842
Low risk: 1708


In [232]:
print('Platinum total:',  key_189.query('pembro == 0').shape[0])
print('High risk:', key_189.query('pembro == 0').query('risk_score >= @high_cutoff_189').shape[0])
print('Med risk:', key_189.query('pembro == 0').query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189').shape[0])
print('Low risk:', key_189.query('pembro == 0').query('risk_score <= @low_cutoff_189').shape[0])

Platinum total: 23480
High risk: 7670
Med risk: 7838
Low risk: 7972


#### Survival curves with covariate balancing 

In [233]:
key_189 = key_189.set_index('PatientID')

In [234]:
key_189_iptw = key_189.filter(items = ['death_status',
                                       'timerisk_treatment',
                                       'pembro',
                                       'age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'adv_year',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_diagnosis',
                                       'pdl1',
                                       'albumin_diag', 
                                       'weight_pct_change',
                                       'risk_score'])

In [235]:
key_189_iptw['met_cat'] = pd.cut(key_189_iptw['adv_year'],
                                 bins = [2010, 2018, float('inf')],
                                 labels = ['11-18', '19-22'])

In [236]:
conditions = [
    ((key_189_iptw['pdl1'] == "1-49%") | (key_189_iptw['pdl1'] == "50-100%"))]

choices = ['>0%']

key_189_iptw['pdl1_cat'] = np.select(conditions, choices, default = key_189_iptw['pdl1'])

In [237]:
conditions = [
    ((key_189_iptw['ecog_diagnosis'] == "1.0") | (key_189_iptw['ecog_diagnosis'] == "0.0")),  
    ((key_189_iptw['ecog_diagnosis'] == "2.0") | (key_189_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

key_189_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [238]:
key_189_iptw.dtypes

death_status               bool
timerisk_treatment      float64
pembro                    int64
age                       int64
gender                   object
race                     object
PracticeType             object
Histology                object
adv_year                  int64
delta_adv_diagnosis       int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
pdl1                     object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
pdl1_cat                 object
ecog_2                   object
dtype: object

In [239]:
to_be_categorical = list(key_189_iptw.select_dtypes(include = ['object']).columns)

In [240]:
to_be_categorical

['gender',
 'race',
 'PracticeType',
 'Histology',
 'ecog_diagnosis',
 'pdl1',
 'pdl1_cat',
 'ecog_2']

In [241]:
to_be_categorical.append('met_cat')

In [242]:
to_be_categorical.remove('pdl1')

In [243]:
to_be_categorical.remove('ecog_diagnosis')

In [244]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    key_189_iptw[x] = key_189_iptw[x].astype('category')

In [245]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [246]:
# List of categorical features.
categorical_features = list(key_189_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [247]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [248]:
key_189_iptw_low = (
    key_189_iptw
    .query('risk_score <= @low_cutoff_189'))

key_189_iptw_med = (
    key_189_iptw
    .query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189'))

key_189_iptw_high = (
    key_189_iptw
    .query('risk_score >= @high_cutoff_189'))

key_189_iptw_all = key_189_iptw

In [249]:
key_189_low_x = preprocessor.fit_transform(key_189_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_cat', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

key_189_med_x = preprocessor.fit_transform(key_189_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_cat', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

key_189_high_x = preprocessor.fit_transform(key_189_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'PracticeType',
                                                                              'Histology',
                                                                              'met_cat',
                                                                              'delta_adv_diagnosis',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ecog_2',
                                                                              'pdl1_cat', 
                                                                              'albumin_diag', 
                                                                              'weight_pct_change',
                                                                              'risk_score']))

key_189_all_x = preprocessor.fit_transform(key_189_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'Histology',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2',
                                                                            'pdl1_cat', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change',
                                                                            'risk_score']))

In [250]:
lr_189_low = LogisticRegression(max_iter = 1000)
lr_189_low.fit(key_189_low_x, key_189_iptw_low['pembro'])

LogisticRegression(max_iter=1000)

In [251]:
lr_189_med = LogisticRegression(max_iter = 1000)
lr_189_med.fit(key_189_med_x, key_189_iptw_med['pembro'])

LogisticRegression(max_iter=1000)

In [252]:
lr_189_high = LogisticRegression(max_iter = 1000)
lr_189_high.fit(key_189_high_x, key_189_iptw_high['pembro'])

LogisticRegression(max_iter=1000)

In [253]:
lr_189_all = LogisticRegression(max_iter = 1000)
lr_189_all.fit(key_189_all_x, key_189_iptw_all['pembro'])

LogisticRegression(max_iter=1000)

In [254]:
pred_low = lr_189_low.predict_proba(key_189_low_x)
pred_med = lr_189_med.predict_proba(key_189_med_x)
pred_high = lr_189_high.predict_proba(key_189_high_x)
pred_all = lr_189_all.predict_proba(key_189_all_x)

In [255]:
key_189_iptw_low['ps'] = pred_low[:, 1]
key_189_iptw_med['ps'] = pred_med[:, 1]
key_189_iptw_high['ps'] = pred_high[:, 1]
key_189_iptw_all['ps'] = pred_all[:, 1]

In [256]:
key_189_iptw_low['weight'] = (
    np.where(key_189_iptw_low['pembro'] == 1, 1/key_189_iptw_low['ps'], 1/(1 - key_189_iptw_low['ps'])))

key_189_iptw_med['weight'] = (
    np.where(key_189_iptw_med['pembro'] == 1, 1/key_189_iptw_med['ps'], 1/(1 - key_189_iptw_med['ps'])))

key_189_iptw_high['weight'] = (
    np.where(key_189_iptw_high['pembro'] == 1, 1/key_189_iptw_high['ps'], 1/(1 - key_189_iptw_high['ps'])))

key_189_iptw_all['weight'] = (
    np.where(key_189_iptw_all['pembro'] == 1, 1/key_189_iptw_all['ps'], 1/(1 - key_189_iptw_all['ps'])))

In [257]:
# Low KM curves
kmf_low_pembro_189_iptw = KaplanMeierFitter()
kmf_low_plat_189_iptw = KaplanMeierFitter()

kmf_low_pembro_189_iptw.fit(
    key_189_iptw_low.query('pembro == 1').timerisk_treatment/30,
    key_189_iptw_low.query('pembro == 1').death_status,
    weights = key_189_iptw_low.query('pembro == 1')['weight'])

kmf_low_plat_189_iptw.fit(
    key_189_iptw_low.query('pembro == 0').timerisk_treatment/30,
    key_189_iptw_low.query('pembro == 0').death_status,
    weights = key_189_iptw_low.query('pembro == 0')['weight'])

# Med KM curves
kmf_med_pembro_189_iptw = KaplanMeierFitter()
kmf_med_plat_189_iptw = KaplanMeierFitter()

kmf_med_pembro_189_iptw.fit(
    key_189_iptw_med.query('pembro == 1').timerisk_treatment/30,
    key_189_iptw_med.query('pembro == 1').death_status,
    weights = key_189_iptw_med.query('pembro == 1')['weight'])

kmf_med_plat_189_iptw.fit(
    key_189_iptw_med.query('pembro == 0').timerisk_treatment/30,
    key_189_iptw_med.query('pembro == 0').death_status,
    weights = key_189_iptw_med.query('pembro == 0')['weight'])

# High KM curves 
kmf_high_pembro_189_iptw = KaplanMeierFitter()
kmf_high_plat_189_iptw = KaplanMeierFitter()

kmf_high_pembro_189_iptw.fit(
    key_189_iptw_high.query('pembro == 1').timerisk_treatment/30,
    key_189_iptw_high.query('pembro == 1').death_status,
    weights = key_189_iptw_high.query('pembro == 1')['weight'])

kmf_high_plat_189_iptw.fit(
    key_189_iptw_high.query('pembro == 0').timerisk_treatment/30,
    key_189_iptw_high.query('pembro == 0').death_status,
    weights = key_189_iptw_high.query('pembro == 0')['weight'])

# All KM curves 
kmf_all_pembro_189_iptw = KaplanMeierFitter()
kmf_all_plat_189_iptw = KaplanMeierFitter()

kmf_all_pembro_189_iptw.fit(
    key_189_iptw_all.query('pembro == 1').timerisk_treatment/30,
    key_189_iptw_all.query('pembro == 1').death_status,
    weights = key_189_iptw_all.query('pembro == 1')['weight'])

kmf_all_plat_189_iptw.fit(
    key_189_iptw_all.query('pembro == 0').timerisk_treatment/30,
    key_189_iptw_all.query('pembro == 0').death_status,
    weights = key_189_iptw_all.query('pembro == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 29804.6 total observations, 8081.93 right-censored observations>

#### Calculating survival metrics

In [258]:
pembro_189_median_os = mos(kmf_low_pembro_189_iptw,
                           kmf_med_pembro_189_iptw,
                           kmf_high_pembro_189_iptw,
                           kmf_all_pembro_189_iptw)

plat_189_median_os = mos(kmf_low_plat_189_iptw,
                         kmf_med_plat_189_iptw,
                         kmf_high_plat_189_iptw,
                         kmf_all_plat_189_iptw)

In [259]:
key_189_iptw_all_imputed = key_189_iptw_all.copy()
key_189_iptw_all_imputed['albumin_diag'] = key_189_iptw_all_imputed['albumin_diag'].fillna(key_189_iptw_all_imputed['albumin_diag'].median())
key_189_iptw_all_imputed['weight_pct_change'] = key_189_iptw_all_imputed['weight_pct_change'].fillna(key_189_iptw_all_imputed['weight_pct_change'].median())

In [260]:
key189_hr_all = CoxPHFitter()
key189_hr_all.fit(key_189_iptw_all_imputed,
                  duration_col = 'timerisk_treatment',
                  event_col = 'death_status',
                  formula = 'pembro + age + gender + race + PracticeType + Histology + met_cat + delta_adv_diagnosis + commercial + medicare + medicaid + ecog_2 + pdl1_cat + albumin_diag + weight_pct_change + risk_score',
                  weights_col = 'weight',
                  robust = True)

<lifelines.CoxPHFitter: fitted with 55736.7 total observations, 18130.3 right-censored observations>

In [261]:
key189_all_rmst_mos_95 = rmst_mos_95ci(key_189_iptw_all,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat', 
                                        'albumin_diag', 
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [262]:
key189_low_rmst_mos_95 = rmst_mos_95ci(key_189_iptw_low,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat', 
                                        'albumin_diag', 
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [263]:
key189_med_rmst_mos_95 = rmst_mos_95ci(key_189_iptw_med,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat', 
                                        'albumin_diag', 
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [264]:
key189_high_rmst_mos_95 = rmst_mos_95ci(key_189_iptw_high,
                                       1000,
                                       'pembro',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat', 
                                        'albumin_diag', 
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [265]:
keynote_189_data = [
    {'trial_name': 'KEYNOTE-189', 
     'risk_group': 'low', 
     'r_trt_mos': pembro_189_median_os[0],
     'r_trt_mos_95': key189_low_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_189_median_os[0],
     'r_cont_mos_95': key189_low_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_189_median_os[0] - plat_189_median_os[0], 
     'rct_trt_arm': 22.0, 
     'rct_cont_arm': 10.6,
     'rct_mos_diff': 22.0-10.6, 
     'trt_rmst': restricted_mean_survival_time(kmf_low_pembro_189_iptw, 36),
     'trt_rmst_95': key189_low_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_low_plat_189_iptw, 36),
     'cont_rmst_95': key189_low_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_low_pembro_189_iptw, 36) - restricted_mean_survival_time(kmf_low_plat_189_iptw, 36),
     'diff_rmst_95': key189_low_rmst_mos_95.difference_rmst_95,
     'rcount': key_189.query('risk_score <= @low_cutoff_189').shape[0],
     'rcount_chemo': key_189.query('risk_score <= @low_cutoff_189').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-189', 
     'risk_group': 'medium', 
     'r_trt_mos': pembro_189_median_os[1],
     'r_trt_mos_95': key189_med_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_189_median_os[1],
     'r_cont_mos_95': key189_med_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_189_median_os[1] - plat_189_median_os[1], 
     'rct_trt_arm': 22.0, 
     'rct_cont_arm': 10.6,
     'rct_mos_diff': 22.0-10.6, 
     'trt_rmst': restricted_mean_survival_time(kmf_med_pembro_189_iptw, 36),
     'trt_rmst_95': key189_med_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_med_plat_189_iptw, 36),
     'cont_rmst_95': key189_med_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_med_pembro_189_iptw, 36) - restricted_mean_survival_time(kmf_med_plat_189_iptw, 36),
     'diff_rmst_95': key189_med_rmst_mos_95.difference_rmst_95,
     'rcount': key_189.query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189').shape[0],
     'rcount_chemo': key_189.query('risk_score < @high_cutoff_189 and risk_score > @low_cutoff_189').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-189', 
     'risk_group': 'high', 
     'r_trt_mos': pembro_189_median_os[2],
     'r_trt_mos_95': key189_high_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_189_median_os[2],
     'r_cont_mos_95': key189_high_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_189_median_os[2] - plat_189_median_os[2], 
     'rct_trt_arm': 22.0, 
     'rct_cont_arm': 10.6,
     'rct_mos_diff': 22.0-10.6, 
     'trt_rmst': restricted_mean_survival_time(kmf_high_pembro_189_iptw, 36),
     'trt_rmst_95': key189_high_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_high_plat_189_iptw, 36),
     'cont_rmst_95': key189_high_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_high_pembro_189_iptw, 36) - restricted_mean_survival_time(kmf_high_plat_189_iptw, 36),
     'diff_rmst_95': key189_high_rmst_mos_95.difference_rmst_95,
     'rcount': key_189.query('risk_score >= @high_cutoff_189').shape[0],
     'rcount_chemo': key_189.query('risk_score >= @high_cutoff_189').query('pembro == 0').shape[0]},
    
    {'trial_name': 'KEYNOTE-189', 
     'risk_group': 'all', 
     'r_hr': key189_hr_all.hazard_ratios_['pembro'],
     'r_hr_95': [key189_hr_all.summary.loc['pembro']['exp(coef) lower 95%'], key189_hr_all.summary.loc['pembro']['exp(coef) upper 95%']],
     'r_trt_mos': pembro_189_median_os[3],
     'r_trt_mos_95': key189_all_rmst_mos_95.mos_A_95,
     'r_cont_mos': plat_189_median_os[3],
     'r_cont_mos_95': key189_all_rmst_mos_95.mos_B_95,
     'r_mos_diff': pembro_189_median_os[3] - plat_189_median_os[3], 
     'rct_trt_arm': 22.0, 
     'rct_cont_arm': 10.6,
     'rct_mos_diff': 22.0-10.6, 
     'rcount': key_189.shape[0], 
     'rcount_chemo': key_189.query('pembro == 0').shape[0]}
]

### CHECKMATE-078: Second-line nivolumab vs. docetaxel

**INCLUSION CRITERIA**
* Advanced or metastatic NSCLC 
* Progressed on first line platinum-based chemotherapy 
* Received second line nivolumab or docetaxel
* Prior treatments with docetaxel or immunotherapy contraindicated
* EGFR and ALK negative 

#### Nivolumab 

In [266]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [267]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [268]:
line_therapy_fl = (
    line_therapy[line_therapy.PatientID.isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
)

In [269]:
targeted = [
    'Afatinib',
    'Alectinib',
    'Brigatinib',
    'Cabozantinib',
    'Capmatinib',
    'Ceritinib',
    'Crizotinib',
    'Dabrafenib',
    'Dacomitinib',
    'Entrectinib',
    'Erlotinib',
    'Gefitinib',
    'Lorlatinib',
    'Osimertinib',
    'Pralsetinib',
    'Selpercatinib',
    'Sotorasib',
    'Tepotinib',
    'Trametinib',
    'Vandetanib']

In [270]:
immunotherapy = [
    'Atezolizumab',
    'Cemiplimab',
    'Durvalumab',
    'Ipilimumab',
    'Nivolumab',
    'Pembrolizumab'
]

In [271]:
fl_plat = (
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('Carboplatin|Cisplatin')
     & ~line_therapy_fl['LineName'].str.contains('Docetaxel')
     & ~line_therapy_fl['LineName'].str.contains('|'.join(targeted))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(immunotherapy))]
    .PatientID
)

In [272]:
checkmate_nivo = (
    line_therapy[line_therapy.PatientID.isin(fl_plat)]
    .query('LineNumber == 2')
    .query('LineName == "Nivolumab"')
    [['PatientID', 'StartDate']]
)

In [273]:
pembro_189_median_os[3]

12.9

In [274]:
checkmate_nivo.loc[:, 'nivo'] = 1

In [275]:
row_ID(checkmate_nivo)

(3397, 3397)

#### Docetaxel

In [276]:
checkmate_dotx = (
    line_therapy[line_therapy.PatientID.isin(fl_plat)]
    .query('LineNumber == 2')
    .query('LineName == "Docetaxel"')
    [['PatientID', 'StartDate']]
)

In [277]:
checkmate_dotx.loc[:, 'nivo'] = 0

In [278]:
row_ID(checkmate_dotx)

(745, 745)

In [279]:
checkmate = pd.concat([checkmate_nivo, checkmate_dotx])

In [280]:
row_ID(checkmate)

(4142, 4142)

In [281]:
checkmate = pd.merge(checkmate, df_full, on = 'PatientID', how = 'left')

In [282]:
row_ID(checkmate)

(4142, 4142)

In [283]:
checkmate['StartDate'] = pd.to_datetime(checkmate['StartDate'])

#### Time from treatment to death or censor 

In [284]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [285]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [286]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [287]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [288]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
print(len(mortality), mortality.PatientID.is_unique)

68483 True


In [289]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [290]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [291]:
checkmate = pd.merge(checkmate, mortality, on = 'PatientID', how = 'left')

In [292]:
row_ID(checkmate)

(4142, 4142)

In [293]:
conditions = [
    (checkmate['death_status'] == 1),
    (checkmate['death_status'] == 0)]

choices = [
    (checkmate['death_date'] - checkmate['StartDate']).dt.days,
    (checkmate['last_activity'] - checkmate['StartDate']).dt.days]

checkmate.loc[:, 'timerisk_treatment'] = np.select(conditions, choices)

In [294]:
checkmate = checkmate.query('timerisk_treatment >= 0')

#### Patient count 

In [295]:
checkmate = (
    checkmate
    .query('EGFR != "positive"')
    .query('ALK != "positive"')
)

In [296]:
low_cutoff_078 = checkmate.risk_score.quantile(1/3)

In [297]:
high_cutoff_078 = checkmate.risk_score.quantile(2/3)

In [298]:
print('Nivolumab total:',  checkmate.query('nivo == 1').shape[0])
print('High risk:', checkmate.query('nivo == 1').query('risk_score >= @high_cutoff_078').shape[0])
print('Med risk:', checkmate.query('nivo == 1').query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078').shape[0])
print('Low risk:', checkmate.query('nivo == 1').query('risk_score <= @low_cutoff_078').shape[0])

Nivolumab total: 3339
High risk: 1112
Med risk: 1114
Low risk: 1113


In [299]:
print('Docetaxel total:',  checkmate.query('nivo == 0').shape[0])
print('High risk:', checkmate.query('nivo == 0').query('risk_score >= @high_cutoff_078').shape[0])
print('Med risk:', checkmate.query('nivo == 0').query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078').shape[0])
print('Low risk:', checkmate.query('nivo == 0').query('risk_score <= @low_cutoff_078').shape[0])

Docetaxel total: 734
High risk: 246
Med risk: 243
Low risk: 245


#### Survival curves with covariate balancing 

In [300]:
checkmate = checkmate.set_index('PatientID')

In [301]:
check_iptw = checkmate.filter(items = ['death_status',
                                       'timerisk_treatment',
                                       'nivo',
                                       'age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'adv_year',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_diagnosis',
                                       'pdl1',
                                       'albumin_diag', 
                                       'weight_pct_change',
                                       'risk_score'])

In [302]:
check_iptw['met_cat'] = pd.cut(check_iptw['adv_year'],
                               bins = [2010, 2015, float('inf')],
                               labels = ['11-15', '16-20'])

In [303]:
conditions = [
    ((check_iptw['pdl1'] == "1-49%") | (check_iptw['pdl1'] == "50-100%"))]

choices = ['>0%']

check_iptw['pdl1_cat'] = np.select(conditions, choices, default = check_iptw['pdl1'])

In [304]:
conditions = [
    ((check_iptw['ecog_diagnosis'] == "1.0") | (check_iptw['ecog_diagnosis'] == "0.0")),  
    ((check_iptw['ecog_diagnosis'] == "2.0") | (check_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

check_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [305]:
check_iptw.dtypes

death_status               bool
timerisk_treatment      float64
nivo                      int64
age                       int64
gender                   object
race                     object
PracticeType             object
Histology                object
adv_year                  int64
delta_adv_diagnosis       int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
pdl1                     object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
pdl1_cat                 object
ecog_2                   object
dtype: object

In [306]:
to_be_categorical = list(check_iptw.select_dtypes(include = ['object']).columns)

In [307]:
to_be_categorical

['gender',
 'race',
 'PracticeType',
 'Histology',
 'ecog_diagnosis',
 'pdl1',
 'pdl1_cat',
 'ecog_2']

In [308]:
to_be_categorical.append('met_cat')

In [309]:
to_be_categorical.remove('pdl1')

In [310]:
to_be_categorical.remove('ecog_diagnosis')

In [311]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    check_iptw[x] = check_iptw[x].astype('category')

In [312]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [313]:
# List of categorical features.
categorical_features = list(check_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [314]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [315]:
check_iptw_low = (
    check_iptw
    .query('risk_score <= @low_cutoff_078'))

check_iptw_med = (
    check_iptw
    .query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078'))

check_iptw_high = (
    check_iptw
    .query('risk_score >= @high_cutoff_078'))

check_iptw_all = check_iptw

In [316]:
check_low_x = preprocessor.fit_transform(check_iptw_low.filter(items = ['age',
                                                                        'gender',
                                                                        'race',
                                                                        'PracticeType',
                                                                        'Histology',
                                                                        'met_cat',
                                                                        'delta_adv_diagnosis',
                                                                        'commercial',
                                                                        'medicare',
                                                                        'medicaid',
                                                                        'ecog_2',
                                                                        'pdl1_cat',
                                                                        'albumin_diag',
                                                                        'weight_pct_change',
                                                                        'risk_score']))

check_med_x = preprocessor.fit_transform(check_iptw_med.filter(items = ['age',
                                                                        'gender',
                                                                        'race',
                                                                        'PracticeType',
                                                                        'Histology',
                                                                        'met_cat',
                                                                        'delta_adv_diagnosis',
                                                                        'commercial',
                                                                        'medicare',
                                                                        'medicaid',
                                                                        'ecog_2',
                                                                        'pdl1_cat',
                                                                        'albumin_diag',
                                                                        'weight_pct_change',
                                                                        'risk_score']))

check_high_x = preprocessor.fit_transform(check_iptw_high.filter(items = ['age',
                                                                          'gender',
                                                                          'race',
                                                                          'PracticeType',
                                                                          'Histology',
                                                                          'met_cat',
                                                                          'delta_adv_diagnosis',
                                                                          'commercial',
                                                                          'medicare',
                                                                          'medicaid',
                                                                          'ecog_2',
                                                                          'pdl1_cat',
                                                                          'albumin_diag',
                                                                          'weight_pct_change',
                                                                          'risk_score']))

check_all_x = preprocessor.fit_transform(check_iptw_all.filter(items = ['age',
                                                                        'gender',
                                                                        'race',
                                                                        'PracticeType',
                                                                        'Histology',
                                                                        'met_cat',
                                                                        'delta_adv_diagnosis',
                                                                        'commercial',
                                                                        'medicare',
                                                                        'medicaid',
                                                                        'ecog_2',
                                                                        'pdl1_cat',
                                                                        'albumin_diag',
                                                                        'weight_pct_change',
                                                                        'risk_score']))

In [317]:
lr_check_low = LogisticRegression(max_iter = 1000)
lr_check_low.fit(check_low_x, check_iptw_low['nivo'])

LogisticRegression(max_iter=1000)

In [318]:
lr_check_med = LogisticRegression(max_iter = 1000)
lr_check_med.fit(check_med_x, check_iptw_med['nivo'])

LogisticRegression(max_iter=1000)

In [319]:
lr_check_high = LogisticRegression(max_iter = 1000)
lr_check_high.fit(check_high_x, check_iptw_high['nivo'])

LogisticRegression(max_iter=1000)

In [320]:
lr_check_all = LogisticRegression(max_iter = 1000)
lr_check_all.fit(check_all_x, check_iptw_all['nivo'])

LogisticRegression(max_iter=1000)

In [321]:
pred_low = lr_check_low.predict_proba(check_low_x)
pred_med = lr_check_med.predict_proba(check_med_x)
pred_high = lr_check_high.predict_proba(check_high_x)
pred_all = lr_check_all.predict_proba(check_all_x)

In [322]:
check_iptw_low['ps'] = pred_low[:, 1]
check_iptw_med['ps'] = pred_med[:, 1]
check_iptw_high['ps'] = pred_high[:, 1]
check_iptw_all['ps'] = pred_all[:, 1]

In [323]:
check_iptw_low['weight'] = (
    np.where(check_iptw_low['nivo'] == 1, 1/check_iptw_low['ps'], 1/(1 - check_iptw_low['ps'])))

check_iptw_med['weight'] = (
    np.where(check_iptw_med['nivo'] == 1, 1/check_iptw_med['ps'], 1/(1 - check_iptw_med['ps'])))

check_iptw_high['weight'] = (
    np.where(check_iptw_high['nivo'] == 1, 1/check_iptw_high['ps'], 1/(1 - check_iptw_high['ps'])))

check_iptw_all['weight'] = (
    np.where(check_iptw_all['nivo'] == 1, 1/check_iptw_all['ps'], 1/(1 - check_iptw_all['ps'])))

In [324]:
# Low KM curves
kmf_low_nivo_check_iptw = KaplanMeierFitter()
kmf_low_dotx_check_iptw = KaplanMeierFitter()

kmf_low_nivo_check_iptw.fit(
    check_iptw_low.query('nivo == 1').timerisk_treatment/30,
    check_iptw_low.query('nivo == 1').death_status,
    weights = check_iptw_low.query('nivo == 1')['weight'])

kmf_low_dotx_check_iptw.fit(
    check_iptw_low.query('nivo == 0').timerisk_treatment/30,
    check_iptw_low.query('nivo == 0').death_status,
    weights = check_iptw_low.query('nivo == 0')['weight'])

# Med KM curves
kmf_med_nivo_check_iptw = KaplanMeierFitter()
kmf_med_dotx_check_iptw = KaplanMeierFitter()

kmf_med_nivo_check_iptw.fit(
    check_iptw_med.query('nivo == 1').timerisk_treatment/30,
    check_iptw_med.query('nivo == 1').death_status,
    weights = check_iptw_med.query('nivo == 1')['weight'])

kmf_med_dotx_check_iptw.fit(
    check_iptw_med.query('nivo == 0').timerisk_treatment/30,
    check_iptw_med.query('nivo == 0').death_status,
    weights = check_iptw_med.query('nivo == 0')['weight'])

# High KM curves 
kmf_high_nivo_check_iptw = KaplanMeierFitter()
kmf_high_dotx_check_iptw = KaplanMeierFitter()

kmf_high_nivo_check_iptw.fit(
    check_iptw_high.query('nivo == 1').timerisk_treatment/30,
    check_iptw_high.query('nivo == 1').death_status,
    weights = check_iptw_high.query('nivo == 1')['weight'])

kmf_high_dotx_check_iptw.fit(
    check_iptw_high.query('nivo == 0').timerisk_treatment/30,
    check_iptw_high.query('nivo == 0').death_status,
    weights = check_iptw_high.query('nivo == 0')['weight'])

# All KM curves 
kmf_all_nivo_check_iptw = KaplanMeierFitter()
kmf_all_dotx_check_iptw = KaplanMeierFitter()

kmf_all_nivo_check_iptw.fit(
    check_iptw_all.query('nivo == 1').timerisk_treatment/30,
    check_iptw_all.query('nivo == 1').death_status,
    weights = check_iptw_all.query('nivo == 1')['weight'])

kmf_all_dotx_check_iptw.fit(
    check_iptw_all.query('nivo == 0').timerisk_treatment/30,
    check_iptw_all.query('nivo == 0').death_status,
    weights = check_iptw_all.query('nivo == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 4510.31 total observations, 768.868 right-censored observations>

#### Calculating survival metrics 

In [325]:
nivo_check_median_os = mos(kmf_low_nivo_check_iptw, 
                           kmf_med_nivo_check_iptw,
                           kmf_high_nivo_check_iptw,
                           kmf_all_nivo_check_iptw)

dotx_check_median_os = mos(kmf_low_dotx_check_iptw,
                           kmf_med_dotx_check_iptw,
                           kmf_high_dotx_check_iptw,
                           kmf_all_dotx_check_iptw)

In [326]:
check_iptw_all_imputed = check_iptw_all.copy()
check_iptw_all_imputed['albumin_diag'] = check_iptw_all_imputed['albumin_diag'].fillna(check_iptw_all_imputed['albumin_diag'].median())
check_iptw_all_imputed['weight_pct_change'] = check_iptw_all_imputed['weight_pct_change'].fillna(check_iptw_all_imputed['weight_pct_change'].median())

In [327]:
check_hr_all = CoxPHFitter()
check_hr_all.fit(check_iptw_all_imputed,
                 duration_col = 'timerisk_treatment',
                 event_col = 'death_status',
                 formula = 'nivo + age + gender + race + PracticeType + Histology + delta_adv_diagnosis + commercial + medicare + medicaid + ecog_2 + pdl1_cat + albumin_diag + weight_pct_change + risk_score',
                 weights_col = 'weight',
                 robust = True)

<lifelines.CoxPHFitter: fitted with 8573.55 total observations, 1516.25 right-censored observations>

In [328]:
check_all_rmst_mos_95 = rmst_mos_95ci(check_iptw_all,
                                      1000,
                                      'nivo',
                                      'death',
                                      ['age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'met_cat',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_2',
                                       'pdl1_cat',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'],
                                      ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [329]:
check_low_rmst_mos_95 = rmst_mos_95ci(check_iptw_low,
                                      1000,
                                      'nivo',
                                      'death',
                                      ['age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'met_cat',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_2',
                                       'pdl1_cat',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'],
                                      ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [330]:
check_med_rmst_mos_95 = rmst_mos_95ci(check_iptw_med,
                                      1000,
                                      'nivo',
                                      'death',
                                      ['age',
                                       'gender',
                                       'race',
                                       'PracticeType',
                                       'Histology',
                                       'met_cat',
                                       'delta_adv_diagnosis',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ecog_2',
                                       'pdl1_cat',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'],
                                      ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [331]:
check_high_rmst_mos_95 = rmst_mos_95ci(check_iptw_high,
                                       1000,
                                       'nivo',
                                       'death',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'Histology',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2',
                                        'pdl1_cat',
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        36)

In [332]:
check_data = [
    {'trial_name': 'CHECKMATE-078', 
     'risk_group': 'low', 
     'r_trt_mos': nivo_check_median_os[0],
     'r_trt_mos_95': check_low_rmst_mos_95.mos_A_95,
     'r_cont_mos': dotx_check_median_os[0],
     'r_cont_mos_95': check_low_rmst_mos_95.mos_B_95,
     'r_mos_diff': nivo_check_median_os[0] - dotx_check_median_os[0], 
     'rct_trt_arm': 11.9, 
     'rct_cont_arm': 9.5,
     'rct_mos_diff': 11.9-9.5,
     'trt_rmst': restricted_mean_survival_time(kmf_low_nivo_check_iptw, 36),
     'trt_rmst_95': check_low_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_low_dotx_check_iptw, 36),
     'cont_rmst_95': check_low_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_low_nivo_check_iptw, 36) - restricted_mean_survival_time(kmf_low_dotx_check_iptw, 36),
     'diff_rmst_95': check_low_rmst_mos_95.difference_rmst_95,
     'rcount': checkmate.query('risk_score <= @low_cutoff_078').shape[0],
     'rcount_chemo': checkmate.query('risk_score <= @low_cutoff_078').query('nivo == 0').shape[0]},
    
    {'trial_name': 'CHECKMATE-078', 
     'risk_group': 'medium', 
     'r_trt_mos': nivo_check_median_os[1],
     'r_trt_mos_95': check_med_rmst_mos_95.mos_A_95,
     'r_cont_mos': dotx_check_median_os[1],
     'r_cont_mos_95': check_med_rmst_mos_95.mos_B_95,
     'r_mos_diff': nivo_check_median_os[1] - dotx_check_median_os[1], 
     'rct_trt_arm': 11.9, 
     'rct_cont_arm': 9.5,
     'rct_mos_diff': 11.9-9.5,
     'trt_rmst': restricted_mean_survival_time(kmf_med_nivo_check_iptw, 36),
     'trt_rmst_95': check_med_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_med_dotx_check_iptw, 36),
     'cont_rmst_95': check_med_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_med_nivo_check_iptw, 36) - restricted_mean_survival_time(kmf_med_dotx_check_iptw, 36),
     'diff_rmst_95': check_med_rmst_mos_95.difference_rmst_95,
     'rcount': checkmate.query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078').shape[0],
     'rcount_chemo': checkmate.query('risk_score < @high_cutoff_078 and risk_score > @low_cutoff_078').query('nivo == 0').shape[0]},
    
    {'trial_name': 'CHECKMATE-078', 
     'risk_group': 'high', 
     'r_trt_mos': nivo_check_median_os[2],
     'r_trt_mos_95': check_high_rmst_mos_95.mos_A_95,
     'r_cont_mos': dotx_check_median_os[2],
     'r_cont_mos_95': check_high_rmst_mos_95.mos_B_95,
     'r_mos_diff': nivo_check_median_os[2] - dotx_check_median_os[2], 
     'rct_trt_arm': 11.9, 
     'rct_cont_arm': 9.5,
     'rct_mos_diff': 11.9-9.5,
     'trt_rmst': restricted_mean_survival_time(kmf_high_nivo_check_iptw, 36),
     'trt_rmst_95': check_high_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_high_dotx_check_iptw, 36),
     'cont_rmst_95': check_high_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_high_nivo_check_iptw, 36) - restricted_mean_survival_time(kmf_high_dotx_check_iptw, 36),
     'diff_rmst_95': check_high_rmst_mos_95.difference_rmst_95,
     'rcount': checkmate.query('risk_score >= @high_cutoff_078').shape[0],
     'rcount_chemo': checkmate.query('risk_score >= @high_cutoff_078').query('nivo == 0').shape[0]},
    
    {'trial_name': 'CHECKMATE-078', 
     'risk_group': 'all', 
     'r_hr': check_hr_all.hazard_ratios_['nivo'],
     'r_hr_95': [check_hr_all.summary.loc['nivo']['exp(coef) lower 95%'], check_hr_all.summary.loc['nivo']['exp(coef) upper 95%']],
     'r_trt_mos': nivo_check_median_os[3],
     'r_trt_mos_95': check_all_rmst_mos_95.mos_A_95,
     'r_cont_mos': dotx_check_median_os[3],
     'r_cont_mos_95': check_all_rmst_mos_95.mos_B_95,
     'r_mos_diff': nivo_check_median_os[3] - dotx_check_median_os[3], 
     'rct_trt_arm': 11.9, 
     'rct_cont_arm': 9.5,
     'rct_mos_diff': 11.9-9.5,
     'rcount': checkmate.shape[0], 
     'rcount_chemo': checkmate.query('nivo == 0').shape[0]}
]

### FLAURA: osimertinib vs. gefitinib or erlotinib

**INCLUSION CRITERIA**
* Untreated stage IV NSCLC
* Received first line osimertinib or gefitinib or erlotinib

#### Osimertinib

In [333]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

68483

In [334]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [335]:
flaura_osim = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
    .query('LineName == "Osimertinib"')
    [['PatientID', 'StartDate']]
)

In [336]:
flaura_osim.loc[:, 'osim'] = 1

In [337]:
row_ID(flaura_osim)

(1241, 1241)

#### Gefitinib or Erlotinib

In [338]:
flaura_gefer = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('IsMaintenanceTherapy == False')
    .query('LineName == "Gefitinib" or LineName == "Erlotinib"')
    [['PatientID', 'StartDate']]
)

In [339]:
flaura_gefer.loc[:, 'osim'] = 0

In [340]:
row_ID(flaura_gefer)

(3003, 3003)

In [341]:
flaura = pd.concat([flaura_osim, flaura_gefer])

In [342]:
row_ID(flaura)

(4244, 4244)

In [343]:
flaura = pd.merge(flaura, df_full, on = 'PatientID', how = 'left')

In [344]:
row_ID(flaura)

(4244, 4244)

In [345]:
flaura['StartDate'] = pd.to_datetime(flaura['StartDate'])

#### Time from treatment to death/progression or censor 

In [346]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [347]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [348]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [349]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [350]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
row_ID(mortality)

(68483, 68483)

In [351]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [352]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [353]:
row_ID(mortality)

(68483, 68483)

In [354]:
flaura = pd.merge(flaura, mortality, on = 'PatientID', how = 'left')

In [355]:
row_ID(flaura)

(4244, 4244)

In [356]:
progression = pd.read_csv('Enhanced_AdvNSCLCProgression.csv')

In [357]:
progression = progression[progression.PatientID.isin(flaura.PatientID)][['PatientID', 'ProgressionDate']]

In [358]:
progression['ProgressionDate'] = pd.to_datetime(progression['ProgressionDate'])

In [359]:
progression = (
    progression
    .sort_values(['PatientID', 'ProgressionDate'], ascending = [True, True])
    .drop_duplicates(subset = 'PatientID', keep = 'first')
)

In [360]:
row_ID(progression)

(4243, 4243)

In [361]:
flaura = pd.merge(flaura, progression, on = 'PatientID', how = 'left')

In [362]:
row_ID(flaura)

(4244, 4244)

In [363]:
# Percent without progression date in Flaura trial
len(flaura.query('ProgressionDate.isna()', engine = 'python'))/len(flaura)

0.3437794533459001

In [364]:
conditions = [
    (flaura.ProgressionDate.notna()),
    ((flaura.ProgressionDate.isna()) & (flaura['death_status'] == 1)),
    ((flaura.ProgressionDate.isna()) & (flaura['death_status'] == 0))]

choices = [
    (flaura['ProgressionDate'] - flaura['StartDate']).dt.days,
    (flaura['death_date'] - flaura['StartDate']).dt.days,
    (flaura['last_activity'] - flaura['StartDate']).dt.days]

flaura.loc[:, 'time_prog_treatment'] = np.select(conditions, choices)

In [365]:
flaura = flaura.query('time_prog_treatment >= 0')

In [366]:
row_ID(flaura)

(3622, 3622)

In [367]:
conditions = [
    (flaura.ProgressionDate.notna()),
    ((flaura.ProgressionDate.isna()) & (flaura['death_status'] == 1)),
    ((flaura.ProgressionDate.isna()) & (flaura['death_status'] == 0))]

choices = [1, 1, 0]

flaura.loc[:, 'pfs_status'] = np.select(conditions, choices)

#### Patient count 

In [368]:
low_cutoff_fl = flaura.risk_score.quantile(1/3)

In [369]:
high_cutoff_fl = flaura.risk_score.quantile(2/3)

In [370]:
print('Osimertinib total:', flaura.query('osim == 1').shape[0])
print('High risk:', flaura.query('osim == 1').query('risk_score >= @high_cutoff_fl').shape[0])
print('Med risk:', flaura.query('osim == 1').query('risk_score < @high_cutoff_fl and risk_score > @low_cutoff_fl').shape[0])
print('Low risk:', flaura.query('osim == 1').query('risk_score <= @low_cutoff_fl').shape[0])

Osimertinib total: 1096
High risk: 310
Med risk: 343
Low risk: 443


In [371]:
print('Gefitinib or Erlotinib total:', flaura.query('osim == 0').shape[0])
print('High risk:', flaura.query('osim == 0').query('risk_score >= @high_cutoff_fl').shape[0])
print('Med risk:', flaura.query('osim == 0').query('risk_score < @high_cutoff_fl and risk_score > @low_cutoff_fl').shape[0])
print('Low risk:', flaura.query('osim == 0').query('risk_score <= @low_cutoff_fl').shape[0])

Gefitinib or Erlotinib total: 2526
High risk: 898
Med risk: 863
Low risk: 765


#### PFS with covariate balancing 

In [372]:
flaura = flaura.set_index('PatientID')

In [373]:
flaura['pfs_status'] = flaura['pfs_status'].astype('bool')

In [374]:
flaura_iptw = flaura.filter(items = ['pfs_status',
                                     'time_prog_treatment',
                                     'osim',
                                     'age',
                                     'gender',
                                     'race',
                                     'PracticeType',
                                     'adv_year',
                                     'delta_adv_diagnosis',
                                     'commercial',
                                     'medicare',
                                     'medicaid',
                                     'ecog_diagnosis',
                                     'albumin_diag',
                                     'weight_pct_change',
                                     'risk_score'])

In [375]:
flaura_iptw['met_cat'] = pd.cut(flaura_iptw['adv_year'],
                                bins = [2010, 2018, float('inf')],
                                labels = ['11-18', '19-21'])

In [376]:
conditions = [
    ((flaura_iptw['ecog_diagnosis'] == "1.0") | (flaura_iptw['ecog_diagnosis'] == "0.0")),  
    ((flaura_iptw['ecog_diagnosis'] == "2.0") | (flaura_iptw['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

flaura_iptw['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [377]:
flaura_iptw.dtypes

pfs_status                 bool
time_prog_treatment     float64
osim                      int64
age                       int64
gender                   object
race                     object
PracticeType             object
adv_year                  int64
delta_adv_diagnosis       int64
commercial              float64
medicare                float64
medicaid                float64
ecog_diagnosis           object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
met_cat                category
ecog_2                   object
dtype: object

In [378]:
to_be_categorical = list(flaura_iptw.select_dtypes(include = ['object']).columns)

In [379]:
to_be_categorical

['gender', 'race', 'PracticeType', 'ecog_diagnosis', 'ecog_2']

In [380]:
to_be_categorical.append('met_cat')

In [381]:
to_be_categorical.remove('ecog_diagnosis')

In [382]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    flaura_iptw[x] = flaura_iptw[x].astype('category')

In [383]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [384]:
# List of categorical features.
categorical_features = list(flaura_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [385]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [386]:
flaura_iptw_low = (
    flaura_iptw
    .query('risk_score <= @low_cutoff_fl'))

flaura_iptw_med = (
    flaura_iptw
    .query('risk_score < @high_cutoff_fl and risk_score > @low_cutoff_fl'))

flaura_iptw_high = (
    flaura_iptw
    .query('risk_score >= @high_cutoff_fl'))

flaura_iptw_all = flaura_iptw

In [387]:
flaura_low_x = preprocessor.fit_transform(flaura_iptw_low.filter(items = ['age',
                                                                          'gender',
                                                                          'race',
                                                                          'PracticeType',
                                                                          'met_cat',
                                                                          'delta_adv_diagnosis',
                                                                          'commercial',
                                                                          'medicare',
                                                                          'medicaid',
                                                                          'ecog_2', 
                                                                          'albumin_diag',
                                                                          'weight_pct_change',
                                                                          'risk_score']))

flaura_med_x = preprocessor.fit_transform(flaura_iptw_med.filter(items = ['age',
                                                                          'gender',
                                                                          'race',
                                                                          'PracticeType',
                                                                          'met_cat',
                                                                          'delta_adv_diagnosis',
                                                                          'commercial',
                                                                          'medicare',
                                                                          'medicaid',
                                                                          'ecog_2', 
                                                                          'albumin_diag',
                                                                          'weight_pct_change',
                                                                          'risk_score']))

flaura_high_x = preprocessor.fit_transform(flaura_iptw_high.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'PracticeType',
                                                                            'met_cat',
                                                                            'delta_adv_diagnosis',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ecog_2', 
                                                                            'albumin_diag',
                                                                            'weight_pct_change',
                                                                            'risk_score']))

flaura_all_x = preprocessor.fit_transform(flaura_iptw_all.filter(items = ['age',
                                                                          'gender',
                                                                          'race',
                                                                          'PracticeType',
                                                                          'met_cat',
                                                                          'delta_adv_diagnosis',
                                                                          'commercial',
                                                                          'medicare',
                                                                          'medicaid',
                                                                          'ecog_2', 
                                                                          'albumin_diag',
                                                                          'weight_pct_change',
                                                                          'risk_score']))

In [388]:
lr_flaura_low = LogisticRegression(max_iter = 1000)
lr_flaura_low.fit(flaura_low_x, flaura_iptw_low['osim'])

LogisticRegression(max_iter=1000)

In [389]:
lr_flaura_med = LogisticRegression(max_iter = 1000)
lr_flaura_med.fit(flaura_med_x, flaura_iptw_med['osim'])

LogisticRegression(max_iter=1000)

In [390]:
lr_flaura_high = LogisticRegression(max_iter = 1000)
lr_flaura_high.fit(flaura_high_x, flaura_iptw_high['osim'])

LogisticRegression(max_iter=1000)

In [391]:
lr_flaura_all = LogisticRegression(max_iter = 1000)
lr_flaura_all.fit(flaura_all_x, flaura_iptw_all['osim'])

LogisticRegression(max_iter=1000)

In [392]:
pred_low = lr_flaura_low.predict_proba(flaura_low_x)
pred_med = lr_flaura_med.predict_proba(flaura_med_x)
pred_high = lr_flaura_high.predict_proba(flaura_high_x)
pred_all = lr_flaura_all.predict_proba(flaura_all_x)

In [393]:
flaura_iptw_low['ps'] = pred_low[:, 1]
flaura_iptw_med['ps'] = pred_med[:, 1]
flaura_iptw_high['ps'] = pred_high[:, 1]
flaura_iptw_all['ps'] = pred_all[:, 1]

In [394]:
flaura_iptw_low['weight'] = (
    np.where(flaura_iptw_low['osim'] == 1, 1/flaura_iptw_low['ps'], 1/(1 - flaura_iptw_low['ps'])))

flaura_iptw_med['weight'] = (
    np.where(flaura_iptw_med['osim'] == 1, 1/flaura_iptw_med['ps'], 1/(1 - flaura_iptw_med['ps'])))

flaura_iptw_high['weight'] = (
    np.where(flaura_iptw_high['osim'] == 1, 1/flaura_iptw_high['ps'], 1/(1 - flaura_iptw_high['ps'])))

flaura_iptw_all['weight'] = (
    np.where(flaura_iptw_all['osim'] == 1, 1/flaura_iptw_all['ps'], 1/(1 - flaura_iptw_all['ps'])))

In [395]:
# Low KM curves
kmf_low_osim_flaura_iptw_pfs = KaplanMeierFitter()
kmf_low_gefer_flaura_iptw_pfs = KaplanMeierFitter()

kmf_low_osim_flaura_iptw_pfs.fit(
    flaura_iptw_low.query('osim == 1').time_prog_treatment/30,
    flaura_iptw_low.query('osim == 1').pfs_status,
    weights = flaura_iptw_low.query('osim == 1')['weight'])

kmf_low_gefer_flaura_iptw_pfs.fit(
    flaura_iptw_low.query('osim == 0').time_prog_treatment/30,
    flaura_iptw_low.query('osim == 0').pfs_status,
    weights = flaura_iptw_low.query('osim == 0')['weight'])

# Med KM curves
kmf_med_osim_flaura_iptw_pfs = KaplanMeierFitter()
kmf_med_gefer_flaura_iptw_pfs = KaplanMeierFitter()

kmf_med_osim_flaura_iptw_pfs.fit(
    flaura_iptw_med.query('osim == 1').time_prog_treatment/30,
    flaura_iptw_med.query('osim == 1').pfs_status,
    weights = flaura_iptw_med.query('osim == 1')['weight'])

kmf_med_gefer_flaura_iptw_pfs.fit(
    flaura_iptw_med.query('osim == 0').time_prog_treatment/30,
    flaura_iptw_med.query('osim == 0').pfs_status,
    weights = flaura_iptw_med.query('osim == 0')['weight'])

# High KM curves 
kmf_high_osim_flaura_iptw_pfs = KaplanMeierFitter()
kmf_high_gefer_flaura_iptw_pfs = KaplanMeierFitter()

kmf_high_osim_flaura_iptw_pfs.fit(
    flaura_iptw_high.query('osim == 1').time_prog_treatment/30,
    flaura_iptw_high.query('osim == 1').pfs_status,
    weights = flaura_iptw_high.query('osim == 1')['weight'])

kmf_high_gefer_flaura_iptw_pfs.fit(
    flaura_iptw_high.query('osim == 0').time_prog_treatment/30,
    flaura_iptw_high.query('osim == 0').pfs_status,
    weights = flaura_iptw_high.query('osim == 0')['weight'])

# All KM curves 
kmf_all_osim_flaura_iptw_pfs = KaplanMeierFitter()
kmf_all_gefer_flaura_iptw_pfs = KaplanMeierFitter()

kmf_all_osim_flaura_iptw_pfs.fit(
    flaura_iptw_all.query('osim == 1').time_prog_treatment/30,
    flaura_iptw_all.query('osim == 1').pfs_status,
    weights = flaura_iptw_all.query('osim == 1')['weight'])

kmf_all_gefer_flaura_iptw_pfs.fit(
    flaura_iptw_all.query('osim == 0').time_prog_treatment/30,
    flaura_iptw_all.query('osim == 0').pfs_status,
    weights = flaura_iptw_all.query('osim == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 3578.92 total observations, 482.603 right-censored observations>

#### Calculate survival metrics 

In [396]:
osim_flaura_median_pfs = mos(kmf_low_osim_flaura_iptw_pfs,
                             kmf_med_osim_flaura_iptw_pfs,
                             kmf_high_osim_flaura_iptw_pfs,
                             kmf_all_osim_flaura_iptw_pfs)

gefer_flaura_median_pfs = mos(kmf_low_gefer_flaura_iptw_pfs,
                              kmf_med_gefer_flaura_iptw_pfs,
                              kmf_high_gefer_flaura_iptw_pfs,
                              kmf_all_gefer_flaura_iptw_pfs)

In [397]:
flaura_iptw_all_imputed = flaura_iptw_all.copy()
flaura_iptw_all_imputed['albumin_diag'] = flaura_iptw_all_imputed['albumin_diag'].fillna(flaura_iptw_all_imputed['albumin_diag'].median())
flaura_iptw_all_imputed['weight_pct_change'] = flaura_iptw_all_imputed['weight_pct_change'].fillna(flaura_iptw_all_imputed['weight_pct_change'].median())

In [398]:
flaura_hr_all = CoxPHFitter()
flaura_hr_all.fit(flaura_iptw_all_imputed,
                  duration_col = 'time_prog_treatment',
                  event_col = 'pfs_status',
                  formula = 'osim + age + gender + race + PracticeType + met_cat + delta_adv_diagnosis + commercial + medicare + medicaid + ecog_2 + albumin_diag + weight_pct_change + risk_score',
                  weights_col = 'weight',
                  robust = True)

<lifelines.CoxPHFitter: fitted with 7115.6 total observations, 1554.47 right-censored observations>

In [399]:
flaura_all_rmst_mos_95 = rmst_mos_95ci(flaura_iptw_all,
                                       1000,
                                       'osim',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [400]:
flaura_low_rmst_mos_95 = rmst_mos_95ci(flaura_iptw_low,
                                       1000,
                                       'osim',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [401]:
flaura_med_rmst_mos_95 = rmst_mos_95ci(flaura_iptw_med,
                                       1000,
                                       'osim',
                                       'progression',
                                       ['age',
                                        'gender',
                                        'race',
                                        'PracticeType',
                                        'met_cat',
                                        'delta_adv_diagnosis',
                                        'commercial',
                                        'medicare',
                                        'medicaid',
                                        'ecog_2', 
                                        'albumin_diag',
                                        'weight_pct_change',
                                        'risk_score'],
                                       ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                       36)

In [402]:
flaura_high_rmst_mos_95 = rmst_mos_95ci(flaura_iptw_high,
                                        1000,
                                        'osim',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'PracticeType',
                                         'met_cat',
                                         'delta_adv_diagnosis',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ecog_2', 
                                         'albumin_diag',
                                         'weight_pct_change',
                                         'risk_score'],
                                        ['age', 'delta_adv_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        36)

In [403]:
flaura_data = [
    {'trial_name': 'FLAURA', 
     'risk_group': 'low', 
     'r_trt_mos': osim_flaura_median_pfs[0],
     'r_trt_mos_95': flaura_low_rmst_mos_95.mos_A_95,
     'r_cont_mos': gefer_flaura_median_pfs[0],
     'r_cont_mos_95': flaura_low_rmst_mos_95.mos_B_95,
     'r_mos_diff': osim_flaura_median_pfs[0] - gefer_flaura_median_pfs[0], 
     'rct_trt_arm': 18.9, 
     'rct_cont_arm': 10.2,
     'rct_mos_diff': 18.9-10.2,
     'trt_rmst': restricted_mean_survival_time(kmf_low_osim_flaura_iptw_pfs, 36),
     'trt_rmst_95': flaura_low_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_low_gefer_flaura_iptw_pfs, 36),
     'cont_rmst_95': flaura_low_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_low_osim_flaura_iptw_pfs, 36) - restricted_mean_survival_time(kmf_low_gefer_flaura_iptw_pfs, 36),
     'diff_rmst_95': flaura_low_rmst_mos_95.difference_rmst_95,
     'rcount': flaura.query('risk_score <= @low_cutoff_fl').shape[0]},
    
    {'trial_name': 'FLAURA', 
     'risk_group': 'medium', 
     'r_trt_mos': osim_flaura_median_pfs[1],
     'r_trt_mos_95': flaura_med_rmst_mos_95.mos_A_95,
     'r_cont_mos': gefer_flaura_median_pfs[1],
     'r_cont_mos_95': flaura_med_rmst_mos_95.mos_B_95,
     'r_mos_diff': osim_flaura_median_pfs[1] - gefer_flaura_median_pfs[1], 
     'rct_trt_arm': 18.9, 
     'rct_cont_arm': 10.2,
     'rct_mos_diff': 18.9-10.2,
     'trt_rmst': restricted_mean_survival_time(kmf_med_osim_flaura_iptw_pfs, 36),
     'trt_rmst_95': flaura_med_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_med_gefer_flaura_iptw_pfs, 36),
     'cont_rmst_95': flaura_med_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_med_osim_flaura_iptw_pfs, 36) - restricted_mean_survival_time(kmf_med_gefer_flaura_iptw_pfs, 36),
     'diff_rmst_95': flaura_med_rmst_mos_95.difference_rmst_95,
     'rcount': flaura.query('risk_score < @high_cutoff_fl and risk_score > @low_cutoff_fl').shape[0]},
    
    {'trial_name': 'FLAURA', 
     'risk_group': 'high', 
     'r_trt_mos': osim_flaura_median_pfs[2],
     'r_trt_mos_95': flaura_high_rmst_mos_95.mos_A_95,
     'r_cont_mos': gefer_flaura_median_pfs[2],
     'r_cont_mos_95': flaura_high_rmst_mos_95.mos_B_95,
     'r_mos_diff': osim_flaura_median_pfs[2] - gefer_flaura_median_pfs[2], 
     'rct_trt_arm': 18.9, 
     'rct_cont_arm': 10.2,
     'rct_mos_diff': 18.9-10.2,
     'trt_rmst': restricted_mean_survival_time(kmf_high_osim_flaura_iptw_pfs, 36),
     'trt_rmst_95': flaura_high_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_high_gefer_flaura_iptw_pfs, 36),
     'cont_rmst_95': flaura_high_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_high_osim_flaura_iptw_pfs, 36) - restricted_mean_survival_time(kmf_high_gefer_flaura_iptw_pfs, 36),
     'diff_rmst_95': flaura_high_rmst_mos_95.difference_rmst_95,
     'rcount': flaura.query('risk_score >= @high_cutoff_fl').shape[0]},
    
    {'trial_name': 'FLAURA', 
     'risk_group': 'all', 
     'r_hr': flaura_hr_all.hazard_ratios_['osim'],
     'r_hr_95': [flaura_hr_all.summary.loc['osim']['exp(coef) lower 95%'], flaura_hr_all.summary.loc['osim']['exp(coef) upper 95%']],
     'r_trt_mos': osim_flaura_median_pfs[3],
     'r_trt_mos_95': flaura_all_rmst_mos_95.mos_A_95,
     'r_cont_mos': gefer_flaura_median_pfs[3],
     'r_cont_mos_95': flaura_all_rmst_mos_95.mos_B_95,
     'r_mos_diff': osim_flaura_median_pfs[3] - gefer_flaura_median_pfs[3], 
     'rct_trt_arm': 18.9, 
     'rct_cont_arm': 10.2,
     'rct_mos_diff': 18.9-10.2,
     'rcount': flaura.shape[0]}
]

## Part 3. Combining dictionaries

In [404]:
data_combined = keynote_042_data + keynote_024_data + keynote_189_data + check_data + flaura_data

In [405]:
rtrials_mos_rmst_boot = pd.DataFrame(data_combined)

In [406]:
rtrials_mos_rmst_boot

Unnamed: 0,trial_name,risk_group,r_trt_mos,r_trt_mos_95,r_cont_mos,r_cont_mos_95,r_mos_diff,rct_trt_arm,rct_cont_arm,rct_mos_diff,trt_rmst,trt_rmst_95,cont_rmst,cont_rmst_95,diff_rmst,diff_rmst_95,rcount,rcount_chemo,r_hr,r_hr_95
0,KEYNOTE-042,low,24.166667,"[20.4, 29.43583333333333]",24.766667,"[21.530833333333334, 29.00333333333333]",-0.6,16.7,12.1,4.6,22.239172,"[20.960000241759417, 23.677822262317378]",23.344637,"[21.95403578463139, 24.578117882776628]",-1.105465,"[-2.862508262657949, 0.7294561044346117]",1849,954.0,,
1,KEYNOTE-042,medium,15.4,"[12.933333333333334, 19.9]",15.033333,"[12.966666666666667, 17.733333333333334]",0.366667,16.7,12.1,4.6,18.410023,"[17.110196613225717, 19.703239652373597]",18.044604,"[16.66171233443578, 19.61827783589982]",0.365418,"[-1.6152399731520042, 2.193670062130715]",1849,865.0,,
2,KEYNOTE-042,high,4.3,"[3.6991666666666667, 4.966666666666667]",5.4,"[4.7, 6.266666666666667]",-1.1,16.7,12.1,4.6,10.891993,"[9.881386317957233, 11.943817389368826]",9.724514,"[8.755005246089425, 10.732635328865777]",1.167479,"[-0.16827551382957134, 2.5872902965460725]",1850,815.0,,
3,KEYNOTE-042,all,13.5,"[11.966666666666667, 15.3]",12.933333,"[12.0325, 14.500833333333333]",0.566667,16.7,12.1,4.6,,,,,,,5548,2634.0,1.014601,"[0.9230721104756876, 1.115205225953882]"
4,KEYNOTE-024,low,9.233333,"[7.7, 11.0]",8.4,"[5.5, 9.833333333333334]",0.833333,10.3,6.0,4.3,9.956798,"[9.346125487300682, 10.566334015081466]",9.139877,"[7.885692856285285, 10.550728439867227]",0.81692,"[-0.6706356081985381, 2.1848193081110816]",707,160.0,,
5,KEYNOTE-024,medium,5.333333,"[3.966666666666667, 6.2033333333333305]",5.266667,"[4.2, 7.702499999999998]",0.066667,10.3,6.0,4.3,7.789754,"[7.169883760611351, 8.388688062312188]",7.779783,"[6.423730314265886, 9.381445343418036]",0.009971,"[-1.7634501658420163, 1.5994456516255215]",706,143.0,,
6,KEYNOTE-024,high,2.233333,"[2.0, 2.433333333333333]",2.8,"[2.1325, 3.433333333333333]",-0.566667,10.3,6.0,4.3,4.757815,"[4.246486165082353, 5.293930436019774]",3.970017,"[3.2240106011566763, 4.710977286004878]",0.787798,"[-0.10930832033262405, 1.6702056836108747]",707,161.0,,
7,KEYNOTE-024,all,4.266667,"[3.8, 4.967499999999999]",4.566667,"[3.765, 5.266666666666667]",-0.3,10.3,6.0,4.3,,,,,,,2120,464.0,0.911244,"[0.8044985854177044, 1.0321539839177478]"
8,KEYNOTE-189,low,23.866667,"[20.791666666666668, 27.436666666666664]",19.833333,"[19.266666666666666, 20.666666666666668]",4.033333,22.0,10.6,11.4,22.724544,"[21.681313312045813, 23.776956423578415]",21.115921,"[20.794783933429645, 21.439350399415684]",1.608622,"[0.5351315115205044, 2.715588040783103]",9680,7972.0,,
9,KEYNOTE-189,medium,12.4,"[10.666666666666666, 14.1]",11.033333,"[10.633333333333333, 11.4]",1.366667,22.0,10.6,11.4,16.028986,"[14.960477864420433, 17.11580883873035]",15.238015,"[14.93332873958801, 15.553760974269519]",0.790971,"[-0.3740074119288269, 1.9250885375657658]",9680,7838.0,,


In [407]:
rtrials_mos_rmst_boot.to_csv('rtrials_mos_rmst_boot.csv', index = False)