# Flatiron Health mBC: Survival metrics for strict elgibility criteria
**Background: Calculate survival metrics for emulated trials involving patients meeting strict elgibliity criteria. Hazard ratio for the full cohort is calculated from a Cox-IPTW model. Restricted mean survival time and median overall survival are calculated for phenotypes using an IPTW-adjusted KM curve.** 

## Part 1: Identify patients with exclusion criteria

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Function that returns number of rows and count of unique PatientIDs for a dataframe. 
def row_ID(dataframe):
    row = dataframe.shape[0]
    ID = dataframe['PatientID'].nunique()
    return row, ID

In [3]:
train = pd.read_csv('train_full.csv')
row_ID(train)

(25341, 25341)

In [4]:
test = pd.read_csv('test_full.csv')
row_ID(test)

(6336, 6336)

In [5]:
df = pd.concat([train, test], ignore_index = True)
row_ID(df)

(31677, 31677)

### 1. Relevant comorbidities in the year preceding metastatic diagnosis 
* HIV
* Psychosis and other significant psychiatric disorders 

In [6]:
diagnosis = pd.read_csv('Diagnosis.csv')

In [7]:
diagnosis = diagnosis[diagnosis['PatientID'].isin(df['PatientID'])]       

In [8]:
diagnosis.loc[:, 'DiagnosisDate'] = pd.to_datetime(diagnosis['DiagnosisDate'])

In [9]:
enhanced_met = pd.read_csv('Enhanced_MetastaticBreast.csv')

In [10]:
enhanced_met.loc[:, 'MetDiagnosisDate'] = pd.to_datetime(enhanced_met['MetDiagnosisDate'])

In [11]:
row_ID(diagnosis)

(1575735, 31677)

In [12]:
diagnosis = pd.merge(diagnosis, enhanced_met[['PatientID', 'MetDiagnosisDate']], on = 'PatientID', how = 'left')

In [13]:
row_ID(diagnosis)

(1575735, 31677)

In [14]:
diagnosis.loc[:, 'date_diff'] = (diagnosis['DiagnosisDate'] - diagnosis['MetDiagnosisDate']).dt.days

In [15]:
diagnosis.loc[:, 'diagnosis_code'] = diagnosis['DiagnosisCode'].replace('\.', '', regex = True)

In [16]:
# ICD-9 dataframe with unique codes for each patient. 
diagnosis_9 = (
    diagnosis
    .query('date_diff <= 0 and date_diff > -365')
    .query('DiagnosisCodeSystem == "ICD-9-CM"')
    .drop_duplicates(subset = (['PatientID', 'DiagnosisCode']), keep = 'first')
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

In [17]:
other_comorb_9_IDs = (
    diagnosis_9[diagnosis_9['diagnosis_code'].str.match('042|'
                                                        '29[5789]')].PatientID.unique())

In [18]:
len(other_comorb_9_IDs)

4

In [19]:
cardiac_9_IDs = (
    diagnosis_9[diagnosis_9['diagnosis_code'].str.match('4010|'
                                                        '410|'
                                                        '412|'
                                                        '425')].PatientID.unique()
)

In [20]:
len(cardiac_9_IDs)

66

In [21]:
# ICD-9 dataframe with unique codes for each patient. 
diagnosis_10 = (
    diagnosis
    .query('date_diff <= 0 and  date_diff > -365')
    .query('DiagnosisCodeSystem == "ICD-10-CM"')
    .drop_duplicates(subset = (['PatientID', 'DiagnosisCode']), keep = 'first')
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

In [22]:
other_comorb_10_IDs = (
    diagnosis_10[diagnosis_10['diagnosis_code'].str.match('B20|'
                                                          'F2[024589]')].PatientID.unique())

In [23]:
len(other_comorb_10_IDs)

39

In [24]:
cardiac_10_IDs = (
    diagnosis_10[diagnosis_10['diagnosis_code'].str.match('I16|'
                                                          'I21|'
                                                          'I42')].PatientID.unique())

In [25]:
len(cardiac_10_IDs)

89

In [26]:
other_comorb_IDs = np.unique(np.concatenate([other_comorb_9_IDs, other_comorb_10_IDs]))

In [27]:
len(other_comorb_IDs)

43

In [28]:
cardiac_IDs = np.unique(np.concatenate([cardiac_9_IDs, cardiac_10_IDs]))

In [29]:
len(cardiac_IDs)

154

### 2. CNS metastasis at start of treatment 

In [30]:
mets = pd.read_csv('Enhanced_MetBreastSitesOfMet.csv')

In [31]:
mets = mets[mets['PatientID'].isin(df['PatientID'])]

In [32]:
mets.loc[:, 'DateOfMetastasis'] = pd.to_datetime(mets['DateOfMetastasis'])

In [33]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [34]:
line_therapy = line_therapy[line_therapy['PatientID'].isin(df['PatientID'])]       

In [35]:
line_therapy.loc[:, 'StartDate'] = pd.to_datetime(line_therapy['StartDate'])

In [36]:
therapy_fl = (
    line_therapy
    .query('LineNumber == 1')
    [['PatientID', 'StartDate']]
    .rename(columns = {'StartDate': 'StartDate_fl'}))

In [37]:
therapy_sec = (
    line_therapy
    .query('LineNumber == 2')
    [['PatientID', 'StartDate']]
    .rename(columns = {'StartDate': 'StartDate_sec'}))

In [38]:
row_ID(mets)

(78955, 31540)

In [39]:
mets_fl = pd.merge(mets, therapy_fl, on = 'PatientID', how = 'left')

In [40]:
row_ID(mets_fl)

(78955, 31540)

In [41]:
mets_sec = pd.merge(mets, therapy_sec, on = 'PatientID', how = 'left')

In [42]:
row_ID(mets_sec)

(78955, 31540)

In [43]:
cns_fl_IDs = (
    mets_fl
    .assign(date_diff_fl = (mets_fl['DateOfMetastasis'] - mets_fl['StartDate_fl']).dt.days)
    .query('date_diff_fl <= 0 and date_diff_fl > -90')
    .query('SiteOfMetastasis == "Brain" or SiteOfMetastasis == "CNS site"')
    .PatientID.unique()
)

In [44]:
len(cns_fl_IDs)

1498

In [45]:
cns_sec_IDs = (
    mets_sec
    .assign(date_diff_sec = (mets_sec['DateOfMetastasis'] - mets_sec['StartDate_sec']).dt.days)
    .query('date_diff_sec <= 0 and date_diff_sec > -90')
    .query('SiteOfMetastasis == "Brain" or SiteOfMetastasis == "CNS site"')
    .PatientID.unique()
)

In [46]:
len(cns_sec_IDs)

740

### 3. ECOG >2 at time of treatment  

In [47]:
base_ecog = pd.read_csv('BaselineECOG.csv')

In [48]:
base_ecog = base_ecog[base_ecog['PatientID'].isin(df['PatientID'])]       

In [49]:
ecog_fl_34_IDs = (
    base_ecog
    .query('LineNumber == 1')
    .query('ECOGValue == "3" or ECOGValue == "4"')
    .PatientID.unique())

In [50]:
len(ecog_fl_34_IDs)

826

In [51]:
ecog_fl_234_IDs = (
    base_ecog
    .query('LineNumber == 1')
    .query('ECOGValue == "2" or ECOGValue == "3" or ECOGValue == "4"')
    .PatientID.unique())

In [52]:
len(ecog_fl_234_IDs)

3063

In [53]:
ecog_sec_234_IDs = (
    base_ecog
    .query('LineNumber == 2')
    .query('ECOGValue == "2" or ECOGValue == "3" or ECOGValue == "4"')
    .PatientID.unique())

In [54]:
len(ecog_sec_234_IDs)

2216

### 4. Organ dysfunction at time of treatment
* Hemoglobin <9
* Creatinine >2
* Total bilirubin >3

In [55]:
lab = pd.read_csv('Lab.csv')

In [56]:
lab = lab[lab['PatientID'].isin(df['PatientID'])]

In [57]:
lab.loc[:, 'ResultDate'] = pd.to_datetime(lab['ResultDate']) 

In [58]:
row_ID(lab)

(30593212, 30259)

In [59]:
lab = pd.merge(lab, therapy_fl, on = 'PatientID', how = 'left')

In [60]:
row_ID(lab)

(30593212, 30259)

In [61]:
lab = pd.merge(lab, therapy_sec, on = 'PatientID', how = 'left')

In [62]:
# Select rows with clinically relevant labs.
lab_core = (
    lab[
    (lab['LOINC'] == "2160-0") |
    (lab['LOINC'] == "38483-4") | 
    (lab['LOINC'] == "718-7") |
    (lab['LOINC'] == "20509-6") |
    (lab['LOINC'] == "42719-5") |
    (lab['LOINC'] == "1975-2")]
    .filter(items = ['PatientID', 
                     'ResultDate', 
                     'LOINC', 
                     'LabComponent', 
                     'TestUnits', 
                     'TestUnitsCleaned', 
                     'TestResult', 
                     'TestResultCleaned', 
                     'StartDate_fl',
                     'StartDate_sec'])
)

In [63]:
conditions = [
    ((lab_core['LOINC'] == '2160-0') | (lab_core['LOINC'] == '38483-4')),
    ((lab_core['LOINC'] == '718-7') | (lab_core['LOINC'] == '20509-6')),
    ((lab_core['LOINC'] == '42719-5') | (lab_core['LOINC'] == '1975-2'))]

choices = ['creatinine', 
           'hemoglobin', 
           'total_bilirubin']

lab_core.loc[:, 'lab_name'] = np.select(conditions, choices)

In [64]:
row_ID(lab_core)

(2927968, 30127)

In [65]:
conditions = [
    (lab_core['lab_name'] == 'hemoglobin') & (lab_core['TestUnits'] == 'g/uL')]

choices = [lab_core['TestResultCleaned'] / 100000]

lab_core.loc[:, 'test_result_cleaned'] = np.select(conditions, choices, default = lab_core['TestResultCleaned'])

In [66]:
lab_fl = (
    lab_core
    .assign(date_diff_fl = (lab_core['ResultDate'] - lab_core['StartDate_fl']).dt.days)
    .query('date_diff_fl <= 0 and date_diff_fl > -90')
    .sort_values(by = ['PatientID', 'lab_name', 'date_diff_fl'], ascending = [True, True, False])
    .drop_duplicates(subset = ['PatientID', 'lab_name'], keep = 'first' )
)

In [67]:
# Select lab closest to date of advanced diagnosis and pivot to a wide table. 
lab_wide_fl = (
    lab_fl
    .pivot(index = 'PatientID', columns = 'lab_name', values = 'test_result_cleaned')
    .reset_index())

lab_wide_fl.columns.name = None

In [68]:
lab_wide_fl.sample(3)

Unnamed: 0,PatientID,creatinine,hemoglobin,total_bilirubin
13529,FA661F42343B5,0.8,13.0,0.4
16626,FCB23331432FA,0.63,10.7,0.4
10010,F7AA38E7F9D34,0.73,13.9,0.5


In [69]:
ab_organ_fl_IDs = lab_wide_fl.query('creatinine > 2 or hemoglobin < 9 or total_bilirubin > 3').PatientID

In [70]:
len(ab_organ_fl_IDs)

1053

In [71]:
lab_sec = (
    lab_core
    .assign(date_diff_sec = (lab_core['ResultDate'] - lab_core['StartDate_sec']).dt.days)
    .query('date_diff_sec <= 0 and date_diff_sec > -90')
    .sort_values(by = ['PatientID', 'lab_name', 'date_diff_sec'], ascending = [True, True, False])
    .drop_duplicates(subset = ['PatientID', 'lab_name'], keep = 'first' )
)

In [72]:
# Select lab closest to date of advanced diagnosis and pivot to a wide table. 
lab_wide_sec = (
    lab_sec
    .pivot(index = 'PatientID', columns = 'lab_name', values = 'test_result_cleaned')
    .reset_index())

lab_wide_sec.columns.name = None

In [73]:
lab_wide_sec.sample(3)

Unnamed: 0,PatientID,creatinine,hemoglobin,total_bilirubin
9989,F9EE2F7CC2A71,0.95,12.5,0.6
2395,F262CAFB9D02D,0.7,11.6,0.59
7615,F784A6F519839,0.61,10.5,0.47


In [74]:
ab_organ_sec_IDs = lab_wide_sec.query('creatinine > 2 or hemoglobin < 9 or total_bilirubin > 3').PatientID

In [75]:
len(ab_organ_sec_IDs)

967

In [76]:
del diagnosis
del diagnosis_10
del diagnosis_9
del enhanced_met
del lab
del lab_core
del lab_fl
del lab_sec
del lab_wide_fl
del lab_wide_sec
del line_therapy
del therapy_fl
del therapy_sec

## Part 2: In-silico trials 

### Import packages and create necessary functions

In [77]:
from scipy import stats

from sksurv.nonparametric import kaplan_meier_estimator
from survive import KaplanMeier, SurvivalData

from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.plotting import add_at_risk_counts
from lifelines.utils import median_survival_times, restricted_mean_survival_time
from lifelines.statistics import logrank_test

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

import warnings

In [78]:
# Find index for value closest to input value. 
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

In [79]:
# Calculates median overeall suvival for risk groups. 
def mos(low, med, high, comp):
    low_os = low.median_survival_time_
    med_os = med.median_survival_time_
    high_os = high.median_survival_time_
    comp_os = comp.median_survival_time_
    mos = [low_os, med_os, high_os, comp_os]
    return (mos)

In [80]:
def rmst_mos_95ci(df, num_samples, drug, event, items_list, numerical_features, rmst_time):
    
    """
    Estimate the 95% confidence interval for RMST and mOS using bootstrap resampling.

    Parameters:
    - df: DataFrame containing survival data
    - num_samples: Number of bootstrap samples
    - drug: Treatment indicator variable
    - event: Event type ('death' or 'progression')
    - items_list: Feature list for IPTW 
    - numerical_features: List of numerical features
    - rmst_time: Time to calculate RMST 

    Returns:
    - mos_A_95: mOS 95% CI for treatment
    - mos_B_95: mOS 95% CI for control
    - rmst_A_95: RMST 95% CI for treatment
    - rmst_B_95: RMST 95% CI for control
    - difference_rmst_95: RMST 95% CI for difference between treatment and control 
    """
    
    np.random.seed(42)
    mos_A = []
    mos_B = []
    rmst_A_list = []
    rmst_B_list = []
    differences_rmst = []
    
    # Define variables based on the event type
    if event == 'death':
        time_column = 'timerisk_treatment'
        status_column = 'death_status'
        
    else:
        time_column = 'time_prog_treatment'
        status_column = 'pfs_status'
        
    # Set up preprocessor for logistical regression which will be for IPTW  
    numerical_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('std_scaler', StandardScaler())])
        
    categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')
    categorical_features = list(df.select_dtypes(include = ['category']).columns)
        
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)],
        remainder = 'passthrough')
    
    # Boostrap HR 
    for _ in range(num_samples):
        
        # Resample data with replacement
        resampled_df = resample(df).drop(columns = ['ps', 'weight'])
        
        # Calculated IPTW for the resampled group 
        df_x = preprocessor.fit_transform(resampled_df.filter(items = items_list))
                                           
        df_lr = LogisticRegression(max_iter = 1000)
        df_lr.fit(df_x, resampled_df[drug])
        warnings.filterwarnings("ignore")
                                           
        pred = df_lr.predict_proba(df_x)        
        resampled_df['ps'] = pred[:, 1]                          
        resampled_df['weight'] = (
                np.where(resampled_df[drug] == 1, 1/resampled_df['ps'], 1/(1 - resampled_df['ps'])))
    
        # mOS from IPTW-KM
        kmf_A = KaplanMeierFitter()
        kmf_A.fit(resampled_df.query(f'{drug} == 1')[time_column]/30,
                  resampled_df.query(f'{drug} == 1')[status_column], 
                  weights = resampled_df.query(f'{drug} == 1')['weight'])

        kmf_B = KaplanMeierFitter()
        kmf_B.fit(resampled_df.query(f'{drug} == 0')[time_column]/30,
                  resampled_df.query(f'{drug} == 0')[status_column], 
                  weights = resampled_df.query(f'{drug} == 0')['weight'])
    
        mos_A.append(kmf_A.median_survival_time_)
        mos_B.append(kmf_B.median_survival_time_)
        
        # RMST from IPTW-KM
        rmst_A = restricted_mean_survival_time(kmf_A, rmst_time)
        rmst_B = restricted_mean_survival_time(kmf_B, rmst_time)
        
        rmst_A_list.append(rmst_A)
        rmst_B_list.append(rmst_B)
        differences_rmst.append(rmst_A - rmst_B)

    # Calculate the 95% confidence interval
    results = pd.Series({
    'mos_A_95': np.percentile(mos_A, [2.5, 97.5]),
    'mos_B_95': np.percentile(mos_B, [2.5, 97.5]),
    'rmst_A_95': np.percentile(rmst_A_list, [2.5, 97.5]),
    'rmst_B_95': np.percentile(rmst_B_list, [2.5, 97.5]),
    'difference_rmst_95': np.percentile(differences_rmst, [2.5, 97.5])
    })
    
    return results

In [81]:
cutoff = pd.read_csv('risk_cutoff_breast.csv', index_col = 0)

### PALOMA-2: palbociclib plus letrozole vs. letrozole in hormone-sensitive metastatic breast cancer

**INCLUSION**
* Untreated metastatic breast cancer
* Received first line palbociclib and aromatase inhibitor or aromatase inhibitor alone
* Luteinizing hormone–releasing hormone agonist (leuprolide, goserelin, and triptorelin) is allowed in either arm
* ER-positive within [-90, +30] days of first-line treatment
* HER-2 negative within [-90, +30] days of first-line treatment
* No relevant comorbidities in the year preceding metastatic diagnosis 
* No CNS metastasis at time of treatment 
* ECOG cannot be 3 or 4 at time of treatment 
* Adequate organ function at time of treatment 

#### Palbociclib plus AI 

In [82]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

31677

In [83]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [84]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1'))

In [85]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('Palbociclib')].LineName.value_counts().head(10)

Letrozole,Palbociclib                  2503
Fulvestrant,Palbociclib                1324
Anastrozole,Palbociclib                 466
Palbociclib                             261
Exemestane,Palbociclib                  123
Letrozole,Leuprolide,Palbociclib         98
Goserelin,Letrozole,Palbociclib          90
Fulvestrant,Letrozole,Palbociclib        40
Anastrozole,Fulvestrant,Palbociclib      37
Palbociclib,Tamoxifen                    31
Name: LineName, dtype: int64

In [86]:
# Of note, LH releasing hormone agonist (leuprolide, goserelin, and triptorelin) not excluded
ai = [
    'Anastrozole',
    'Letrozole',
    'Exemestane']

exc = [
    'Capecitabine',
    'Carboplatin',
    'Cisplatin',
    'Cyclophosphamide',
    'Cytarabine Liposomal',
    'Decitabine',
    'Docetaxel',
    'Doxorubicin',
    'Doxorubicin Pegylated Liposomal',
    'Eribulin',
    'Etoposide',
    'Fluorouracil',
    'Gemcitabine',
    'Hydroxyurea',
    'Leucovorin',
    'Methotrexate',
    'Oxaliplatin',
    'Paclitaxel',
    'Paclitaxel Protein-Bound',
    'Vinorelbine',
    'Nivolumab',
    'Pembrolizumab',
    'Abemaciclib',
    'Alpelisib',
    'Fulvestrant',
    'Ribociclib',
    'Tamoxifen',
    'Carfilzomib',
    'Daratumumab',
    'Everolimus',
    'Imatinib',
    'Lapatinib',
    'Lenalidomide',
    'Neratinib',
    'Olaparib',
    'Pazopanib',
    'Pertuzumab',
    'Rituximab',
    'Sorafenib',
    'Toremifene',
    'Trastuzumab',
    'Clinical Study Drug',
]

In [87]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(ai)) & 
                line_therapy_fl['LineName'].str.contains('Palbociclib') &
                ~line_therapy_fl['LineName'].str.contains('|'.join(exc))].LineName.value_counts().head(30)

Letrozole,Palbociclib                  2503
Anastrozole,Palbociclib                 466
Exemestane,Palbociclib                  123
Letrozole,Leuprolide,Palbociclib         98
Goserelin,Letrozole,Palbociclib          90
Anastrozole,Goserelin,Palbociclib        17
Anastrozole,Leuprolide,Palbociclib       17
Letrozole,Palbociclib,Triptorelin         5
Anastrozole,Palbociclib,Triptorelin       4
Exemestane,Goserelin,Palbociclib          4
Exemestane,Leuprolide,Palbociclib         3
Name: LineName, dtype: int64

In [88]:
let_palb = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(ai)) & 
                    line_therapy_fl['LineName'].str.contains('Palbociclib') &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(exc))]
    [['PatientID', 'StartDate']]
)

In [89]:
row_ID(let_palb)

(3330, 3330)

In [90]:
let_palb.loc[:,'let_palb'] = 1

#### AI

In [91]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('Letrozole')].LineName.value_counts().head(10)

Letrozole                            2536
Letrozole,Palbociclib                2503
Letrozole,Ribociclib                  229
Abemaciclib,Letrozole                 190
Letrozole,Leuprolide,Palbociclib       98
Goserelin,Letrozole,Palbociclib        90
Fulvestrant,Letrozole                  64
Letrozole,Trastuzumab                  59
Fulvestrant,Letrozole,Palbociclib      40
Letrozole,Leuprolide                   40
Name: LineName, dtype: int64

In [92]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(ai)) & 
                ~line_therapy_fl['LineName'].str.contains('Palbociclib') &
                ~line_therapy_fl['LineName'].str.contains('|'.join(exc))].LineName.value_counts().head(40)

Anastrozole                      2943
Letrozole                        2536
Exemestane                        780
Letrozole,Leuprolide               40
Anastrozole,Leuprolide             35
Goserelin,Letrozole                26
Anastrozole,Goserelin              20
Exemestane,Goserelin                7
Exemestane,Leuprolide               4
Anastrozole,Triptorelin             2
Letrozole,Medroxyprogesterone       1
Letrozole,Triptorelin               1
Anastrozole,Megestrol               1
Name: LineName, dtype: int64

In [93]:
let = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(ai)) & 
                    ~line_therapy_fl['LineName'].str.contains('Palbociclib') &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(exc))]
    [['PatientID', 'StartDate']]
)

In [94]:
row_ID(let)

(6396, 6396)

In [95]:
let.loc[:,'let_palb'] = 0

In [96]:
paloma2 = pd.concat([let_palb, let])

In [97]:
row_ID(paloma2)

(9726, 9726)

In [98]:
paloma2 = pd.merge(paloma2, df_full, on = 'PatientID', how = 'left')

In [99]:
row_ID(paloma2)

(9726, 9726)

In [100]:
paloma2['StartDate'] = pd.to_datetime(paloma2['StartDate'])

#### ER-positive and HER-2 negative 

In [101]:
biomarkers = pd.read_csv('Enhanced_MetBreastBiomarkers.csv')

In [102]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(paloma2['PatientID'])]

In [103]:
row_ID(biomarkers)

(69754, 9662)

In [104]:
biomarkers = pd.merge(biomarkers, paloma2[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [105]:
row_ID(biomarkers)

(69754, 9662)

In [106]:
biomarkers['StartDate'] = pd.to_datetime(biomarkers['StartDate'])

In [107]:
biomarkers['ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [108]:
biomarkers['SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [109]:
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [110]:
biomarkers.loc[:, 'date_diff'] = (biomarkers['result_date'] - biomarkers['StartDate']).dt.days

In [111]:
er_status = (
    biomarkers
    .query('BiomarkerName == "ER"')
    .query('date_diff <= 30 and date_diff >= -90')
    .query('BiomarkerStatus == "Positive" or BiomarkerStatus == "Negative"') # don't select unknown values 
    .sort_values(['PatientID', 'date_diff'], ascending = [True, False]) # select ER status closest to treatment start
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'BiomarkerStatus']]
    .rename(columns = {'BiomarkerStatus': 'er'})
   )

In [112]:
row_ID(er_status)

(6016, 6016)

In [113]:
her2_status = (
    biomarkers
    .query('BiomarkerName == "HER2"')
    .query('date_diff <= 30 and date_diff >= -90')
    .sort_values(['PatientID', 'date_diff'], ascending = [True, False])
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'BiomarkerStatus']]
    .rename(columns = {'BiomarkerStatus': 'her2'})
)

In [114]:
row_ID(her2_status)

(5607, 5607)

In [115]:
paloma2 = pd.merge(paloma2, er_status, on  = 'PatientID', how = 'left')

In [116]:
row_ID(paloma2)

(9726, 9726)

In [117]:
paloma2 = pd.merge(paloma2, her2_status, on  = 'PatientID', how = 'left')

In [118]:
row_ID(paloma2)

(9726, 9726)

In [119]:
her2_neg = ['IHC negative (0-1+)',
            'FISH negative/not amplified',
            'IHC equivocal (2+)',
            'Negative NOS',
            'NGS negative (ERBB2 not amplified)',
            'FISH equivocal',
            'Equivocal NOS',
            'NGS equivocal (ERBB2 amplification equivocal)']

paloma2 = (
    paloma2
    .query('er == "Positive"')
    .query('her2== @her2_neg')
)

In [120]:
row_ID(paloma2)

(4842, 4842)

#### Time from treatment to progression/death or censor 

In [121]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [122]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [123]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [124]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [125]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
row_ID(mortality)

(31677, 31677)

In [126]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [127]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [128]:
row_ID(mortality)

(31677, 31677)

In [129]:
paloma2 = pd.merge(paloma2, mortality, on = 'PatientID', how = 'left')

In [130]:
row_ID(paloma2)

(4842, 4842)

In [131]:
progression = pd.read_csv ('Enhanced_MetBreastProgression.csv')

In [132]:
progression = progression[progression['PatientID'].isin(paloma2['PatientID'])][['PatientID', 'ProgressionDate']]

In [133]:
progression['ProgressionDate'] = pd.to_datetime(progression['ProgressionDate'])

In [134]:
progression = (
    progression
    .sort_values(['PatientID', 'ProgressionDate'], ascending = [True, True])
    .drop_duplicates(subset = 'PatientID', keep = 'first')
)

In [135]:
row_ID(progression)

(4840, 4840)

In [136]:
paloma2 = pd.merge(paloma2, progression, on = 'PatientID', how = 'left')

In [137]:
row_ID(paloma2)

(4842, 4842)

In [138]:
# Percent without progression date
len(paloma2.query('ProgressionDate.isna()', engine = 'python'))/len(paloma2)

0.4012804626187526

In [139]:
conditions = [
    (paloma2.ProgressionDate.notna()),
    ((paloma2.ProgressionDate.isna()) & (paloma2['death_status'] == 1)),
    ((paloma2.ProgressionDate.isna()) & (paloma2['death_status'] == 0))]

choices = [
    (paloma2['ProgressionDate'] - paloma2['StartDate']).dt.days,
    (paloma2['death_date'] - paloma2['StartDate']).dt.days,
    (paloma2['last_activity'] - paloma2['StartDate']).dt.days]

paloma2.loc[:, 'time_prog_treatment'] = np.select(conditions, choices)

In [140]:
paloma2 = paloma2.query('time_prog_treatment >= 0')

In [141]:
len(paloma2)

4734

In [142]:
conditions = [
    (paloma2.ProgressionDate.notna()),
    ((paloma2.ProgressionDate.isna()) & (paloma2['death_status'] == 1)),
    ((paloma2.ProgressionDate.isna()) & (paloma2['death_status'] == 0))]

choices = [1, 1, 0]

paloma2.loc[:, 'pfs_status'] = np.select(conditions, choices)

#### Patient count 

In [143]:
row_ID(paloma2)

(4734, 4734)

In [144]:
# Exlcude those with other relevant comorbidities
paloma2 = paloma2[~paloma2['PatientID'].isin(other_comorb_IDs)]

In [145]:
# Exclude those with CNS metastatsis at time of first line treatment
paloma2 = paloma2[~paloma2['PatientID'].isin(cns_fl_IDs)]

In [146]:
# Exclude those with ECOG 3 or 4  
paloma2 = paloma2[~paloma2['PatientID'].isin(ecog_fl_34_IDs)]

In [147]:
# Exclude those with abnormal organ function at time of first line treatment 
paloma2 = paloma2[~paloma2['PatientID'].isin(ab_organ_fl_IDs)]

In [148]:
row_ID(paloma2)

(4276, 4276)

In [149]:
low_cutoff_paloma2 = cutoff.loc['paloma2'].low

In [150]:
high_cutoff_paloma2 = cutoff.loc['paloma2'].high

In [151]:
print('Palbociclib plus letrozole total:',  paloma2.query('let_palb == 1').shape[0])
print('High risk:', paloma2.query('let_palb == 1').query('risk_score >= @high_cutoff_paloma2').shape[0])
print('Med risk:', paloma2.query('let_palb == 1').query('risk_score < @high_cutoff_paloma2 and risk_score > @low_cutoff_paloma2').shape[0])
print('Low risk:', paloma2.query('let_palb == 1').query('risk_score <= @low_cutoff_paloma2').shape[0])

Palbociclib plus letrozole total: 1961
High risk: 505
Med risk: 637
Low risk: 819


In [152]:
print('Letrozole:',  paloma2.query('let_palb == 0').shape[0])
print('High risk:', paloma2.query('let_palb == 0').query('risk_score >= @high_cutoff_paloma2').shape[0])
print('Med risk:', paloma2.query('let_palb == 0').query('risk_score < @high_cutoff_paloma2 and risk_score > @low_cutoff_paloma2').shape[0])
print('Low risk:', paloma2.query('let_palb == 0').query('risk_score <= @low_cutoff_paloma2').shape[0])

Letrozole: 2315
High risk: 762
Med risk: 836
Low risk: 717


#### PFS with covariate balancing 

In [153]:
paloma2 = paloma2.set_index('PatientID')

In [154]:
conditions = [
    (paloma2['thorax_met'] == 1) |
    (paloma2['liver_met'] == 1) |
    (paloma2['cns_met'] == 1) |
    (paloma2['peritoneum_met'] == 1) |
    (paloma2['other_met'] == 1),
    (paloma2['bone_met'] == 0) &
    (paloma2['thorax_met'] == 0) &
    (paloma2['lymph_met'] == 0) &
    (paloma2['liver_met'] == 0) &
    (paloma2['cns_met'] == 0) &
    (paloma2['skin_met'] == 0) &
    (paloma2['peritoneum_met'] == 0) &
    (paloma2['other_met'] == 0)
]

choices = ['visceral', 'unknown']

paloma2['met_site'] = np.select(conditions, choices, default = 'nonvisceral')

In [155]:
paloma2['met_cat'] = pd.cut(paloma2['met_year'],
                            bins = [2010, 2016, float('inf')],
                            labels = ['11-16', '17-22'])

In [156]:
conditions = [
    ((paloma2['ecog_diagnosis'] == "1.0") | (paloma2['ecog_diagnosis'] == "0.0")),  
    ((paloma2['ecog_diagnosis'] == "2.0") | (paloma2['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

paloma2['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [157]:
paloma2_iptw = paloma2.filter(items = ['pfs_status',
                                       'time_prog_treatment',
                                       'let_palb',
                                       'age',
                                       'gender',
                                       'race',
                                       'p_type',
                                       'delta_met_diagnosis',
                                       'met_cat',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ses',
                                       'ecog_2',
                                       'met_site',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'])

In [158]:
paloma2_iptw.dtypes

pfs_status                int64
time_prog_treatment     float64
let_palb                  int64
age                       int64
gender                   object
race                     object
p_type                   object
delta_met_diagnosis       int64
met_cat                category
commercial              float64
medicare                float64
medicaid                float64
ses                     float64
ecog_2                   object
met_site                 object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
dtype: object

In [159]:
to_be_categorical = list(paloma2_iptw.select_dtypes(include = ['object']).columns)

In [160]:
to_be_categorical

['gender', 'race', 'p_type', 'ecog_2', 'met_site']

In [161]:
to_be_categorical.append('met_cat')

In [162]:
to_be_categorical.append('ses')

In [163]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    paloma2_iptw[x] = paloma2_iptw[x].astype('category')

In [164]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [165]:
# List of categorical features.
categorical_features = list(paloma2_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [166]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [167]:
paloma2_iptw_low = (
    paloma2_iptw
    .query('risk_score <= @low_cutoff_paloma2'))

paloma2_iptw_med = (
    paloma2_iptw
    .query('risk_score < @high_cutoff_paloma2 and risk_score > @low_cutoff_paloma2'))

paloma2_iptw_high = (
    paloma2_iptw
    .query('risk_score >= @high_cutoff_paloma2'))

paloma2_iptw_all = paloma2_iptw

In [168]:
paloma2_low_x = preprocessor.fit_transform(paloma2_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'p_type',
                                                                            'delta_met_diagnosis',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ses',
                                                                            'ecog_2',
                                                                            'met_site', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

paloma2_med_x = preprocessor.fit_transform(paloma2_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'p_type',
                                                                            'delta_met_diagnosis',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ses',
                                                                            'ecog_2',
                                                                            'met_site', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

paloma2_high_x = preprocessor.fit_transform(paloma2_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'p_type',
                                                                              'delta_met_diagnosis',
                                                                              'met_cat',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ses',
                                                                              'ecog_2',
                                                                              'met_site', 
                                                                              'albumin_diag', 
                                                                              'weight_pct_change', 
                                                                              'risk_score']))

paloma2_all_x = preprocessor.fit_transform(paloma2_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'p_type',
                                                                            'delta_met_diagnosis',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ses',
                                                                            'ecog_2',
                                                                            'met_site', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

In [169]:
lr_paloma2_low = LogisticRegression(max_iter = 1000)
lr_paloma2_low.fit(paloma2_low_x, paloma2_iptw_low['let_palb'])

LogisticRegression(max_iter=1000)

In [170]:
lr_paloma2_med = LogisticRegression(max_iter = 1000)
lr_paloma2_med.fit(paloma2_med_x, paloma2_iptw_med['let_palb'])

LogisticRegression(max_iter=1000)

In [171]:
lr_paloma2_high = LogisticRegression(max_iter = 1000)
lr_paloma2_high.fit(paloma2_high_x, paloma2_iptw_high['let_palb'])

LogisticRegression(max_iter=1000)

In [172]:
lr_paloma2_all = LogisticRegression(max_iter = 1000)
lr_paloma2_all.fit(paloma2_all_x, paloma2_iptw_all['let_palb'])

LogisticRegression(max_iter=1000)

In [173]:
pred_low = lr_paloma2_low.predict_proba(paloma2_low_x)
pred_med = lr_paloma2_med.predict_proba(paloma2_med_x)
pred_high = lr_paloma2_high.predict_proba(paloma2_high_x)
pred_all = lr_paloma2_all.predict_proba(paloma2_all_x)

In [174]:
paloma2_iptw_low['ps'] = pred_low[:, 1]
paloma2_iptw_med['ps'] = pred_med[:, 1]
paloma2_iptw_high['ps'] = pred_high[:, 1]
paloma2_iptw_all['ps'] = pred_all[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [175]:
paloma2_iptw_low['weight'] = (
    np.where(paloma2_iptw_low['let_palb'] == 1, 1/paloma2_iptw_low['ps'], 1/(1 - paloma2_iptw_low['ps'])))

paloma2_iptw_med['weight'] = (
    np.where(paloma2_iptw_med['let_palb'] == 1, 1/paloma2_iptw_med['ps'], 1/(1 - paloma2_iptw_med['ps'])))

paloma2_iptw_high['weight'] = (
    np.where(paloma2_iptw_high['let_palb'] == 1, 1/paloma2_iptw_high['ps'], 1/(1 - paloma2_iptw_high['ps'])))

paloma2_iptw_all['weight'] = (
    np.where(paloma2_iptw_all['let_palb'] == 1, 1/paloma2_iptw_all['ps'], 1/(1 - paloma2_iptw_all['ps'])))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [176]:
# Low KM curves
kmf_low_plet_paloma2_iptw = KaplanMeierFitter()
kmf_low_let_paloma2_iptw = KaplanMeierFitter()

kmf_low_plet_paloma2_iptw.fit(
    paloma2_iptw_low.query('let_palb == 1').time_prog_treatment/30,
    paloma2_iptw_low.query('let_palb == 1').pfs_status,
    weights = paloma2_iptw_low.query('let_palb == 1')['weight'])

kmf_low_let_paloma2_iptw.fit(
    paloma2_iptw_low.query('let_palb == 0').time_prog_treatment/30,
    paloma2_iptw_low.query('let_palb == 0').pfs_status,
    weights = paloma2_iptw_low.query('let_palb == 0')['weight'])

# Med KM curves
kmf_med_plet_paloma2_iptw = KaplanMeierFitter()
kmf_med_let_paloma2_iptw = KaplanMeierFitter()

kmf_med_plet_paloma2_iptw.fit(
    paloma2_iptw_med.query('let_palb == 1').time_prog_treatment/30,
    paloma2_iptw_med.query('let_palb == 1').pfs_status,
    weights = paloma2_iptw_med.query('let_palb == 1')['weight'])

kmf_med_let_paloma2_iptw.fit(
    paloma2_iptw_med.query('let_palb == 0').time_prog_treatment/30,
    paloma2_iptw_med.query('let_palb == 0').pfs_status,
    weights = paloma2_iptw_med.query('let_palb == 0')['weight'])

# High KM curves 
kmf_high_plet_paloma2_iptw = KaplanMeierFitter()
kmf_high_let_paloma2_iptw = KaplanMeierFitter()

kmf_high_plet_paloma2_iptw.fit(
    paloma2_iptw_high.query('let_palb == 1').time_prog_treatment/30,
    paloma2_iptw_high.query('let_palb == 1').pfs_status,
    weights = paloma2_iptw_high.query('let_palb == 1')['weight'])

kmf_high_let_paloma2_iptw.fit(
    paloma2_iptw_high.query('let_palb == 0').time_prog_treatment/30,
    paloma2_iptw_high.query('let_palb == 0').pfs_status,
    weights = paloma2_iptw_high.query('let_palb == 0')['weight'])

# All KM curves 
kmf_all_plet_paloma2_iptw = KaplanMeierFitter()
kmf_all_let_paloma2_iptw = KaplanMeierFitter()

kmf_all_plet_paloma2_iptw.fit(
    paloma2_iptw_all.query('let_palb == 1').time_prog_treatment/30,
    paloma2_iptw_all.query('let_palb == 1').pfs_status,
    weights = paloma2_iptw_all.query('let_palb == 1')['weight'])

kmf_all_let_paloma2_iptw.fit(
    paloma2_iptw_all.query('let_palb == 0').time_prog_treatment/30,
    paloma2_iptw_all.query('let_palb == 0').pfs_status,
    weights = paloma2_iptw_all.query('let_palb == 0')['weight'])

  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  


<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 4281.25 total observations, 1071.13 right-censored observations>

#### Calculating survival metrics

In [177]:
plet_paloma2_median_pfs = mos(kmf_low_plet_paloma2_iptw,
                              kmf_med_plet_paloma2_iptw,
                              kmf_high_plet_paloma2_iptw,
                              kmf_all_plet_paloma2_iptw)

let_paloma2_median_pfs = mos(kmf_low_let_paloma2_iptw,
                             kmf_med_let_paloma2_iptw,
                             kmf_high_let_paloma2_iptw,
                             kmf_all_let_paloma2_iptw)

In [178]:
paloma2_iptw_all_imputed = paloma2_iptw_all.copy()
paloma2_iptw_all_imputed['albumin_diag'] = paloma2_iptw_all_imputed['albumin_diag'].fillna(paloma2_iptw_all_imputed['albumin_diag'].median())
paloma2_iptw_all_imputed['weight_pct_change'] = paloma2_iptw_all_imputed['weight_pct_change'].fillna(paloma2_iptw_all_imputed['weight_pct_change'].median())
paloma2_iptw_all_imputed['ses'] = paloma2_iptw_all_imputed['ses'].cat.add_categories('unknown')
paloma2_iptw_all_imputed['ses'] = paloma2_iptw_all_imputed['ses'].fillna('unknown')

In [179]:
paloma2_hr_all = CoxPHFitter()
paloma2_hr_all.fit(paloma2_iptw_all_imputed,
                   duration_col = 'time_prog_treatment',
                   event_col = 'pfs_status',
                   formula = 'let_palb + age + gender + race + p_type + delta_met_diagnosis + met_cat + commercial + medicare + medicaid + ses + ecog_2 + met_site + albumin_diag + weight_pct_change + risk_score',
                   weights_col = 'weight',
                   robust = True)

<lifelines.CoxPHFitter: fitted with 8535.86 total observations, 2441.47 right-censored observations>

In [180]:
paloma2_all_rmst_mos_95 = rmst_mos_95ci(paloma2_iptw_all,
                                        1000,
                                        'let_palb',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'p_type',
                                         'delta_met_diagnosis',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ses',
                                         'ecog_2',
                                         'met_site', 
                                         'albumin_diag', 
                                         'weight_pct_change', 
                                         'risk_score'],
                                        ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        48)

In [181]:
paloma2_low_rmst_mos_95 = rmst_mos_95ci(paloma2_iptw_low,
                                        1000,
                                        'let_palb',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'p_type',
                                         'delta_met_diagnosis',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ses',
                                         'ecog_2',
                                         'met_site', 
                                         'albumin_diag', 
                                         'weight_pct_change', 
                                         'risk_score'],
                                        ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        48)

In [182]:
paloma2_med_rmst_mos_95 = rmst_mos_95ci(paloma2_iptw_med,
                                        1000,
                                        'let_palb',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'p_type',
                                         'delta_met_diagnosis',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ses',
                                         'ecog_2',
                                         'met_site', 
                                         'albumin_diag', 
                                         'weight_pct_change', 
                                         'risk_score'],
                                        ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        48)

In [183]:
paloma2_high_rmst_mos_95 = rmst_mos_95ci(paloma2_iptw_high,
                                         1000,
                                         'let_palb',
                                         'progression',
                                         ['age',
                                          'gender',
                                          'race',
                                          'p_type',
                                          'delta_met_diagnosis',
                                          'met_cat',
                                          'commercial',
                                          'medicare',
                                          'medicaid',
                                          'ses',
                                          'ecog_2',
                                          'met_site', 
                                          'albumin_diag', 
                                          'weight_pct_change', 
                                          'risk_score'],
                                         ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                         48)

In [184]:
paloma2_data = [
    {'trial_name': 'PALOMA-2', 
     'risk_group': 'low', 
     's_trt_mos': plet_paloma2_median_pfs[0],
     's_trt_mos_95': paloma2_low_rmst_mos_95.mos_A_95,
     's_cont_mos': let_paloma2_median_pfs[0],
     's_cont_mos_95': paloma2_low_rmst_mos_95.mos_B_95,
     's_mos_diff': plet_paloma2_median_pfs[0] - let_paloma2_median_pfs[0], 
     'rct_trt_arm': 27.6,
     'rct_cont_arm': 14.5,
     'rct_mos_diff': 27.6-14.5,
     's_trt_rmst': restricted_mean_survival_time(kmf_low_plet_paloma2_iptw, 48),
     's_trt_rmst_95': paloma2_low_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_low_let_paloma2_iptw, 48),
     's_cont_rmst_95': paloma2_low_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_low_plet_paloma2_iptw, 48) - restricted_mean_survival_time(kmf_low_let_paloma2_iptw, 48),
     's_diff_rmst_95': paloma2_low_rmst_mos_95.difference_rmst_95,
     'scount': paloma2.query('risk_score <= @low_cutoff_paloma2').shape[0]},
    
    {'trial_name': 'PALOMA-2', 
     'risk_group': 'medium', 
     's_trt_mos': plet_paloma2_median_pfs[1],
     's_trt_mos_95': paloma2_med_rmst_mos_95.mos_A_95,
     's_cont_mos': let_paloma2_median_pfs[1],
     's_cont_mos_95': paloma2_med_rmst_mos_95.mos_B_95,
     's_mos_diff': plet_paloma2_median_pfs[1] - let_paloma2_median_pfs[1], 
     'rct_trt_arm': 27.6,
     'rct_cont_arm': 14.5,
     'rct_mos_diff': 27.6-14.5,
     's_trt_rmst': restricted_mean_survival_time(kmf_med_plet_paloma2_iptw, 48),
     's_trt_rmst_95': paloma2_med_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_med_let_paloma2_iptw, 48),
     's_cont_rmst_95': paloma2_med_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_med_plet_paloma2_iptw, 48) - restricted_mean_survival_time(kmf_med_let_paloma2_iptw, 48),
     's_diff_rmst_95': paloma2_med_rmst_mos_95.difference_rmst_95,
     'scount': paloma2.query('risk_score < @high_cutoff_paloma2 and risk_score > @low_cutoff_paloma2').shape[0]},
    
    {'trial_name': 'PALOMA-2', 
     'risk_group': 'high', 
     's_trt_mos': plet_paloma2_median_pfs[2],
     's_trt_mos_95': paloma2_high_rmst_mos_95.mos_A_95,
     's_cont_mos': let_paloma2_median_pfs[2],
     's_cont_mos_95': paloma2_high_rmst_mos_95.mos_B_95,
     's_mos_diff': plet_paloma2_median_pfs[2] - let_paloma2_median_pfs[2], 
     'rct_trt_arm': 27.6,
     'rct_cont_arm': 14.5,
     'rct_mos_diff': 27.6-14.5,
     's_trt_rmst': restricted_mean_survival_time(kmf_high_plet_paloma2_iptw, 48),
     's_trt_rmst_95': paloma2_high_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_high_let_paloma2_iptw, 48),
     's_cont_rmst_95': paloma2_high_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_high_plet_paloma2_iptw, 48) - restricted_mean_survival_time(kmf_high_let_paloma2_iptw, 48),
     's_diff_rmst_95': paloma2_high_rmst_mos_95.difference_rmst_95,
     'scount': paloma2.query('risk_score >= @high_cutoff_paloma2').shape[0]},
    
    {'trial_name': 'PALOMA-2', 
     'risk_group': 'all', 
     's_hr': paloma2_hr_all.hazard_ratios_['let_palb'],
     's_hr_95': [paloma2_hr_all.summary.loc['let_palb']['exp(coef) lower 95%'], paloma2_hr_all.summary.loc['let_palb']['exp(coef) upper 95%']],
     's_trt_mos': plet_paloma2_median_pfs[3],
     's_trt_mos_95': paloma2_all_rmst_mos_95.mos_A_95,
     's_cont_mos': let_paloma2_median_pfs[3],
     's_cont_mos_95': paloma2_all_rmst_mos_95.mos_B_95,
     's_mos_diff': plet_paloma2_median_pfs[3] - let_paloma2_median_pfs[3], 
     'rct_trt_arm': 27.6,
     'rct_cont_arm': 14.5,
     'rct_mos_diff': 27.6-14.5,
     'scount': paloma2.shape[0]}
]

### PALOMA-3: palbociclib plus fulvestrant vs. fulvestrant in hormone-sensitive metastatic breast cancer that had previously progressed on endocrine therapy

**INCLUSION**
* Received first line estrogen therapy +/ one line of chemotherapy 
* Received second (or third) line palbociclib plus fulvestrant or fulvestrant alone 
* Did not receive CDK 4/6 inhibitor, fulvestrant, or everolimus in earlier lines
* ER/PR positive and HER-2 negative within (-inf, +30] days of start of treatment
* No relevant comorbidities in the year preceding metastatic diagnosis 
* No CNS metastasis at time of treatment 
* ECOG cannot be 2, 3, or 4 at time of treatment 
* Adequate organ function at time of treatment 

#### 1. First line endorcine therapy + chemotherapy, second line fulvestrant +/- palbociclib

In [185]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

31677

In [186]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [187]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1'))

In [188]:
et = [
    'Anastrozole',
    'Letrozole',
    'Exemestane',
    'Tamoxifen']

chemo = [
    'Capecitabine',
    'Carboplatin',
    'Cyclophosphamide',
    'Docetaxel',
    'Eribulin',
    'Gemcitabine',
    'Paclitaxel',
    'Paclitaxel Protein-Bound',
    'Vinorelbine']

exc =[
    'Abemaciclib',
    'Palbociclib',
    'Ribociclib',
    'Fulvestrant',
    'Everolimus', 
    'Clinical Study Drug']

In [189]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(et)) 
                & line_therapy_fl['LineName'].str.contains('|'.join(chemo))
                & ~line_therapy_fl['LineName'].str.contains('|'.join(exc))].LineName.value_counts().head(10)

Capecitabine,Letrozole                        31
Anastrozole,Capecitabine                      25
Anastrozole,Paclitaxel                        18
Letrozole,Paclitaxel Protein-Bound            17
Anastrozole,Paclitaxel Protein-Bound          14
Capecitabine,Tamoxifen                        14
Capecitabine,Exemestane                       13
Letrozole,Paclitaxel                          13
Paclitaxel,Tamoxifen                          11
Docetaxel,Letrozole,Pertuzumab,Trastuzumab    11
Name: LineName, dtype: int64

In [190]:
etchemo_id = (
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('|'.join(et))
     & line_therapy_fl['LineName'].str.contains('|'.join(chemo))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(exc))]
    .PatientID
)

In [191]:
line_therapy_sec = (
    line_therapy[line_therapy['PatientID'].isin(etchemo_id)]
    .query('LineNumber == 2'))

In [192]:
etchemo_pf = (
    line_therapy_sec
    .query('LineName == "Fulvestrant,Palbociclib"')
    [['PatientID', 'StartDate']]
)

In [193]:
row_ID(etchemo_pf)

(13, 13)

In [194]:
etchemo_f = (
    line_therapy_sec
    .query('LineName == "Fulvestrant"')
    [['PatientID', 'StartDate']]
)

In [195]:
row_ID(etchemo_pf)

(13, 13)

#### 2. First line endocrine therapy,  second line chemotherapy, third line palbociclib + fulvestrant

In [196]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(et)) 
                & ~line_therapy_fl['LineName'].str.contains('|'.join(chemo))
                & ~line_therapy_fl['LineName'].str.contains('|'.join(exc))].LineName.value_counts().head(10)

Anastrozole                2943
Letrozole                  2536
Tamoxifen                  1305
Exemestane                  780
Anastrozole,Trastuzumab      66
Letrozole,Trastuzumab        59
Leuprolide,Tamoxifen         56
Letrozole,Leuprolide         40
Goserelin,Tamoxifen          36
Anastrozole,Leuprolide       35
Name: LineName, dtype: int64

In [197]:
et_id = (
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('|'.join(et))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(chemo))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(exc))]
    .PatientID
)

In [198]:
line_therapy_sec = (
    line_therapy[line_therapy['PatientID'].isin(et_id)]
    .query('LineNumber == 2')
)

In [199]:
et_chemo_id = (
    line_therapy_sec
    [line_therapy_sec['LineName'].str.contains('|'.join(chemo))
     & ~line_therapy_sec['LineName'].str.contains('|'.join(exc))]
    .PatientID
)

In [200]:
et_chemo_pf = (
    line_therapy[line_therapy['PatientID'].isin(et_chemo_id)]
    .query('LineNumber == 3')
    .query('LineName == "Fulvestrant,Palbociclib"')
    [['PatientID', 'StartDate']]
)

In [201]:
row_ID(et_chemo_pf)

(33, 33)

In [202]:
et_chemo_f = (
    line_therapy[line_therapy['PatientID'].isin(et_chemo_id)]
    .query('LineNumber == 3')
    .query('LineName == "Fulvestrant"')
    [['PatientID', 'StartDate']]
)

In [203]:
row_ID(et_chemo_f)

(57, 57)

#### 3. First line chemotherapy, second line endocrine therapy, third line palbociclib + fulvestrant

In [204]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(chemo)) 
                & ~line_therapy_fl['LineName'].str.contains('|'.join(et))
                & ~line_therapy_fl['LineName'].str.contains('|'.join(exc))].LineName.value_counts().head(10)

Capecitabine                             1324
Cyclophosphamide,Doxorubicin              690
Docetaxel,Pertuzumab,Trastuzumab          676
Paclitaxel                                589
Paclitaxel Protein-Bound                  584
Carboplatin,Gemcitabine                   407
Paclitaxel,Pertuzumab,Trastuzumab         289
Eribulin                                  261
Carboplatin,Paclitaxel                    250
Atezolizumab,Paclitaxel Protein-Bound     226
Name: LineName, dtype: int64

In [205]:
chemo_id = (
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('|'.join(chemo))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(et))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(exc))]
    .PatientID
)

In [206]:
line_therapy_sec = (
    line_therapy[line_therapy['PatientID'].isin(chemo_id)]
    .query('LineNumber == 2')
)

In [207]:
chemo_et_id = (
    line_therapy_sec
    [line_therapy_sec['LineName'].str.contains('|'.join(et))
     & ~line_therapy_sec['LineName'].str.contains('|'.join(exc))]
    .PatientID
)

In [208]:
chemo_et_pf = (
    line_therapy[line_therapy['PatientID'].isin(chemo_et_id)]
    .query('LineNumber == 3')
    .query('LineName == "Fulvestrant,Palbociclib"')
    [['PatientID', 'StartDate']]
)

In [209]:
row_ID(chemo_et_pf)

(37, 37)

In [210]:
chemo_et_f = (
    line_therapy[line_therapy['PatientID'].isin(chemo_et_id)]
    .query('LineNumber == 3')
    .query('LineName == "Fulvestrant"')
    [['PatientID', 'StartDate']]
)

In [211]:
row_ID(chemo_et_f)

(42, 42)

#### 4. First line endocrine therapy, second line palbociclib + fulvestrant

In [212]:
et_pf = (
    line_therapy[line_therapy['PatientID'].isin(et_id)]
    .query('LineNumber == 2')
    .query('LineName == "Fulvestrant,Palbociclib"')
    [['PatientID', 'StartDate']]
)

In [213]:
row_ID(et_pf)

(440, 440)

In [214]:
et_f = (
    line_therapy[line_therapy['PatientID'].isin(et_id)]
    .query('LineNumber == 2')
    .query('LineName == "Fulvestrant"')
    [['PatientID', 'StartDate']]
)

In [215]:
row_ID(et_f)

(784, 784)

In [216]:
paloma3_pf = pd.concat([etchemo_pf, et_chemo_pf, chemo_et_pf, et_pf])

In [217]:
paloma3_pf.loc[:, 'pfulv'] = 1

In [218]:
row_ID(paloma3_pf)

(523, 523)

In [219]:
paloma3_f = pd.concat([etchemo_f, et_chemo_f, chemo_et_f, et_f])

In [220]:
paloma3_f.loc[:, 'pfulv'] = 0

In [221]:
row_ID(paloma3_f)

(897, 897)

In [222]:
paloma3 = pd.concat([paloma3_pf, paloma3_f])

In [223]:
row_ID(paloma3)

(1420, 1420)

In [224]:
paloma3 = pd.merge(paloma3, df_full, on = 'PatientID', how = 'left')

In [225]:
row_ID(paloma3)

(1420, 1420)

In [226]:
paloma3['StartDate'] = pd.to_datetime(paloma3['StartDate'])

#### ER- or PR-positive and HER-2 negative 

In [227]:
biomarkers = pd.read_csv('Enhanced_MetBreastBiomarkers.csv')

In [228]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(paloma3['PatientID'])]

In [229]:
row_ID(biomarkers)

(10590, 1415)

In [230]:
biomarkers = pd.merge(biomarkers, paloma3[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [231]:
row_ID(biomarkers)

(10590, 1415)

In [232]:
biomarkers['StartDate'] = pd.to_datetime(biomarkers['StartDate'])

In [233]:
biomarkers['ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [234]:
biomarkers['SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [235]:
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [236]:
biomarkers.loc[:, 'date_diff'] = (biomarkers['result_date'] - biomarkers['StartDate']).dt.days

In [237]:
er_status = (
    biomarkers
    .query('BiomarkerName == "ER"')
    .query('date_diff <= 30')
    .query('BiomarkerStatus == "Positive" or BiomarkerStatus == "Negative"') # don't select unknown values 
    .sort_values(['PatientID', 'date_diff'], ascending = [True, False]) # select ER status closest to treatment start
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'BiomarkerStatus']]
    .rename(columns = {'BiomarkerStatus': 'er'})
   )

In [238]:
row_ID(er_status)

(1283, 1283)

In [239]:
pr_status = (
    biomarkers
    .query('BiomarkerName == "PR"')
    .query('date_diff <= 30')
    .query('BiomarkerStatus == "Positive" or BiomarkerStatus == "Negative"') # don't select unknown values 
    .sort_values(['PatientID', 'date_diff'], ascending = [True, False]) # select ER status closest to treatment start
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'BiomarkerStatus']]
    .rename(columns = {'BiomarkerStatus': 'pr'})
   )

In [240]:
row_ID(pr_status)

(1243, 1243)

In [241]:
her2_status = (
    biomarkers
    .query('BiomarkerName == "HER2"')
    .query('date_diff <= 30')
    .sort_values(['PatientID', 'date_diff'], ascending = [True, False])
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'BiomarkerStatus']]
    .rename(columns = {'BiomarkerStatus': 'her2'})
)

In [242]:
row_ID(her2_status)

(1236, 1236)

In [243]:
paloma3 = pd.merge(paloma3, er_status, on  = 'PatientID', how = 'left')

In [244]:
row_ID(paloma3)

(1420, 1420)

In [245]:
paloma3 = pd.merge(paloma3, pr_status, on  = 'PatientID', how = 'left')

In [246]:
row_ID(paloma3)

(1420, 1420)

In [247]:
paloma3 = pd.merge(paloma3, her2_status, on  = 'PatientID', how = 'left')

In [248]:
row_ID(paloma3)

(1420, 1420)

In [249]:
her2_neg = ['IHC negative (0-1+)',
            'FISH negative/not amplified',
            'IHC equivocal (2+)',
            'Negative NOS',
            'NGS negative (ERBB2 not amplified)',
            'FISH equivocal',
            'Equivocal NOS',
            'NGS equivocal (ERBB2 amplification equivocal)']

paloma3 = (
    paloma3
    .query('er == "Positive" or pr == "Positive"')
    .query('her2== @her2_neg')
)

In [250]:
row_ID(paloma3)

(1153, 1153)

#### Time from treatment to progression/death or censor 

In [251]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [252]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [253]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [254]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [255]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
row_ID(mortality)

(31677, 31677)

In [256]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [257]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [258]:
row_ID(mortality)

(31677, 31677)

In [259]:
paloma3 = pd.merge(paloma3, mortality, on = 'PatientID', how = 'left')

In [260]:
row_ID(paloma3)

(1153, 1153)

In [261]:
progression = pd.read_csv ('Enhanced_MetBreastProgression.csv')

In [262]:
progression = progression[progression['PatientID'].isin(paloma3['PatientID'])][['PatientID', 'ProgressionDate']]

In [263]:
progression['ProgressionDate'] = pd.to_datetime(progression['ProgressionDate'])

In [264]:
progression = pd.merge(progression, paloma3[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [265]:
progression['date_diff'] = (progression['ProgressionDate'] - progression['StartDate']).dt.days

In [266]:
progression = (
    progression.query('date_diff > 0')
    .sort_values(['PatientID', 'ProgressionDate'], ascending = [True, True])
    .drop_duplicates(subset = 'PatientID', keep = 'first')
)   

In [267]:
row_ID(progression)

(831, 831)

In [268]:
progression = progression[['PatientID', 'ProgressionDate']]

In [269]:
paloma3 = pd.merge(paloma3, progression, on = 'PatientID', how = 'left')

In [270]:
row_ID(paloma3)

(1153, 1153)

In [271]:
# Percent without progression date
len(paloma3.query('ProgressionDate.isna()', engine = 'python'))/len(paloma3)

0.2792714657415438

In [272]:
conditions = [
    (paloma3.ProgressionDate.notna()),
    ((paloma3.ProgressionDate.isna()) & (paloma3['death_status'] == 1)),
    ((paloma3.ProgressionDate.isna()) & (paloma3['death_status'] == 0))]

choices = [
    (paloma3['ProgressionDate'] - paloma3['StartDate']).dt.days,
    (paloma3['death_date'] - paloma3['StartDate']).dt.days,
    (paloma3['last_activity'] - paloma3['StartDate']).dt.days]

paloma3.loc[:, 'time_prog_treatment'] = np.select(conditions, choices)

In [273]:
paloma3 = paloma3.query('time_prog_treatment >= 0')

In [274]:
len(paloma3)

1153

In [275]:
conditions = [
    (paloma3.ProgressionDate.notna()),
    ((paloma3.ProgressionDate.isna()) & (paloma3['death_status'] == 1)),
    ((paloma3.ProgressionDate.isna()) & (paloma3['death_status'] == 0))]

choices = [1, 1, 0]

paloma3.loc[:, 'pfs_status'] = np.select(conditions, choices)

#### Patient count 

In [276]:
row_ID(paloma3)

(1153, 1153)

In [277]:
# Exclude those with CNS metastatsis at time of metastatic diagnosis 
paloma3 = paloma3[~paloma3['PatientID'].isin(cns_sec_IDs)]

In [278]:
# Exlcude those with other relevant comorbidities
paloma3 = paloma3[~paloma3['PatientID'].isin(other_comorb_IDs)]

In [279]:
# Exclude those with ECOG 2, 3, or 4 
paloma3 = paloma3[~paloma3['PatientID'].isin(ecog_sec_234_IDs)]

In [280]:
# Exclude those with abnormal organ function at time of second line treatment 
paloma3 = paloma3[~paloma3['PatientID'].isin(ab_organ_sec_IDs)]

In [281]:
row_ID(paloma3)

(940, 940)

In [282]:
low_cutoff_paloma3 = cutoff.loc['paloma3'].low

In [283]:
high_cutoff_paloma3 = cutoff.loc['paloma3'].high

In [284]:
print('Palobociclib plus fulvestrant total:',  paloma3.query('pfulv == 1').shape[0])
print('High risk:', paloma3.query('pfulv == 1').query('risk_score >= @high_cutoff_paloma3').shape[0])
print('Med risk:', paloma3.query('pfulv == 1').query('risk_score < @high_cutoff_paloma3 and risk_score > @low_cutoff_paloma3').shape[0])
print('Low risk:', paloma3.query('pfulv == 1').query('risk_score <= @low_cutoff_paloma3').shape[0])

Palobociclib plus fulvestrant total: 382
High risk: 106
Med risk: 129
Low risk: 147


In [285]:
print('Fulvestrant total:',  paloma3.query('pfulv == 0').shape[0])
print('High risk:', paloma3.query('pfulv == 0').query('risk_score >= @high_cutoff_paloma3').shape[0])
print('Med risk:', paloma3.query('pfulv == 0').query('risk_score < @high_cutoff_paloma3 and risk_score > @low_cutoff_paloma3').shape[0])
print('Low risk:', paloma3.query('pfulv == 0').query('risk_score <= @low_cutoff_paloma3').shape[0])

Fulvestrant total: 558
High risk: 161
Med risk: 195
Low risk: 202


#### Survival with covariate balancing 

In [286]:
paloma3 = paloma3.set_index('PatientID')

In [287]:
conditions = [
    (paloma3['thorax_met'] == 1) |
    (paloma3['liver_met'] == 1) |
    (paloma3['cns_met'] == 1) |
    (paloma3['peritoneum_met'] == 1) |
    (paloma3['other_met'] == 1),
    (paloma3['bone_met'] == 0) &
    (paloma3['thorax_met'] == 0) &
    (paloma3['lymph_met'] == 0) &
    (paloma3['liver_met'] == 0) &
    (paloma3['cns_met'] == 0) &
    (paloma3['skin_met'] == 0) &
    (paloma3['peritoneum_met'] == 0) &
    (paloma3['other_met'] == 0)
]

choices = ['visceral', 'unknown']

paloma3['met_site'] = np.select(conditions, choices, default = 'nonvisceral')

In [288]:
paloma3['met_cat'] = pd.cut(paloma3['met_year'],
                            bins = [2010, 2016, float('inf')],
                            labels = ['11-16', '17-22'])

In [289]:
conditions = [
    ((paloma3['ecog_diagnosis'] == "1.0") | (paloma3['ecog_diagnosis'] == "0.0")),  
    ((paloma3['ecog_diagnosis'] == "2.0") | (paloma3['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

paloma3['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [290]:
paloma3_iptw = paloma3.filter(items = ['pfs_status',
                                       'time_prog_treatment',
                                       'pfulv',
                                       'age',
                                       'gender',
                                       'race',
                                       'p_type',
                                       'delta_met_diagnosis',
                                       'met_cat',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ses',
                                       'ecog_2',
                                       'met_site',
                                       'albumin_diag', 
                                       'weight_pct_change',
                                       'risk_score'])

In [291]:
paloma3_iptw.dtypes

pfs_status                int64
time_prog_treatment     float64
pfulv                     int64
age                       int64
gender                   object
race                     object
p_type                   object
delta_met_diagnosis       int64
met_cat                category
commercial              float64
medicare                float64
medicaid                float64
ses                     float64
ecog_2                   object
met_site                 object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
dtype: object

In [292]:
to_be_categorical = list(paloma3_iptw.select_dtypes(include = ['object']).columns)

In [293]:
to_be_categorical

['gender', 'race', 'p_type', 'ecog_2', 'met_site']

In [294]:
to_be_categorical.append('met_cat')

In [295]:
to_be_categorical.append('ses')

In [296]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    paloma3_iptw[x] = paloma3_iptw[x].astype('category')

In [297]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [298]:
# List of categorical features.
categorical_features = list(paloma3_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [299]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [300]:
paloma3_iptw_low = (
    paloma3_iptw
    .query('risk_score <= @low_cutoff_paloma3'))

paloma3_iptw_med = (
    paloma3_iptw
    .query('risk_score < @high_cutoff_paloma3 and risk_score > @low_cutoff_paloma3'))

paloma3_iptw_high = (
    paloma3_iptw
    .query('risk_score >= @high_cutoff_paloma3'))

paloma3_iptw_all = paloma3_iptw

In [301]:
paloma3_low_x = preprocessor.fit_transform(paloma3_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'p_type',
                                                                            'delta_met_diagnosis',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ses',
                                                                            'ecog_2',
                                                                            'met_site', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

paloma3_med_x = preprocessor.fit_transform(paloma3_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'p_type',
                                                                            'delta_met_diagnosis',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ses',
                                                                            'ecog_2',
                                                                            'met_site', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

paloma3_high_x = preprocessor.fit_transform(paloma3_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'p_type',
                                                                              'delta_met_diagnosis',
                                                                              'met_cat',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ses',
                                                                              'ecog_2',
                                                                              'met_site', 
                                                                              'albumin_diag', 
                                                                              'weight_pct_change', 
                                                                              'risk_score']))

paloma3_all_x = preprocessor.fit_transform(paloma3_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'p_type',
                                                                            'delta_met_diagnosis',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ses',
                                                                            'ecog_2',
                                                                            'met_site', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change',
                                                                            'risk_score']))

In [302]:
lr_paloma3_low = LogisticRegression(max_iter = 1000)
lr_paloma3_low.fit(paloma3_low_x, paloma3_iptw_low['pfulv'])

LogisticRegression(max_iter=1000)

In [303]:
lr_paloma3_med = LogisticRegression(max_iter = 1000)
lr_paloma3_med.fit(paloma3_med_x, paloma3_iptw_med['pfulv'])

LogisticRegression(max_iter=1000)

In [304]:
lr_paloma3_high = LogisticRegression(max_iter = 1000)
lr_paloma3_high.fit(paloma3_high_x, paloma3_iptw_high['pfulv'])

LogisticRegression(max_iter=1000)

In [305]:
lr_paloma3_all = LogisticRegression(max_iter = 1000)
lr_paloma3_all.fit(paloma3_all_x, paloma3_iptw_all['pfulv'])

LogisticRegression(max_iter=1000)

In [306]:
pred_low = lr_paloma3_low.predict_proba(paloma3_low_x)
pred_med = lr_paloma3_med.predict_proba(paloma3_med_x)
pred_high = lr_paloma3_high.predict_proba(paloma3_high_x)
pred_all = lr_paloma3_all.predict_proba(paloma3_all_x)

In [307]:
paloma3_iptw_low['ps'] = pred_low[:, 1]
paloma3_iptw_med['ps'] = pred_med[:, 1]
paloma3_iptw_high['ps'] = pred_high[:, 1]
paloma3_iptw_all['ps'] = pred_all[:, 1]

In [308]:
paloma3_iptw_low['weight'] = (
    np.where(paloma3_iptw_low['pfulv'] == 1, 1/paloma3_iptw_low['ps'], 1/(1 - paloma3_iptw_low['ps'])))

paloma3_iptw_med['weight'] = (
    np.where(paloma3_iptw_med['pfulv'] == 1, 1/paloma3_iptw_med['ps'], 1/(1 - paloma3_iptw_med['ps'])))

paloma3_iptw_high['weight'] = (
    np.where(paloma3_iptw_high['pfulv'] == 1, 1/paloma3_iptw_high['ps'], 1/(1 - paloma3_iptw_high['ps'])))

paloma3_iptw_all['weight'] = (
    np.where(paloma3_iptw_all['pfulv'] == 1, 1/paloma3_iptw_all['ps'], 1/(1 - paloma3_iptw_all['ps'])))

In [309]:
# Low KM curves
kmf_low_pfulv_paloma3_iptw = KaplanMeierFitter()
kmf_low_fulv_paloma3_iptw = KaplanMeierFitter()

kmf_low_pfulv_paloma3_iptw.fit(
    paloma3_iptw_low.query('pfulv == 1').time_prog_treatment/30,
    paloma3_iptw_low.query('pfulv == 1').pfs_status,
    weights = paloma3_iptw_low.query('pfulv == 1')['weight'])

kmf_low_fulv_paloma3_iptw.fit(
    paloma3_iptw_low.query('pfulv == 0').time_prog_treatment/30,
    paloma3_iptw_low.query('pfulv == 0').pfs_status,
    weights = paloma3_iptw_low.query('pfulv == 0')['weight'])

# Med KM curves
kmf_med_pfulv_paloma3_iptw = KaplanMeierFitter()
kmf_med_fulv_paloma3_iptw = KaplanMeierFitter()

kmf_med_pfulv_paloma3_iptw.fit(
    paloma3_iptw_med.query('pfulv == 1').time_prog_treatment/30,
    paloma3_iptw_med.query('pfulv == 1').pfs_status,
    weights = paloma3_iptw_med.query('pfulv == 1')['weight'])

kmf_med_fulv_paloma3_iptw.fit(
    paloma3_iptw_med.query('pfulv == 0').time_prog_treatment/30,
    paloma3_iptw_med.query('pfulv == 0').pfs_status,
    weights = paloma3_iptw_med.query('pfulv == 0')['weight'])

# High KM curves 
kmf_high_pfulv_paloma3_iptw = KaplanMeierFitter()
kmf_high_fulv_paloma3_iptw = KaplanMeierFitter()

kmf_high_pfulv_paloma3_iptw.fit(
    paloma3_iptw_high.query('pfulv == 1').time_prog_treatment/30,
    paloma3_iptw_high.query('pfulv == 1').pfs_status,
    weights = paloma3_iptw_high.query('pfulv == 1')['weight'])

kmf_high_fulv_paloma3_iptw.fit(
    paloma3_iptw_high.query('pfulv == 0').time_prog_treatment/30,
    paloma3_iptw_high.query('pfulv == 0').pfs_status,
    weights = paloma3_iptw_high.query('pfulv == 0')['weight'])

# All KM curves 
kmf_all_pfulv_paloma3_iptw = KaplanMeierFitter()
kmf_all_fulv_paloma3_iptw = KaplanMeierFitter()

kmf_all_pfulv_paloma3_iptw.fit(
    paloma3_iptw_all.query('pfulv == 1').time_prog_treatment/30,
    paloma3_iptw_all.query('pfulv == 1').pfs_status,
    weights = paloma3_iptw_all.query('pfulv == 1')['weight'])

kmf_all_fulv_paloma3_iptw.fit(
    paloma3_iptw_all.query('pfulv == 0').time_prog_treatment/30,
    paloma3_iptw_all.query('pfulv == 0').pfs_status,
    weights = paloma3_iptw_all.query('pfulv == 0')['weight'])


<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 943.643 total observations, 116.817 right-censored observations>

#### Calculating survival metrics

In [310]:
pfulv_paloma3_median_os = mos(kmf_low_pfulv_paloma3_iptw,
                              kmf_med_pfulv_paloma3_iptw,
                              kmf_high_pfulv_paloma3_iptw,
                              kmf_all_pfulv_paloma3_iptw)

fulv_paloma3_median_os = mos(kmf_low_fulv_paloma3_iptw,
                             kmf_med_fulv_paloma3_iptw,
                             kmf_high_fulv_paloma3_iptw,
                             kmf_all_fulv_paloma3_iptw)

In [311]:
paloma3_iptw_all_imputed = paloma3_iptw_all.copy()
paloma3_iptw_all_imputed['albumin_diag'] = paloma3_iptw_all_imputed['albumin_diag'].fillna(paloma3_iptw_all_imputed['albumin_diag'].median())
paloma3_iptw_all_imputed['weight_pct_change'] = paloma3_iptw_all_imputed['weight_pct_change'].fillna(paloma3_iptw_all_imputed['weight_pct_change'].median())
paloma3_iptw_all_imputed['ses'] = paloma3_iptw_all_imputed['ses'].cat.add_categories('unknown')
paloma3_iptw_all_imputed['ses'] = paloma3_iptw_all_imputed['ses'].fillna('unknown')

In [312]:
paloma3_hr_all = CoxPHFitter()
paloma3_hr_all.fit(paloma3_iptw_all_imputed,
                   duration_col = 'time_prog_treatment', 
                   event_col = 'pfs_status', 
                   formula = 'pfulv + age + gender + race + p_type + delta_met_diagnosis + met_cat + commercial + medicare + medicaid + ses + ecog_2 + met_site + albumin_diag + weight_pct_change + risk_score',
                   weights_col = 'weight',
                   robust = True)

<lifelines.CoxPHFitter: fitted with 1859.37 total observations, 258.259 right-censored observations>

In [313]:
paloma3_all_rmst_mos_95 = rmst_mos_95ci(paloma3_iptw_all,
                                        1000,
                                        'pfulv',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'p_type',
                                         'delta_met_diagnosis',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ses',
                                         'ecog_2',
                                         'met_site',
                                         'albumin_diag', 
                                         'weight_pct_change', 
                                         'risk_score'],
                                         ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                         24)

In [314]:
paloma3_low_rmst_mos_95 = rmst_mos_95ci(paloma3_iptw_low,
                                        1000,
                                        'pfulv',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'p_type',
                                         'delta_met_diagnosis',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ses',
                                         'ecog_2',
                                         'met_site',
                                         'albumin_diag', 
                                         'weight_pct_change', 
                                         'risk_score'],
                                         ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                         24)

In [315]:
paloma3_med_rmst_mos_95 = rmst_mos_95ci(paloma3_iptw_med,
                                        1000,
                                        'pfulv',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'p_type',
                                         'delta_met_diagnosis',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ses',
                                         'ecog_2',
                                         'met_site',
                                         'albumin_diag', 
                                         'weight_pct_change', 
                                         'risk_score'],
                                         ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                         24)

In [316]:
paloma3_high_rmst_mos_95 = rmst_mos_95ci(paloma3_iptw_high,
                                         1000,
                                         'pfulv',
                                         'progression',
                                         ['age',
                                          'gender',
                                          'race',
                                          'p_type',
                                          'delta_met_diagnosis',
                                          'met_cat',
                                          'commercial',
                                          'medicare',
                                          'medicaid',
                                          'ses',
                                          'ecog_2',
                                          'met_site',
                                          'albumin_diag', 
                                          'weight_pct_change', 
                                          'risk_score'],
                                          ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                          24)

In [317]:
paloma3_data = [
    {'trial_name': 'PALOMA-3', 
     'risk_group': 'low', 
     's_trt_mos': pfulv_paloma3_median_os[0],
     's_trt_mos_95': paloma3_low_rmst_mos_95.mos_A_95,
     's_cont_mos': fulv_paloma3_median_os[0],
     's_cont_mos_95': paloma3_low_rmst_mos_95.mos_B_95,
     's_mos_diff': pfulv_paloma3_median_os[0] - fulv_paloma3_median_os[0], 
     'rct_trt_arm': 9.5,
     'rct_cont_arm': 4.6,
     'rct_mos_diff': 9.5-4.6,
     's_trt_rmst': restricted_mean_survival_time(kmf_low_pfulv_paloma3_iptw, 24),
     's_trt_rmst_95': paloma3_low_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_low_fulv_paloma3_iptw, 24),
     's_cont_rmst_95': paloma3_low_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_low_pfulv_paloma3_iptw, 24) - restricted_mean_survival_time(kmf_low_fulv_paloma3_iptw, 24),
     's_diff_rmst_95': paloma3_low_rmst_mos_95.difference_rmst_95,
     'scount': paloma3.query('risk_score <= @low_cutoff_paloma3').shape[0]},
    
    {'trial_name': 'PALOMA-3', 
     'risk_group': 'medium', 
     's_trt_mos': pfulv_paloma3_median_os[1],
     's_trt_mos_95': paloma3_med_rmst_mos_95.mos_A_95,
     's_cont_mos': fulv_paloma3_median_os[1],
     's_cont_mos_95': paloma3_med_rmst_mos_95.mos_B_95,
     's_mos_diff': pfulv_paloma3_median_os[1] - fulv_paloma3_median_os[1], 
     'rct_trt_arm': 9.5,
     'rct_cont_arm': 4.6,
     'rct_mos_diff': 9.5-4.6,
     's_trt_rmst': restricted_mean_survival_time(kmf_med_pfulv_paloma3_iptw, 24),
     's_trt_rmst_95': paloma3_med_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_med_fulv_paloma3_iptw, 24),
     's_cont_rmst_95': paloma3_med_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_med_pfulv_paloma3_iptw, 24) - restricted_mean_survival_time(kmf_med_fulv_paloma3_iptw, 24),
     's_diff_rmst_95': paloma3_med_rmst_mos_95.difference_rmst_95,
     'scount': paloma3.query('risk_score < @high_cutoff_paloma3 and risk_score > @low_cutoff_paloma3').shape[0]},
    
    {'trial_name': 'PALOMA-3', 
     'risk_group': 'high', 
     's_trt_mos': pfulv_paloma3_median_os[2],
     's_trt_mos_95': paloma3_high_rmst_mos_95.mos_A_95,
     's_cont_mos': fulv_paloma3_median_os[2],
     's_cont_mos_95': paloma3_high_rmst_mos_95.mos_B_95,
     's_mos_diff': pfulv_paloma3_median_os[2] - fulv_paloma3_median_os[2], 
     'rct_trt_arm': 9.5,
     'rct_cont_arm': 4.6,
     'rct_mos_diff': 9.5-4.6,
     's_trt_rmst': restricted_mean_survival_time(kmf_high_pfulv_paloma3_iptw, 24),
     's_trt_rmst_95': paloma3_high_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_high_fulv_paloma3_iptw, 24),
     's_cont_rmst_95': paloma3_high_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_high_pfulv_paloma3_iptw, 24) - restricted_mean_survival_time(kmf_high_fulv_paloma3_iptw, 24),
     's_diff_rmst_95': paloma3_high_rmst_mos_95.difference_rmst_95,
     'scount': paloma3.query('risk_score >= @high_cutoff_paloma3').shape[0]},
    
    {'trial_name': 'PALOMA-3', 
     'risk_group': 'all', 
     's_hr': paloma3_hr_all.hazard_ratios_['pfulv'],
     's_hr_95': [paloma3_hr_all.summary.loc['pfulv']['exp(coef) lower 95%'], paloma3_hr_all.summary.loc['pfulv']['exp(coef) upper 95%']],
     's_trt_mos': pfulv_paloma3_median_os[3],
     's_trt_mos_95': paloma3_all_rmst_mos_95.mos_A_95,
     's_cont_mos': fulv_paloma3_median_os[3],
     's_cont_mos_95': paloma3_all_rmst_mos_95.mos_B_95,
     's_mos_diff': pfulv_paloma3_median_os[3] - fulv_paloma3_median_os[3], 
     'rct_trt_arm': 9.5,
     'rct_cont_arm': 4.6,
     'rct_mos_diff': 9.5-4.6,
     'scount': paloma3.shape[0]}
]

### CLEOPATRA: pertuzumab, trastuzumab, and docetaxel in HER2-positive metastatic breast cancer

**INCLUSION**
* Untreated metastatic breast cancer
* Received first line pertuzumab, trastuzumab, and docetaxel/paclitaxel or first line trastuzumab + docetaxel/paclitaxel 
* First line of hormonal treatment is allowed
* HER-2 positive within (+30, -inf) of receipt of treatment 
* No relevant comorbidities in the year preceding metastatic diagnosis 
* No CNS metastasis at time of treatment 
* ECOG cannot be 2, 3, or 4 at time of treatment 
* Adequate organ function at time of treatment 

#### First line trastuzumab + docetaxel/paclitaxel +/- pertuzumab 

In [318]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

31677

In [319]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [320]:
tpdp = [
    'Docetaxel,Pertuzumab,Trastuzumab',
    'Docetaxel,Pertuzumab,Trastuzumab-Anns',
    'Docetaxel,Pertuzumab,Trastuzumab-Qyyp',
    'Paclitaxel,Pertuzumab,Trastuzumab',
    'Paclitaxel,Pertuzumab,Trastuzumab-Anns',
    'Paclitaxel,Pertuzumab,Trastuzumab-Qyyp',
    'Paclitaxel Protein-Bound,Pertuzumab,Trastuzumab',
    'Paclitaxel Protein-Bound,Pertuzumab,Trastuzumab-Anns',
    'Paclitaxel Protein-Bound,Pertuzumab,Trastuzumab-Qyyp']

tpdp_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('LineName == @tpdp')
    [['PatientID', 'StartDate']]
)

In [321]:
tpdp_fl.loc[:, 'tpdp'] = 1

In [322]:
row_ID(tpdp_fl)

(1219, 1219)

In [323]:
tpd = [
    'Docetaxel,Trastuzumab',
    'Docetaxel,Trastuzumab-Anns',
    'Docetaxel,Trastuzumab-Qyyp',
    'Paclitaxel,Trastuzumab',
    'Paclitaxel,Trastuzumab-Anns',
    'Paclitaxel,Trastuzumab-Qyyp',
    'Paclitaxel Protein-Bound,Trastuzumab',
    'Paclitaxel Protein-Bound,Trastuzumab-Anns',
    'Paclitaxel Protein-Bound,Trastuzumab-Qyyp'] 

tpd_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('LineName == @tpd')
    [['PatientID', 'StartDate']]
)

In [324]:
tpd_fl.loc[:, 'tpdp'] = 0

In [325]:
row_ID(tpdp_fl)

(1219, 1219)

#### First line endocrine therapy, second line trastuzumab + docetaxel/paclitaxel +/- pertuzumab 

In [326]:
all_treatment = line_therapy.LineName.unique()

In [327]:
treatment_list = []
for trt in all_treatment:
    treatment_list.extend(trt.split(","))

In [328]:
unique = list(dict.fromkeys(treatment_list))

In [329]:
et = [
    'Anastrozole',
    'Letrozole',
    'Exemestane',
    'Tamoxifen',
    'Leuprolide',
    'Goserelin',
    'Triptorelin']

In [330]:
unique.remove('Anastrozole')
unique.remove('Letrozole')
unique.remove('Exemestane')
unique.remove('Tamoxifen')
unique.remove('Leuprolide')
unique.remove('Goserelin')
unique.remove('Triptorelin')

In [331]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1'))

In [332]:
(
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('|'.join(et))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(unique))]
    .LineName.value_counts().head(10)
)

Anastrozole               2943
Letrozole                 2536
Tamoxifen                 1305
Exemestane                 780
Leuprolide                  77
Goserelin                   60
Leuprolide,Tamoxifen        56
Letrozole,Leuprolide        40
Goserelin,Tamoxifen         36
Anastrozole,Leuprolide      35
Name: LineName, dtype: int64

In [333]:
et_id = (
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('|'.join(et))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(unique))]
    .PatientID
)

In [334]:
line_therapy_sec = (
    line_therapy[line_therapy['PatientID'].isin(et_id)]
    .query('LineNumber == 2'))

In [335]:
et_tpdp = (
    line_therapy_sec
    .query('LineName == @tpdp')
    [['PatientID', 'StartDate']]
)

In [336]:
et_tpdp.loc[:, 'tpdp'] = 1

In [337]:
row_ID(et_tpdp)

(79, 79)

In [338]:
et_tpd = (
    line_therapy_sec
    .query('LineName == @tpd')
    [['PatientID', 'StartDate']]
)

In [339]:
et_tpd.loc[:, 'tpdp'] = 0

In [340]:
row_ID(et_tpd)

(5, 5)

In [341]:
cleopatra_p = pd.concat([tpdp_fl, et_tpdp])

In [342]:
row_ID(cleopatra_p)

(1298, 1298)

In [343]:
cleopatra_np = pd.concat([tpd_fl, et_tpd])

In [344]:
row_ID(cleopatra_np)

(184, 184)

In [345]:
cleopatra = pd.concat([cleopatra_p, cleopatra_np])

In [346]:
row_ID(cleopatra)

(1482, 1482)

In [347]:
cleopatra = pd.merge(cleopatra, df_full, on = 'PatientID', how = 'left')

In [348]:
row_ID(cleopatra)

(1482, 1482)

In [349]:
cleopatra['StartDate'] = pd.to_datetime(cleopatra['StartDate'])

#### HER-2 positive

In [350]:
biomarkers = pd.read_csv('Enhanced_MetBreastBiomarkers.csv')

In [351]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(cleopatra['PatientID'])]

In [352]:
row_ID(biomarkers)

(10765, 1482)

In [353]:
biomarkers = pd.merge(biomarkers, cleopatra[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [354]:
row_ID(biomarkers)

(10765, 1482)

In [355]:
biomarkers['StartDate'] = pd.to_datetime(biomarkers['StartDate'])

In [356]:
biomarkers['ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [357]:
biomarkers['SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [358]:
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [359]:
biomarkers.loc[:, 'date_diff'] = (biomarkers['result_date'] - biomarkers['StartDate']).dt.days

In [360]:
her2_rel = ['IHC negative (0-1+)',
            'FISH negative/not amplified',
            'Negative NOS',
            'NGS negative (ERBB2 not amplified)',
            'IHC positive (3+)',
            'FISH positive/amplified',
            'Positive NOS',
            'NGS positive (ERBB2 amplified)']

her2_status = (
    biomarkers
    .query('BiomarkerName == "HER2"')
    .query('date_diff <= 30')
    .query('BiomarkerStatus == @her2_rel')
    .sort_values(['PatientID', 'date_diff'], ascending = [True, False])
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'BiomarkerStatus']]
    .rename(columns = {'BiomarkerStatus': 'her2'})
)

In [361]:
row_ID(her2_status)

(1377, 1377)

In [362]:
cleopatra = pd.merge(cleopatra, her2_status, on  = 'PatientID', how = 'left')

In [363]:
row_ID(cleopatra)

(1482, 1482)

In [364]:
her2_pos = ['IHC positive (3+)',
            'FISH positive/amplified',
            'Positive NOS',
            'NGS positive (ERBB2 amplified)']

cleopatra = (
    cleopatra
    .query('her2== @her2_pos')
)

In [365]:
row_ID(cleopatra)

(1296, 1296)

#### Time from treatment to progression/death or censor 

In [366]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [367]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [368]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [369]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [370]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
row_ID(mortality)

(31677, 31677)

In [371]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [372]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [373]:
row_ID(mortality)

(31677, 31677)

In [374]:
cleopatra = pd.merge(cleopatra, mortality, on = 'PatientID', how = 'left')

In [375]:
row_ID(cleopatra)

(1296, 1296)

In [376]:
conditions = [
    (cleopatra['death_status'] == 1),
    (cleopatra['death_status'] == 0)]

choices = [
    (cleopatra['death_date'] - cleopatra['StartDate']).dt.days,
    (cleopatra['last_activity'] - cleopatra['StartDate']).dt.days]

cleopatra.loc[:, 'timerisk_treatment'] = np.select(conditions, choices)

In [377]:
cleopatra = cleopatra.query('timerisk_treatment >= 0')

#### Patient count 

In [378]:
row_ID(cleopatra)

(1294, 1294)

In [379]:
# Exclude those with CNS metastatsis at time of metastatic diagnosis 
cleopatra = cleopatra[~cleopatra['PatientID'].isin(cns_fl_IDs)]

In [380]:
# Exlcude those with cardiac comorbidities
cleopatra = cleopatra[~cleopatra['PatientID'].isin(cardiac_IDs)]

In [381]:
# Exlcude those with other relevant comorbidities
cleopatra = cleopatra[~cleopatra['PatientID'].isin(other_comorb_IDs)]

In [382]:
# Exclude those with ECOG 2, 3, or 4
cleopatra = cleopatra[~cleopatra['PatientID'].isin(ecog_fl_234_IDs)]

In [383]:
# Exclude those with abnormal organ function at time of second line treatment 
cleopatra = cleopatra[~cleopatra['PatientID'].isin(ab_organ_fl_IDs)]

In [384]:
row_ID(cleopatra)

(1005, 1005)

In [385]:
low_cutoff_cleopatra = cutoff.loc['cleopatra'].low

In [386]:
high_cutoff_cleopatra = cutoff.loc['cleopatra'].high

In [387]:
print('Pertuzumab + trastuzumab + docetaxel/paclitaxel total:',  cleopatra.query('tpdp == 1').shape[0])
print('High risk:', cleopatra.query('tpdp == 1').query('risk_score >= @high_cutoff_cleopatra').shape[0])
print('Med risk:', cleopatra.query('tpdp == 1').query('risk_score < @high_cutoff_cleopatra and risk_score > @low_cutoff_cleopatra').shape[0])
print('Low risk:', cleopatra.query('tpdp == 1').query('risk_score <= @low_cutoff_cleopatra').shape[0])

Pertuzumab + trastuzumab + docetaxel/paclitaxel total: 892
High risk: 219
Med risk: 321
Low risk: 352


In [388]:
print('Trastuzumab + docetaxel/paclitaxel total:',  cleopatra.query('tpdp == 0').shape[0])
print('High risk:', cleopatra.query('tpdp == 0').query('risk_score >= @high_cutoff_cleopatra').shape[0])
print('Med risk:', cleopatra.query('tpdp == 0').query('risk_score < @high_cutoff_cleopatra and risk_score > @low_cutoff_cleopatra').shape[0])
print('Low risk:', cleopatra.query('tpdp == 0').query('risk_score <= @low_cutoff_cleopatra').shape[0])

Trastuzumab + docetaxel/paclitaxel total: 113
High risk: 39
Med risk: 35
Low risk: 39


#### Survival curves with covariate balancing 

In [389]:
conditions = [
    (cleopatra['thorax_met'] == 1) |
    (cleopatra['liver_met'] == 1) |
    (cleopatra['cns_met'] == 1) |
    (cleopatra['peritoneum_met'] == 1) |
    (cleopatra['other_met'] == 1),
    (cleopatra['bone_met'] == 0) &
    (cleopatra['thorax_met'] == 0) &
    (cleopatra['lymph_met'] == 0) &
    (cleopatra['liver_met'] == 0) &
    (cleopatra['cns_met'] == 0) &
    (cleopatra['skin_met'] == 0) &
    (cleopatra['peritoneum_met'] == 0) &
    (cleopatra['other_met'] == 0)
]

choices = ['visceral', 'unknown']

cleopatra['met_site'] = np.select(conditions, choices, default = 'nonvisceral')

In [390]:
cleopatra['met_cat'] = pd.cut(cleopatra['met_year'],
                              bins = [2010, 2016, float('inf')],
                              labels = ['11-16', '17-22'])

In [391]:
conditions = [
    ((cleopatra['ecog_diagnosis'] == "1.0") | (cleopatra['ecog_diagnosis'] == "0.0")),  
    ((cleopatra['ecog_diagnosis'] == "2.0") | (cleopatra['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

cleopatra['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [392]:
conditions = [
    ((cleopatra['ses'] == 1) | (cleopatra['ses'] == 2) | (cleopatra['ses'] == 3)), 
    ((cleopatra['ses'] == 4) | (cleopatra['ses'] == 5))
]

choices = ['lt_4', 'gte_4']

cleopatra['ses_cat'] = np.select(conditions, choices, default = 'unknown')

In [393]:
cleopatra['race_cat'] = np.where(cleopatra['race'] == "White", 1, 0)

In [394]:
cleopatra = cleopatra.set_index('PatientID')

In [395]:
cleopatra_iptw = cleopatra.filter(items = ['death_status',
                                           'timerisk_treatment',
                                           'tpdp',
                                           'age',
                                           'gender',
                                           'race_cat',
                                           'p_type',
                                           'delta_met_diagnosis',
                                           'met_cat',
                                           'ses_cat',
                                           'ecog_2',
                                           'albumin_diag',
                                           'weight_pct_change',
                                           'risk_score'])

In [396]:
cleopatra_iptw.dtypes

death_status               bool
timerisk_treatment      float64
tpdp                      int64
age                       int64
gender                   object
race_cat                  int64
p_type                   object
delta_met_diagnosis       int64
met_cat                category
ses_cat                  object
ecog_2                   object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
dtype: object

In [397]:
to_be_categorical = list(cleopatra_iptw.select_dtypes(include = ['object']).columns)

In [398]:
to_be_categorical

['gender', 'p_type', 'ses_cat', 'ecog_2']

In [399]:
to_be_categorical.append('met_cat')

In [400]:
to_be_categorical.append('race_cat')

In [401]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    cleopatra_iptw[x] = cleopatra_iptw[x].astype('category')

In [402]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_met_diagnosis']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [403]:
# List of categorical features.
categorical_features = list(cleopatra_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [404]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [405]:
cleopatra_iptw_low = (
    cleopatra_iptw
    .query('risk_score <= @low_cutoff_cleopatra')
    .drop(columns = ['risk_score']))

cleopatra_iptw_med = (
    cleopatra_iptw
    .query('risk_score < @high_cutoff_cleopatra and risk_score > @low_cutoff_cleopatra')
    .drop(columns = ['risk_score']))

cleopatra_iptw_high = (
    cleopatra_iptw
    .query('risk_score >= @high_cutoff_cleopatra')
    .drop(columns = ['risk_score']))

cleopatra_iptw_all = (
    cleopatra_iptw
    .drop(columns = ['risk_score']))

In [406]:
cleopatra_low_x = preprocessor.fit_transform(cleopatra_iptw_low.filter(items = ['age',
                                                                                'gender',
                                                                                'race_cat',
                                                                                'p_type',
                                                                                'delta_met_diagnosis',
                                                                                'met_cat',
                                                                                'ses_cat',
                                                                                'ecog_2']))

cleopatra_med_x = preprocessor.fit_transform(cleopatra_iptw_med.filter(items = ['age',
                                                                                'gender',
                                                                                'race_cat',
                                                                                'p_type',
                                                                                'delta_met_diagnosis',
                                                                                'met_cat',
                                                                                'ses_cat',
                                                                                'ecog_2']))

cleopatra_high_x = preprocessor.fit_transform(cleopatra_iptw_high.filter(items = ['age',
                                                                                  'gender',
                                                                                  'race_cat',
                                                                                  'p_type',
                                                                                  'delta_met_diagnosis',
                                                                                  'met_cat',
                                                                                  'ses_cat',
                                                                                  'ecog_2']))

cleopatra_all_x = preprocessor.fit_transform(cleopatra_iptw_all.filter(items = ['age',
                                                                                'gender',
                                                                                'race_cat',
                                                                                'p_type',
                                                                                'delta_met_diagnosis',
                                                                                'met_cat',
                                                                                'ses_cat',
                                                                                'ecog_2']))

In [407]:
lr_cleopatra_low = LogisticRegression(max_iter = 1000)
lr_cleopatra_low.fit(cleopatra_low_x, cleopatra_iptw_low['tpdp'])

LogisticRegression(max_iter=1000)

In [408]:
lr_cleopatra_med = LogisticRegression(max_iter = 1000)
lr_cleopatra_med.fit(cleopatra_med_x, cleopatra_iptw_med['tpdp'])

LogisticRegression(max_iter=1000)

In [409]:
lr_cleopatra_high = LogisticRegression(max_iter = 1000)
lr_cleopatra_high.fit(cleopatra_high_x, cleopatra_iptw_high['tpdp'])

LogisticRegression(max_iter=1000)

In [410]:
lr_cleopatra_all = LogisticRegression(max_iter = 1000)
lr_cleopatra_all.fit(cleopatra_all_x, cleopatra_iptw_all['tpdp'])

LogisticRegression(max_iter=1000)

In [411]:
pred_low = lr_cleopatra_low.predict_proba(cleopatra_low_x)
pred_med = lr_cleopatra_med.predict_proba(cleopatra_med_x)
pred_high = lr_cleopatra_high.predict_proba(cleopatra_high_x)
pred_all = lr_cleopatra_all.predict_proba(cleopatra_all_x)

In [412]:
cleopatra_iptw_low['ps'] = pred_low[:, 1]
cleopatra_iptw_med['ps'] = pred_med[:, 1]
cleopatra_iptw_high['ps'] = pred_high[:, 1]
cleopatra_iptw_all['ps'] = pred_all[:, 1]

In [413]:
cleopatra_iptw_low['weight'] = (
    np.where(cleopatra_iptw_low['tpdp'] == 1, 1/cleopatra_iptw_low['ps'], 1/(1 - cleopatra_iptw_low['ps'])))

cleopatra_iptw_med['weight'] = (
    np.where(cleopatra_iptw_med['tpdp'] == 1, 1/cleopatra_iptw_med['ps'], 1/(1 - cleopatra_iptw_med['ps'])))

cleopatra_iptw_high['weight'] = (
    np.where(cleopatra_iptw_high['tpdp'] == 1, 1/cleopatra_iptw_high['ps'], 1/(1 - cleopatra_iptw_high['ps'])))

cleopatra_iptw_all['weight'] = (
    np.where(cleopatra_iptw_all['tpdp'] == 1, 1/cleopatra_iptw_all['ps'], 1/(1 - cleopatra_iptw_all['ps'])))

In [414]:
# Low KM curves
kmf_low_tpdp_cleopatra_iptw = KaplanMeierFitter()
kmf_low_tdp_cleopatra_iptw = KaplanMeierFitter()

kmf_low_tpdp_cleopatra_iptw.fit(
    cleopatra_iptw_low.query('tpdp == 1')['timerisk_treatment']/30,
    cleopatra_iptw_low.query('tpdp == 1')['death_status'],
    weights = cleopatra_iptw_low.query('tpdp == 1')['weight'])

kmf_low_tdp_cleopatra_iptw.fit(
    cleopatra_iptw_low.query('tpdp == 0')['timerisk_treatment']/30,
    cleopatra_iptw_low.query('tpdp == 0')['death_status'],
    weights = cleopatra_iptw_low.query('tpdp == 0')['weight'])

# Med KM curves
kmf_med_tpdp_cleopatra_iptw = KaplanMeierFitter()
kmf_med_tdp_cleopatra_iptw = KaplanMeierFitter()

kmf_med_tpdp_cleopatra_iptw.fit(
    cleopatra_iptw_med.query('tpdp == 1')['timerisk_treatment']/30,
    cleopatra_iptw_med.query('tpdp == 1')['death_status'], 
    weights = cleopatra_iptw_med.query('tpdp == 1')['weight'])

kmf_med_tdp_cleopatra_iptw.fit(
    cleopatra_iptw_med.query('tpdp == 0')['timerisk_treatment']/30,
    cleopatra_iptw_med.query('tpdp == 0')['death_status'], 
    weights = cleopatra_iptw_med.query('tpdp == 0')['weight'])

# High KM curves 
kmf_high_tpdp_cleopatra_iptw = KaplanMeierFitter()
kmf_high_tdp_cleopatra_iptw = KaplanMeierFitter()

kmf_high_tpdp_cleopatra_iptw.fit(
    cleopatra_iptw_high.query('tpdp == 1')['timerisk_treatment']/30,
    cleopatra_iptw_high.query('tpdp == 1')['death_status'], 
    weights = cleopatra_iptw_high.query('tpdp == 1')['weight'])

kmf_high_tdp_cleopatra_iptw.fit(
    cleopatra_iptw_high.query('tpdp == 0')['timerisk_treatment']/30,
    cleopatra_iptw_high.query('tpdp == 0')['death_status'], 
    weights = cleopatra_iptw_high.query('tpdp == 0')['weight'])

# All KM curves 
kmf_all_tpdp_cleopatra_iptw = KaplanMeierFitter()
kmf_all_tdp_cleopatra_iptw = KaplanMeierFitter()

kmf_all_tpdp_cleopatra_iptw.fit(
    cleopatra_iptw_all.query('tpdp == 1')['timerisk_treatment']/30,
    cleopatra_iptw_all.query('tpdp == 1')['death_status'], 
    weights = cleopatra_iptw_all.query('tpdp == 1')['weight'])

kmf_all_tdp_cleopatra_iptw.fit(
    cleopatra_iptw_all.query('tpdp == 0')['timerisk_treatment']/30,
    cleopatra_iptw_all.query('tpdp == 0')['death_status'], 
    weights = cleopatra_iptw_all.query('tpdp == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 936.338 total observations, 303.26 right-censored observations>

#### Calculating survival metrics 

In [415]:
tpdp_cleopatra_median_os = mos(kmf_low_tpdp_cleopatra_iptw,
                               kmf_med_tpdp_cleopatra_iptw,
                               kmf_high_tpdp_cleopatra_iptw,
                               kmf_all_tpdp_cleopatra_iptw)

tdp_cleopatra_median_os = mos(kmf_low_tdp_cleopatra_iptw,
                              kmf_med_tdp_cleopatra_iptw,
                              kmf_high_tdp_cleopatra_iptw,
                              kmf_all_tdp_cleopatra_iptw)

In [416]:
cleopatra_iptw_all_imputed = cleopatra_iptw_all.copy()
cleopatra_iptw_all_imputed['albumin_diag'] = cleopatra_iptw_all_imputed['albumin_diag'].fillna(cleopatra_iptw_all_imputed['albumin_diag'].median())
cleopatra_iptw_all_imputed['weight_pct_change'] = cleopatra_iptw_all_imputed['weight_pct_change'].fillna(cleopatra_iptw_all_imputed['weight_pct_change'].median())

In [417]:
cleopatra_iptw_all_imputed = pd.merge(cleopatra_iptw_all_imputed.reset_index(), df_full.reset_index()[['PatientID', 'risk_score']], on = 'PatientID', how = 'left')

In [418]:
cleopatra_hr_all = CoxPHFitter()
cleopatra_hr_all.fit(cleopatra_iptw_all_imputed,
                    duration_col = 'timerisk_treatment', 
                    event_col = 'death_status', 
                    formula = 'tpdp + age + gender + race_cat + p_type + delta_met_diagnosis + met_cat + ses_cat + ecog_2 + albumin_diag + weight_pct_change + risk_score',
                    weights_col = 'weight',
                    robust = True)

<lifelines.CoxPHFitter: fitted with 1941.77 total observations, 864.898 right-censored observations>

In [419]:
cleopatra_all_rmst_mos_95 = rmst_mos_95ci(cleopatra_iptw_all,
                                          1000,
                                          'tpdp',
                                          'death',
                                          ['age',
                                           'gender',
                                           'race_cat',
                                           'p_type',
                                           'delta_met_diagnosis',
                                           'met_cat',
                                           'ses_cat',
                                           'ecog_2'],
                                          ['age', 'delta_met_diagnosis'],
                                          60)

In [420]:
cleopatra_low_rmst_mos_95 = rmst_mos_95ci(cleopatra_iptw_low,
                                          1000,
                                          'tpdp',
                                          'death',
                                          ['age',
                                           'gender',
                                           'race_cat',
                                           'p_type',
                                           'delta_met_diagnosis',
                                           'met_cat',
                                           'ses_cat',
                                           'ecog_2'],
                                          ['age', 'delta_met_diagnosis'],
                                          60)

In [421]:
cleopatra_med_rmst_mos_95 = rmst_mos_95ci(cleopatra_iptw_med,
                                          1000,
                                          'tpdp',
                                          'death',
                                          ['age',
                                           'gender',
                                           'race_cat',
                                           'p_type',
                                           'delta_met_diagnosis',
                                           'met_cat',
                                           'ses_cat',
                                           'ecog_2'],
                                          ['age', 'delta_met_diagnosis'],
                                          60)

In [422]:
cleopatra_high_rmst_mos_95 = rmst_mos_95ci(cleopatra_iptw_high,
                                          1000,
                                          'tpdp',
                                          'death',
                                          ['age',
                                           'gender',
                                           'race_cat',
                                           'p_type',
                                           'delta_met_diagnosis',
                                           'met_cat',
                                           'ses_cat',
                                           'ecog_2'],
                                          ['age', 'delta_met_diagnosis'],
                                          60)

In [423]:
cleopatra_data = [
    {'trial_name': 'CLEOPATRA', 
     'risk_group': 'low', 
     's_trt_mos': tpdp_cleopatra_median_os[0],
     's_trt_mos_95': cleopatra_low_rmst_mos_95.mos_A_95,
     's_cont_mos': tdp_cleopatra_median_os[0],
     's_cont_mos_95': cleopatra_low_rmst_mos_95.mos_B_95,
     's_mos_diff': tpdp_cleopatra_median_os[0] - tdp_cleopatra_median_os[0], 
     'rct_trt_arm': 57.1,
     'rct_cont_arm': 40.8,
     'rct_mos_diff': 57.1-40.8,
     's_trt_rmst': restricted_mean_survival_time(kmf_low_tpdp_cleopatra_iptw, 60),
     's_trt_rmst_95': cleopatra_low_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_low_tdp_cleopatra_iptw, 60),
     's_cont_rmst_95': cleopatra_low_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_low_tpdp_cleopatra_iptw, 60) - restricted_mean_survival_time(kmf_low_tdp_cleopatra_iptw, 60),
     's_diff_rmst_95': cleopatra_low_rmst_mos_95.difference_rmst_95,
     'scount': cleopatra.query('risk_score <= @low_cutoff_cleopatra').shape[0]},
    
    {'trial_name': 'CLEOPATRA', 
     'risk_group': 'medium', 
     's_trt_mos': tpdp_cleopatra_median_os[1],
     's_trt_mos_95': cleopatra_med_rmst_mos_95.mos_A_95,
     's_cont_mos': tdp_cleopatra_median_os[1],
     's_cont_mos_95': cleopatra_med_rmst_mos_95.mos_B_95,
     's_mos_diff': tpdp_cleopatra_median_os[1] - tdp_cleopatra_median_os[1], 
     'rct_trt_arm': 57.1,
     'rct_cont_arm': 40.8,
     'rct_mos_diff': 57.1-40.8,
     's_trt_rmst': restricted_mean_survival_time(kmf_med_tpdp_cleopatra_iptw, 60),
     's_trt_rmst_95': cleopatra_med_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_med_tdp_cleopatra_iptw, 60),
     's_cont_rmst_95': cleopatra_med_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_med_tpdp_cleopatra_iptw, 60) - restricted_mean_survival_time(kmf_med_tdp_cleopatra_iptw, 60),
     's_diff_rmst_95': cleopatra_med_rmst_mos_95.difference_rmst_95,
     'scount': cleopatra.query('risk_score < @high_cutoff_cleopatra and risk_score > @low_cutoff_cleopatra').shape[0]},
    
    {'trial_name': 'CLEOPATRA', 
     'risk_group': 'high', 
     's_trt_mos': tpdp_cleopatra_median_os[2],
     's_trt_mos_95': cleopatra_high_rmst_mos_95.mos_A_95,
     's_cont_mos': tdp_cleopatra_median_os[2],
     's_cont_mos_95': cleopatra_high_rmst_mos_95.mos_B_95,
     's_mos_diff': tpdp_cleopatra_median_os[2] - tdp_cleopatra_median_os[2], 
     'rct_trt_arm': 57.1,
     'rct_cont_arm': 40.8,
     'rct_mos_diff': 57.1-40.8,
     's_trt_rmst': restricted_mean_survival_time(kmf_high_tpdp_cleopatra_iptw, 60),
     's_trt_rmst_95': cleopatra_high_rmst_mos_95.rmst_A_95,
     's_cont_rmst': restricted_mean_survival_time(kmf_high_tdp_cleopatra_iptw, 60),
     's_cont_rmst_95': cleopatra_high_rmst_mos_95.rmst_B_95,
     's_diff_rmst': restricted_mean_survival_time(kmf_high_tpdp_cleopatra_iptw, 60) - restricted_mean_survival_time(kmf_high_tdp_cleopatra_iptw, 60),
     's_diff_rmst_95': cleopatra_high_rmst_mos_95.difference_rmst_95,
     'scount': cleopatra.query('risk_score >= @high_cutoff_cleopatra').shape[0]},
    
    {'trial_name': 'CLEOPATRA', 
     'risk_group': 'all', 
     's_hr': cleopatra_hr_all.hazard_ratios_['tpdp'],
     's_hr_95': [cleopatra_hr_all.summary.loc['tpdp']['exp(coef) lower 95%'], cleopatra_hr_all.summary.loc['tpdp']['exp(coef) upper 95%']],
     's_trt_mos': tpdp_cleopatra_median_os[3],
     's_trt_mos_95': cleopatra_all_rmst_mos_95.mos_A_95,
     's_cont_mos': tdp_cleopatra_median_os[3],
     's_cont_mos_95': cleopatra_all_rmst_mos_95.mos_B_95,
     's_mos_diff': tpdp_cleopatra_median_os[3] - tdp_cleopatra_median_os[3], 
     'rct_trt_arm': 57.1,
     'rct_cont_arm': 40.8,
     'rct_mos_diff': 57.1-40.8,
     'scount': cleopatra.shape[0]}
]

## Part 3. Combining dictionaries 

In [424]:
data_combined = paloma2_data + paloma3_data + cleopatra_data

In [425]:
strials_mos_rmst_boot = pd.DataFrame(data_combined)

In [426]:
strials_mos_rmst_boot

Unnamed: 0,trial_name,risk_group,s_trt_mos,s_trt_mos_95,s_cont_mos,s_cont_mos_95,s_mos_diff,rct_trt_arm,rct_cont_arm,rct_mos_diff,s_trt_rmst,s_trt_rmst_95,s_cont_rmst,s_cont_rmst_95,s_diff_rmst,s_diff_rmst_95,scount,s_hr,s_hr_95
0,PALOMA-2,low,32.033333,"[28.461666666666666, 36.6]",21.933333,"[20.266666666666666, 24.3]",10.1,27.6,14.5,13.1,29.875194,"[28.503408253277513, 31.44550477465725]",25.439408,"[23.984425776555053, 27.045772834853526]",4.435786,"[2.3338482690161215, 6.635226477842613]",1536,,
1,PALOMA-2,medium,24.566667,"[20.7325, 30.0]",18.266667,"[16.7, 20.3]",6.3,27.6,14.5,13.1,26.41296,"[24.70039556414195, 28.437863276744036]",22.327226,"[21.042807537681696, 23.665775410666328]",4.085734,"[2.0449036572259836, 6.384466339419586]",1473,,
2,PALOMA-2,high,11.8,"[10.066666666666666, 13.966666666666667]",9.433333,"[8.333333333333334, 10.7]",2.366667,27.6,14.5,13.1,16.506114,"[14.878941058363512, 18.071185535360634]",13.386579,"[12.370227515662082, 14.377660039483194]",3.119535,"[1.2620525354220855, 4.881942149141819]",1267,,
3,PALOMA-2,all,20.666667,"[18.925833333333333, 21.833333333333332]",16.266667,"[15.099166666666667, 17.533333333333335]",4.4,27.6,14.5,13.1,,,,,,,4276,0.770165,"[0.7044686557501216, 0.8419871839991265]"
4,PALOMA-3,low,13.766667,"[11.2, 17.033333333333335]",9.8,"[7.156666666666667, 13.102499999999997]",3.966667,9.5,4.6,4.9,14.19239,"[12.763424882456649, 15.672814923500392]",11.933394,"[10.476250981503593, 13.373207299008232]",2.258996,"[0.26220238761223574, 4.295560598738964]",349,,
5,PALOMA-3,medium,9.366667,"[6.266666666666667, 13.7]",4.9,"[4.233333333333333, 6.266666666666667]",4.466667,9.5,4.6,4.9,11.438872,"[9.688231653203616, 13.220207972731593]",8.733969,"[7.4299719383278235, 10.227282876909998]",2.704903,"[0.26978706848601974, 4.935723780590562]",324,,
6,PALOMA-3,high,5.566667,"[4.033333333333333, 8.966666666666667]",4.5,"[4.066666666666666, 5.633333333333334]",1.066667,9.5,4.6,4.9,8.549052,"[7.101670110535841, 10.165603144941937]",6.507426,"[5.495823850757027, 7.61497483371504]",2.041625,"[0.23134300795921506, 3.8686579692175136]",267,,
7,PALOMA-3,all,10.533333,"[8.9, 12.366666666666667]",6.066667,"[5.199166666666667, 6.767499999999999]",4.466667,9.5,4.6,4.9,,,,,,,940,0.740055,"[0.6279207192837161, 0.8722152368682689]"
8,CLEOPATRA,low,90.933333,"[82.76666666666667, nan]",52.066667,"[33.333333333333336, 78.8]",38.866667,57.1,40.8,16.3,51.647747,"[49.75908211685452, 53.44090161337312]",44.534412,"[37.71709752464466, 51.10464672272865]",7.113335,"[0.2909044456568117, 14.28987569917692]",391,,
9,CLEOPATRA,medium,50.866667,"[44.666666666666664, 63.1]",42.2,"[23.666666666666668, 60.13333333333333]",8.666667,57.1,40.8,16.3,43.527447,"[41.26401326689026, 45.58319309908114]",39.808802,"[32.4666490975901, 47.655869244418035]",3.718645,"[-4.49595122362415, 11.109113989835501]",356,,


In [427]:
strials_mos_rmst_boot.to_csv('strials_mos_rmst_boot.csv', index = False)