# Flatiron Health mBC: Survival metrics for key elgibility criteria
**Background: Calculate survival metrics for emulated trials involving patients meeting key elgibliity criteria. Hazard ratio for the full cohort is calculated from a Cox-IPTW model. Restricted mean survival time and median overall survival are calculated for phenotypes using an IPTW-adjusted KM curve.** 

## Part 1: Preprocessing

### 1.1 Import packages and create necessary functions

In [1]:
import numpy as np
import pandas as pd

from scipy import stats

from sksurv.nonparametric import kaplan_meier_estimator
from survive import KaplanMeier, SurvivalData

from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.plotting import add_at_risk_counts
from lifelines.utils import median_survival_times, restricted_mean_survival_time
from lifelines.statistics import logrank_test

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

import warnings

In [2]:
# Function that returns number of rows and count of unique PatientIDs for a dataframe. 
def row_ID(dataframe):
    row = dataframe.shape[0]
    ID = dataframe['PatientID'].nunique()
    return row, ID

In [3]:
# Find index for value closest to input value. 
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

In [4]:
# Calculates median overeall suvival for risk groups. 
def mos(low, med, high, comp):
    low_os = low.median_survival_time_
    med_os = med.median_survival_time_
    high_os = high.median_survival_time_
    comp_os = comp.median_survival_time_
    mos = [low_os, med_os, high_os, comp_os]
    return (mos)

In [5]:
def rmst_mos_95ci(df, num_samples, drug, event, items_list, numerical_features, rmst_time):
    
    """
    Estimate the 95% confidence interval for RMST and mOS using bootstrap resampling.

    Parameters:
    - df: DataFrame containing survival data
    - num_samples: Number of bootstrap samples
    - drug: Treatment indicator variable
    - event: Event type ('death' or 'progression')
    - items_list: Feature list for IPTW 
    - numerical_features: List of numerical features
    - rmst_time: Time to calculate RMST 

    Returns:
    - mos_A_95: mOS 95% CI for treatment
    - mos_B_95: mOS 95% CI for control
    - rmst_A_95: RMST 95% CI for treatment
    - rmst_B_95: RMST 95% CI for control
    - difference_rmst_95: RMST 95% CI for difference between treatment and control 
    """
    
    np.random.seed(42)
    mos_A = []
    mos_B = []
    rmst_A_list = []
    rmst_B_list = []
    differences_rmst = []
    
    # Define variables based on the event type
    if event == 'death':
        time_column = 'timerisk_treatment'
        status_column = 'death_status'
        
    else:
        time_column = 'time_prog_treatment'
        status_column = 'pfs_status'
        
    # Set up preprocessor for logistical regression which will be for IPTW  
    numerical_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('std_scaler', StandardScaler())])
        
    categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')
    categorical_features = list(df.select_dtypes(include = ['category']).columns)
        
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)],
        remainder = 'passthrough')
    
    # Boostrap HR 
    for _ in range(num_samples):
        
        # Resample data with replacement
        resampled_df = resample(df).drop(columns = ['ps', 'weight'])
        
        # Calculated IPTW for the resampled group 
        df_x = preprocessor.fit_transform(resampled_df.filter(items = items_list))
                                           
        df_lr = LogisticRegression(max_iter = 1000)
        df_lr.fit(df_x, resampled_df[drug])
        warnings.filterwarnings("ignore")
                                           
        pred = df_lr.predict_proba(df_x)        
        resampled_df['ps'] = pred[:, 1]                          
        resampled_df['weight'] = (
                np.where(resampled_df[drug] == 1, 1/resampled_df['ps'], 1/(1 - resampled_df['ps'])))
    
        # mOS from IPTW-KM
        kmf_A = KaplanMeierFitter()
        kmf_A.fit(resampled_df.query(f'{drug} == 1')[time_column]/30,
                  resampled_df.query(f'{drug} == 1')[status_column], 
                  weights = resampled_df.query(f'{drug} == 1')['weight'])

        kmf_B = KaplanMeierFitter()
        kmf_B.fit(resampled_df.query(f'{drug} == 0')[time_column]/30,
                  resampled_df.query(f'{drug} == 0')[status_column], 
                  weights = resampled_df.query(f'{drug} == 0')['weight'])
    
        mos_A.append(kmf_A.median_survival_time_)
        mos_B.append(kmf_B.median_survival_time_)
        
        # RMST from IPTW-KM
        rmst_A = restricted_mean_survival_time(kmf_A, rmst_time)
        rmst_B = restricted_mean_survival_time(kmf_B, rmst_time)
        
        rmst_A_list.append(rmst_A)
        rmst_B_list.append(rmst_B)
        differences_rmst.append(rmst_A - rmst_B)

    # Calculate the 95% confidence interval
    results = pd.Series({
    'mos_A_95': np.percentile(mos_A, [2.5, 97.5]),
    'mos_B_95': np.percentile(mos_B, [2.5, 97.5]),
    'rmst_A_95': np.percentile(rmst_A_list, [2.5, 97.5]),
    'rmst_B_95': np.percentile(rmst_B_list, [2.5, 97.5]),
    'difference_rmst_95': np.percentile(differences_rmst, [2.5, 97.5])
    })
    
    return results

## Part 2: In silico trials 

### PALOMA-2: palbociclib plus letrozole vs. letrozole in hormone-sensitive metastatic breast cancer

**INCLUSION**
* Untreated metastatic breast cancer
* Received first line palbociclib and aromatase inhibitor or aromatase inhibitor alone
* Luteinizing hormone–releasing hormone agonist (leuprolide, goserelin, and triptorelin) is allowed in either arm
* ER-positive within [-90, +30] days of first-line treatment
* HER-2 negative within [-90, +30] days of first-line treatment

#### Palbociclib plus AI 

In [6]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

31677

In [7]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [8]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1'))

In [9]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('Palbociclib')].LineName.value_counts().head(10)

Letrozole,Palbociclib                  2503
Fulvestrant,Palbociclib                1324
Anastrozole,Palbociclib                 466
Palbociclib                             261
Exemestane,Palbociclib                  123
Letrozole,Leuprolide,Palbociclib         98
Goserelin,Letrozole,Palbociclib          90
Fulvestrant,Letrozole,Palbociclib        40
Anastrozole,Fulvestrant,Palbociclib      37
Palbociclib,Tamoxifen                    31
Name: LineName, dtype: int64

In [10]:
# Of note, LH releasing hormone agonist (leuprolide, goserelin, and triptorelin) not excluded
ai = [
    'Anastrozole',
    'Letrozole',
    'Exemestane']

exc = [
    'Capecitabine',
    'Carboplatin',
    'Cisplatin',
    'Cyclophosphamide',
    'Cytarabine Liposomal',
    'Decitabine',
    'Docetaxel',
    'Doxorubicin',
    'Doxorubicin Pegylated Liposomal',
    'Eribulin',
    'Etoposide',
    'Fluorouracil',
    'Gemcitabine',
    'Hydroxyurea',
    'Leucovorin',
    'Methotrexate',
    'Oxaliplatin',
    'Paclitaxel',
    'Paclitaxel Protein-Bound',
    'Vinorelbine',
    'Nivolumab',
    'Pembrolizumab',
    'Abemaciclib',
    'Alpelisib',
    'Fulvestrant',
    'Ribociclib',
    'Tamoxifen',
    'Carfilzomib',
    'Daratumumab',
    'Everolimus',
    'Imatinib',
    'Lapatinib',
    'Lenalidomide',
    'Neratinib',
    'Olaparib',
    'Pazopanib',
    'Pertuzumab',
    'Rituximab',
    'Sorafenib',
    'Toremifene',
    'Trastuzumab',
    'Clinical Study Drug',
]

In [11]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(ai)) & 
                line_therapy_fl['LineName'].str.contains('Palbociclib') &
                ~line_therapy_fl['LineName'].str.contains('|'.join(exc))].LineName.value_counts().head(30)

Letrozole,Palbociclib                  2503
Anastrozole,Palbociclib                 466
Exemestane,Palbociclib                  123
Letrozole,Leuprolide,Palbociclib         98
Goserelin,Letrozole,Palbociclib          90
Anastrozole,Goserelin,Palbociclib        17
Anastrozole,Leuprolide,Palbociclib       17
Letrozole,Palbociclib,Triptorelin         5
Anastrozole,Palbociclib,Triptorelin       4
Exemestane,Goserelin,Palbociclib          4
Exemestane,Leuprolide,Palbociclib         3
Name: LineName, dtype: int64

In [12]:
let_palb = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(ai)) & 
                    line_therapy_fl['LineName'].str.contains('Palbociclib') &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(exc))]
    [['PatientID', 'StartDate']]
)

In [13]:
row_ID(let_palb)

(3330, 3330)

In [14]:
let_palb.loc[:,'let_palb'] = 1

#### AI

In [15]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('Letrozole')].LineName.value_counts().head(10)

Letrozole                            2536
Letrozole,Palbociclib                2503
Letrozole,Ribociclib                  229
Abemaciclib,Letrozole                 190
Letrozole,Leuprolide,Palbociclib       98
Goserelin,Letrozole,Palbociclib        90
Fulvestrant,Letrozole                  64
Letrozole,Trastuzumab                  59
Fulvestrant,Letrozole,Palbociclib      40
Letrozole,Leuprolide                   40
Name: LineName, dtype: int64

In [16]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(ai)) & 
                ~line_therapy_fl['LineName'].str.contains('Palbociclib') &
                ~line_therapy_fl['LineName'].str.contains('|'.join(exc))].LineName.value_counts().head(40)

Anastrozole                      2943
Letrozole                        2536
Exemestane                        780
Letrozole,Leuprolide               40
Anastrozole,Leuprolide             35
Goserelin,Letrozole                26
Anastrozole,Goserelin              20
Exemestane,Goserelin                7
Exemestane,Leuprolide               4
Anastrozole,Triptorelin             2
Letrozole,Medroxyprogesterone       1
Letrozole,Triptorelin               1
Anastrozole,Megestrol               1
Name: LineName, dtype: int64

In [17]:
let = (
    line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(ai)) & 
                    ~line_therapy_fl['LineName'].str.contains('Palbociclib') &
                    ~line_therapy_fl['LineName'].str.contains('|'.join(exc))]
    [['PatientID', 'StartDate']]
)

In [18]:
row_ID(let)

(6396, 6396)

In [19]:
let.loc[:,'let_palb'] = 0

In [20]:
paloma2 = pd.concat([let_palb, let])

In [21]:
row_ID(paloma2)

(9726, 9726)

In [22]:
paloma2 = pd.merge(paloma2, df_full, on = 'PatientID', how = 'left')

In [23]:
row_ID(paloma2)

(9726, 9726)

In [24]:
paloma2['StartDate'] = pd.to_datetime(paloma2['StartDate'])

#### ER-positive and HER-2 negative 

In [25]:
biomarkers = pd.read_csv('Enhanced_MetBreastBiomarkers.csv')

In [26]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(paloma2['PatientID'])]

In [27]:
row_ID(biomarkers)

(69754, 9662)

In [28]:
biomarkers = pd.merge(biomarkers, paloma2[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [29]:
row_ID(biomarkers)

(69754, 9662)

In [30]:
biomarkers['StartDate'] = pd.to_datetime(biomarkers['StartDate'])

In [31]:
biomarkers['ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [32]:
biomarkers['SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [33]:
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [34]:
biomarkers.loc[:, 'date_diff'] = (biomarkers['result_date'] - biomarkers['StartDate']).dt.days

In [35]:
er_status = (
    biomarkers
    .query('BiomarkerName == "ER"')
    .query('date_diff <= 30 and date_diff >= -90')
    .query('BiomarkerStatus == "Positive" or BiomarkerStatus == "Negative"') # don't select unknown values 
    .sort_values(['PatientID', 'date_diff'], ascending = [True, False]) # select ER status closest to treatment start
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'BiomarkerStatus']]
    .rename(columns = {'BiomarkerStatus': 'er'})
   )

In [36]:
row_ID(er_status)

(6016, 6016)

In [37]:
her2_status = (
    biomarkers
    .query('BiomarkerName == "HER2"')
    .query('date_diff <= 30 and date_diff >= -90')
    .sort_values(['PatientID', 'date_diff'], ascending = [True, False])
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'BiomarkerStatus']]
    .rename(columns = {'BiomarkerStatus': 'her2'})
)

In [38]:
row_ID(her2_status)

(5607, 5607)

In [39]:
paloma2 = pd.merge(paloma2, er_status, on  = 'PatientID', how = 'left')

In [40]:
row_ID(paloma2)

(9726, 9726)

In [41]:
paloma2 = pd.merge(paloma2, her2_status, on  = 'PatientID', how = 'left')

In [42]:
row_ID(paloma2)

(9726, 9726)

In [43]:
her2_neg = ['IHC negative (0-1+)',
            'FISH negative/not amplified',
            'IHC equivocal (2+)',
            'Negative NOS',
            'NGS negative (ERBB2 not amplified)',
            'FISH equivocal',
            'Equivocal NOS',
            'NGS equivocal (ERBB2 amplification equivocal)']

paloma2 = (
    paloma2
    .query('er == "Positive"')
    .query('her2== @her2_neg')
)

In [44]:
row_ID(paloma2)

(4842, 4842)

#### Time from treatment to progression/death or censor 

In [45]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [46]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [47]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [48]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [49]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
row_ID(mortality)

(31677, 31677)

In [50]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [51]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [52]:
row_ID(mortality)

(31677, 31677)

In [53]:
paloma2 = pd.merge(paloma2, mortality, on = 'PatientID', how = 'left')

In [54]:
row_ID(paloma2)

(4842, 4842)

In [55]:
progression = pd.read_csv ('Enhanced_MetBreastProgression.csv')

In [56]:
progression = progression[progression['PatientID'].isin(paloma2['PatientID'])][['PatientID', 'ProgressionDate']]

In [57]:
progression['ProgressionDate'] = pd.to_datetime(progression['ProgressionDate'])

In [58]:
progression = (
    progression
    .sort_values(['PatientID', 'ProgressionDate'], ascending = [True, True])
    .drop_duplicates(subset = 'PatientID', keep = 'first')
)

In [59]:
row_ID(progression)

(4840, 4840)

In [60]:
paloma2 = pd.merge(paloma2, progression, on = 'PatientID', how = 'left')

In [61]:
row_ID(paloma2)

(4842, 4842)

In [62]:
# Percent without progression date
len(paloma2.query('ProgressionDate.isna()', engine = 'python'))/len(paloma2)

0.4012804626187526

In [63]:
conditions = [
    (paloma2.ProgressionDate.notna()),
    ((paloma2.ProgressionDate.isna()) & (paloma2['death_status'] == 1)),
    ((paloma2.ProgressionDate.isna()) & (paloma2['death_status'] == 0))]

choices = [
    (paloma2['ProgressionDate'] - paloma2['StartDate']).dt.days,
    (paloma2['death_date'] - paloma2['StartDate']).dt.days,
    (paloma2['last_activity'] - paloma2['StartDate']).dt.days]

paloma2.loc[:, 'time_prog_treatment'] = np.select(conditions, choices)

In [64]:
paloma2 = paloma2.query('time_prog_treatment >= 0')

In [65]:
len(paloma2)

4734

In [66]:
conditions = [
    (paloma2.ProgressionDate.notna()),
    ((paloma2.ProgressionDate.isna()) & (paloma2['death_status'] == 1)),
    ((paloma2.ProgressionDate.isna()) & (paloma2['death_status'] == 0))]

choices = [1, 1, 0]

paloma2.loc[:, 'pfs_status'] = np.select(conditions, choices)

#### Patient count 

In [67]:
low_cutoff_paloma2 = paloma2.risk_score.quantile(1/3)

In [68]:
high_cutoff_paloma2 = paloma2.risk_score.quantile(2/3)

In [69]:
print('Palbociclib plus letrozole total:',  paloma2.query('let_palb == 1').shape[0])
print('High risk:', paloma2.query('let_palb == 1').query('risk_score >= @high_cutoff_paloma2').shape[0])
print('Med risk:', paloma2.query('let_palb == 1').query('risk_score < @high_cutoff_paloma2 and risk_score > @low_cutoff_paloma2').shape[0])
print('Low risk:', paloma2.query('let_palb == 1').query('risk_score <= @low_cutoff_paloma2').shape[0])

Palbociclib plus letrozole total: 2137
High risk: 615
Med risk: 681
Low risk: 841


In [70]:
print('Letrozole:',  paloma2.query('let_palb == 0').shape[0])
print('High risk:', paloma2.query('let_palb == 0').query('risk_score >= @high_cutoff_paloma2').shape[0])
print('Med risk:', paloma2.query('let_palb == 0').query('risk_score < @high_cutoff_paloma2 and risk_score > @low_cutoff_paloma2').shape[0])
print('Low risk:', paloma2.query('let_palb == 0').query('risk_score <= @low_cutoff_paloma2').shape[0])

Letrozole: 2597
High risk: 963
Med risk: 897
Low risk: 737


#### PFS with covariate balancing 

In [71]:
paloma2 = paloma2.set_index('PatientID')

In [72]:
conditions = [
    (paloma2['thorax_met'] == 1) |
    (paloma2['liver_met'] == 1) |
    (paloma2['cns_met'] == 1) |
    (paloma2['peritoneum_met'] == 1) |
    (paloma2['other_met'] == 1),
    (paloma2['bone_met'] == 0) &
    (paloma2['thorax_met'] == 0) &
    (paloma2['lymph_met'] == 0) &
    (paloma2['liver_met'] == 0) &
    (paloma2['cns_met'] == 0) &
    (paloma2['skin_met'] == 0) &
    (paloma2['peritoneum_met'] == 0) &
    (paloma2['other_met'] == 0)
]

choices = ['visceral', 'unknown']

paloma2['met_site'] = np.select(conditions, choices, default = 'nonvisceral')

In [73]:
paloma2['met_cat'] = pd.cut(paloma2['met_year'],
                            bins = [2010, 2016, float('inf')],
                            labels = ['11-16', '17-22'])

In [74]:
conditions = [
    ((paloma2['ecog_diagnosis'] == "1.0") | (paloma2['ecog_diagnosis'] == "0.0")),  
    ((paloma2['ecog_diagnosis'] == "2.0") | (paloma2['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

paloma2['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [75]:
paloma2_iptw = paloma2.filter(items = ['pfs_status',
                                       'time_prog_treatment',
                                       'let_palb',
                                       'age',
                                       'gender',
                                       'race',
                                       'p_type',
                                       'delta_met_diagnosis',
                                       'met_cat',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ses',
                                       'ecog_2',
                                       'met_site',
                                       'albumin_diag',
                                       'weight_pct_change',
                                       'risk_score'])

In [76]:
paloma2_iptw.dtypes

pfs_status                int64
time_prog_treatment     float64
let_palb                  int64
age                       int64
gender                   object
race                     object
p_type                   object
delta_met_diagnosis       int64
met_cat                category
commercial              float64
medicare                float64
medicaid                float64
ses                     float64
ecog_2                   object
met_site                 object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
dtype: object

In [77]:
to_be_categorical = list(paloma2_iptw.select_dtypes(include = ['object']).columns)

In [78]:
to_be_categorical

['gender', 'race', 'p_type', 'ecog_2', 'met_site']

In [79]:
to_be_categorical.append('met_cat')

In [80]:
to_be_categorical.append('ses')

In [81]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    paloma2_iptw[x] = paloma2_iptw[x].astype('category')

In [82]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [83]:
# List of categorical features.
categorical_features = list(paloma2_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [84]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [85]:
paloma2_iptw_low = (
    paloma2_iptw
    .query('risk_score <= @low_cutoff_paloma2'))

paloma2_iptw_med = (
    paloma2_iptw
    .query('risk_score < @high_cutoff_paloma2 and risk_score > @low_cutoff_paloma2'))

paloma2_iptw_high = (
    paloma2_iptw
    .query('risk_score >= @high_cutoff_paloma2'))

paloma2_iptw_all = paloma2_iptw

In [86]:
paloma2_low_x = preprocessor.fit_transform(paloma2_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'p_type',
                                                                            'delta_met_diagnosis',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ses',
                                                                            'ecog_2',
                                                                            'met_site', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

paloma2_med_x = preprocessor.fit_transform(paloma2_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'p_type',
                                                                            'delta_met_diagnosis',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ses',
                                                                            'ecog_2',
                                                                            'met_site', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

paloma2_high_x = preprocessor.fit_transform(paloma2_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'p_type',
                                                                              'delta_met_diagnosis',
                                                                              'met_cat',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ses',
                                                                              'ecog_2',
                                                                              'met_site', 
                                                                              'albumin_diag', 
                                                                              'weight_pct_change', 
                                                                              'risk_score']))

paloma2_all_x = preprocessor.fit_transform(paloma2_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'p_type',
                                                                            'delta_met_diagnosis',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ses',
                                                                            'ecog_2',
                                                                            'met_site', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

In [87]:
lr_paloma2_low = LogisticRegression(max_iter = 1000)
lr_paloma2_low.fit(paloma2_low_x, paloma2_iptw_low['let_palb'])

LogisticRegression(max_iter=1000)

In [88]:
lr_paloma2_med = LogisticRegression(max_iter = 1000)
lr_paloma2_med.fit(paloma2_med_x, paloma2_iptw_med['let_palb'])

LogisticRegression(max_iter=1000)

In [89]:
lr_paloma2_high = LogisticRegression(max_iter = 1000)
lr_paloma2_high.fit(paloma2_high_x, paloma2_iptw_high['let_palb'])

LogisticRegression(max_iter=1000)

In [90]:
lr_paloma2_all = LogisticRegression(max_iter = 1000)
lr_paloma2_all.fit(paloma2_all_x, paloma2_iptw_all['let_palb'])

LogisticRegression(max_iter=1000)

In [91]:
pred_low = lr_paloma2_low.predict_proba(paloma2_low_x)
pred_med = lr_paloma2_med.predict_proba(paloma2_med_x)
pred_high = lr_paloma2_high.predict_proba(paloma2_high_x)
pred_all = lr_paloma2_all.predict_proba(paloma2_all_x)

In [92]:
paloma2_iptw_low['ps'] = pred_low[:, 1]
paloma2_iptw_med['ps'] = pred_med[:, 1]
paloma2_iptw_high['ps'] = pred_high[:, 1]
paloma2_iptw_all['ps'] = pred_all[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [93]:
paloma2_iptw_low['weight'] = (
    np.where(paloma2_iptw_low['let_palb'] == 1, 1/paloma2_iptw_low['ps'], 1/(1 - paloma2_iptw_low['ps'])))

paloma2_iptw_med['weight'] = (
    np.where(paloma2_iptw_med['let_palb'] == 1, 1/paloma2_iptw_med['ps'], 1/(1 - paloma2_iptw_med['ps'])))

paloma2_iptw_high['weight'] = (
    np.where(paloma2_iptw_high['let_palb'] == 1, 1/paloma2_iptw_high['ps'], 1/(1 - paloma2_iptw_high['ps'])))

paloma2_iptw_all['weight'] = (
    np.where(paloma2_iptw_all['let_palb'] == 1, 1/paloma2_iptw_all['ps'], 1/(1 - paloma2_iptw_all['ps'])))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [94]:
# Low KM curves
kmf_low_plet_paloma2_iptw = KaplanMeierFitter()
kmf_low_let_paloma2_iptw = KaplanMeierFitter()

kmf_low_plet_paloma2_iptw.fit(
    paloma2_iptw_low.query('let_palb == 1').time_prog_treatment/30,
    paloma2_iptw_low.query('let_palb == 1').pfs_status,
    weights = paloma2_iptw_low.query('let_palb == 1')['weight'])

kmf_low_let_paloma2_iptw.fit(
    paloma2_iptw_low.query('let_palb == 0').time_prog_treatment/30,
    paloma2_iptw_low.query('let_palb == 0').pfs_status,
    weights = paloma2_iptw_low.query('let_palb == 0')['weight'])

# Med KM curves
kmf_med_plet_paloma2_iptw = KaplanMeierFitter()
kmf_med_let_paloma2_iptw = KaplanMeierFitter()

kmf_med_plet_paloma2_iptw.fit(
    paloma2_iptw_med.query('let_palb == 1').time_prog_treatment/30,
    paloma2_iptw_med.query('let_palb == 1').pfs_status,
    weights = paloma2_iptw_med.query('let_palb == 1')['weight'])

kmf_med_let_paloma2_iptw.fit(
    paloma2_iptw_med.query('let_palb == 0').time_prog_treatment/30,
    paloma2_iptw_med.query('let_palb == 0').pfs_status,
    weights = paloma2_iptw_med.query('let_palb == 0')['weight'])

# High KM curves 
kmf_high_plet_paloma2_iptw = KaplanMeierFitter()
kmf_high_let_paloma2_iptw = KaplanMeierFitter()

kmf_high_plet_paloma2_iptw.fit(
    paloma2_iptw_high.query('let_palb == 1').time_prog_treatment/30,
    paloma2_iptw_high.query('let_palb == 1').pfs_status,
    weights = paloma2_iptw_high.query('let_palb == 1')['weight'])

kmf_high_let_paloma2_iptw.fit(
    paloma2_iptw_high.query('let_palb == 0').time_prog_treatment/30,
    paloma2_iptw_high.query('let_palb == 0').pfs_status,
    weights = paloma2_iptw_high.query('let_palb == 0')['weight'])

# All KM curves 
kmf_all_plet_paloma2_iptw = KaplanMeierFitter()
kmf_all_let_paloma2_iptw = KaplanMeierFitter()

kmf_all_plet_paloma2_iptw.fit(
    paloma2_iptw_all.query('let_palb == 1').time_prog_treatment/30,
    paloma2_iptw_all.query('let_palb == 1').pfs_status,
    weights = paloma2_iptw_all.query('let_palb == 1')['weight'])

kmf_all_let_paloma2_iptw.fit(
    paloma2_iptw_all.query('let_palb == 0').time_prog_treatment/30,
    paloma2_iptw_all.query('let_palb == 0').pfs_status,
    weights = paloma2_iptw_all.query('let_palb == 0')['weight'])

  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
                  


<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 4746.12 total observations, 1167.3 right-censored observations>

#### Calculating survival metrics

In [95]:
plet_paloma2_median_pfs = mos(kmf_low_plet_paloma2_iptw,
                              kmf_med_plet_paloma2_iptw,
                              kmf_high_plet_paloma2_iptw,
                              kmf_all_plet_paloma2_iptw)

let_paloma2_median_pfs = mos(kmf_low_let_paloma2_iptw,
                             kmf_med_let_paloma2_iptw,
                             kmf_high_let_paloma2_iptw,
                             kmf_all_let_paloma2_iptw)

In [96]:
paloma2_iptw_all_imputed = paloma2_iptw_all.copy()
paloma2_iptw_all_imputed['albumin_diag'] = paloma2_iptw_all_imputed['albumin_diag'].fillna(paloma2_iptw_all_imputed['albumin_diag'].median())
paloma2_iptw_all_imputed['weight_pct_change'] = paloma2_iptw_all_imputed['weight_pct_change'].fillna(paloma2_iptw_all_imputed['weight_pct_change'].median())
paloma2_iptw_all_imputed['ses'] = paloma2_iptw_all_imputed['ses'].cat.add_categories('unknown')
paloma2_iptw_all_imputed['ses'] = paloma2_iptw_all_imputed['ses'].fillna('unknown')

In [97]:
paloma2_hr_all = CoxPHFitter()
paloma2_hr_all.fit(paloma2_iptw_all_imputed,
                   duration_col = 'time_prog_treatment',
                   event_col = 'pfs_status',
                   formula = 'let_palb + age + gender + race + p_type + delta_met_diagnosis + met_cat + commercial + medicare + medicaid + ses + ecog_2 + met_site + albumin_diag + weight_pct_change + risk_score',
                   weights_col = 'weight',
                   robust = True)

<lifelines.CoxPHFitter: fitted with 9442.92 total observations, 2651.83 right-censored observations>

In [98]:
paloma2_all_rmst_mos_95 = rmst_mos_95ci(paloma2_iptw_all,
                                        1000,
                                        'let_palb',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'p_type',
                                         'delta_met_diagnosis',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ses',
                                         'ecog_2',
                                         'met_site', 
                                         'albumin_diag', 
                                         'weight_pct_change', 
                                         'risk_score'],
                                        ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        48)

In [99]:
paloma2_low_rmst_mos_95 = rmst_mos_95ci(paloma2_iptw_low,
                                        1000,
                                        'let_palb',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'p_type',
                                         'delta_met_diagnosis',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ses',
                                         'ecog_2',
                                         'met_site', 
                                         'albumin_diag', 
                                         'weight_pct_change', 
                                         'risk_score'],
                                        ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        48)

In [100]:
paloma2_med_rmst_mos_95 = rmst_mos_95ci(paloma2_iptw_med,
                                        1000,
                                        'let_palb',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'p_type',
                                         'delta_met_diagnosis',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ses',
                                         'ecog_2',
                                         'met_site', 
                                         'albumin_diag', 
                                         'weight_pct_change', 
                                         'risk_score'],
                                        ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                        48)

In [101]:
paloma2_high_rmst_mos_95 = rmst_mos_95ci(paloma2_iptw_high,
                                         1000,
                                         'let_palb',
                                         'progression',
                                         ['age',
                                          'gender',
                                          'race',
                                          'p_type',
                                          'delta_met_diagnosis',
                                          'met_cat',
                                          'commercial',
                                          'medicare',
                                          'medicaid',
                                          'ses',
                                          'ecog_2',
                                          'met_site', 
                                          'albumin_diag', 
                                          'weight_pct_change', 
                                          'risk_score'],
                                         ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                         48)

In [102]:
paloma2_data = [
    {'trial_name': 'PALOMA-2', 
     'risk_group': 'low', 
     'r_trt_mos': plet_paloma2_median_pfs[0],
     'r_trt_mos_95': paloma2_low_rmst_mos_95.mos_A_95,
     'r_cont_mos': let_paloma2_median_pfs[0],
     'r_cont_mos_95': paloma2_low_rmst_mos_95.mos_B_95,
     'r_mos_diff': plet_paloma2_median_pfs[0] - let_paloma2_median_pfs[0], 
     'rct_trt_arm': 27.6,
     'rct_cont_arm': 14.5,
     'rct_mos_diff': 27.6-14.5,
     'trt_rmst': restricted_mean_survival_time(kmf_low_plet_paloma2_iptw, 48),
     'trt_rmst_95': paloma2_low_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_low_let_paloma2_iptw, 48),
     'cont_rmst_95': paloma2_low_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_low_plet_paloma2_iptw, 48) - restricted_mean_survival_time(kmf_low_let_paloma2_iptw, 48),
     'diff_rmst_95': paloma2_low_rmst_mos_95.difference_rmst_95,
     'rcount': paloma2.query('risk_score <= @low_cutoff_paloma2').shape[0]},
    
    {'trial_name': 'PALOMA-2', 
     'risk_group': 'medium', 
     'r_trt_mos': plet_paloma2_median_pfs[1],
     'r_trt_mos_95': paloma2_med_rmst_mos_95.mos_A_95,
     'r_cont_mos': let_paloma2_median_pfs[1],
     'r_cont_mos_95': paloma2_med_rmst_mos_95.mos_B_95,
     'r_mos_diff': plet_paloma2_median_pfs[1] - let_paloma2_median_pfs[1], 
     'rct_trt_arm': 27.6,
     'rct_cont_arm': 14.5,
     'rct_mos_diff': 27.6-14.5,
     'trt_rmst': restricted_mean_survival_time(kmf_med_plet_paloma2_iptw, 48),
     'trt_rmst_95': paloma2_med_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_med_let_paloma2_iptw, 48),
     'cont_rmst_95': paloma2_med_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_med_plet_paloma2_iptw, 48) - restricted_mean_survival_time(kmf_med_let_paloma2_iptw, 48),
     'diff_rmst_95': paloma2_med_rmst_mos_95.difference_rmst_95,
     'rcount': paloma2.query('risk_score < @high_cutoff_paloma2 and risk_score > @low_cutoff_paloma2').shape[0]},
    
    {'trial_name': 'PALOMA-2', 
     'risk_group': 'high', 
     'r_trt_mos': plet_paloma2_median_pfs[2],
     'r_trt_mos_95': paloma2_high_rmst_mos_95.mos_A_95,
     'r_cont_mos': let_paloma2_median_pfs[2],
     'r_cont_mos_95': paloma2_high_rmst_mos_95.mos_B_95,
     'r_mos_diff': plet_paloma2_median_pfs[2] - let_paloma2_median_pfs[2], 
     'rct_trt_arm': 27.6,
     'rct_cont_arm': 14.5,
     'rct_mos_diff': 27.6-14.5,
     'trt_rmst': restricted_mean_survival_time(kmf_high_plet_paloma2_iptw, 48),
     'trt_rmst_95': paloma2_high_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_high_let_paloma2_iptw, 48),
     'cont_rmst_95': paloma2_high_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_high_plet_paloma2_iptw, 48) - restricted_mean_survival_time(kmf_high_let_paloma2_iptw, 48),
     'diff_rmst_95': paloma2_high_rmst_mos_95.difference_rmst_95,
     'rcount': paloma2.query('risk_score >= @high_cutoff_paloma2').shape[0]},
    
    {'trial_name': 'PALOMA-2', 
     'risk_group': 'all', 
     'r_hr': paloma2_hr_all.hazard_ratios_['let_palb'],
     'r_hr_95': [paloma2_hr_all.summary.loc['let_palb']['exp(coef) lower 95%'], paloma2_hr_all.summary.loc['let_palb']['exp(coef) upper 95%']],
     'r_trt_mos': plet_paloma2_median_pfs[3],
     'r_trt_mos_95': paloma2_all_rmst_mos_95.mos_A_95,
     'r_cont_mos': let_paloma2_median_pfs[3],
     'r_cont_mos_95': paloma2_all_rmst_mos_95.mos_B_95,
     'r_mos_diff': plet_paloma2_median_pfs[3] - let_paloma2_median_pfs[3], 
     'rct_trt_arm': 27.6,
     'rct_cont_arm': 14.5,
     'rct_mos_diff': 27.6-14.5,
     'rcount': paloma2.shape[0]}
]

### PALOMA-3: palbociclib plus fulvestrant vs. fulvestrant in hormone-sensitive metastatic breast cancer that had previously progressed on endocrine therapy

**INCLUSION**
* Received first line estrogen therapy +/ one line of chemotherapy 
* Received second (or third) line palbociclib plus fulvestrant or fulvestrant alone 
* Did not receive CDK 4/6 inhibitor, fulvestrant, or everolimus in earlier lines
* ER/PR positive and HER-2 negative within (-inf, +30] days of start of treatment

#### 1. First line endorcine therapy + chemotherapy, second line fulvestrant +/- palbociclib

In [103]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

31677

In [104]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [105]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1'))

In [106]:
et = [
    'Anastrozole',
    'Letrozole',
    'Exemestane',
    'Tamoxifen']

chemo = [
    'Capecitabine',
    'Carboplatin',
    'Cyclophosphamide',
    'Docetaxel',
    'Eribulin',
    'Gemcitabine',
    'Paclitaxel',
    'Paclitaxel Protein-Bound',
    'Vinorelbine']

exc =[
    'Abemaciclib',
    'Palbociclib',
    'Ribociclib',
    'Fulvestrant',
    'Everolimus',
    'Clinical Study Drug']

In [107]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(et)) 
                & line_therapy_fl['LineName'].str.contains('|'.join(chemo))
                & ~line_therapy_fl['LineName'].str.contains('|'.join(exc))].LineName.value_counts().head(10)

Capecitabine,Letrozole                        31
Anastrozole,Capecitabine                      25
Anastrozole,Paclitaxel                        18
Letrozole,Paclitaxel Protein-Bound            17
Anastrozole,Paclitaxel Protein-Bound          14
Capecitabine,Tamoxifen                        14
Capecitabine,Exemestane                       13
Letrozole,Paclitaxel                          13
Paclitaxel,Tamoxifen                          11
Docetaxel,Letrozole,Pertuzumab,Trastuzumab    11
Name: LineName, dtype: int64

In [108]:
etchemo_id = (
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('|'.join(et))
     & line_therapy_fl['LineName'].str.contains('|'.join(chemo))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(exc))]
    .PatientID
)

In [109]:
line_therapy_sec = (
    line_therapy[line_therapy['PatientID'].isin(etchemo_id)]
    .query('LineNumber == 2'))

In [110]:
etchemo_pf = (
    line_therapy_sec
    .query('LineName == "Fulvestrant,Palbociclib"')
    [['PatientID', 'StartDate']]
)

In [111]:
row_ID(etchemo_pf)

(13, 13)

In [112]:
etchemo_f = (
    line_therapy_sec
    .query('LineName == "Fulvestrant"')
    [['PatientID', 'StartDate']]
)

In [113]:
row_ID(etchemo_pf)

(13, 13)

#### 2. First line endocrine therapy,  second line chemotherapy, third line palbociclib + fulvestrant

In [114]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(et)) 
                & ~line_therapy_fl['LineName'].str.contains('|'.join(chemo))
                & ~line_therapy_fl['LineName'].str.contains('|'.join(exc))].LineName.value_counts().head(10)

Anastrozole                2943
Letrozole                  2536
Tamoxifen                  1305
Exemestane                  780
Anastrozole,Trastuzumab      66
Letrozole,Trastuzumab        59
Leuprolide,Tamoxifen         56
Letrozole,Leuprolide         40
Goserelin,Tamoxifen          36
Anastrozole,Leuprolide       35
Name: LineName, dtype: int64

In [115]:
et_id = (
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('|'.join(et))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(chemo))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(exc))]
    .PatientID
)

In [116]:
line_therapy_sec = (
    line_therapy[line_therapy['PatientID'].isin(et_id)]
    .query('LineNumber == 2')
)

In [117]:
et_chemo_id = (
    line_therapy_sec
    [line_therapy_sec['LineName'].str.contains('|'.join(chemo))
     & ~line_therapy_sec['LineName'].str.contains('|'.join(exc))]
    .PatientID
)

In [118]:
et_chemo_pf = (
    line_therapy[line_therapy['PatientID'].isin(et_chemo_id)]
    .query('LineNumber == 3')
    .query('LineName == "Fulvestrant,Palbociclib"')
    [['PatientID', 'StartDate']]
)

In [119]:
row_ID(et_chemo_pf)

(33, 33)

In [120]:
et_chemo_f = (
    line_therapy[line_therapy['PatientID'].isin(et_chemo_id)]
    .query('LineNumber == 3')
    .query('LineName == "Fulvestrant"')
    [['PatientID', 'StartDate']]
)

In [121]:
row_ID(et_chemo_f)

(57, 57)

#### 3. First line chemotherapy, second line endocrine therapy, third line palbociclib + fulvestrant

In [122]:
line_therapy_fl[line_therapy_fl['LineName'].str.contains('|'.join(chemo)) 
                & ~line_therapy_fl['LineName'].str.contains('|'.join(et))
                & ~line_therapy_fl['LineName'].str.contains('|'.join(exc))].LineName.value_counts().head(10)

Capecitabine                             1324
Cyclophosphamide,Doxorubicin              690
Docetaxel,Pertuzumab,Trastuzumab          676
Paclitaxel                                589
Paclitaxel Protein-Bound                  584
Carboplatin,Gemcitabine                   407
Paclitaxel,Pertuzumab,Trastuzumab         289
Eribulin                                  261
Carboplatin,Paclitaxel                    250
Atezolizumab,Paclitaxel Protein-Bound     226
Name: LineName, dtype: int64

In [123]:
chemo_id = (
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('|'.join(chemo))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(et))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(exc))]
    .PatientID
)

In [124]:
line_therapy_sec = (
    line_therapy[line_therapy['PatientID'].isin(chemo_id)]
    .query('LineNumber == 2')
)

In [125]:
chemo_et_id = (
    line_therapy_sec
    [line_therapy_sec['LineName'].str.contains('|'.join(et))
     & ~line_therapy_sec['LineName'].str.contains('|'.join(exc))]
    .PatientID
)

In [126]:
chemo_et_pf = (
    line_therapy[line_therapy['PatientID'].isin(chemo_et_id)]
    .query('LineNumber == 3')
    .query('LineName == "Fulvestrant,Palbociclib"')
    [['PatientID', 'StartDate']]
)

In [127]:
row_ID(chemo_et_pf)

(37, 37)

In [128]:
chemo_et_f = (
    line_therapy[line_therapy['PatientID'].isin(chemo_et_id)]
    .query('LineNumber == 3')
    .query('LineName == "Fulvestrant"')
    [['PatientID', 'StartDate']]
)

In [129]:
row_ID(chemo_et_f)

(42, 42)

#### 4. First line endocrine therapy, second line palbociclib + fulvestrant

In [130]:
et_pf = (
    line_therapy[line_therapy['PatientID'].isin(et_id)]
    .query('LineNumber == 2')
    .query('LineName == "Fulvestrant,Palbociclib"')
    [['PatientID', 'StartDate']]
)

In [131]:
row_ID(et_pf)

(440, 440)

In [132]:
et_f = (
    line_therapy[line_therapy['PatientID'].isin(et_id)]
    .query('LineNumber == 2')
    .query('LineName == "Fulvestrant"')
    [['PatientID', 'StartDate']]
)

In [133]:
row_ID(et_f)

(784, 784)

In [134]:
paloma3_pf = pd.concat([etchemo_pf, et_chemo_pf, chemo_et_pf, et_pf])

In [135]:
paloma3_pf.loc[:, 'pfulv'] = 1

In [136]:
row_ID(paloma3_pf)

(523, 523)

In [137]:
paloma3_f = pd.concat([etchemo_f, et_chemo_f, chemo_et_f, et_f])

In [138]:
paloma3_f.loc[:, 'pfulv'] = 0

In [139]:
row_ID(paloma3_f)

(897, 897)

In [140]:
paloma3 = pd.concat([paloma3_pf, paloma3_f])

In [141]:
row_ID(paloma3)

(1420, 1420)

In [142]:
paloma3 = pd.merge(paloma3, df_full, on = 'PatientID', how = 'left')

In [143]:
row_ID(paloma3)

(1420, 1420)

In [144]:
paloma3['StartDate'] = pd.to_datetime(paloma3['StartDate'])

#### ER- or PR-positive and HER-2 negative 

In [145]:
biomarkers = pd.read_csv('Enhanced_MetBreastBiomarkers.csv')

In [146]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(paloma3['PatientID'])]

In [147]:
row_ID(biomarkers)

(10590, 1415)

In [148]:
biomarkers = pd.merge(biomarkers, paloma3[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [149]:
row_ID(biomarkers)

(10590, 1415)

In [150]:
biomarkers['StartDate'] = pd.to_datetime(biomarkers['StartDate'])

In [151]:
biomarkers['ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [152]:
biomarkers['SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [153]:
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [154]:
biomarkers.loc[:, 'date_diff'] = (biomarkers['result_date'] - biomarkers['StartDate']).dt.days

In [155]:
er_status = (
    biomarkers
    .query('BiomarkerName == "ER"')
    .query('date_diff <= 30')
    .query('BiomarkerStatus == "Positive" or BiomarkerStatus == "Negative"') # don't select unknown values 
    .sort_values(['PatientID', 'date_diff'], ascending = [True, False]) # select ER status closest to treatment start
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'BiomarkerStatus']]
    .rename(columns = {'BiomarkerStatus': 'er'})
   )

In [156]:
row_ID(er_status)

(1283, 1283)

In [157]:
pr_status = (
    biomarkers
    .query('BiomarkerName == "PR"')
    .query('date_diff <= 30')
    .query('BiomarkerStatus == "Positive" or BiomarkerStatus == "Negative"') # don't select unknown values 
    .sort_values(['PatientID', 'date_diff'], ascending = [True, False]) # select ER status closest to treatment start
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'BiomarkerStatus']]
    .rename(columns = {'BiomarkerStatus': 'pr'})
   )

In [158]:
row_ID(pr_status)

(1243, 1243)

In [159]:
her2_status = (
    biomarkers
    .query('BiomarkerName == "HER2"')
    .query('date_diff <= 30')
    .sort_values(['PatientID', 'date_diff'], ascending = [True, False])
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'BiomarkerStatus']]
    .rename(columns = {'BiomarkerStatus': 'her2'})
)

In [160]:
row_ID(her2_status)

(1236, 1236)

In [161]:
paloma3 = pd.merge(paloma3, er_status, on  = 'PatientID', how = 'left')

In [162]:
row_ID(paloma3)

(1420, 1420)

In [163]:
paloma3 = pd.merge(paloma3, pr_status, on  = 'PatientID', how = 'left')

In [164]:
row_ID(paloma3)

(1420, 1420)

In [165]:
paloma3 = pd.merge(paloma3, her2_status, on  = 'PatientID', how = 'left')

In [166]:
row_ID(paloma3)

(1420, 1420)

In [167]:
her2_neg = ['IHC negative (0-1+)',
            'FISH negative/not amplified',
            'IHC equivocal (2+)',
            'Negative NOS',
            'NGS negative (ERBB2 not amplified)',
            'FISH equivocal',
            'Equivocal NOS',
            'NGS equivocal (ERBB2 amplification equivocal)']

paloma3 = (
    paloma3
    .query('er == "Positive" or pr == "Positive"')
    .query('her2== @her2_neg')
)

In [168]:
row_ID(paloma3)

(1153, 1153)

#### Time from treatment to progression/death or censor 

In [169]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [170]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [171]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [172]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [173]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
row_ID(mortality)

(31677, 31677)

In [174]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [175]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [176]:
row_ID(mortality)

(31677, 31677)

In [177]:
paloma3 = pd.merge(paloma3, mortality, on = 'PatientID', how = 'left')

In [178]:
row_ID(paloma3)

(1153, 1153)

In [179]:
progression = pd.read_csv ('Enhanced_MetBreastProgression.csv')

In [180]:
progression = progression[progression['PatientID'].isin(paloma3['PatientID'])][['PatientID', 'ProgressionDate']]

In [181]:
progression['ProgressionDate'] = pd.to_datetime(progression['ProgressionDate'])

In [182]:
progression = pd.merge(progression, paloma3[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [183]:
progression['date_diff'] = (progression['ProgressionDate'] - progression['StartDate']).dt.days

In [184]:
progression = (
    progression.query('date_diff > 0')
    .sort_values(['PatientID', 'ProgressionDate'], ascending = [True, True])
    .drop_duplicates(subset = 'PatientID', keep = 'first')
)   

In [185]:
row_ID(progression)

(831, 831)

In [186]:
progression = progression[['PatientID', 'ProgressionDate']]

In [187]:
paloma3 = pd.merge(paloma3, progression, on = 'PatientID', how = 'left')

In [188]:
row_ID(paloma3)

(1153, 1153)

In [189]:
# Percent without progression date
len(paloma3.query('ProgressionDate.isna()', engine = 'python'))/len(paloma3)

0.2792714657415438

In [190]:
conditions = [
    (paloma3.ProgressionDate.notna()),
    ((paloma3.ProgressionDate.isna()) & (paloma3['death_status'] == 1)),
    ((paloma3.ProgressionDate.isna()) & (paloma3['death_status'] == 0))]

choices = [
    (paloma3['ProgressionDate'] - paloma3['StartDate']).dt.days,
    (paloma3['death_date'] - paloma3['StartDate']).dt.days,
    (paloma3['last_activity'] - paloma3['StartDate']).dt.days]

paloma3.loc[:, 'time_prog_treatment'] = np.select(conditions, choices)

In [191]:
paloma3 = paloma3.query('time_prog_treatment >= 0')

In [192]:
len(paloma3)

1153

In [193]:
conditions = [
    (paloma3.ProgressionDate.notna()),
    ((paloma3.ProgressionDate.isna()) & (paloma3['death_status'] == 1)),
    ((paloma3.ProgressionDate.isna()) & (paloma3['death_status'] == 0))]

choices = [1, 1, 0]

paloma3.loc[:, 'pfs_status'] = np.select(conditions, choices)

#### Patient count 

In [194]:
low_cutoff_paloma3 = paloma3.risk_score.quantile(1/3)

In [195]:
high_cutoff_paloma3 = paloma3.risk_score.quantile(2/3)

In [196]:
print('Palobociclib plus fulvestrant total:',  paloma3.query('pfulv == 1').shape[0])
print('High risk:', paloma3.query('pfulv == 1').query('risk_score >= @high_cutoff_paloma3').shape[0])
print('Med risk:', paloma3.query('pfulv == 1').query('risk_score < @high_cutoff_paloma3 and risk_score > @low_cutoff_paloma3').shape[0])
print('Low risk:', paloma3.query('pfulv == 1').query('risk_score <= @low_cutoff_paloma3').shape[0])

Palobociclib plus fulvestrant total: 460
High risk: 146
Med risk: 148
Low risk: 166


In [197]:
print('Fulvestrant total:',  paloma3.query('pfulv == 0').shape[0])
print('High risk:', paloma3.query('pfulv == 0').query('risk_score >= @high_cutoff_paloma3').shape[0])
print('Med risk:', paloma3.query('pfulv == 0').query('risk_score < @high_cutoff_paloma3 and risk_score > @low_cutoff_paloma3').shape[0])
print('Low risk:', paloma3.query('pfulv == 0').query('risk_score <= @low_cutoff_paloma3').shape[0])

Fulvestrant total: 693
High risk: 239
Med risk: 235
Low risk: 219


#### Survival curves with covariate balancing 

In [198]:
paloma3 = paloma3.set_index('PatientID')

In [199]:
conditions = [
    (paloma3['thorax_met'] == 1) |
    (paloma3['liver_met'] == 1) |
    (paloma3['cns_met'] == 1) |
    (paloma3['peritoneum_met'] == 1) |
    (paloma3['other_met'] == 1),
    (paloma3['bone_met'] == 0) &
    (paloma3['thorax_met'] == 0) &
    (paloma3['lymph_met'] == 0) &
    (paloma3['liver_met'] == 0) &
    (paloma3['cns_met'] == 0) &
    (paloma3['skin_met'] == 0) &
    (paloma3['peritoneum_met'] == 0) &
    (paloma3['other_met'] == 0)
]

choices = ['visceral', 'unknown']

paloma3['met_site'] = np.select(conditions, choices, default = 'nonvisceral')

In [200]:
paloma3['met_cat'] = pd.cut(paloma3['met_year'],
                            bins = [2010, 2016, float('inf')],
                            labels = ['11-16', '17-22'])

In [201]:
conditions = [
    ((paloma3['ecog_diagnosis'] == "1.0") | (paloma3['ecog_diagnosis'] == "0.0")),  
    ((paloma3['ecog_diagnosis'] == "2.0") | (paloma3['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

paloma3['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [202]:
paloma3_iptw = paloma3.filter(items = ['pfs_status',
                                       'time_prog_treatment',
                                       'pfulv',
                                       'age',
                                       'gender',
                                       'race',
                                       'p_type',
                                       'delta_met_diagnosis',
                                       'met_cat',
                                       'commercial',
                                       'medicare',
                                       'medicaid',
                                       'ses',
                                       'ecog_2',
                                       'met_site',
                                       'albumin_diag', 
                                       'weight_pct_change',
                                       'risk_score'])

In [203]:
paloma3_iptw.dtypes

pfs_status                int64
time_prog_treatment     float64
pfulv                     int64
age                       int64
gender                   object
race                     object
p_type                   object
delta_met_diagnosis       int64
met_cat                category
commercial              float64
medicare                float64
medicaid                float64
ses                     float64
ecog_2                   object
met_site                 object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
dtype: object

In [204]:
to_be_categorical = list(paloma3_iptw.select_dtypes(include = ['object']).columns)

In [205]:
to_be_categorical

['gender', 'race', 'p_type', 'ecog_2', 'met_site']

In [206]:
to_be_categorical.append('met_cat')

In [207]:
to_be_categorical.append('ses')

In [208]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    paloma3_iptw[x] = paloma3_iptw[x].astype('category')

In [209]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [210]:
# List of categorical features.
categorical_features = list(paloma3_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [211]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [212]:
paloma3_iptw_low = (
    paloma3_iptw
    .query('risk_score <= @low_cutoff_paloma3'))

paloma3_iptw_med = (
    paloma3_iptw
    .query('risk_score < @high_cutoff_paloma3 and risk_score > @low_cutoff_paloma3'))

paloma3_iptw_high = (
    paloma3_iptw
    .query('risk_score >= @high_cutoff_paloma3'))

paloma3_iptw_all = paloma3_iptw

In [213]:
paloma3_low_x = preprocessor.fit_transform(paloma3_iptw_low.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'p_type',
                                                                            'delta_met_diagnosis',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ses',
                                                                            'ecog_2',
                                                                            'met_site', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

paloma3_med_x = preprocessor.fit_transform(paloma3_iptw_med.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'p_type',
                                                                            'delta_met_diagnosis',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ses',
                                                                            'ecog_2',
                                                                            'met_site', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change', 
                                                                            'risk_score']))

paloma3_high_x = preprocessor.fit_transform(paloma3_iptw_high.filter(items = ['age',
                                                                              'gender',
                                                                              'race',
                                                                              'p_type',
                                                                              'delta_met_diagnosis',
                                                                              'met_cat',
                                                                              'commercial',
                                                                              'medicare',
                                                                              'medicaid',
                                                                              'ses',
                                                                              'ecog_2',
                                                                              'met_site', 
                                                                              'albumin_diag', 
                                                                              'weight_pct_change', 
                                                                              'risk_score']))

paloma3_all_x = preprocessor.fit_transform(paloma3_iptw_all.filter(items = ['age',
                                                                            'gender',
                                                                            'race',
                                                                            'p_type',
                                                                            'delta_met_diagnosis',
                                                                            'met_cat',
                                                                            'commercial',
                                                                            'medicare',
                                                                            'medicaid',
                                                                            'ses',
                                                                            'ecog_2',
                                                                            'met_site', 
                                                                            'albumin_diag', 
                                                                            'weight_pct_change',
                                                                            'risk_score']))

In [214]:
lr_paloma3_low = LogisticRegression(max_iter = 1000)
lr_paloma3_low.fit(paloma3_low_x, paloma3_iptw_low['pfulv'])

LogisticRegression(max_iter=1000)

In [215]:
lr_paloma3_med = LogisticRegression(max_iter = 1000)
lr_paloma3_med.fit(paloma3_med_x, paloma3_iptw_med['pfulv'])

LogisticRegression(max_iter=1000)

In [216]:
lr_paloma3_high = LogisticRegression(max_iter = 1000)
lr_paloma3_high.fit(paloma3_high_x, paloma3_iptw_high['pfulv'])

LogisticRegression(max_iter=1000)

In [217]:
lr_paloma3_all = LogisticRegression(max_iter = 1000)
lr_paloma3_all.fit(paloma3_all_x, paloma3_iptw_all['pfulv'])

LogisticRegression(max_iter=1000)

In [218]:
pred_low = lr_paloma3_low.predict_proba(paloma3_low_x)
pred_med = lr_paloma3_med.predict_proba(paloma3_med_x)
pred_high = lr_paloma3_high.predict_proba(paloma3_high_x)
pred_all = lr_paloma3_all.predict_proba(paloma3_all_x)

In [219]:
paloma3_iptw_low['ps'] = pred_low[:, 1]
paloma3_iptw_med['ps'] = pred_med[:, 1]
paloma3_iptw_high['ps'] = pred_high[:, 1]
paloma3_iptw_all['ps'] = pred_all[:, 1]

In [220]:
paloma3_iptw_low['weight'] = (
    np.where(paloma3_iptw_low['pfulv'] == 1, 1/paloma3_iptw_low['ps'], 1/(1 - paloma3_iptw_low['ps'])))

paloma3_iptw_med['weight'] = (
    np.where(paloma3_iptw_med['pfulv'] == 1, 1/paloma3_iptw_med['ps'], 1/(1 - paloma3_iptw_med['ps'])))

paloma3_iptw_high['weight'] = (
    np.where(paloma3_iptw_high['pfulv'] == 1, 1/paloma3_iptw_high['ps'], 1/(1 - paloma3_iptw_high['ps'])))

paloma3_iptw_all['weight'] = (
    np.where(paloma3_iptw_all['pfulv'] == 1, 1/paloma3_iptw_all['ps'], 1/(1 - paloma3_iptw_all['ps'])))

In [221]:
# Low KM curves
kmf_low_pfulv_paloma3_iptw = KaplanMeierFitter()
kmf_low_fulv_paloma3_iptw = KaplanMeierFitter()

kmf_low_pfulv_paloma3_iptw.fit(
    paloma3_iptw_low.query('pfulv == 1').time_prog_treatment/30,
    paloma3_iptw_low.query('pfulv == 1').pfs_status,
    weights = paloma3_iptw_low.query('pfulv == 1')['weight'])

kmf_low_fulv_paloma3_iptw.fit(
    paloma3_iptw_low.query('pfulv == 0').time_prog_treatment/30,
    paloma3_iptw_low.query('pfulv == 0').pfs_status,
    weights = paloma3_iptw_low.query('pfulv == 0')['weight'])

# Med KM curves
kmf_med_pfulv_paloma3_iptw = KaplanMeierFitter()
kmf_med_fulv_paloma3_iptw = KaplanMeierFitter()

kmf_med_pfulv_paloma3_iptw.fit(
    paloma3_iptw_med.query('pfulv == 1').time_prog_treatment/30,
    paloma3_iptw_med.query('pfulv == 1').pfs_status,
    weights = paloma3_iptw_med.query('pfulv == 1')['weight'])

kmf_med_fulv_paloma3_iptw.fit(
    paloma3_iptw_med.query('pfulv == 0').time_prog_treatment/30,
    paloma3_iptw_med.query('pfulv == 0').pfs_status,
    weights = paloma3_iptw_med.query('pfulv == 0')['weight'])

# High KM curves 
kmf_high_pfulv_paloma3_iptw = KaplanMeierFitter()
kmf_high_fulv_paloma3_iptw = KaplanMeierFitter()

kmf_high_pfulv_paloma3_iptw.fit(
    paloma3_iptw_high.query('pfulv == 1').time_prog_treatment/30,
    paloma3_iptw_high.query('pfulv == 1').pfs_status,
    weights = paloma3_iptw_high.query('pfulv == 1')['weight'])

kmf_high_fulv_paloma3_iptw.fit(
    paloma3_iptw_high.query('pfulv == 0').time_prog_treatment/30,
    paloma3_iptw_high.query('pfulv == 0').pfs_status,
    weights = paloma3_iptw_high.query('pfulv == 0')['weight'])

# All KM curves 
kmf_all_pfulv_paloma3_iptw = KaplanMeierFitter()
kmf_all_fulv_paloma3_iptw = KaplanMeierFitter()

kmf_all_pfulv_paloma3_iptw.fit(
    paloma3_iptw_all.query('pfulv == 1').time_prog_treatment/30,
    paloma3_iptw_all.query('pfulv == 1').pfs_status,
    weights = paloma3_iptw_all.query('pfulv == 1')['weight'])

kmf_all_fulv_paloma3_iptw.fit(
    paloma3_iptw_all.query('pfulv == 0').time_prog_treatment/30,
    paloma3_iptw_all.query('pfulv == 0').pfs_status,
    weights = paloma3_iptw_all.query('pfulv == 0')['weight'])


<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 1155.13 total observations, 139.566 right-censored observations>

#### Calculating survival metrics

In [222]:
pfulv_paloma3_median_os = mos(kmf_low_pfulv_paloma3_iptw,
                              kmf_med_pfulv_paloma3_iptw,
                              kmf_high_pfulv_paloma3_iptw,
                              kmf_all_pfulv_paloma3_iptw)

fulv_paloma3_median_os = mos(kmf_low_fulv_paloma3_iptw,
                             kmf_med_fulv_paloma3_iptw,
                             kmf_high_fulv_paloma3_iptw,
                             kmf_all_fulv_paloma3_iptw)

In [223]:
paloma3_iptw_all_imputed = paloma3_iptw_all.copy()
paloma3_iptw_all_imputed['albumin_diag'] = paloma3_iptw_all_imputed['albumin_diag'].fillna(paloma3_iptw_all_imputed['albumin_diag'].median())
paloma3_iptw_all_imputed['weight_pct_change'] = paloma3_iptw_all_imputed['weight_pct_change'].fillna(paloma3_iptw_all_imputed['weight_pct_change'].median())
paloma3_iptw_all_imputed['ses'] = paloma3_iptw_all_imputed['ses'].cat.add_categories('unknown')
paloma3_iptw_all_imputed['ses'] = paloma3_iptw_all_imputed['ses'].fillna('unknown')

In [224]:
paloma3_hr_all = CoxPHFitter()
paloma3_hr_all.fit(paloma3_iptw_all_imputed,
                   duration_col = 'time_prog_treatment', 
                   event_col = 'pfs_status', 
                   formula = 'pfulv + age + gender + race + p_type + delta_met_diagnosis + met_cat + commercial + medicare + medicaid + ses + ecog_2 + met_site + albumin_diag + weight_pct_change + risk_score',
                   weights_col = 'weight',
                   robust = True)

<lifelines.CoxPHFitter: fitted with 2279.94 total observations, 313.11 right-censored observations>

In [225]:
paloma3_all_rmst_mos_95 = rmst_mos_95ci(paloma3_iptw_all,
                                        1000,
                                        'pfulv',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'p_type',
                                         'delta_met_diagnosis',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ses',
                                         'ecog_2',
                                         'met_site',
                                         'albumin_diag', 
                                         'weight_pct_change', 
                                         'risk_score'],
                                         ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                         24)

In [226]:
paloma3_low_rmst_mos_95 = rmst_mos_95ci(paloma3_iptw_low,
                                        1000,
                                        'pfulv',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'p_type',
                                         'delta_met_diagnosis',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ses',
                                         'ecog_2',
                                         'met_site',
                                         'albumin_diag', 
                                         'weight_pct_change', 
                                         'risk_score'],
                                         ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                         24)

In [227]:
paloma3_med_rmst_mos_95 = rmst_mos_95ci(paloma3_iptw_med,
                                        1000,
                                        'pfulv',
                                        'progression',
                                        ['age',
                                         'gender',
                                         'race',
                                         'p_type',
                                         'delta_met_diagnosis',
                                         'met_cat',
                                         'commercial',
                                         'medicare',
                                         'medicaid',
                                         'ses',
                                         'ecog_2',
                                         'met_site',
                                         'albumin_diag', 
                                         'weight_pct_change', 
                                         'risk_score'],
                                         ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                         24)

In [228]:
paloma3_high_rmst_mos_95 = rmst_mos_95ci(paloma3_iptw_high,
                                         1000,
                                         'pfulv',
                                         'progression',
                                         ['age',
                                          'gender',
                                          'race',
                                          'p_type',
                                          'delta_met_diagnosis',
                                          'met_cat',
                                          'commercial',
                                          'medicare',
                                          'medicaid',
                                          'ses',
                                          'ecog_2',
                                          'met_site',
                                          'albumin_diag', 
                                          'weight_pct_change', 
                                          'risk_score'],
                                          ['age', 'delta_met_diagnosis', 'albumin_diag', 'weight_pct_change', 'risk_score'],
                                          24)

In [229]:
paloma3_data = [
    {'trial_name': 'PALOMA-3', 
     'risk_group': 'low', 
     'r_trt_mos': pfulv_paloma3_median_os[0],
     'r_trt_mos_95': paloma3_low_rmst_mos_95.mos_A_95,
     'r_cont_mos': fulv_paloma3_median_os[0],
     'r_cont_mos_95': paloma3_low_rmst_mos_95.mos_B_95,
     'r_mos_diff': pfulv_paloma3_median_os[0] - fulv_paloma3_median_os[0], 
     'rct_trt_arm': 9.5,
     'rct_cont_arm': 4.6,
     'rct_mos_diff': 9.5-4.6,
     'trt_rmst': restricted_mean_survival_time(kmf_low_pfulv_paloma3_iptw, 24),
     'trt_rmst_95': paloma3_low_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_low_fulv_paloma3_iptw, 24),
     'cont_rmst_95': paloma3_low_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_low_pfulv_paloma3_iptw, 24) - restricted_mean_survival_time(kmf_low_fulv_paloma3_iptw, 24),
     'diff_rmst_95': paloma3_low_rmst_mos_95.difference_rmst_95,
     'rcount': paloma3.query('risk_score <= @low_cutoff_paloma3').shape[0]},
    
    {'trial_name': 'PALOMA-3', 
     'risk_group': 'medium', 
     'r_trt_mos': pfulv_paloma3_median_os[1],
     'r_trt_mos_95': paloma3_med_rmst_mos_95.mos_A_95,
     'r_cont_mos': fulv_paloma3_median_os[1],
     'r_cont_mos_95': paloma3_med_rmst_mos_95.mos_B_95,
     'r_mos_diff': pfulv_paloma3_median_os[1] - fulv_paloma3_median_os[1], 
     'rct_trt_arm': 9.5,
     'rct_cont_arm': 4.6,
     'rct_mos_diff': 9.5-4.6,
     'trt_rmst': restricted_mean_survival_time(kmf_med_pfulv_paloma3_iptw, 24),
     'trt_rmst_95': paloma3_med_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_med_fulv_paloma3_iptw, 24),
     'cont_rmst_95': paloma3_med_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_med_pfulv_paloma3_iptw, 24) - restricted_mean_survival_time(kmf_med_fulv_paloma3_iptw, 24),
     'diff_rmst_95': paloma3_med_rmst_mos_95.difference_rmst_95,
     'rcount': paloma3.query('risk_score < @high_cutoff_paloma3 and risk_score > @low_cutoff_paloma3').shape[0]},
    
    {'trial_name': 'PALOMA-3', 
     'risk_group': 'high', 
     'r_trt_mos': pfulv_paloma3_median_os[2],
     'r_trt_mos_95': paloma3_high_rmst_mos_95.mos_A_95,
     'r_cont_mos': fulv_paloma3_median_os[2],
     'r_cont_mos_95': paloma3_high_rmst_mos_95.mos_B_95,
     'r_mos_diff': pfulv_paloma3_median_os[2] - fulv_paloma3_median_os[2], 
     'rct_trt_arm': 9.5,
     'rct_cont_arm': 4.6,
     'rct_mos_diff': 9.5-4.6,
     'trt_rmst': restricted_mean_survival_time(kmf_high_pfulv_paloma3_iptw, 24),
     'trt_rmst_95': paloma3_high_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_high_fulv_paloma3_iptw, 24),
     'cont_rmst_95': paloma3_high_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_high_pfulv_paloma3_iptw, 24) - restricted_mean_survival_time(kmf_high_fulv_paloma3_iptw, 24),
     'diff_rmst_95': paloma3_high_rmst_mos_95.difference_rmst_95,
     'rcount': paloma3.query('risk_score >= @high_cutoff_paloma3').shape[0]},
    
    {'trial_name': 'PALOMA-3', 
     'risk_group': 'all', 
     'r_hr': paloma3_hr_all.hazard_ratios_['pfulv'],
     'r_hr_95': [paloma3_hr_all.summary.loc['pfulv']['exp(coef) lower 95%'], paloma3_hr_all.summary.loc['pfulv']['exp(coef) upper 95%']],
     'r_trt_mos': pfulv_paloma3_median_os[3],
     'r_trt_mos_95': paloma3_all_rmst_mos_95.mos_A_95,
     'r_cont_mos': fulv_paloma3_median_os[3],
     'r_cont_mos_95': paloma3_all_rmst_mos_95.mos_B_95,
     'r_mos_diff': pfulv_paloma3_median_os[3] - fulv_paloma3_median_os[3], 
     'rct_trt_arm': 9.5,
     'rct_cont_arm': 4.6,
     'rct_mos_diff': 9.5-4.6,
     'rcount': paloma3.shape[0]}
]

### CLEOPATRA: pertuzumab, trastuzumab, and docetaxel in HER2-positive metastatic breast cancer

**INCLUSION**
* Untreated metastatic breast cancer
* Received first line pertuzumab, trastuzumab, and docetaxel/paclitaxel or first line trastuzumab + docetaxel/paclitaxel 
* First line of hormonal treatment is allowed
* HER-2 positive within (+30, -inf) of receipt of treatment 

#### First line trastuzumab + docetaxel/paclitaxel +/- pertuzumab 

In [230]:
df_full = pd.read_csv('df_risk_crude.csv', index_col = 'PatientID', dtype = {'death_status': bool})
df_full.index.nunique()

31677

In [231]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [232]:
tpdp = [
    'Docetaxel,Pertuzumab,Trastuzumab',
    'Docetaxel,Pertuzumab,Trastuzumab-Anns',
    'Docetaxel,Pertuzumab,Trastuzumab-Qyyp',
    'Paclitaxel,Pertuzumab,Trastuzumab',
    'Paclitaxel,Pertuzumab,Trastuzumab-Anns',
    'Paclitaxel,Pertuzumab,Trastuzumab-Qyyp',
    'Paclitaxel Protein-Bound,Pertuzumab,Trastuzumab',
    'Paclitaxel Protein-Bound,Pertuzumab,Trastuzumab-Anns',
    'Paclitaxel Protein-Bound,Pertuzumab,Trastuzumab-Qyyp']

tpdp_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('LineName == @tpdp')
    [['PatientID', 'StartDate']]
)

In [233]:
tpdp_fl.loc[:, 'tpdp'] = 1

In [234]:
row_ID(tpdp_fl)

(1219, 1219)

In [235]:
tpd = [
    'Docetaxel,Trastuzumab',
    'Docetaxel,Trastuzumab-Anns',
    'Docetaxel,Trastuzumab-Qyyp',
    'Paclitaxel,Trastuzumab',
    'Paclitaxel,Trastuzumab-Anns',
    'Paclitaxel,Trastuzumab-Qyyp',
    'Paclitaxel Protein-Bound,Trastuzumab',
    'Paclitaxel Protein-Bound,Trastuzumab-Anns',
    'Paclitaxel Protein-Bound,Trastuzumab-Qyyp'] 

tpd_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1')
    .query('LineName == @tpd')
    [['PatientID', 'StartDate']]
)

In [236]:
tpd_fl.loc[:, 'tpdp'] = 0

In [237]:
row_ID(tpdp_fl)

(1219, 1219)

#### First line endocrine therapy, second line trastuzumab + docetaxel/paclitaxel +/- pertuzumab 

In [238]:
all_treatment = line_therapy.LineName.unique()

In [239]:
treatment_list = []
for trt in all_treatment:
    treatment_list.extend(trt.split(","))

In [240]:
unique = list(dict.fromkeys(treatment_list))

In [241]:
et = [
    'Anastrozole',
    'Letrozole',
    'Exemestane',
    'Tamoxifen',
    'Leuprolide',
    'Goserelin',
    'Triptorelin']

In [242]:
unique.remove('Anastrozole')
unique.remove('Letrozole')
unique.remove('Exemestane')
unique.remove('Tamoxifen')
unique.remove('Leuprolide')
unique.remove('Goserelin')
unique.remove('Triptorelin')

In [243]:
line_therapy_fl = (
    line_therapy[line_therapy['PatientID'].isin(df_full.index)]
    .query('LineNumber == 1'))

In [244]:
(
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('|'.join(et))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(unique))]
    .LineName.value_counts().head(10)
)

Anastrozole               2943
Letrozole                 2536
Tamoxifen                 1305
Exemestane                 780
Leuprolide                  77
Goserelin                   60
Leuprolide,Tamoxifen        56
Letrozole,Leuprolide        40
Goserelin,Tamoxifen         36
Anastrozole,Leuprolide      35
Name: LineName, dtype: int64

In [245]:
et_id = (
    line_therapy_fl
    [line_therapy_fl['LineName'].str.contains('|'.join(et))
     & ~line_therapy_fl['LineName'].str.contains('|'.join(unique))]
    .PatientID
)

In [246]:
line_therapy_sec = (
    line_therapy[line_therapy['PatientID'].isin(et_id)]
    .query('LineNumber == 2'))

In [247]:
et_tpdp = (
    line_therapy_sec
    .query('LineName == @tpdp')
    [['PatientID', 'StartDate']]
)

In [248]:
et_tpdp.loc[:, 'tpdp'] = 1

In [249]:
row_ID(et_tpdp)

(79, 79)

In [250]:
et_tpd = (
    line_therapy_sec
    .query('LineName == @tpd')
    [['PatientID', 'StartDate']]
)

In [251]:
et_tpd.loc[:, 'tpdp'] = 0

In [252]:
row_ID(et_tpd)

(5, 5)

In [253]:
cleopatra_p = pd.concat([tpdp_fl, et_tpdp])

In [254]:
row_ID(cleopatra_p)

(1298, 1298)

In [255]:
cleopatra_np = pd.concat([tpd_fl, et_tpd])

In [256]:
row_ID(cleopatra_np)

(184, 184)

In [257]:
cleopatra = pd.concat([cleopatra_p, cleopatra_np])

In [258]:
row_ID(cleopatra)

(1482, 1482)

In [259]:
cleopatra = pd.merge(cleopatra, df_full, on = 'PatientID', how = 'left')

In [260]:
row_ID(cleopatra)

(1482, 1482)

In [261]:
cleopatra['StartDate'] = pd.to_datetime(cleopatra['StartDate'])

#### HER-2 positive

In [262]:
biomarkers = pd.read_csv('Enhanced_MetBreastBiomarkers.csv')

In [263]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(cleopatra['PatientID'])]

In [264]:
row_ID(biomarkers)

(10765, 1482)

In [265]:
biomarkers = pd.merge(biomarkers, cleopatra[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [266]:
row_ID(biomarkers)

(10765, 1482)

In [267]:
biomarkers['StartDate'] = pd.to_datetime(biomarkers['StartDate'])

In [268]:
biomarkers['ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [269]:
biomarkers['SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [270]:
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [271]:
biomarkers.loc[:, 'date_diff'] = (biomarkers['result_date'] - biomarkers['StartDate']).dt.days

In [272]:
her2_rel = ['IHC negative (0-1+)',
            'FISH negative/not amplified',
            'Negative NOS',
            'NGS negative (ERBB2 not amplified)',
            'IHC positive (3+)',
            'FISH positive/amplified',
            'Positive NOS',
            'NGS positive (ERBB2 amplified)']

her2_status = (
    biomarkers
    .query('BiomarkerName == "HER2"')
    .query('date_diff <= 30')
    .query('BiomarkerStatus == @her2_rel')
    .sort_values(['PatientID', 'date_diff'], ascending = [True, False])
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    [['PatientID', 'BiomarkerStatus']]
    .rename(columns = {'BiomarkerStatus': 'her2'})
)

In [273]:
row_ID(her2_status)

(1377, 1377)

In [274]:
cleopatra = pd.merge(cleopatra, her2_status, on  = 'PatientID', how = 'left')

In [275]:
row_ID(cleopatra)

(1482, 1482)

In [276]:
her2_pos = ['IHC positive (3+)',
            'FISH positive/amplified',
            'Positive NOS',
            'NGS positive (ERBB2 amplified)']

cleopatra = (
    cleopatra
    .query('her2== @her2_pos')
)

In [277]:
row_ID(cleopatra)

(1296, 1296)

#### Time from treatment to progression/death or censor 

In [278]:
mortality_tr = pd.read_csv('mortality_cleaned_tr.csv')

In [279]:
mortality_te = pd.read_csv('mortality_cleaned_te.csv')

In [280]:
mortality_tr = mortality_tr[['PatientID', 'death_date', 'last_activity']]

In [281]:
mortality_te = mortality_te[['PatientID', 'death_date', 'last_activity']]

In [282]:
mortality = pd.concat([mortality_tr, mortality_te], ignore_index = True)
row_ID(mortality)

(31677, 31677)

In [283]:
mortality.loc[:, 'last_activity'] = pd.to_datetime(mortality['last_activity'])

In [284]:
mortality.loc[:, 'death_date'] = pd.to_datetime(mortality['death_date'])

In [285]:
row_ID(mortality)

(31677, 31677)

In [286]:
cleopatra = pd.merge(cleopatra, mortality, on = 'PatientID', how = 'left')

In [287]:
row_ID(cleopatra)

(1296, 1296)

In [288]:
conditions = [
    (cleopatra['death_status'] == 1),
    (cleopatra['death_status'] == 0)]

choices = [
    (cleopatra['death_date'] - cleopatra['StartDate']).dt.days,
    (cleopatra['last_activity'] - cleopatra['StartDate']).dt.days]

cleopatra.loc[:, 'timerisk_treatment'] = np.select(conditions, choices)

In [289]:
cleopatra = cleopatra.query('timerisk_treatment >= 0')

#### Patient count 

In [290]:
low_cutoff_cleopatra = cleopatra.risk_score.quantile(1/3)

In [291]:
high_cutoff_cleopatra = cleopatra.risk_score.quantile(2/3)

In [292]:
print('Pertuzumab + trastuzumab + docetaxel/paclitaxel total:',  cleopatra.query('tpdp == 1').shape[0])
print('High risk:', cleopatra.query('tpdp == 1').query('risk_score >= @high_cutoff_cleopatra').shape[0])
print('Med risk:', cleopatra.query('tpdp == 1').query('risk_score < @high_cutoff_cleopatra and risk_score > @low_cutoff_cleopatra').shape[0])
print('Low risk:', cleopatra.query('tpdp == 1').query('risk_score <= @low_cutoff_cleopatra').shape[0])

Pertuzumab + trastuzumab + docetaxel/paclitaxel total: 1151
High risk: 375
Med risk: 390
Low risk: 386


In [293]:
print('Trastuzumab + docetaxel/paclitaxel total:',  cleopatra.query('tpdp == 0').shape[0])
print('High risk:', cleopatra.query('tpdp == 0').query('risk_score >= @high_cutoff_cleopatra').shape[0])
print('Med risk:', cleopatra.query('tpdp == 0').query('risk_score < @high_cutoff_cleopatra and risk_score > @low_cutoff_cleopatra').shape[0])
print('Low risk:', cleopatra.query('tpdp == 0').query('risk_score <= @low_cutoff_cleopatra').shape[0])

Trastuzumab + docetaxel/paclitaxel total: 143
High risk: 57
Med risk: 41
Low risk: 45


#### Survival curves with covariate balancing 

In [294]:
conditions = [
    (cleopatra['thorax_met'] == 1) |
    (cleopatra['liver_met'] == 1) |
    (cleopatra['cns_met'] == 1) |
    (cleopatra['peritoneum_met'] == 1) |
    (cleopatra['other_met'] == 1),
    (cleopatra['bone_met'] == 0) &
    (cleopatra['thorax_met'] == 0) &
    (cleopatra['lymph_met'] == 0) &
    (cleopatra['liver_met'] == 0) &
    (cleopatra['cns_met'] == 0) &
    (cleopatra['skin_met'] == 0) &
    (cleopatra['peritoneum_met'] == 0) &
    (cleopatra['other_met'] == 0)
]

choices = ['visceral', 'unknown']

cleopatra['met_site'] = np.select(conditions, choices, default = 'nonvisceral')

In [295]:
cleopatra['met_cat'] = pd.cut(cleopatra['met_year'],
                              bins = [2010, 2016, float('inf')],
                              labels = ['11-16', '17-22'])

In [296]:
conditions = [
    ((cleopatra['ecog_diagnosis'] == "1.0") | (cleopatra['ecog_diagnosis'] == "0.0")),  
    ((cleopatra['ecog_diagnosis'] == "2.0") | (cleopatra['ecog_diagnosis'] == "3.0"))
]

choices = ['lt_2', 'gte_2']

cleopatra['ecog_2'] = np.select(conditions, choices, default = 'unknown')

In [297]:
conditions = [
    ((cleopatra['ses'] == 1) | (cleopatra['ses'] == 2) | (cleopatra['ses'] == 3)), 
    ((cleopatra['ses'] == 4) | (cleopatra['ses'] == 5))
]

choices = ['lt_4', 'gte_4']

cleopatra['ses_cat'] = np.select(conditions, choices, default = 'unknown')

In [298]:
cleopatra['race_cat'] = np.where(cleopatra['race'] == "White", 1, 0)

In [299]:
cleopatra = cleopatra.set_index('PatientID')

In [300]:
cleopatra_iptw = cleopatra.filter(items = ['death_status',
                                           'timerisk_treatment',
                                           'tpdp',
                                           'age',
                                           'gender',
                                           'race_cat',
                                           'p_type',
                                           'delta_met_diagnosis',
                                           'met_cat',
                                           'ses_cat',
                                           'ecog_2',
                                           'albumin_diag',
                                           'weight_pct_change',
                                           'risk_score'])

In [301]:
cleopatra_iptw.dtypes

death_status               bool
timerisk_treatment      float64
tpdp                      int64
age                       int64
gender                   object
race_cat                  int64
p_type                   object
delta_met_diagnosis       int64
met_cat                category
ses_cat                  object
ecog_2                   object
albumin_diag            float64
weight_pct_change       float64
risk_score              float64
dtype: object

In [302]:
to_be_categorical = list(cleopatra_iptw.select_dtypes(include = ['object']).columns)

In [303]:
to_be_categorical

['gender', 'p_type', 'ses_cat', 'ecog_2']

In [304]:
to_be_categorical.append('met_cat')

In [305]:
to_be_categorical.append('race_cat')

In [306]:
# Convert variables in list to categorical.
for x in list(to_be_categorical):
    cleopatra_iptw[x] = cleopatra_iptw[x].astype('category')

In [307]:
# List of numeric variables, excluding binary variables. 
numerical_features = ['age', 'delta_met_diagnosis']

# Transformer will first calculate column median and impute, and then apply a standard scaler. 
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())])

In [308]:
# List of categorical features.
categorical_features = list(cleopatra_iptw.select_dtypes(include = ['category']).columns)

# One-hot-encode categorical features.
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

In [309]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder = 'passthrough')

In [310]:
cleopatra_iptw_low = (
    cleopatra_iptw
    .query('risk_score <= @low_cutoff_cleopatra')
    .drop(columns = ['risk_score']))

cleopatra_iptw_med = (
    cleopatra_iptw
    .query('risk_score < @high_cutoff_cleopatra and risk_score > @low_cutoff_cleopatra')
    .drop(columns = ['risk_score']))

cleopatra_iptw_high = (
    cleopatra_iptw
    .query('risk_score >= @high_cutoff_cleopatra')
    .drop(columns = ['risk_score']))

cleopatra_iptw_all = (
    cleopatra_iptw
    .drop(columns = ['risk_score']))

In [311]:
cleopatra_low_x = preprocessor.fit_transform(cleopatra_iptw_low.filter(items = ['age',
                                                                                'gender',
                                                                                'race_cat',
                                                                                'p_type',
                                                                                'delta_met_diagnosis',
                                                                                'met_cat',
                                                                                'ses_cat',
                                                                                'ecog_2']))

cleopatra_med_x = preprocessor.fit_transform(cleopatra_iptw_med.filter(items = ['age',
                                                                                'gender',
                                                                                'race_cat',
                                                                                'p_type',
                                                                                'delta_met_diagnosis',
                                                                                'met_cat',
                                                                                'ses_cat',
                                                                                'ecog_2']))

cleopatra_high_x = preprocessor.fit_transform(cleopatra_iptw_high.filter(items = ['age',
                                                                                  'gender',
                                                                                  'race_cat',
                                                                                  'p_type',
                                                                                  'delta_met_diagnosis',
                                                                                  'met_cat',
                                                                                  'ses_cat',
                                                                                  'ecog_2']))

cleopatra_all_x = preprocessor.fit_transform(cleopatra_iptw_all.filter(items = ['age',
                                                                                'gender',
                                                                                'race_cat',
                                                                                'p_type',
                                                                                'delta_met_diagnosis',
                                                                                'met_cat',
                                                                                'ses_cat',
                                                                                'ecog_2']))

In [312]:
lr_cleopatra_low = LogisticRegression(max_iter = 1000)
lr_cleopatra_low.fit(cleopatra_low_x, cleopatra_iptw_low['tpdp'])

LogisticRegression(max_iter=1000)

In [313]:
lr_cleopatra_med = LogisticRegression(max_iter = 1000)
lr_cleopatra_med.fit(cleopatra_med_x, cleopatra_iptw_med['tpdp'])

LogisticRegression(max_iter=1000)

In [314]:
lr_cleopatra_high = LogisticRegression(max_iter = 1000)
lr_cleopatra_high.fit(cleopatra_high_x, cleopatra_iptw_high['tpdp'])

LogisticRegression(max_iter=1000)

In [315]:
lr_cleopatra_all = LogisticRegression(max_iter = 1000)
lr_cleopatra_all.fit(cleopatra_all_x, cleopatra_iptw_all['tpdp'])

LogisticRegression(max_iter=1000)

In [316]:
pred_low = lr_cleopatra_low.predict_proba(cleopatra_low_x)
pred_med = lr_cleopatra_med.predict_proba(cleopatra_med_x)
pred_high = lr_cleopatra_high.predict_proba(cleopatra_high_x)
pred_all = lr_cleopatra_all.predict_proba(cleopatra_all_x)

In [317]:
cleopatra_iptw_low['ps'] = pred_low[:, 1]
cleopatra_iptw_med['ps'] = pred_med[:, 1]
cleopatra_iptw_high['ps'] = pred_high[:, 1]
cleopatra_iptw_all['ps'] = pred_all[:, 1]

In [318]:
cleopatra_iptw_low['weight'] = (
    np.where(cleopatra_iptw_low['tpdp'] == 1, 1/cleopatra_iptw_low['ps'], 1/(1 - cleopatra_iptw_low['ps'])))

cleopatra_iptw_med['weight'] = (
    np.where(cleopatra_iptw_med['tpdp'] == 1, 1/cleopatra_iptw_med['ps'], 1/(1 - cleopatra_iptw_med['ps'])))

cleopatra_iptw_high['weight'] = (
    np.where(cleopatra_iptw_high['tpdp'] == 1, 1/cleopatra_iptw_high['ps'], 1/(1 - cleopatra_iptw_high['ps'])))

cleopatra_iptw_all['weight'] = (
    np.where(cleopatra_iptw_all['tpdp'] == 1, 1/cleopatra_iptw_all['ps'], 1/(1 - cleopatra_iptw_all['ps'])))

In [319]:
# Low KM curves
kmf_low_tpdp_cleopatra_iptw = KaplanMeierFitter()
kmf_low_tdp_cleopatra_iptw = KaplanMeierFitter()

kmf_low_tpdp_cleopatra_iptw.fit(
    cleopatra_iptw_low.query('tpdp == 1')['timerisk_treatment']/30,
    cleopatra_iptw_low.query('tpdp == 1')['death_status'],
    weights = cleopatra_iptw_low.query('tpdp == 1')['weight'])

kmf_low_tdp_cleopatra_iptw.fit(
    cleopatra_iptw_low.query('tpdp == 0')['timerisk_treatment']/30,
    cleopatra_iptw_low.query('tpdp == 0')['death_status'],
    weights = cleopatra_iptw_low.query('tpdp == 0')['weight'])

# Med KM curves
kmf_med_tpdp_cleopatra_iptw = KaplanMeierFitter()
kmf_med_tdp_cleopatra_iptw = KaplanMeierFitter()

kmf_med_tpdp_cleopatra_iptw.fit(
    cleopatra_iptw_med.query('tpdp == 1')['timerisk_treatment']/30,
    cleopatra_iptw_med.query('tpdp == 1')['death_status'], 
    weights = cleopatra_iptw_med.query('tpdp == 1')['weight'])

kmf_med_tdp_cleopatra_iptw.fit(
    cleopatra_iptw_med.query('tpdp == 0')['timerisk_treatment']/30,
    cleopatra_iptw_med.query('tpdp == 0')['death_status'], 
    weights = cleopatra_iptw_med.query('tpdp == 0')['weight'])

# High KM curves 
kmf_high_tpdp_cleopatra_iptw = KaplanMeierFitter()
kmf_high_tdp_cleopatra_iptw = KaplanMeierFitter()

kmf_high_tpdp_cleopatra_iptw.fit(
    cleopatra_iptw_high.query('tpdp == 1')['timerisk_treatment']/30,
    cleopatra_iptw_high.query('tpdp == 1')['death_status'], 
    weights = cleopatra_iptw_high.query('tpdp == 1')['weight'])

kmf_high_tdp_cleopatra_iptw.fit(
    cleopatra_iptw_high.query('tpdp == 0')['timerisk_treatment']/30,
    cleopatra_iptw_high.query('tpdp == 0')['death_status'], 
    weights = cleopatra_iptw_high.query('tpdp == 0')['weight'])

# All KM curves 
kmf_all_tpdp_cleopatra_iptw = KaplanMeierFitter()
kmf_all_tdp_cleopatra_iptw = KaplanMeierFitter()

kmf_all_tpdp_cleopatra_iptw.fit(
    cleopatra_iptw_all.query('tpdp == 1')['timerisk_treatment']/30,
    cleopatra_iptw_all.query('tpdp == 1')['death_status'], 
    weights = cleopatra_iptw_all.query('tpdp == 1')['weight'])

kmf_all_tdp_cleopatra_iptw.fit(
    cleopatra_iptw_all.query('tpdp == 0')['timerisk_treatment']/30,
    cleopatra_iptw_all.query('tpdp == 0')['death_status'], 
    weights = cleopatra_iptw_all.query('tpdp == 0')['weight'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 1263.51 total observations, 427.177 right-censored observations>

#### Calculating survival metrics 

In [320]:
tpdp_cleopatra_median_os = mos(kmf_low_tpdp_cleopatra_iptw,
                               kmf_med_tpdp_cleopatra_iptw,
                               kmf_high_tpdp_cleopatra_iptw,
                               kmf_all_tpdp_cleopatra_iptw)

tdp_cleopatra_median_os = mos(kmf_low_tdp_cleopatra_iptw,
                              kmf_med_tdp_cleopatra_iptw,
                              kmf_high_tdp_cleopatra_iptw,
                              kmf_all_tdp_cleopatra_iptw)

In [321]:
cleopatra_iptw_all_imputed = cleopatra_iptw_all.copy()
cleopatra_iptw_all_imputed['albumin_diag'] = cleopatra_iptw_all_imputed['albumin_diag'].fillna(cleopatra_iptw_all_imputed['albumin_diag'].median())
cleopatra_iptw_all_imputed['weight_pct_change'] = cleopatra_iptw_all_imputed['weight_pct_change'].fillna(cleopatra_iptw_all_imputed['weight_pct_change'].median())

In [322]:
cleopatra_iptw_all_imputed = pd.merge(cleopatra_iptw_all_imputed.reset_index(), df_full.reset_index()[['PatientID', 'risk_score']], on = 'PatientID', how = 'left')
cleopatra_iptw_all = pd.merge(cleopatra_iptw_all.reset_index(), df_full.reset_index()[['PatientID', 'risk_score']], on = 'PatientID', how = 'left')

In [323]:
cleopatra_hr_all = CoxPHFitter()
cleopatra_hr_all.fit(cleopatra_iptw_all_imputed,
                    duration_col = 'timerisk_treatment', 
                    event_col = 'death_status', 
                    formula =  'tpdp + age + gender + race_cat + p_type + delta_met_diagnosis + met_cat + ses_cat + ecog_2 + albumin_diag + weight_pct_change + risk_score',
                    weights_col = 'weight',
                    robust = True)

<lifelines.CoxPHFitter: fitted with 2557.74 total observations, 1108.84 right-censored observations>

In [324]:
cleopatra_all_rmst_mos_95 = rmst_mos_95ci(cleopatra_iptw_all,
                                          1000,
                                          'tpdp',
                                          'death',
                                          ['age',
                                           'gender',
                                           'race_cat',
                                           'p_type',
                                           'delta_met_diagnosis',
                                           'met_cat',
                                           'ses_cat',
                                           'ecog_2'],
                                          ['age', 'delta_met_diagnosis'],
                                          60)

In [325]:
cleopatra_low_rmst_mos_95 = rmst_mos_95ci(cleopatra_iptw_low,
                                          1000,
                                          'tpdp',
                                          'death',
                                          ['age',
                                           'gender',
                                           'race_cat',
                                           'p_type',
                                           'delta_met_diagnosis',
                                           'met_cat',
                                           'ses_cat',
                                           'ecog_2'],
                                          ['age', 'delta_met_diagnosis'],
                                          60)

In [326]:
cleopatra_med_rmst_mos_95 = rmst_mos_95ci(cleopatra_iptw_med,
                                          1000,
                                          'tpdp',
                                          'death',
                                          ['age',
                                           'gender',
                                           'race_cat',
                                           'p_type',
                                           'delta_met_diagnosis',
                                           'met_cat',
                                           'ses_cat',
                                           'ecog_2'],
                                          ['age', 'delta_met_diagnosis'],
                                          60)

In [327]:
cleopatra_high_rmst_mos_95 = rmst_mos_95ci(cleopatra_iptw_high,
                                          1000,
                                          'tpdp',
                                          'death',
                                          ['age',
                                           'gender',
                                           'race_cat',
                                           'p_type',
                                           'delta_met_diagnosis',
                                           'met_cat',
                                           'ses_cat',
                                           'ecog_2'],
                                          ['age', 'delta_met_diagnosis'],
                                          60)

In [328]:
cleopatra_data = [
    {'trial_name': 'CLEOPATRA', 
     'risk_group': 'low', 
     'r_trt_mos': tpdp_cleopatra_median_os[0],
     'r_trt_mos_95': cleopatra_low_rmst_mos_95.mos_A_95,
     'r_cont_mos': tdp_cleopatra_median_os[0],
     'r_cont_mos_95': cleopatra_low_rmst_mos_95.mos_B_95,
     'r_mos_diff': tpdp_cleopatra_median_os[0] - tdp_cleopatra_median_os[0], 
     'rct_trt_arm': 57.1,
     'rct_cont_arm': 40.8,
     'rct_mos_diff': 57.1-40.8,
     'trt_rmst': restricted_mean_survival_time(kmf_low_tpdp_cleopatra_iptw, 60),
     'trt_rmst_95': cleopatra_low_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_low_tdp_cleopatra_iptw, 60),
     'cont_rmst_95': cleopatra_low_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_low_tpdp_cleopatra_iptw, 60) - restricted_mean_survival_time(kmf_low_tdp_cleopatra_iptw, 60),
     'diff_rmst_95': cleopatra_low_rmst_mos_95.difference_rmst_95,
     'rcount': cleopatra.query('risk_score <= @low_cutoff_cleopatra').shape[0]},
    
    {'trial_name': 'CLEOPATRA', 
     'risk_group': 'medium', 
     'r_trt_mos': tpdp_cleopatra_median_os[1],
     'r_trt_mos_95': cleopatra_med_rmst_mos_95.mos_A_95,
     'r_cont_mos': tdp_cleopatra_median_os[1],
     'r_cont_mos_95': cleopatra_med_rmst_mos_95.mos_B_95,
     'r_mos_diff': tpdp_cleopatra_median_os[1] - tdp_cleopatra_median_os[1], 
     'rct_trt_arm': 57.1,
     'rct_cont_arm': 40.8,
     'rct_mos_diff': 57.1-40.8,
     'trt_rmst': restricted_mean_survival_time(kmf_med_tpdp_cleopatra_iptw, 60),
     'trt_rmst_95': cleopatra_med_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_med_tdp_cleopatra_iptw, 60),
     'cont_rmst_95': cleopatra_med_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_med_tpdp_cleopatra_iptw, 60) - restricted_mean_survival_time(kmf_med_tdp_cleopatra_iptw, 60),
     'diff_rmst_95': cleopatra_med_rmst_mos_95.difference_rmst_95,
     'rcount': cleopatra.query('risk_score < @high_cutoff_cleopatra and risk_score > @low_cutoff_cleopatra').shape[0]},
    
    {'trial_name': 'CLEOPATRA', 
     'risk_group': 'high', 
     'r_trt_mos': tpdp_cleopatra_median_os[2],
     'r_trt_mos_95': cleopatra_high_rmst_mos_95.mos_A_95,
     'r_cont_mos': tdp_cleopatra_median_os[2],
     'r_cont_mos_95': cleopatra_high_rmst_mos_95.mos_B_95,
     'r_mos_diff': tpdp_cleopatra_median_os[2] - tdp_cleopatra_median_os[2], 
     'rct_trt_arm': 57.1,
     'rct_cont_arm': 40.8,
     'rct_mos_diff': 57.1-40.8,
     'trt_rmst': restricted_mean_survival_time(kmf_high_tpdp_cleopatra_iptw, 60),
     'trt_rmst_95': cleopatra_high_rmst_mos_95.rmst_A_95,
     'cont_rmst': restricted_mean_survival_time(kmf_high_tdp_cleopatra_iptw, 60),
     'cont_rmst_95': cleopatra_high_rmst_mos_95.rmst_B_95,
     'diff_rmst': restricted_mean_survival_time(kmf_high_tpdp_cleopatra_iptw, 60) - restricted_mean_survival_time(kmf_high_tdp_cleopatra_iptw, 60),
     'diff_rmst_95': cleopatra_high_rmst_mos_95.difference_rmst_95,
     'rcount': cleopatra.query('risk_score >= @high_cutoff_cleopatra').shape[0]},
    
    {'trial_name': 'CLEOPATRA', 
     'risk_group': 'all', 
     'r_hr': cleopatra_hr_all.hazard_ratios_['tpdp'],
     'r_hr_95': [cleopatra_hr_all.summary.loc['tpdp']['exp(coef) lower 95%'], cleopatra_hr_all.summary.loc['tpdp']['exp(coef) upper 95%']],
     'r_trt_mos': tpdp_cleopatra_median_os[3],
     'r_trt_mos_95': cleopatra_all_rmst_mos_95.mos_A_95,
     'r_cont_mos': tdp_cleopatra_median_os[3],
     'r_cont_mos_95': cleopatra_all_rmst_mos_95.mos_B_95,
     'r_mos_diff': tpdp_cleopatra_median_os[3] - tdp_cleopatra_median_os[3], 
     'rct_trt_arm': 57.1,
     'rct_cont_arm': 40.8,
     'rct_mos_diff': 57.1-40.8,
     'rcount': cleopatra.shape[0]}
]

## Part 3. Combining dictionaries 

In [329]:
data_combined = paloma2_data + paloma3_data + cleopatra_data

In [330]:
rtrials_mos_rmst_boot = pd.DataFrame(data_combined)

In [331]:
rtrials_mos_rmst_boot

Unnamed: 0,trial_name,risk_group,r_trt_mos,r_trt_mos_95,r_cont_mos,r_cont_mos_95,r_mos_diff,rct_trt_arm,rct_cont_arm,rct_mos_diff,trt_rmst,trt_rmst_95,cont_rmst,cont_rmst_95,diff_rmst,diff_rmst_95,rcount,r_hr,r_hr_95
0,PALOMA-2,low,32.033333,"[27.433333333333334, 35.93333333333333]",21.866667,"[20.2, 24.3]",10.166667,27.6,14.5,13.1,29.872153,"[28.287844962950516, 31.436144230327013]",25.330078,"[23.93608793780265, 27.02555447304788]",4.542075,"[2.148845025478604, 6.608525647071166]",1578,,
1,PALOMA-2,medium,24.0,"[20.265, 28.6]",18.266667,"[16.6, 20.166666666666668]",5.733333,27.6,14.5,13.1,26.333322,"[24.57382965563522, 28.08193888360373]",22.207777,"[20.98609428992929, 23.384876409462308]",4.125545,"[2.054684052981503, 6.233565097318803]",1578,,
2,PALOMA-2,high,10.833333,"[9.765833333333335, 12.6]",9.266667,"[8.365833333333335, 10.666666666666666]",1.566667,27.6,14.5,13.1,15.783458,"[14.322103628886445, 17.15820045412908]",13.289877,"[12.405609237175105, 14.16243576587341]",2.493581,"[0.7508768382688206, 4.187160675520319]",1578,,
3,PALOMA-2,all,19.566667,"[17.533333333333335, 20.966666666666665]",15.5,"[14.766666666666667, 16.666666666666668]",4.066667,27.6,14.5,13.1,,,,,,,4734,0.790242,"[0.726141554380528, 0.8600018594135159]"
4,PALOMA-3,low,13.766667,"[11.2, 17.20666666666666]",8.633333,"[6.4, 11.8]",5.133333,9.5,4.6,4.9,14.211243,"[12.842844158902214, 15.714275527345622]",11.52327,"[10.25911121439129, 12.787418435885066]",2.687974,"[0.7815761919031117, 4.528143197515613]",385,,
5,PALOMA-3,medium,9.366667,"[6.533333333333333, 13.7]",5.4,"[4.666666666666667, 6.6]",3.966667,9.5,4.6,4.9,11.828401,"[10.154930809848906, 13.469124739074818]",8.943854,"[7.772293147150985, 10.207339834829467]",2.884547,"[0.8347788054278571, 4.832022469911095]",383,,
6,PALOMA-3,high,4.566667,"[3.829166666666667, 6.133333333333334]",4.3,"[3.7333333333333334, 5.233333333333333]",0.266667,9.5,4.6,4.9,7.573165,"[6.392102225062113, 8.882334709337963]",6.786363,"[5.772680811755951, 7.884548563288684]",0.786801,"[-0.8244567985182658, 2.3630414440498684]",385,,
7,PALOMA-3,all,9.366667,"[7.8625, 11.2]",5.933333,"[5.166666666666667, 6.566666666666666]",3.433333,9.5,4.6,4.9,,,,,,,1153,0.795126,"[0.6851783277820629, 0.9227161053067006]"
8,CLEOPATRA,low,90.6,"[80.8, nan]",51.9,"[34.1, 76.3]",38.7,57.1,40.8,16.3,51.408647,"[49.62428000381762, 53.069461392101815]",44.435526,"[37.940465170696385, 49.78478485839959]",6.973121,"[1.600658183271298, 13.458759867564078]",431,,
9,CLEOPATRA,medium,53.133333,"[47.36666666666667, 63.9]",42.2,"[23.666666666666668, 60.13333333333333]",10.933333,57.1,40.8,16.3,44.025827,"[41.843920465853024, 46.00685819299512]",39.424588,"[31.948401758459394, 45.794889270353316]",4.601239,"[-1.6616639811332705, 12.098087628544324]",431,,


In [332]:
rtrials_mos_rmst_boot.to_csv('rtrials_mos_rmst_boot.csv', index = False)