# Flatiron Health aNSCLC: Data Wrangling Test Set

**OBJECTIVE: Select patients from test cohort and create a dataframe of relevant variables which can be used to build machine learning survival models.**

**BACKGROUND: The 10 CSV Flatiron files will be cleaned in the exact same fashion for the test set patients as for the training set patients. For more information on the cleaning process refer to Notebook: Data Wrangling Training Set.**

**OUTLINE:**
1. **File cleaning for patients in training set**
2. **Merge files to create master test dataframe** 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from scipy.stats import linregress 

In [2]:
# Function that returns number of rows and count of unique PatientIDs for a dataframe. 
def row_ID(dataframe):
    row = dataframe.shape[0]
    ID = dataframe['PatientID'].nunique()
    return row, ID

In [3]:
#Import test IDs saved from Data Wrangling Training Set file. 
test_IDs = pd.read_csv('test_IDs.csv')

In [4]:
# Array of PatientIDs in training set.
test_IDs = test_IDs['PatientID'].to_numpy()

## File cleaning for patients in training set

### 1. Demographics 

In [5]:
demographics = pd.read_csv('Demographics.csv')

In [6]:
demographics = demographics[demographics['PatientID'].isin(test_IDs)]

In [7]:
row_ID(demographics)

(13697, 13697)

#### 1.1 Race and Ethnicity

In [8]:
# If race value is 'Hispanic or Latino', code as unknown, otherwise value unchanged.
demographics['race'] = (
    np.where(demographics['Race'] == 'Hispanic or Latino', 'unknown', demographics['Race'])
)

In [9]:
# Missing race value will be recoded as Unknown
demographics['race'] = demographics['race'].fillna('unknown')

In [10]:
demographics['race'].value_counts().sum()

13697

In [11]:
# If race value is equal to 'Hispanic or Latino', code ethnicity as 'Hispanic or Latino', otherwise unchanged. 
demographics['ethnicity'] = (
    np.where(demographics['Race'] == 'Hispanic or Latino', 'hispanic_latino', demographics['Ethnicity'])
)

In [12]:
demographics['ethnicity'] = demographics['ethnicity'].fillna('unknown')

In [13]:
demographics['ethnicity'] = demographics['ethnicity'].replace({'Hispanic or Latino': 'hispanic_latino'})

In [14]:
demographics = demographics.drop(columns = ['Race', 'Ethnicity'])

#### 1.2 BirthYear

In [15]:
enhanced_adv = pd.read_csv('Enhanced_AdvancedNSCLC.csv')

In [16]:
demographics = pd.merge(demographics, enhanced_adv[['PatientID', 'AdvancedDiagnosisDate']], on = 'PatientID')

In [17]:
demographics.loc[:, 'AdvancedDiagnosisDate'] = pd.to_datetime(demographics['AdvancedDiagnosisDate'])

In [18]:
demographics.loc[:, 'age'] = demographics['AdvancedDiagnosisDate'].dt.year - demographics['BirthYear']

In [19]:
demographics = demographics.drop(columns = ['BirthYear', 'AdvancedDiagnosisDate'])

#### 1.3 PracticeType

**No changes mades to variable.** 

#### 1.4 Gender

In [20]:
# Impute M as unknown since M was imputed for missing values in trianing set. 
demographics['Gender'] = demographics['Gender'].fillna('M')

In [21]:
demographics = demographics.rename(columns = {'Gender': 'gender'})

#### 1.5 State

In [22]:
# Group states into Census-Bureau regions  
state_dict = { 
    'ME': 'northeast', 
    'NH': 'northeast',
    'VT': 'northeast', 
    'MA': 'northeast',
    'CT': 'northeast',
    'RI': 'northeast',  
    'NY': 'northeast', 
    'NJ': 'northeast', 
    'PA': 'northeast', 
    'IL': 'midwest', 
    'IN': 'midwest', 
    'MI': 'midwest', 
    'OH': 'midwest', 
    'WI': 'midwest',
    'IA': 'midwest',
    'KS': 'midwest',
    'MN': 'midwest',
    'MO': 'midwest', 
    'NE': 'midwest',
    'ND': 'midwest',
    'SD': 'midwest',
    'DE': 'south',
    'FL': 'south',
    'GA': 'south',
    'MD': 'south',
    'NC': 'south', 
    'SC': 'south',
    'VA': 'south',
    'DC': 'south',
    'WV': 'south',
    'AL': 'south',
    'KY': 'south',
    'MS': 'south',
    'TN': 'south',
    'AR': 'south',
    'LA': 'south',
    'OK': 'south',
    'TX': 'south',
    'AZ': 'west',
    'CO': 'west',
    'ID': 'west',
    'MT': 'west',
    'NV': 'west',
    'NM': 'west',
    'UT': 'west',
    'WY': 'west',
    'AK': 'west',
    'CA': 'west',
    'HI': 'west',
    'OR': 'west',
    'WA': 'west',
    'PR': 'unknown'
}

demographics['region'] = demographics['State'].map(state_dict)

In [23]:
demographics['region'] = demographics['region'].fillna('unknown')

In [24]:
demographics['region'].value_counts(dropna = False).sum()

13697

In [25]:
demographics = demographics.drop(columns = ['State'])

#### 1.6 PracticeID

In [26]:
demographics = demographics.drop(columns = ['PracticeID'])

#### 1.7 PrimaryPhysicianID

In [27]:
demographics = demographics.drop(columns = ['PrimaryPhysicianID'])

In [28]:
# Final training demographics table.
demographics.sample(5)

Unnamed: 0,PatientID,PracticeType,gender,race,ethnicity,age,region
3748,F89B68E8E6D3B,COMMUNITY,M,Other Race,unknown,67,northeast
12585,F28A0C4FB43A3,ACADEMIC,M,White,unknown,68,unknown
5860,F3D961D32CC50,COMMUNITY,M,White,unknown,65,south
2175,F941B86667B06,COMMUNITY,F,White,unknown,79,south
12834,FA6394B42E670,ACADEMIC,M,White,unknown,60,unknown


In [29]:
row_ID(demographics)

(13697, 13697)

In [30]:
%whos DataFrame

Variable       Type         Data/Info
-------------------------------------
demographics   DataFrame               PatientID Prac<...>n[13697 rows x 7 columns]
enhanced_adv   DataFrame               PatientID Diag<...>n[68483 rows x 6 columns]


In [31]:
del enhanced_adv

### 2. Enhanced_AdvancedNSCLC

In [32]:
enhanced_adv = pd.read_csv('Enhanced_AdvancedNSCLC.csv')

In [33]:
enhanced_adv = enhanced_adv[enhanced_adv['PatientID'].isin(test_IDs)]

In [34]:
row_ID(enhanced_adv)

(13697, 13697)

#### 2.1 Histology

**No changes mades to variable.** 

#### 2.2 GroupStage

In [35]:
# Dictionary for regrouping stages
stage_dict = { 
    'Stage I': 'I',
    'Stage IA': 'I',
    'Stage IA1': 'I', 
    'Stage IA2': 'I',
    'Stage IA3': 'I', 
    'Stage IB': 'I',
    'Stage II': 'II',
    'Stage IIA': 'II',
    'Stage IIB': 'II',
    'Stage III': 'III',
    'Stage IIIA': 'IIIA',
    'Stage IIIB': 'IIIB',
    'Stage IIIC': 'IIIC',
    'Stage IV': 'IV',
    'Stage IVA': 'IV',
    'Stage IVB': 'IV'
}

enhanced_adv['stage'] = enhanced_adv['GroupStage'].map(stage_dict)

In [36]:
enhanced_adv['stage'] = enhanced_adv['stage'].fillna('unknown')

In [37]:
enhanced_adv = enhanced_adv.drop(columns = ['GroupStage'])

#### 2.3 SmokingStatus

**No changes mades to variable.** 

#### 2.4 AdvancedDiagnosisDate

In [38]:
enhanced_adv.loc[:, 'AdvancedDiagnosisDate'] = pd.to_datetime(enhanced_adv['AdvancedDiagnosisDate'])

In [39]:
enhanced_adv = enhanced_adv.rename(columns = {'AdvancedDiagnosisDate': 'adv_date'})

In [40]:
enhanced_adv['adv_year_cat'] = pd.cut(enhanced_adv['adv_date'].dt.year,
                                      bins = [2010, 2012, 2014, 2016, 2018, 2020],
                                      labels = ['11-12', '13-14', '15-16', '17-18', '19-20'])

In [41]:
enhanced_adv.loc[:, 'adv_year'] = enhanced_adv['adv_date'].dt.year

#### 2.5 DiagnosisDate

In [42]:
enhanced_adv = enhanced_adv.rename(columns = {'DiagnosisDate': 'diagnosis_date'})

In [43]:
# Missing diagnosis_date will be replaced with adv_date; other dates will be left untouched. 
enhanced_adv['diagnosis_date'] = (
    np.where(enhanced_adv['diagnosis_date'].isna(), enhanced_adv['adv_date'], enhanced_adv['diagnosis_date'])
)

In [44]:
enhanced_adv['diagnosis_date'] = pd.to_datetime(enhanced_adv['diagnosis_date'])

#### 2.6 Time from diagnosis date to advanced date

In [45]:
enhanced_adv.loc[:, 'delta_adv_diagnosis'] = (enhanced_adv['adv_date'] - enhanced_adv['diagnosis_date']).dt.days

In [46]:
# Final enhanced_adv dataframe
enhanced_adv.sample(5)

Unnamed: 0,PatientID,diagnosis_date,adv_date,Histology,SmokingStatus,stage,adv_year_cat,adv_year,delta_adv_diagnosis
18967,F47C4D4E0908D,2013-01-01,2013-01-01,Non-squamous cell carcinoma,No history of smoking,IV,13-14,2013,0
38995,FB4DB20A88839,2015-09-02,2015-09-02,Non-squamous cell carcinoma,History of smoking,IV,15-16,2015,0
63455,FB731BD275C22,2015-08-27,2016-10-12,Non-squamous cell carcinoma,History of smoking,I,15-16,2016,412
30120,FE91E9FF373CC,2015-09-14,2015-09-14,NSCLC histology NOS,History of smoking,IV,15-16,2015,0
59224,F649A61A8426D,2014-06-20,2014-06-20,Non-squamous cell carcinoma,History of smoking,IV,13-14,2014,0


In [47]:
row_ID(enhanced_adv)

(13697, 13697)

In [48]:
%whos DataFrame

Variable       Type         Data/Info
-------------------------------------
demographics   DataFrame               PatientID Prac<...>n[13697 rows x 7 columns]
enhanced_adv   DataFrame               PatientID diag<...>n[13697 rows x 9 columns]


### 3. Mortality 

In [49]:
mortality = pd.read_csv('Enhanced_Mortality_V2.csv')

In [50]:
mortality = mortality[mortality['PatientID'].isin(test_IDs)]

In [51]:
row_ID(mortality)

(9806, 9806)

In [52]:
mortality = mortality.rename(columns = {'DateOfDeath': 'death_date'})

In [53]:
# For patients with year granularity, impute middle of the year (ie., July 1)
mortality['death_date'] = (
    np.where(mortality['death_date'].str.len() == 4, mortality['death_date'] + '-07-01', mortality['death_date'])
)

In [54]:
# For patients with month granularity, impute 15th of the month.
mortality['death_date'] = (
    np.where(mortality['death_date'].str.len() == 7, mortality['death_date'] + '-15', mortality['death_date'])
)

In [55]:
mortality['death_date'] = pd.to_datetime(mortality['death_date'])

#### Censoring

**For patients for whom a date of death is not known, the censor date can be defined either as the data cutoff date or as the last confirmed activity date. The last confirmed activity date is broadly defined as the last date at which there is evidence in the EHR that a patient is alive. Evidence of a record in at least one of the items listed below qualifies as patient-level confirmed activity:**
* **Visit: VisitDate**
* **Enhanced_AdvNSCLC_Orals: StartDate or EndDate**
* **Enhanced_AdvNSCLCBiomarkers: SpecimenCollectedDate**
* **Enhanced_AdvNSCLCProgression: LastClinicNoteDate or ProgressionDate**

In [56]:
visit = pd.read_csv('Visit.csv')
telemedicine = pd.read_csv('Telemedicine.csv')
orals = pd.read_csv('Enhanced_AdvNSCLC_Orals.csv')
biomarkers = pd.read_csv('Enhanced_AdvNSCLCBiomarkers.csv')
progression = pd.read_csv ('Enhanced_AdvNSCLCProgression.csv')

##### Visit and Telemedicine

In [57]:
# Append telemedicine to visit table. 
visit_tele = (
    visit
    .drop(columns = ['VisitType', 'IsVitalsVisit', 'IsTreatmentVisit', 'IsLabVisit'])
    .append(telemedicine)
) 

In [58]:
visit_tele.loc[:,'VisitDate'] = pd.to_datetime(visit_tele['VisitDate'])

In [59]:
# Select max VisitDate from combined Visit and Telemedicine table.
visit_tele_max = (
    visit_tele
    [visit_tele['PatientID'].isin(test_IDs)]
    .groupby('PatientID')['VisitDate'].max()
    .to_frame(name = 'visit_max')
    .reset_index()
)

##### Orals

In [60]:
orals = orals[orals['PatientID'].isin(test_IDs)]

In [61]:
row_ID(orals)

(2830, 1985)

In [62]:
orals.loc[:, 'StartDate'] = pd.to_datetime(orals['StartDate'])

In [63]:
orals.loc[:, 'EndDate'] = pd.to_datetime(orals['EndDate'])

In [64]:
orals_max = (
    orals
    .assign(max_date = orals[['StartDate', 'EndDate']].max(axis = 1))
    .groupby('PatientID')['max_date'].max()
    .to_frame(name = 'orals_max')
    .reset_index()
)

##### Biomarkers

In [65]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(test_IDs)]

In [66]:
row_ID(biomarkers)

(51601, 10160)

In [67]:
biomarkers.loc[:, 'SpecimenCollectedDate'] = pd.to_datetime(biomarkers['SpecimenCollectedDate'])

In [68]:
biomarkers_max = (
    biomarkers
    .groupby('PatientID')['SpecimenCollectedDate'].max()
    .to_frame(name = 'biomarkers_max')
    .reset_index()
)

##### Progression

In [69]:
progression = progression[progression['PatientID'].isin(test_IDs)]

In [70]:
row_ID(progression)

(17248, 9984)

In [71]:
progression.loc[:, 'ProgressionDate'] = pd.to_datetime(progression['ProgressionDate'])

In [72]:
progression.loc[:, 'LastClinicNoteDate'] = pd.to_datetime(progression['LastClinicNoteDate'])

In [73]:
progression_max = (
    progression
    .assign(max_date = progression[['ProgressionDate', 'LastClinicNoteDate']].max(axis = 1))
    .groupby('PatientID')['max_date'].max()
    .to_frame(name = 'progression_max')
    .reset_index()
)

##### Max date merge

In [74]:
last_activity = pd.merge(visit_tele_max, orals_max, on = 'PatientID', how = 'outer')

In [75]:
last_activity = pd.merge(last_activity, biomarkers_max, on = 'PatientID', how = 'outer')

In [76]:
last_activity = pd.merge(last_activity, progression_max, on = 'PatientID', how = 'outer')

In [77]:
row_ID(last_activity)

(13697, 13697)

In [78]:
# Find max of each row. 
last_activity = (
    last_activity
    .assign(last_activity = last_activity[['visit_max', 'orals_max', 'biomarkers_max', 'progression_max']].max(axis = 1))
    .filter(items = ['PatientID', 'last_activity'])
)

In [79]:
# Append missing training IDs.
mortality = (
    mortality
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(mortality['PatientID'])].to_frame(name = 'PatientID'), 
        sort = False
    )
)

In [80]:
mortality = pd.merge(mortality, enhanced_adv[['PatientID', 'adv_date']], on = 'PatientID')

In [81]:
mortality = pd.merge(mortality, last_activity, on = 'PatientID')

In [82]:
row_ID(mortality)

(13697, 13697)

In [83]:
mortality.loc[:, 'death_status'] = np.where(mortality['death_date'].isna(), 0, 1)

In [84]:
mortality.loc[:, 'timerisk_activity'] = (
    np.where(mortality['death_date'].isna(),
             (mortality['last_activity'] - mortality['adv_date']).dt.days,
             (mortality['death_date'] - mortality['adv_date']).dt.days)
)

In [85]:
# If timerisk_activity is less than 0, set to 0 otherwise remains unchanged. 
mortality['timerisk_activity'] = np.where(mortality['timerisk_activity'] < 0, 0, mortality['timerisk_activity'])

In [86]:
mortality = pd.merge(mortality, enhanced_adv[['PatientID', 'diagnosis_date']], on = 'PatientID', how = 'outer')

In [87]:
# timerisk_activity_first is time from first diagnosis (advanced or not) to death or last activity if no death date.
mortality.loc[:, 'timerisk_activity_first'] = (
    np.where(mortality['death_date'].isna(),
             (mortality['last_activity'] - mortality['diagnosis_date']).dt.days,
             (mortality['death_date'] - mortality['diagnosis_date']).dt.days)
)

In [88]:
# If timerisk_activity is less than 0, set to 0 otherwise remains unchanged. 
mortality['timerisk_activity_first'] = np.where(
    mortality['timerisk_activity_first'] < 0, 0, mortality['timerisk_activity_first'])

In [89]:
mortality.to_csv('mortality_cleaned_te.csv', index = False, header = True)

In [90]:
mortality = mortality.filter(items = ['PatientID', 'death_status', 'timerisk_activity'])

In [91]:
%whos DataFrame

Variable          Type         Data/Info
----------------------------------------
biomarkers        DataFrame                PatientID Bio<...>[51601 rows x 18 columns]
biomarkers_max    DataFrame               PatientID biom<...>n[10160 rows x 2 columns]
demographics      DataFrame               PatientID Prac<...>n[13697 rows x 7 columns]
enhanced_adv      DataFrame               PatientID diag<...>n[13697 rows x 9 columns]
last_activity     DataFrame               PatientID last<...>n[13697 rows x 2 columns]
mortality         DataFrame               PatientID  dea<...>n[13697 rows x 3 columns]
orals             DataFrame               PatientID     <...>\n[2830 rows x 5 columns]
orals_max         DataFrame              PatientID  oral<...>\n[1985 rows x 2 columns]
progression       DataFrame               PatientID Prog<...>n[17248 rows x 8 columns]
progression_max   DataFrame              PatientID progr<...>\n[9984 rows x 2 columns]
telemedicine      DataFrame               Patien

In [92]:
# Keep mortality
del biomarkers
del biomarkers_max
del last_activity
del orals
del orals_max
del progression 
del progression_max
del telemedicine
del visit
del visit_tele
del visit_tele_max

### 4. MedicationAdministration

In [93]:
med_admin = pd.read_csv('MedicationAdministration.csv')

In [94]:
row_ID(med_admin)

(3368216, 52146)

In [95]:
med_admin = med_admin[med_admin['PatientID'].isin(test_IDs)]

In [96]:
row_ID(med_admin)

(671300, 10459)

**The Medication Administration table documents the date a medication was administered to a patient. An indicator variable for clinically relevant administered medications received between -90 days from advanced diagnosis and up to time of first line of therapy will be created as columns.** 

In [97]:
line_therapy = pd.read_csv('LineOfTherapy.csv')

In [98]:
line_therapy_1 = (
    line_therapy 
    .query('LineNumber == 1 and IsMaintenanceTherapy == False')
)

In [99]:
med_admin = pd.merge(med_admin, line_therapy_1[['PatientID', 'StartDate']], on = 'PatientID', how = 'left')

In [100]:
med_admin = pd.merge(med_admin, enhanced_adv[['PatientID', 'adv_date']], on = 'PatientID', how = 'left')

In [101]:
med_admin.loc[:, 'AdministeredDate'] = pd.to_datetime(med_admin['AdministeredDate'])

In [102]:
med_admin.loc[:, 'StartDate'] = pd.to_datetime(med_admin['StartDate'])

In [103]:
# New variable upper_bound which defines upper bound
# If no StartDate (ie., no treatment received), then upper bound +30 from advanced diagnosis 
# If StartDate is greater than 30 days from advanced diagnosis, then upper bound +30 from advanced diagnosis
# If StartDate is less than or equal 30 from advanced diagnosis, then upper bound is one day before StartDate
conditions = [
    (med_admin['StartDate'].isna()) | ((med_admin['StartDate'] - med_admin['adv_date']).dt.days > 30),
    ((med_admin['StartDate'] - med_admin['adv_date']).dt.days <= 30)]    

choices = [30, (med_admin['StartDate'] - med_admin['adv_date']).dt.days - 1]
    
med_admin.loc[:, 'upper_bound'] = np.select(conditions, choices)

In [104]:
med_admin.loc[:, 'upper_bound_date'] = (
    np.where(med_admin['upper_bound'] != 30, 
             med_admin['StartDate'] - pd.DateOffset(days = 1), 
             med_admin['adv_date'] + pd.DateOffset(days = 30))
)

In [105]:
# Select window of -90 to +30 or start date of first line (whichever comes first) from time of advanced diagnosis and remove clinical study drug. 
med_admin_win = (
    med_admin
    [((med_admin['AdministeredDate'] - med_admin['adv_date']).dt.days >= -90) &
    (med_admin['AdministeredDate'] <= med_admin['upper_bound_date']) &
    (med_admin['CommonDrugName'] != 'Clinical study drug')]
)

In [106]:
row_ID(med_admin_win)

(41432, 2843)

#### 4.1 Antineoplastic

**No indicator variable created.** 

#### 4.2 Antiemetic

**No indicator variable created.** 

#### 4.3 Solution-fluid

**No indicator variable created.** 

#### 4.4 Glucocorticoid & steroid

In [107]:
med_admin_win.loc[:, 'steroid_diag'] = (
    np.where(((med_admin_win['DrugCategory'] == 'glucocorticoid') | 
              (med_admin_win['CommonDrugName'] == 'hydrocortisone')) & 
             ((med_admin_win['Route'] == 'Intravenous') | 
              (med_admin_win['Route'] == 'Oral') | 
              (med_admin_win['Route'] == 'Intrajejunal')), 1, 0)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


#### 4.5 Pain agent

**Clinically releavant. Three indicator variables from 'pain agent' category will be created:**
* **Opioid PO: patient recieved oral opioid (fentanyl patch included)**
* **Non-opioid PO: patient received non-opioid orally** 
* **Pain IV: patient received pain medciation intravenously**

##### 4.5a opioid PO

In [108]:
# List of avialable opioids in the US. 
opioid_list = [
    'buprenorphine',
    'codeine',
    'fentanyl',
    'hydrocodone',
    'hydromorphone',
    'methadone',
    'morphine',
    'oxycodone',
    'oxymorphone',
    'tapentadol',
    'tramadol'
]

In [109]:
med_admin_win.loc[:, 'opioid_PO_diag'] = (
    np.where(((med_admin_win['Route'] == 'Oral') | 
              (med_admin_win['Route'] == 'Transdermal') | 
              (med_admin_win['Route'] == 'Sublingual')) &
             (med_admin_win['CommonDrugName'].str.contains('|'.join(opioid_list))), 1, 0)
)

##### 4.5b Nonopioid PO

In [110]:
med_admin_win.loc[:, 'nonopioid_PO_diag'] = (
    np.where((med_admin_win['DrugCategory'] == 'pain agent') & 
             (med_admin_win['Route'] == 'Oral') & 
             (~med_admin_win['CommonDrugName'].str.contains('|'.join(opioid_list))), 1, 0)
)

##### 4.5c Pain IV

In [111]:
med_admin_win.loc[:, 'pain_IV_diag'] = (
    np.where((med_admin_win['DrugCategory'] == 'pain agent') & 
             (med_admin_win['Route'] == 'Intravenous') & 
             (med_admin_win['CommonDrugName'] != 'abatacept') & 
             (med_admin_win['CommonDrugName'] != 'lidocaine'), 1, 0)
)

#### 4.6 A. Hematological agent

**A therapeutic anticoagulation indicator variable will be created from the hematological agent category and will consist of:** 
* **Heparin: patients on a heparin gtt or other pareneral agents (eg., bivilrudin or argatroban)**
* **Enoxaparin: patients administered greater than prophylactic dosing of enoxaparin or other subcutaneous agents (eg., dalteparin or fondaparinux)**
* **DOAC: patients administered any dose of apxiaban, rivaroxabn, edoxaban, or dabigatran**
* **Warfarin: patients administered any dose of warfarin**  

##### 4.6 Aa. Heparin and other parenteral agents

In [112]:
med_admin_win.loc[:, 'heparin_diag'] = (
    np.where(((med_admin_win['CommonDrugName'].str.contains('heparin')) & 
              (med_admin_win['AdministeredUnits'] == 'unit/kg/hr')) | 
             (med_admin_win['CommonDrugName'].str.contains('bivalirudin')) | 
             (med_admin_win['CommonDrugName'].str.contains('argatroban')), 1, 0)
)

##### 4.6 Ab. Enoxaparin and other subcutaneous agents 

In [113]:
med_admin_win.loc[:, 'enoxaparin_diag'] = (
    np.where(((med_admin_win['CommonDrugName'].str.contains('enoxaparin')) & 
              (med_admin_win['AdministeredAmount'] > 40)) | 
             ((med_admin_win['CommonDrugName'].str.contains('dalteparin')) & 
              (med_admin_win['AdministeredAmount'] > 5000)) | 
             ((med_admin_win['CommonDrugName'].str.contains('fondaparinux')) & 
              (med_admin_win['AdministeredAmount'] > 2.5)), 1, 0)
)

##### 4.6 Ac. DOAC

In [114]:
med_admin_win.loc[:, 'doac_diag'] = (
    np.where((med_admin_win['CommonDrugName'].str.contains('apixaban')) | 
             (med_admin_win['CommonDrugName'].str.contains('rivaroxaban')) | 
             (med_admin_win['CommonDrugName'].str.contains('dabigatran')) | 
             (med_admin_win['CommonDrugName'].str.contains('edoxaban')), 1, 0)
)

##### 4.6 Ad. Warfarin

In [115]:
med_admin_win.loc[:, 'warfarin_diag'] = np.where((med_admin_win['CommonDrugName'].str.contains('warfarin')), 1, 0)

##### 4.6 Ae. Anticaogulation merge 

In [116]:
# Combine heparin, enoxparin, DOAC, and warfarin columns into a single anticoagulation indicator variable. 
med_admin_win['ac_diag'] = (
    med_admin_win['heparin_diag'] + med_admin_win['enoxaparin_diag'] + med_admin_win['doac_diag'] + med_admin_win['warfarin_diag']
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [117]:
# Drop heparin, enoxaparin, DOAC, and warfarin columns. 
med_admin_win = med_admin_win.drop(columns = ['heparin_diag', 'enoxaparin_diag', 'doac_diag', 'warfarin_diag'])

##### 4.6 B. DAPT

**No indicator variable created.** 

##### 4.6 C. G-CSF

**No indicator variable created.**

##### 4.6 C. ESA

**No indicator variable created.**

##### 4.6 D. tPA

**No indicator variable created.**

#### 4.7 Anti-infective 

**Clinically relevant; two separate indicator variables from the anti-infective drug category will be created:** 
* **Antiinfective IV group** 
* **Antiinfective group**

##### 4.7a Antiinfective IV 

In [118]:
med_admin_win.loc[:, 'antiinfective_IV_diag'] = (
    np.where((med_admin_win['DrugCategory'] == 'anti-infective') & 
             (med_admin_win['Route'] == 'Intravenous'), 1, 0)
)

##### 4.7b Antiinfective PO

In [119]:
med_admin_win.loc[:, 'antiinfective_diag'] = (
    np.where((med_admin_win['DrugCategory'] == 'anti-infective') & 
             (med_admin_win['Route'] == 'Oral'), 1, 0)
)

#### 4.8 Anesthetic

**No indicator variable created.** 

#### 4.9 Cytoprotective

**No indicator variable created.**

#### 4.10 Antihyperglycemic

In [120]:
med_admin_win.loc[:, 'antihyperglycemic_diag'] = np.where(med_admin_win['DrugCategory'] == 'antihyperglycemic', 1, 0)

#### 4.11 Proton pump inhibitor

In [121]:
med_admin_win.loc[:, 'ppi_diag'] = np.where(med_admin_win['DrugCategory'] == 'proton pump inhibitor', 1, 0)

#### 4.12 Antidepressant

In [122]:
med_admin_win.loc[:, 'antidepressant_diag'] = np.where(med_admin_win['DrugCategory'] == 'antidepressant', 1, 0)

#### 4.13 Bone therapy agent

In [123]:
med_admin_win.loc[:, 'bta_diag'] = np.where(med_admin_win['DrugCategory'] == 'bone therapy agent (bta)', 1, 0)

#### 4.14 Hormone

In [124]:
med_admin_win.loc[:, 'thyroid_diag'] = np.where(med_admin_win['CommonDrugName'] == 'levothyroxine', 1, 0)

#### 4.15 Gout and hyperurecemia agent 

**No indicator variable created.**

#### 4.16 Immunosuppressive 

In [125]:
med_admin_win.loc[:, 'is_diag'] = np.where(med_admin_win['DrugCategory'] == 'immunosuppressive', 1, 0)

#### 4.17 Sedative agent 

**No indicator variable created.**

#### 4.18 Endocrine 

**No indicator variable created.**

#### 4.19 Antidote and reversal agent

**No indicator variable created.**

#### 4.20 Hyperglycemic

**No indicator variable created.**

#### 4.21 Antithyroid agent

**No indicator variable created.**

#### 4.22 Anticholinergic

**No indicator variable created.**

#### 4.23 Calcimimetic 

**No indicator variable created.**

#### 4.24 Targeted therapy 

**No indicator variable created.**

#### 4.25 Anti-convulsant agent

**No indicator variable created.**

#### 4.26 Route

**No indicator variable created.**

#### 4.27 Condensing 

In [126]:
# Select columns with indicator variables and PatientID, then collapse rows by PatientID and sum columns. 
med_admin_wide = (
    med_admin_win
    [med_admin_win.columns[med_admin_win.columns.str.contains('diag|PatientID')]]
    .groupby('PatientID').sum()
)

In [127]:
# Replace numbers greater than 1 with 1; 0 remains unchanged. 
med_admin_wide = (
    med_admin_wide.mask(med_admin_wide > 1, 1)
    .reset_index()
)

In [128]:
row_ID(med_admin_wide)

(2843, 2843)

In [129]:
# Append missing training IDs.
med_admin_wide = (
    med_admin_wide.append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(med_admin_wide['PatientID'])].to_frame(name = 'PatientID'),
        sort = False
    )
    .fillna(0)
)

In [130]:
row_ID(med_admin_wide)

(13697, 13697)

In [131]:
%whos DataFrame

Variable         Type         Data/Info
---------------------------------------
demographics     DataFrame               PatientID Prac<...>n[13697 rows x 7 columns]
enhanced_adv     DataFrame               PatientID diag<...>n[13697 rows x 9 columns]
line_therapy     DataFrame               PatientID     <...>n[95320 rows x 9 columns]
line_therapy_1   DataFrame               PatientID     <...>n[49820 rows x 9 columns]
med_admin        DataFrame                PatientID    <...>671300 rows x 14 columns]
med_admin_wide   DataFrame               PatientID  ste<...>[13697 rows x 14 columns]
med_admin_win    DataFrame                PatientID    <...>[41432 rows x 27 columns]
mortality        DataFrame               PatientID  dea<...>n[13697 rows x 3 columns]


In [132]:
# Keep med_admin_wide.
del line_therapy
del line_therapy_1
del med_admin
del med_admin_win

### 5. Biomarkers 

In [133]:
biomarkers = pd.read_csv('Enhanced_AdvNSCLCBiomarkers.csv')

In [134]:
row_ID(biomarkers)

(255246, 50357)

In [135]:
biomarkers = biomarkers[biomarkers['PatientID'].isin(test_IDs)]

In [136]:
row_ID(biomarkers)

(51601, 10160)

**The Biomarkers dataframe is in a long format. A single-row-per-patient dataframe will be built where each column represents biomarker status. Index date will be date of advanced diagnosis and elgibility windown is negative infinity to +30 days of advanced diagnosis.** 

#### 5.1 Assigning patient-level biomarker status 

**Result date is the date the biomarker result was first reported, and therefore represents the date on which the clinician would be expected to have information about the patient’s biomarker status to inform the course of treatment. Flatiron recommends using result date as the relevant biomarker test date and using specimen received date as the proxy when result date is not available.** 

In [137]:
biomarkers.loc[:, 'ResultDate'] = pd.to_datetime(biomarkers['ResultDate'])

In [138]:
biomarkers.loc[:, 'SpecimenReceivedDate'] = pd.to_datetime(biomarkers['SpecimenReceivedDate'])

In [139]:
# Replace missing result date with specimen received date. 
biomarkers.loc[:, 'result_date'] = (
    np.where(biomarkers['ResultDate'].isna(), biomarkers['SpecimenReceivedDate'], biomarkers['ResultDate'])
)

In [140]:
biomarkers = pd.merge(biomarkers, enhanced_adv[['PatientID', 'adv_date']], on = 'PatientID', how = 'left')

In [141]:
# Create new variable 'date_diff' that marks difference in days between collected specimen advanced diagnosis. 
biomarkers.loc[:, 'bio_date_diff'] = (biomarkers['result_date'] - biomarkers['adv_date']).dt.days

In [142]:
biomarker_win = biomarkers[biomarkers['bio_date_diff'] <= 30]

In [143]:
row_ID(biomarker_win)

(25753, 6980)

**Patients with at least one confirmed positive test result for the biomarker of interest within the eligibility window will be considered “ever-positive”. This will include patients who may have confirmed negative results before and/or after a positive result within the eligibility window.** 

**Patients with at least one confirmed negative test result for the biomarker of interest, and no confirmed positive test results for the same biomarker within the eligibility window may be considered “only-negative”.** 

In [144]:
# Create indicator variable where where 2 if positive, 1 if negative, and 0 if unknown or missing. 
conditions = [
    (biomarker_win['BiomarkerStatus'] == 'Rearrangement present') | (biomarker_win['BiomarkerStatus'] == 'Mutation positive') | (biomarker_win['BiomarkerStatus'] == 'PD-L1 positive'),
    (biomarker_win['BiomarkerStatus'] == 'Rearrangement not present') | (biomarker_win['BiomarkerStatus'] == 'Mutation negative') | (biomarker_win['BiomarkerStatus'] == 'PD-L1 negative/not detected')
]

choices = [2,1]
biomarker_win.loc[:, 'bio_status'] = np.select(conditions, choices, default = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [145]:
# Excluding PDL1, select highest number biomarker status among duplicates, merge with nonduplciates, then pivot. 
biomarker_notpdl1 = (
    biomarker_win.query('BiomarkerName != "PDL1"')
    .sort_values(by = ['PatientID', 'BiomarkerName','bio_status'], ascending = False)
    .drop_duplicates(subset = ['PatientID', 'BiomarkerName'], keep = 'first')
    .pivot(index = 'PatientID', columns = 'BiomarkerName', values = 'bio_status')
    .reset_index()
)
biomarker_notpdl1.columns.name = None

In [146]:
row_ID(biomarker_notpdl1)

(6248, 6248)

In [147]:
# Append misisng patient IDs.
biomarker_notpdl1_wide = (
    biomarker_notpdl1
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(biomarker_notpdl1['PatientID'])].to_frame(name = 'PatientID'),
        sort = False)
)

In [148]:
row_ID(biomarker_notpdl1_wide)

(13697, 13697)

In [149]:
biomarker_notpdl1_wide['ALK'] = (
    biomarker_notpdl1_wide['ALK'].replace({
        2: 'positive',
        1: 'negative',
        0: 'unknown',
        np.nan: 'unknown'})
)

In [150]:
biomarker_notpdl1_wide['BRAF'] = (
    biomarker_notpdl1_wide['BRAF'].replace({
        2: 'positive',
        1: 'negative',
        0: 'unknown',
        np.nan: 'unknown'})
)

In [151]:
biomarker_notpdl1_wide['EGFR'] = (
    biomarker_notpdl1_wide['EGFR'].replace({
        2: 'positive',
        1: 'negative',
        0: 'unknown',
        np.nan: 'unknown'})
)

In [152]:
biomarker_notpdl1_wide['KRAS'] = (
    biomarker_notpdl1_wide['KRAS'].replace({
        2: 'positive',
        1: 'negative',
        0: 'unknown',
        np.nan: 'unknown'})
)

In [153]:
biomarker_notpdl1_wide['ROS1'] = (
    biomarker_notpdl1_wide['ROS1'].replace({
        2: 'positive',
        1: 'negative',
        0: 'unknown',
        np.nan: 'unknown'})
)

#### 5.2 Assigning patient-level PD-L1

**Flatiron recommends using PercentStaining as the primary source of truth to assess PD-L1 status over other options in the Biomarker table. For patients with multiple PDL1 testing instances, the maximum PercentStaining level will be selected and assigned to the patient. Flatiron recommends excluding from the study cohort patients with one or more PD-L1 testing instances within their eligibility window that fall on or before to December 31, 2016. PD-L1 testing instances from earlier years are likely to be missing PercentStaining values and these missing values may lead to bias.** 

In [154]:
biomarker_win_pdl1 = (
    biomarker_win
    .query('BiomarkerName == "PDL1"')
)

In [155]:
row_ID(biomarker_win_pdl1)

(4101, 3460)

In [156]:
pdl1_dict = { 
    np.nan: 0,
    '0%': 1, 
    '< 1%': 2,
    '1%': 3, 
    '2% - 4%': 4,
    '5% - 9%': 5,
    '10% - 19%': 6,  
    '20% - 29%': 7, 
    '30% - 39%': 8, 
    '40% - 49%': 9, 
    '50% - 59%': 10, 
    '60% - 69%': 11, 
    '70% - 79%': 12, 
    '80% - 89%': 13, 
    '90% - 99%': 14,
    '100%': 15
}

biomarker_win_pdl1.loc[:, 'percent_staining'] = biomarker_win_pdl1['PercentStaining'].map(pdl1_dict)

In [157]:
# Select highest percent staining for those with repeat testing, merge with nonduplciates, then pivot. 
biomarker_pdl1_staining = (
    biomarker_win_pdl1
    .sort_values(by = ['PatientID', 'percent_staining'], ascending = False)
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    .pivot(index = 'PatientID', columns = 'BiomarkerName', values = 'percent_staining')
    .rename(columns = {'PDL1': 'pdl1_staining'})
    .reset_index()
)
biomarker_pdl1_staining.columns.name = None

In [158]:
row_ID(biomarker_pdl1_staining)

(3460, 3460)

In [159]:
pdl1_dict_rev = { 
    0: np.nan,
    1: '0%', 
    2: '0%',
    3: '1-49%', 
    4: '1-49%',
    5: '1-49%',
    6: '1-49%',  
    7: '1-49%', 
    8: '1-49%', 
    9: '1-49%', 
    10: '50-100%', 
    11: '50-100%', 
    12: '50-100%', 
    13: '50-100%', 
    14: '50-100%',
    15: '50-100%'
}

biomarker_pdl1_staining.loc[:, 'pdl1_staining'] = biomarker_pdl1_staining['pdl1_staining'].map(pdl1_dict_rev)

In [160]:
biomarker_pdl1_staining_wide = (
    biomarker_pdl1_staining
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(biomarker_pdl1_staining['PatientID'])].to_frame(name = 'PatientID'),
        sort = False)
    .fillna('unknown')
)

In [161]:
biomarker_pdl1_staining_wide.shape

(13697, 2)

In [162]:
biomarker_pdl1_staining_wide.sample(5)

Unnamed: 0,PatientID,pdl1_staining
2988,FDF1277A013FB,unknown
319,F1A0AE9D5ABC6,1-49%
11204,FE5D3C47DE131,unknown
694,F35E6D48B4560,50-100%
13386,FBBD606DC8F76,unknown


**Flatiron recommends considering using BiomarkerStatus to impute the missing patient-level PercentStaining category value. Impute missing PercentStaining values as follows:**
* **Impute a PercentStaining value of “≥1%” for patients with at least one confirmed positive PD-L1 result within the eligible window** 
* **Impute a PercentStaining value of “0%” to patients with no confirmed positive PD-L1 results and at least one confirmed negative PD-L1 result within the eligible window** 
* **Do not impute a PercentStaining value to patients who have no confirmed positive or negative PD-L1 results within the eligible window**

In [163]:
# Select highest percent staining for those with repeat testing, merge with nonduplciates, then pivot. 
biomarker_pdl1_status = (
    biomarker_win_pdl1
    .sort_values(by = ['PatientID', 'bio_status'], ascending = False)
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    .pivot(index = 'PatientID', columns = 'BiomarkerName', values = 'bio_status')
    .rename(columns = {'PDL1': 'pdl1_status'})
    .reset_index()
)
biomarker_pdl1_status.columns.name = None

In [164]:
row_ID(biomarker_pdl1_status)

(3460, 3460)

In [165]:
biomarker_pdl1_status_wide = (
    biomarker_pdl1_status
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(biomarker_pdl1_status['PatientID'])].to_frame(name = 'PatientID'),
        sort = False)
    .fillna(0)
)

In [166]:
biomarker_pdl1 = pd.merge(biomarker_pdl1_staining_wide, biomarker_pdl1_status_wide, on = 'PatientID')

In [167]:
# If PDL1 staining is unknown, set to >=1% if ever positive and 0% if ever negative. 
# If PDL1 staining is known, set to >=1% if staining 1-100% and 0% if 0%.
conditions = [
    ((biomarker_pdl1['pdl1_staining'] == 'unknown') & (biomarker_pdl1['pdl1_status'] == 2)) | 
    (((biomarker_pdl1['pdl1_staining'] == '1-49%') | (biomarker_pdl1['pdl1_staining'] == '50-100%'))),
    ((biomarker_pdl1['pdl1_staining'] == 'unknown') & (biomarker_pdl1['pdl1_status'] == 1)) | 
    (biomarker_pdl1['pdl1_staining'] == '0%'), 
    ((biomarker_pdl1['pdl1_staining'] == 'unknown') & (biomarker_pdl1['pdl1_status'] == 0)),
]

choices = ['>=1%', '0%', 'unknown']

biomarker_pdl1.loc[:, 'pdl1_n'] = np.select(conditions, choices)

In [168]:
biomarker_pdl1.sample(5)

Unnamed: 0,PatientID,pdl1_staining,pdl1_status,pdl1_n
11161,F8C7ED13640B1,unknown,0.0,unknown
13680,FAC8804353ADA,unknown,0.0,unknown
4640,F5881D36361A4,unknown,0.0,unknown
11229,F773F3106A800,unknown,0.0,unknown
10168,F302113E4258B,unknown,0.0,unknown


In [169]:
biomarker_pdl1_wide = (
    biomarker_pdl1
    .drop(columns = ['pdl1_status'])
    .rename(columns = {'pdl1_staining': 'pdl1'})
)

In [170]:
biomarker_wide = pd.merge(biomarker_notpdl1_wide, biomarker_pdl1_wide, on = 'PatientID')

In [171]:
row_ID(biomarker_wide)

(13697, 13697)

In [172]:
biomarker_wide.sample(5)

Unnamed: 0,PatientID,ALK,BRAF,EGFR,KRAS,ROS1,pdl1,pdl1_n
5156,FD22C1C245A73,negative,negative,negative,negative,negative,1-49%,>=1%
10375,FE5A63D988653,unknown,unknown,unknown,unknown,unknown,unknown,unknown
13145,FB03AA50B0B71,unknown,unknown,unknown,unknown,unknown,unknown,unknown
4822,FC51116592F25,negative,negative,negative,positive,negative,unknown,unknown
7754,FC3C905B52D14,unknown,unknown,unknown,unknown,unknown,unknown,unknown


In [173]:
%whos DataFrame

Variable                       Type         Data/Info
-----------------------------------------------------
biomarker_notpdl1              DataFrame              PatientID  ALK <...>\n[6248 rows x 6 columns]
biomarker_notpdl1_wide         DataFrame               PatientID     <...>n[13697 rows x 6 columns]
biomarker_pdl1                 DataFrame               PatientID pdl1<...>n[13697 rows x 4 columns]
biomarker_pdl1_staining        DataFrame              PatientID pdl1_<...>\n[3460 rows x 2 columns]
biomarker_pdl1_staining_wide   DataFrame               PatientID pdl1<...>n[13697 rows x 2 columns]
biomarker_pdl1_status          DataFrame              PatientID  pdl1<...>\n[3460 rows x 2 columns]
biomarker_pdl1_status_wide     DataFrame               PatientID  pdl<...>n[13697 rows x 2 columns]
biomarker_pdl1_wide            DataFrame               PatientID     <...>n[13697 rows x 3 columns]
biomarker_wide                 DataFrame               PatientID     <...>n[13697 rows x 8 c

In [174]:
# Keep biomarker_wide.
del biomarker_notpdl1
del biomarker_notpdl1_wide
del biomarker_pdl1
del biomarker_pdl1_staining
del biomarker_pdl1_staining_wide
del biomarker_pdl1_status
del biomarker_pdl1_status_wide
del biomarker_pdl1_wide
del biomarker_win
del biomarker_win_pdl1
del biomarkers

### 6. Insurance 

In [175]:
insurance = pd.read_csv('Insurance.csv')

In [176]:
row_ID(insurance)

(147794, 63396)

In [177]:
insurance = insurance[insurance['PatientID'].isin(test_IDs)]

In [178]:
row_ID(insurance)

(29437, 12700)

**The insurance table contains patient insurance/payer information. Patients may have multiple payer categories concurrently. Start date is populated roughly 80% of the time, while end date is populated about 20% of the time. This mutiple-row-per-patient table will be transformed into a single-row-per-patient table. Indicator variables for each payer category active at time of advacned diagnosis will be made as columns. Insurance will be considered active if start date is less than 30 days from advanced diagnosis regardless of end date.** 

In [179]:
insurance.loc[:, 'StartDate'] = pd.to_datetime(insurance['StartDate'])

In [180]:
insurance = pd.merge(insurance, enhanced_adv[['PatientID', 'adv_date']], on = 'PatientID', how = 'left')

In [181]:
# Remove years with start dates less than 1900 (eg., 1694) which is likely a coding error. 
insurance = insurance[(insurance['StartDate']).dt.year >= 1900]

In [182]:
insurance.loc[:, 'insurance_date_diff'] = (insurance['StartDate'] - insurance['adv_date']).dt.days

In [183]:
insurance_win = insurance[insurance['insurance_date_diff'] <= 30]

In [184]:
row_ID(insurance)

(24261, 11408)

#### 6.1 Commerical

In [185]:
insurance_win.loc[:, 'commercial'] = np.where(insurance_win['PayerCategory'] == 'Commercial Health Plan', 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


#### 6.2 Medicare

In [186]:
insurance_win.loc[:, 'medicare'] = np.where(insurance_win['PayerCategory'] == 'Medicare', 1, 0)

#### 6.3 Medicaid

In [187]:
insurance_win.loc[:, 'medicaid'] = np.where(insurance_win['PayerCategory'] == 'Medicaid', 1, 0)

#### 6.4 Other

In [188]:
insurance_win.loc[:, 'other_insurance'] = (
    np.where(
        (insurance_win['PayerCategory'] != 'Commercial Health Plan') & 
        (insurance_win['PayerCategory'] != 'Medicare') &
        (insurance_win['PayerCategory'] != 'Medicaid'), 1, 0)
)

#### 6.5 Condense rows 

In [189]:
# After dropping 'insurance_date_diff', add columns by PatientID.
insurance_wide = (
    insurance_win
    .drop(columns = ['insurance_date_diff'])
    .groupby('PatientID').sum()
)

In [190]:
# Set any value greater than 1 to 1; leave 0 unchanged. 
insurance_wide = (
    insurance_wide
    .mask(insurance_wide > 1, 1)
    .reset_index()
)

In [191]:
row_ID(insurance_wide)

(10084, 10084)

In [192]:
# Append missing training IDs.
insurance_wide = (
    insurance_wide
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(insurance_wide['PatientID'])].to_frame(name = 'PatientID'),
        sort = False)
)

In [193]:
row_ID(insurance_wide)

(13697, 13697)

In [194]:
insurance_wide = insurance_wide.fillna(0)

In [195]:
%whos DataFrame

Variable         Type         Data/Info
---------------------------------------
biomarker_wide   DataFrame               PatientID     <...>n[13697 rows x 8 columns]
demographics     DataFrame               PatientID Prac<...>n[13697 rows x 7 columns]
enhanced_adv     DataFrame               PatientID diag<...>n[13697 rows x 9 columns]
insurance        DataFrame               PatientID     <...>n[24261 rows x 6 columns]
insurance_wide   DataFrame               PatientID  com<...>n[13697 rows x 5 columns]
insurance_win    DataFrame               PatientID     <...>[17591 rows x 10 columns]
med_admin_wide   DataFrame               PatientID  ste<...>[13697 rows x 14 columns]
mortality        DataFrame               PatientID  dea<...>n[13697 rows x 3 columns]


In [196]:
# Keep insurance_wide.
del insurance
del insurance_win

### 7. ECOG

In [197]:
ecog = pd.read_csv('ECOG.csv')

In [198]:
row_ID(ecog)

(822210, 48418)

In [199]:
ecog = ecog[ecog['PatientID'].isin(test_IDs)]

In [200]:
row_ID(ecog)

(164057, 9750)

**The ECOG table is a longitudinal record of structured ECOG scores captured in the EHR for each patient. Many patients have multiple ECOG scores reported. A new dataframe will be built where one ECOG score will be assigned to each patient. The index date will be date of advanced diagnosis with an elgible window period of +30 days to -90 days from index date. The ECOG score closest to index date will be assigned to the patient. In the case of two ECOG scores on the same day or equidistant but on opposite sides of the index date, the highest ECOG score (worse performance) will be selected.** 

**BaselineECOG is a composite table that selectes one ECOG score within +7 days and -30 days of a line of therapy. Of note, patients might have two baseline ECOG values for line number 1 due to maintenance therapy. This table will be pivoted to a wide format and merged with the above ECOG table.** 

#### 7.1 ECOG at time of advanced diagnosis

In [201]:
ecog = pd.merge(ecog, enhanced_adv[['PatientID', 'adv_date']], on = 'PatientID', how = 'left')

In [202]:
ecog.loc[:, 'EcogDate'] = pd.to_datetime(ecog['EcogDate'])      

In [203]:
ecog.loc[:, 'ecog_date_diff'] = (ecog['EcogDate'] - ecog['adv_date']).dt.days

In [204]:
ecog_win = ecog[(ecog['ecog_date_diff'] >= -90) & (ecog['ecog_date_diff'] <= 30)]

In [205]:
row_ID(ecog)

(164057, 9750)

In [206]:
# Delta in ECOG date and advanced diagnosis will be converted to an absolute value. 
ecog_win.loc[:, 'ecog_date_diff'] = ecog_win['ecog_date_diff'].abs()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [207]:
# Sort values with ECOG nearest to time of diagnosis as top row (and largest ECOG if multiple ECOGs that day) then select top row.   ECOG date nearest to day of diagnosis as top row and largest ES
ecog_diagnosis_wide = (
    ecog_win
    .sort_values(by = ['PatientID', 'ecog_date_diff', 'EcogValue'], ascending = [True, True, False])
    .drop_duplicates(subset = ['PatientID'], keep = 'first' )
    .filter(items = ['PatientID', 'EcogValue'])
    .rename(columns = {'EcogValue': 'ecog_diagnosis'})
)

In [208]:
row_ID(ecog_diagnosis_wide)

(6514, 6514)

In [209]:
# Append missing training IDs. 
ecog_diagnosis_wide = (
    ecog_diagnosis_wide
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(ecog_diagnosis_wide['PatientID'])].to_frame(name = 'PatientID'),
        sort = False)
    .fillna('unknown')
)

In [210]:
row_ID(ecog_diagnosis_wide)

(13697, 13697)

In [211]:
%whos DataFrame

Variable              Type         Data/Info
--------------------------------------------
biomarker_wide        DataFrame               PatientID     <...>n[13697 rows x 8 columns]
demographics          DataFrame               PatientID Prac<...>n[13697 rows x 7 columns]
ecog                  DataFrame                PatientID   E<...>[164057 rows x 5 columns]
ecog_diagnosis_wide   DataFrame                PatientID eco<...>n[13697 rows x 2 columns]
ecog_win              DataFrame                PatientID   E<...>n[16453 rows x 5 columns]
enhanced_adv          DataFrame               PatientID diag<...>n[13697 rows x 9 columns]
insurance_wide        DataFrame               PatientID  com<...>n[13697 rows x 5 columns]
med_admin_wide        DataFrame               PatientID  ste<...>[13697 rows x 14 columns]
mortality             DataFrame               PatientID  dea<...>n[13697 rows x 3 columns]


In [212]:
# Keep ecog_diagnosis_wide 
del ecog 
del ecog_win 

### 8. Vitals

In [213]:
vitals = pd.read_csv('Vitals.csv')

In [214]:
row_ID(vitals)

(12814491, 68313)

In [215]:
vitals = vitals[vitals['PatientID'].isin(test_IDs)]

In [216]:
row_ID(vitals)

(2533726, 13668)

**The Vitals table is a longitudinal record of vitals captured in the EHR for each patient. a BMI variable at time of advanced diagnosis will be created. The elgibility window will be -90 days to +30 days from advanced diagnosis. Average height from all visits will be used to calculate BMI. In the case of two weights on the same day or equidistant but on opposite sides of the index date, the lowest weight will be selected. In addition, percent change in weight will be calculated using -inf to +30. Patients must have at least two weight recordings to calculate percent change or weight.**

#### 8.1 Weight and BMI

In [217]:
# Create weight dataframe; remove weight values that are empty or equal to zero.
weight = (
    vitals
    .query('Test == "body weight"')
    .filter(items = ['PatientID', 'TestDate', 'TestResultCleaned'])
    .rename(columns = {'TestResultCleaned': 'weight'})
    .dropna(subset = ['weight'])
    .query('weight != 0')
)

In [218]:
weight.loc[:, 'TestDate'] = pd.to_datetime(weight['TestDate'])

In [219]:
weight = pd.merge(weight, enhanced_adv[['PatientID', 'adv_date']], on = 'PatientID', how = 'left')

In [220]:
# Weight elgibliity window is -90 and +30 from advanced diagnosis. 
weight_win_bmi = (
    weight
    .assign(weight_date_diff = (weight['TestDate'] - weight['adv_date']).dt.days)
    .query('weight_date_diff >= -90 and weight_date_diff <= 30')
)

In [221]:
row_ID(weight_win_bmi)

(35521, 10806)

In [222]:
weight_win_bmi.loc[:, 'weight_date_diff'] = weight_win_bmi['weight_date_diff'].abs()

In [223]:
# Select weight closest to date of advanced diagnosis; lowest weight selected in the event of two weights on same day or equidistant. 
weight_bmi_wide = (
    weight_win_bmi
    .sort_values(by = ['PatientID', 'weight_date_diff', 'weight'], ascending = [True, True, True])
    .drop_duplicates(subset = ['PatientID'], keep = 'first')
    .filter(items = ['PatientID', 'weight'])
    .rename(columns = {'weight': 'weight_diag'})
)

In [224]:
row_ID(weight_bmi_wide)

(10806, 10806)

In [225]:
# Dataframe of average height for each patient. 
height_avg = (
    vitals
    .query('Test == "body height"')
    .filter(items = ['PatientID', 'TestResultCleaned'])
    .groupby('PatientID')['TestResultCleaned'].mean()
    .to_frame()
    .reset_index()
    .rename(columns = {'TestResultCleaned': 'height_avg'})
)

In [226]:
weight_bmi_wide = pd.merge(weight_bmi_wide, height_avg, on = 'PatientID', how = 'left')

In [227]:
# Create BMI column. 
weight_bmi_wide = (
    weight_bmi_wide
    .assign(bmi_diag = lambda x: (x['weight_diag']/(x['height_avg']*x['height_avg']))*10000)
    .drop(columns = ['height_avg'])
)

In [228]:
weight_bmi_wide.describe()

Unnamed: 0,weight_diag,bmi_diag
count,10806.0,10721.0
mean,75.09229,26.305994
std,18.975591,5.922231
min,5.443104,0.590576
25%,61.688512,22.239938
50%,72.824196,25.547351
75%,86.18248,29.401605
max,187.152059,91.651652


In [229]:
# Append excluded IDs from training set and create a missing variable for those without BMI at diagnosis. 
weight_bmi_wide = (
    weight_bmi_wide
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(weight_bmi_wide['PatientID'])].to_frame(name = 'PatientID'),
        sort = False)
)

In [230]:
row_ID(weight_bmi_wide)

(13697, 13697)

In [231]:
weight_bmi_wide.loc[:, 'bmi_diag_na'] = np.where(weight_bmi_wide['bmi_diag'].isna(), 1, 0)

#### 8.2 Percent change 

In [232]:
# Select elgbility window of -90 to +90 days from advanced diagnosis.
weight_win_summary = (
    weight
    .assign(weight_date_diff = (weight['TestDate'] - weight['adv_date']).dt.days)
    .query('weight_date_diff >= -90 and weight_date_diff <= 90')
)

In [233]:
# Select patients with more than 1 weight recording within elgibility window.
weight_win_summary = weight_win_summary[weight_win_summary.duplicated(subset = ['PatientID'], keep = False)]

In [234]:
row_ID(weight_win_summary)

(79234, 11047)

In [235]:
# Select weight from the earliest time within elgibility window. 
weight_tmin = weight_win_summary.loc[weight_win_summary.groupby('PatientID')['weight_date_diff'].idxmin()]

In [236]:
# Select weight from the latest time within elgibility window. 
weight_tmax = weight_win_summary.loc[weight_win_summary.groupby('PatientID')['weight_date_diff'].idxmax()]

In [237]:
# Combine above two dataframes and sort from earliest recorded weight to latest recorded weight for each patient. 
weight_tcomb = (
    pd.concat([weight_tmin, weight_tmax])
    .sort_values(by = ['PatientID', 'weight_date_diff'], ascending = True)
)

In [238]:
row_ID(weight_tcomb)

(22094, 11047)

In [239]:
weight_tcomb.loc[:, 'weight_pct_change'] = weight_tcomb.groupby('PatientID')['weight'].pct_change()

In [240]:
weight_tcomb.loc[:, 'diff_date_diff'] = weight_tcomb['weight_date_diff'].diff()

In [241]:
# Drop empty rows for weight_pct_change.
weight_pct_wide = (
    weight_tcomb
    .dropna(subset = ['weight_pct_change'])
    .filter(items = ['PatientID', 'weight_pct_change', 'diff_date_diff'])
)

In [242]:
row_ID(weight_pct_wide)

(11047, 11047)

In [243]:
# Append missing training IDs and create a missing variable for those without weight_pct_change. 
weight_pct_wide = (
    weight_pct_wide
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(weight_pct_wide['PatientID'])].to_frame(name = 'PatientID'),
        sort = False)
    .drop(columns = ['diff_date_diff'])
)

In [244]:
row_ID(weight_pct_wide)

(13697, 13697)

In [245]:
weight_pct_wide.loc[:, 'weight_pct_na'] = np.where(weight_pct_wide['weight_pct_change'].isna(), 1, 0)

#### 8.3 Weight slope

In [246]:
weight_win_summary.loc[:, 'date_ordinal'] = weight_win_summary['TestDate'].map(dt.datetime.toordinal)

In [247]:
# Dataframe of slope for weight recordings within window period (kg/day).
weight_slope_wide = (
    weight_win_summary
    .groupby('PatientID')
    .apply(lambda x: pd.Series(linregress(x['date_ordinal'], x['weight'])))
    .rename(columns = {0: 'weight_slope'})
    .reset_index()
    .filter(items = ['PatientID', 'weight_slope']))   

  slope = ssxym / ssxm


In [248]:
row_ID(weight_slope_wide)

(11047, 11047)

In [249]:
# Append missing training IDs. 
weight_slope_wide = (
    weight_slope_wide
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(weight_slope_wide['PatientID'])].to_frame(name = 'PatientID'),
        sort = False)
)

In [250]:
row_ID(weight_slope_wide)

(13697, 13697)

#### 8.4 Merge 

In [251]:
weight_wide = pd.merge(weight_bmi_wide, weight_pct_wide, on = 'PatientID')

In [252]:
weight_wide = pd.merge(weight_wide, weight_slope_wide, on = 'PatientID')

In [253]:
row_ID(weight_wide)

(13697, 13697)

In [254]:
%whos DataFrame

Variable              Type         Data/Info
--------------------------------------------
biomarker_wide        DataFrame               PatientID     <...>n[13697 rows x 8 columns]
demographics          DataFrame               PatientID Prac<...>n[13697 rows x 7 columns]
ecog_diagnosis_wide   DataFrame                PatientID eco<...>n[13697 rows x 2 columns]
enhanced_adv          DataFrame               PatientID diag<...>n[13697 rows x 9 columns]
height_avg            DataFrame               PatientID  hei<...>n[13523 rows x 2 columns]
insurance_wide        DataFrame               PatientID  com<...>n[13697 rows x 5 columns]
med_admin_wide        DataFrame               PatientID  ste<...>[13697 rows x 14 columns]
mortality             DataFrame               PatientID  dea<...>n[13697 rows x 3 columns]
vitals                DataFrame                  PatientID  <...>533726 rows x 15 columns]
weight                DataFrame                PatientID   T<...>[329929 rows x 4 columns]


In [255]:
# Keep weight_wide.
del height_avg
del vitals
del weight
del weight_bmi_wide
del weight_pct_wide
del weight_slope_wide
del weight_tcomb
del weight_tmax
del weight_tmin
del weight_win_bmi
del weight_win_summary

### 9. Lab

In [256]:
lab = pd.read_csv('Lab.csv')

In [257]:
row_ID(lab)

(39492037, 64852)

In [258]:
# Selecting PatientIDs from training set. 
lab = lab[lab['PatientID'].isin(test_IDs)]

In [259]:
row_ID(lab)

(7863557, 13019)

**The Lab table is a longitudinal record of lab captured in the EHR with multiple-rows-per-patient. A single-patient-per-row table will be built focusing on the following NCCN recommended labs (ie., CMP and CBC):**

* **Creatinine -- (LOINC: 2160-0 and 38483-4)**
* **Hemoglobin -- (LOINC: 718-7 and 20509-6)**
* **White blood cell count -- (LOINC: 26464-8 and 6690-2)**
* **Neutrophil count -- (LOINC: 26499-4, 751-8, 30451-9, and 753-4)**
* **Albumin, serum -- (LOINC: 1751-7)**
* **Total bilirubin -- (LOINC: 42719-5 and 1975-2)**
* **Sodium — (LOINC: 2947-0 and 2951-2)**
* **Bicarb — (LOINC: 1963-8, 1959-6, 14627-4, 1960-4, and 2028-9)**
* **Calcium — (LOINC: 17861-6 and 49765-1)**
* **AST — (LOINC: 1920-8)**
* **ALT — (LOINC: 1742-6, 1743-4, and 1744-2)**
* **Platelet -- (LOINC: 26515-7, 777-3, 778-1, and 49497-1)**
* **Potassium -- (LOINC: 6298-4 and 2823-3)**
* **Chloride -- (LOINC: 2075-0)**
* **BUN -- (LOINC: 3094-0)**

**The index date will be time of advanced diagnosis with an elgibility window of -90 days to +30 days. The lab value closest to the index date will be selected for each patient. The following summary statistics, using an elgibility window of negative infinity to +30 days from advanced diagnosis, will also be created for some of the above variables:** 
* **Max**
* **Min**
* **Mean**
* **Standard deviation** 
* **Slope**

In [260]:
lab = pd.merge(lab, enhanced_adv[['PatientID', 'adv_date']], on = 'PatientID', how = 'left')

In [261]:
lab.loc[:, 'ResultDate'] = pd.to_datetime(lab['ResultDate']) 

In [262]:
# Select rows with clinically relevant labs.
lab_core = (
    lab[
    (lab['LOINC'] == "2160-0") |
    (lab['LOINC'] == "38483-4") | 
    (lab['LOINC'] == "718-7") |
    (lab['LOINC'] == "20509-6") |
    (lab['LOINC'] == "26464-8") |
    (lab['LOINC'] == "6690-2") |
    (lab['LOINC'] == "26499-4") |
    (lab['LOINC'] == "751-8") |
    (lab['LOINC'] == "30451-9") |
    (lab['LOINC'] == "753-4") |
    (lab['LOINC'] == "1751-7") |
    (lab['LOINC'] == "42719-5") |
    (lab['LOINC'] == "1975-2") |
    (lab['LOINC'] == "2947-0") |
    (lab['LOINC'] == "2951-2") |
    (lab['LOINC'] == "1963-8") |
    (lab['LOINC'] == "1959-6") |
    (lab['LOINC'] == "14627-4") |
    (lab['LOINC'] == "1960-4") |
    (lab['LOINC'] == "2028-9") |
    (lab['LOINC'] == "17861-6") |
    (lab['LOINC'] == "49765-1") |
    (lab['LOINC'] == "1920-8") |
    (lab['LOINC'] == "1742-6") | 
    (lab['LOINC'] == "1743-4") |
    (lab['LOINC'] == "1744-2") |
    (lab['LOINC'] == "26515-7") | 
    (lab['LOINC'] == "777-3") |
    (lab['LOINC'] == "778-1") |
    (lab['LOINC'] == "49497-1") | 
    (lab['LOINC'] == "6298-4") |
    (lab['LOINC'] == "2823-3") |
    (lab['LOINC'] == "2075-0") | 
    (lab['LOINC'] == "3094-0") | 
    (lab['LOINC'] == "6768-6")]
    .filter(items = ['PatientID', 
                     'ResultDate', 
                     'LOINC', 
                     'LabComponent', 
                     'TestUnits', 
                     'TestUnitsCleaned', 
                     'TestResult', 
                     'TestResultCleaned', 
                     'adv_date'])
)

In [263]:
conditions = [
    ((lab_core['LOINC'] == '2160-0') | (lab_core['LOINC'] == '38483-4')),
    ((lab_core['LOINC'] == '718-7') | (lab_core['LOINC'] == '20509-6')),
    ((lab_core['LOINC'] == '26464-8') | (lab_core['LOINC'] == '6690-2')), 
    ((lab_core['LOINC'] == '26499-4') | (lab_core['LOINC'] == '751-8') | (lab_core['LOINC'] == '30451-9') | (lab_core['LOINC'] == '753-4')),
    (lab_core['LOINC'] == '1751-7'),
    ((lab_core['LOINC'] == '42719-5') | (lab_core['LOINC'] == '1975-2')),
    ((lab_core['LOINC'] == '2947-0') | (lab_core['LOINC'] == '2951-2')),
    ((lab_core['LOINC'] == '1963-8') | (lab_core['LOINC'] == '1959-6') | (lab_core['LOINC'] == '14627-4') | (lab_core['LOINC'] == '1960-4') | (lab_core['LOINC'] == '2028-9')),
    ((lab_core['LOINC'] == '17861-6') | (lab_core['LOINC'] == '49765-1')),
    (lab_core['LOINC'] == '1920-8'),
    ((lab_core['LOINC'] == '1742-6') | (lab_core['LOINC'] == '1743-4') | (lab_core['LOINC'] == '1744-2')),
    ((lab_core['LOINC'] == '26515-7') | (lab_core['LOINC'] == '777-3') | (lab_core['LOINC'] == '778-1') | (lab_core['LOINC'] == '49497-1')),
    ((lab_core['LOINC'] == '6298-4') | (lab_core['LOINC'] == '2823-3')),
    (lab_core['LOINC'] == '2075-0'), 
    (lab_core['LOINC'] == '3094-0'),
    (lab_core['LOINC'] == '6768-6')]

choices = ['creatinine', 
           'hemoglobin', 
           'wbc', 
           'neutrophil_count',  
           'albumin', 
           'total_bilirubin', 
           'sodium', 
           'bicarb',
           'calcium',
           'ast', 
           'alt',
           'platelet',
           'potassium', 
           'chloride',
           'bun',
           'alp']

lab_core.loc[:, 'lab_name'] = np.select(conditions, choices)

In [264]:
# Remove missing lab values. 
lab_core = lab_core.dropna(subset = ['TestResultCleaned'])

In [265]:
conditions = [
    ((lab_core['lab_name'] == 'wbc') | (lab_core['lab_name'] == 'neutrophil_count') | (lab_core['lab_name'] == 'platelet')) & 
    (lab_core['TestUnits'] == '10*3/L'),
    (lab_core['lab_name'] == 'hemoglobin') & (lab_core['TestUnits'] == 'g/uL')]

choices = [lab_core['TestResultCleaned'] * 1000000,
           lab_core['TestResultCleaned'] / 100000]

lab_core.loc[:, 'test_result_cleaned'] = np.select(conditions, choices, default = lab_core['TestResultCleaned'])

In [266]:
# Elgibliity window is -90 and +30 from advanced diagnosis. 
lab_core_win = (
    lab_core
    .assign(lab_date_diff = (lab_core['ResultDate'] - lab_core['adv_date']).dt.days)
    .query('lab_date_diff >= -90 and lab_date_diff <= 30')
    .filter(items = ['PatientID', 'ResultDate', 'TestResultCleaned', 'lab_name', 'adv_date', 'test_result_cleaned', 'lab_date_diff'])
)

In [267]:
lab_core_win.loc[:, 'lab_date_diff'] = lab_core_win['lab_date_diff'].abs()

In [268]:
# Select lab closest to date of advanced diagnosis and pivot to a wide table. 
lab_diag_wide = (
    lab_core_win
    .loc[lab_core_win.groupby(['PatientID', 'lab_name'])['lab_date_diff'].idxmin()]
    .pivot(index = 'PatientID', columns = 'lab_name', values = 'test_result_cleaned')
    .reset_index()
    .rename(columns = {
        'albumin': 'albumin_diag',
        'creatinine': 'creatinine_diag',
        'hemoglobin': 'hemoglobin_diag',
        'neutrophil_count': 'neutrophil_count_diag',
        'total_bilirubin': 'total_bilirubin_diag',
        'wbc': 'wbc_diag',
        'sodium': 'sodium_diag', 
        'bicarb': 'bicarb_diag',
        'calcium': 'calcium_diag',
        'ast': 'ast_diag', 
        'alt': 'alt_diag',
        'platelet': 'platelet_diag',
        'potassium': 'potassium_diag',
        'chloride': 'chloride_diag',
        'bun': 'bun_diag',
        'alp': 'alp_diag'})
)

lab_diag_wide.columns.name = None

In [269]:
row_ID(lab_diag_wide)

(9519, 9519)

In [270]:
lab_diag_wide = (
    lab_diag_wide
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(lab_diag_wide['PatientID'])].to_frame(name = 'PatientID'),
        sort = False)
)

In [271]:
row_ID(lab_diag_wide)

(13697, 13697)

In [272]:
# Create missing variables for labs at time of diagnosis. 
for x in range (1, len(lab_diag_wide.columns)):
    lab_diag_wide.loc[:, lab_diag_wide.columns[x]+'_na'] = np.where(lab_diag_wide[lab_diag_wide.columns[x]].isna(), 1, 0)

In [273]:
list(lab_diag_wide.columns)

['PatientID',
 'albumin_diag',
 'alp_diag',
 'alt_diag',
 'ast_diag',
 'bicarb_diag',
 'bun_diag',
 'calcium_diag',
 'chloride_diag',
 'creatinine_diag',
 'hemoglobin_diag',
 'neutrophil_count_diag',
 'platelet_diag',
 'potassium_diag',
 'sodium_diag',
 'total_bilirubin_diag',
 'wbc_diag',
 'albumin_diag_na',
 'alp_diag_na',
 'alt_diag_na',
 'ast_diag_na',
 'bicarb_diag_na',
 'bun_diag_na',
 'calcium_diag_na',
 'chloride_diag_na',
 'creatinine_diag_na',
 'hemoglobin_diag_na',
 'neutrophil_count_diag_na',
 'platelet_diag_na',
 'potassium_diag_na',
 'sodium_diag_na',
 'total_bilirubin_diag_na',
 'wbc_diag_na']

#### Mean, max, min, and standard deviation

In [274]:
# Elgibility window is negative infinity to +30 from advanced diagnosis. 
lab_core_win_summ = (
    lab_core
    .assign(lab_date_diff = (lab_core['ResultDate'] - lab_core['adv_date']).dt.days)
    .query('lab_date_diff <= 30')
    .filter(items = ['PatientID', 'ResultDate', 'TestResultCleaned', 'lab_name', 'adv_date', 'test_result_cleaned', 'lab_date_diff'])
)

In [275]:
# Pivot table of average values for core labs during elgibility period of -90 to -30 days from advanced diagnosis. 
lab_avg_wide = (
    lab_core_win_summ
    .groupby(['PatientID', 'lab_name'])['test_result_cleaned'].mean()
    .to_frame()
    .reset_index()
    .pivot(index = 'PatientID', columns = 'lab_name', values = 'test_result_cleaned')
    .reset_index()
    .rename(columns = {
        'albumin': 'albumin_avg',
        'creatinine': 'creatinine_avg',
        'hemoglobin': 'hemoglobin_avg',
        'neutrophil_count': 'neutrophil_count_avg',
        'total_bilirubin': 'total_bilirubin_avg',
        'wbc': 'wbc_avg',
        'sodium': 'sodium_avg', 
        'bicarb': 'bicarb_avg',
        'calcium': 'calcium_avg',
        'ast': 'ast_avg', 
        'alt': 'alt_avg',
        'platelet': 'platelet_avg',
        'potassium': 'potassium_avg',
        'chloride': 'chloride_avg',
        'bun': 'bun_avg',
        'alp': 'alp_avg'})
)

lab_avg_wide.columns.name = None

In [276]:
row_ID(lab_avg_wide)

(9924, 9924)

In [277]:
# Pivot table of maximum values for core labs during elgibility period of -90 to -30 days from advanced diagnosis. 
lab_max_wide = (
    lab_core_win_summ
    .groupby(['PatientID', 'lab_name'])['test_result_cleaned'].max()
    .to_frame()
    .reset_index()
    .pivot(index = 'PatientID', columns = 'lab_name', values = 'test_result_cleaned')
    .reset_index()
    .rename(columns = {
        'albumin': 'albumin_max',
        'creatinine': 'creatinine_max',
        'hemoglobin': 'hemoglobin_max',
        'neutrophil_count': 'neutrophil_count_max',
        'total_bilirubin': 'total_bilirubin_max',
        'wbc': 'wbc_max', 
        'sodium': 'sodium_max', 
        'bicarb': 'bicarb_max',
        'calcium': 'calcium_max',
        'ast': 'ast_max', 
        'alt': 'alt_max',
        'platelet': 'platelet_max',
        'potassium': 'potassium_max',
        'chloride': 'chloride_max',
        'bun': 'bun_max', 
        'alp': 'alp_max'})
)

lab_max_wide.columns.name = None

In [278]:
row_ID(lab_max_wide)

(9924, 9924)

In [279]:
# Pivot table of minimum values for core labs during elgibility period of -90 to -30 days from advanced diagnosis. 
lab_min_wide = (
    lab_core_win_summ
    .groupby(['PatientID', 'lab_name'])['test_result_cleaned'].min()
    .to_frame()
    .reset_index()
    .pivot(index = 'PatientID', columns = 'lab_name', values = 'test_result_cleaned')
    .reset_index()
    .rename(columns = {
        'albumin': 'albumin_min',
        'creatinine': 'creatinine_min',
        'hemoglobin': 'hemoglobin_min',
        'neutrophil_count': 'neutrophil_count_min',
        'total_bilirubin': 'total_bilirubin_min',
        'wbc': 'wbc_min',
        'sodium': 'sodium_min', 
        'bicarb': 'bicarb_min',
        'calcium': 'calcium_min',
        'ast': 'ast_min', 
        'alt': 'alt_min',
        'platelet': 'platelet_min',
        'potassium': 'potassium_min',
        'chloride': 'chloride_min',
        'bun': 'bun_min',
        'alp': 'alp_min'})
)

lab_min_wide.columns.name = None

In [280]:
row_ID(lab_min_wide)

(9924, 9924)

In [281]:
# Pivot table of standard deviation for core labs during elgibility period of -90 to -30 days from advanced diagnosis. 
lab_std_wide = (
    lab_core_win_summ
    .groupby(['PatientID', 'lab_name'])['test_result_cleaned'].std()
    .to_frame()
    .reset_index()
    .pivot(index = 'PatientID', columns = 'lab_name', values = 'test_result_cleaned')
    .reset_index()
    .rename(columns = {
        'albumin': 'albumin_std',
        'creatinine': 'creatinine_std',
        'hemoglobin': 'hemoglobin_std',
        'neutrophil_count': 'neutrophil_count_std',
        'total_bilirubin': 'total_bilirubin_std',
        'wbc': 'wbc_std',
        'sodium': 'sodium_std', 
        'bicarb': 'bicarb_std',
        'calcium': 'calcium_std',
        'ast': 'ast_std', 
        'alt': 'alt_std',
        'platelet': 'platelet_std',
        'potassium': 'potassium_std',
        'chloride': 'chloride_std',
        'bun': 'bun_std', 
        'alp': 'alp_std'})
)

lab_std_wide.columns.name = None

In [282]:
row_ID(lab_std_wide)

(9924, 9924)

In [283]:
lab_summary_wide = pd.merge(lab_avg_wide, lab_max_wide, on = 'PatientID', how = 'outer')

In [284]:
lab_summary_wide = pd.merge(lab_summary_wide, lab_min_wide, on = 'PatientID', how = 'outer')

In [285]:
lab_summary_wide = pd.merge(lab_summary_wide, lab_std_wide, on = 'PatientID', how = 'outer')

In [286]:
row_ID(lab_summary_wide)

(9924, 9924)

In [287]:
lab_summary_wide = (
    lab_summary_wide
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(lab_summary_wide['PatientID'])].to_frame(name = 'PatientID'),
        sort = False)
)

In [288]:
row_ID(lab_summary_wide)

(13697, 13697)

In [289]:
list(lab_summary_wide.columns)

['PatientID',
 'albumin_avg',
 'alp_avg',
 'alt_avg',
 'ast_avg',
 'bicarb_avg',
 'bun_avg',
 'calcium_avg',
 'chloride_avg',
 'creatinine_avg',
 'hemoglobin_avg',
 'neutrophil_count_avg',
 'platelet_avg',
 'potassium_avg',
 'sodium_avg',
 'total_bilirubin_avg',
 'wbc_avg',
 'albumin_max',
 'alp_max',
 'alt_max',
 'ast_max',
 'bicarb_max',
 'bun_max',
 'calcium_max',
 'chloride_max',
 'creatinine_max',
 'hemoglobin_max',
 'neutrophil_count_max',
 'platelet_max',
 'potassium_max',
 'sodium_max',
 'total_bilirubin_max',
 'wbc_max',
 'albumin_min',
 'alp_min',
 'alt_min',
 'ast_min',
 'bicarb_min',
 'bun_min',
 'calcium_min',
 'chloride_min',
 'creatinine_min',
 'hemoglobin_min',
 'neutrophil_count_min',
 'platelet_min',
 'potassium_min',
 'sodium_min',
 'total_bilirubin_min',
 'wbc_min',
 'albumin_std',
 'alp_std',
 'alt_std',
 'ast_std',
 'bicarb_std',
 'bun_std',
 'calcium_std',
 'chloride_std',
 'creatinine_std',
 'hemoglobin_std',
 'neutrophil_count_std',
 'platelet_std',
 'potassium

#### Slope

In [290]:
lab_core_win_summ.loc[:, 'result_date_ordinal'] = lab_core_win_summ['ResultDate'].map(dt.datetime.toordinal)

In [291]:
lab_slope_wide = (
    lab_core_win_summ
    .groupby(['PatientID', 'lab_name'])
    .apply(lambda x: pd.Series(linregress(x['result_date_ordinal'], x['test_result_cleaned'])))
    .rename(columns = {0: 'slope'})
    .reset_index()
    .filter(items = ['PatientID', 'lab_name', 'slope'])
    .pivot(index = 'PatientID', columns = 'lab_name', values = 'slope')
    .reset_index()
    .rename(columns = {
        'albumin': 'albumin_slope',
        'creatinine': 'creatinine_slope',
        'hemoglobin': 'hemoglobin_slope',
        'neutrophil_count': 'neutrophil_count_slope',
        'total_bilirubin': 'total_bilirubin_slope',
        'wbc': 'wbc_slope',
        'sodium': 'sodium_slope', 
        'bicarb': 'bicarb_slope',
        'calcium': 'calcium_slope',
        'ast': 'ast_slope', 
        'alt': 'alt_slope',
        'platelet': 'platelet_slope',
        'potassium': 'potassium_slope',
        'chloride': 'chloride_slope',
        'bun': 'bun_slope',
        'alp': 'alp_slope'})
)

lab_slope_wide.columns.name = None

  slope = ssxym / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)


In [292]:
row_ID(lab_slope_wide)

(9924, 9924)

In [293]:
lab_slope_wide = (
    lab_slope_wide
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(lab_slope_wide['PatientID'])].to_frame(name = 'PatientID'),
        sort = False)
)

In [294]:
# Create missing variables for lab slope. 
for x in range (1, len(lab_slope_wide.columns)):
    lab_slope_wide.loc[:, lab_slope_wide.columns[x]+'_na'] = np.where(lab_slope_wide[lab_slope_wide.columns[x]].isna(), 1, 0)

In [295]:
row_ID(lab_slope_wide)

(13697, 13697)

#### Merge

In [296]:
lab_wide = pd.merge(lab_diag_wide, lab_summary_wide, on = 'PatientID')

In [297]:
lab_wide = pd.merge(lab_wide, lab_slope_wide, on = 'PatientID')

In [298]:
row_ID(lab_wide)

(13697, 13697)

In [299]:
list(lab_wide.columns)

['PatientID',
 'albumin_diag',
 'alp_diag',
 'alt_diag',
 'ast_diag',
 'bicarb_diag',
 'bun_diag',
 'calcium_diag',
 'chloride_diag',
 'creatinine_diag',
 'hemoglobin_diag',
 'neutrophil_count_diag',
 'platelet_diag',
 'potassium_diag',
 'sodium_diag',
 'total_bilirubin_diag',
 'wbc_diag',
 'albumin_diag_na',
 'alp_diag_na',
 'alt_diag_na',
 'ast_diag_na',
 'bicarb_diag_na',
 'bun_diag_na',
 'calcium_diag_na',
 'chloride_diag_na',
 'creatinine_diag_na',
 'hemoglobin_diag_na',
 'neutrophil_count_diag_na',
 'platelet_diag_na',
 'potassium_diag_na',
 'sodium_diag_na',
 'total_bilirubin_diag_na',
 'wbc_diag_na',
 'albumin_avg',
 'alp_avg',
 'alt_avg',
 'ast_avg',
 'bicarb_avg',
 'bun_avg',
 'calcium_avg',
 'chloride_avg',
 'creatinine_avg',
 'hemoglobin_avg',
 'neutrophil_count_avg',
 'platelet_avg',
 'potassium_avg',
 'sodium_avg',
 'total_bilirubin_avg',
 'wbc_avg',
 'albumin_max',
 'alp_max',
 'alt_max',
 'ast_max',
 'bicarb_max',
 'bun_max',
 'calcium_max',
 'chloride_max',
 'creatinin

In [300]:
%whos DataFrame

Variable              Type         Data/Info
--------------------------------------------
biomarker_wide        DataFrame               PatientID     <...>n[13697 rows x 8 columns]
demographics          DataFrame               PatientID Prac<...>n[13697 rows x 7 columns]
ecog_diagnosis_wide   DataFrame                PatientID eco<...>n[13697 rows x 2 columns]
enhanced_adv          DataFrame               PatientID diag<...>n[13697 rows x 9 columns]
insurance_wide        DataFrame               PatientID  com<...>n[13697 rows x 5 columns]
lab                   DataFrame                 PatientID   <...>863557 rows x 17 columns]
lab_avg_wide          DataFrame              PatientID  albu<...>n[9924 rows x 17 columns]
lab_core              DataFrame                 PatientID Re<...>637247 rows x 11 columns]
lab_core_win          DataFrame                 PatientID Re<...>[321969 rows x 7 columns]
lab_core_win_summ     DataFrame                 PatientID Re<...>[808646 rows x 8 columns]


In [301]:
# Keep biomarker_wide, demographics, ecog_diagnosis_wide, enhanced_adv, insurance_wide, lab_wide, med_admin_wide, 
# mortality, and weight_wide
del lab
del lab_avg_wide
del lab_core
del lab_core_win
del lab_core_win_summ
del lab_diag_wide
del lab_max_wide
del lab_min_wide
del lab_slope_wide
del lab_std_wide
del lab_summary_wide

### 10. Diagnosis 

In [302]:
diagnosis = pd.read_csv('Diagnosis.csv')

In [303]:
row_ID(diagnosis)

(1499292, 68483)

In [304]:
diagnosis = diagnosis[diagnosis['PatientID'].isin(test_IDs)]

In [305]:
row_ID(diagnosis)

(290161, 13697)

**The Diagnosis table is in long format. ICD codes before advanced diagnosis and up to 30 days past diagnosis will be mapped to the Elixhauser comorbidity index. ("Coding Algorithms for Defining Comorbidities in ICD-9-CM and ICD-10 Administrative Data"  by Quan et al is used as a guide for linking ICD codes to Elixhauser comorbidities.) Presence of concurrent or prior cancer diagnosis that is not lung cancer or metastasis and sites of metastases at time of diagnosis will also be created.** 

#### 10.1 Elixhauser

In [306]:
diagnosis = pd.merge(diagnosis, enhanced_adv[['PatientID', 'adv_date']], on = 'PatientID', how = 'left')

In [307]:
diagnosis.loc[:, 'DiagnosisDate'] = pd.to_datetime(diagnosis['DiagnosisDate'])

In [308]:
diagnosis.loc[:, 'diagnosis_date_diff'] = (diagnosis['DiagnosisDate'] - diagnosis['adv_date']).dt.days

In [309]:
# Remove decimal to make mapping to Elixhauser easier. 
diagnosis.loc[:, 'diagnosis_code'] = diagnosis['DiagnosisCode'].replace('\.', '', regex = True)

##### 10.1a Elixhauser for ICD-9

In [310]:
# ICD-9 dataframe with unique codes for each patient. 
diagnosis_elix_9 = (
    diagnosis
    .query('diagnosis_date_diff <= 30')
    .query('DiagnosisCodeSystem == "ICD-9-CM"')
    .drop_duplicates(subset = (['PatientID', 'DiagnosisCode']), keep = 'first')
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

In [311]:
row_ID(diagnosis_elix_9)

(28278, 6457)

In [312]:
diagnosis_elix_9.loc[:, 'chf'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('39891|'
                                                          '402(01|11|91)|'
                                                          '404(01|03|[19][13])|'
                                                          '42(5[456789]|8)'), 1, 0)
)

In [313]:
diagnosis_elix_9.loc[:, 'cardiac_arrhythmias'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('426([079]|1[023])|'
                                                          '427[012346789]|'
                                                          '7850|'
                                                          '996(01|04)|'
                                                          'V450|'
                                                          'V533'), 1, 0)
)

In [314]:
diagnosis_elix_9.loc[:, 'valvular_disease'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('0932|'
                                                          '39[4567]|'
                                                          '424|'
                                                          '746[3456]|'
                                                          'V422|'
                                                          'V433'), 1, 0)
)

In [315]:
diagnosis_elix_9.loc[:, 'pulmonary_circulation'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('41(5[01]|6|7[089])'), 1, 0)
)

In [316]:
diagnosis_elix_9.loc[:, 'peripheral_vascular'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('0930|'
                                                          '4373|'
                                                          '44([01]|3[123456789]|71)|'
                                                          '557[19]|'
                                                          'V434'), 1, 0)
)

In [317]:
diagnosis_elix_9.loc[:, 'htn_uncomplicated'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('401'), 1, 0)
)

In [318]:
diagnosis_elix_9.loc[:, 'htn_complicated'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('40[2345]'), 1, 0)
)

In [319]:
diagnosis_elix_9.loc[:, 'paralysis'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('3341|'
                                                          '34([23]|4[01234569])'), 1, 0)
)

In [320]:
diagnosis_elix_9.loc[:, 'other_neuro_disorders'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('33(19|2[01]|3([45]|92)|[45]|62)|'
                                                          '34([015]|8[13])|'
                                                          '78[04]3'), 1, 0)
)

In [321]:
diagnosis_elix_9.loc[:, 'chronic_pulmonary'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('416[89]|'
                                                          '49|'
                                                          '50([012345]|64|8[18])'), 1, 0)
)

In [322]:
diagnosis_elix_9.loc[:, 'diabetes_uncomplicated'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('250[0123]'), 1, 0)
)

In [323]:
diagnosis_elix_9.loc[:, 'diabetes_complicated'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('250[456789]'), 1, 0)
)

In [324]:
diagnosis_elix_9.loc[:, 'hypothyroidism'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('2409|'
                                                          '24([34]|6[18])'), 1, 0)
)

In [325]:
diagnosis_elix_9.loc[:, 'renal_failure'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('403[019]1|'
                                                          '404[019][23]|'
                                                          '58([56]|80)|'
                                                          'V4(20|51)|'
                                                          'V56'), 1, 0)
)

In [326]:
diagnosis_elix_9.loc[:, 'liver_disease'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('070(2[23]]|3[23]|44|54|6|9)|'
                                                          '456[012]|'
                                                          '57([01]|2[2345678]|3[3489])|'
                                                          'V427'), 1, 0)
)

In [327]:
diagnosis_elix_9.loc[:, 'peptic_ulcer_disease'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('53[1234][79]'), 1, 0)
)

In [328]:
diagnosis_elix_9.loc[:, 'aids_hiv'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('04[234]'), 1, 0)
)

In [329]:
diagnosis_elix_9.loc[:, 'lymphoma'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('20([012]|30)|'
                                                          '2386'), 1, 0)
)

In [330]:
diagnosis_elix_9.loc[:, 'metastatic_cancer'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('19[6789]'), 1, 0)
)

In [331]:
diagnosis_elix_9.loc[:, 'solid_tumor_wout_mets'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('1[456]|'
                                                          '17[012456789]|'
                                                          '18|'
                                                          '19([012345])'), 1, 0)
)

In [332]:
diagnosis_elix_9.loc[:, 'rheumatoid_arthritis'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('446|'
                                                          '7010|'
                                                          '71(0[0123489]|12|4|93)|'
                                                          '72([05]|85|889|930)'), 1, 0)
)

In [333]:
diagnosis_elix_9.loc[:, 'coagulopathy'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('28(6|7[1345])'), 1, 0)
)

In [334]:
diagnosis_elix_9.loc[:, 'obesity'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('2780'), 1, 0)
)

In [335]:
diagnosis_elix_9.loc[:, 'weight_loss'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('26[0123]|'
                                                          '7832|'
                                                          '7994'), 1, 0)
)

In [336]:
diagnosis_elix_9.loc[:, 'fluid_electrolyte'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('2(536|76)'), 1, 0)
)

In [337]:
diagnosis_elix_9.loc[:, 'blood_loss_anemia'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('2800'), 1, 0)
)

In [338]:
diagnosis_elix_9.loc[:, 'deficiency_anemia'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('28(0[123456789]|1)'), 1, 0)
)

In [339]:
diagnosis_elix_9.loc[:, 'alcohol_abuse'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('2652|'
                                                          '291[12356789]|'
                                                          '30(3[09]|50)|'
                                                          '3575|'
                                                          '4255|'
                                                          '5353|'
                                                          '571[0123]|'
                                                          '980|'
                                                          'V113'), 1, 0)
)

In [340]:
diagnosis_elix_9.loc[:, 'drug_abuse'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('292|'
                                                          '30(4|5[23456789])|'
                                                          'V6542'), 1, 0)
)

In [341]:
diagnosis_elix_9.loc[:, 'psychoses'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('2938|'
                                                          '296[0145]4|'
                                                          '29[578]'), 1, 0)
)

In [342]:
diagnosis_elix_9.loc[:, 'depression'] = (
    np.where(diagnosis_elix_9['diagnosis_code'].str.match('296[235]|'
                                                          '3(004|09|11)'), 1, 0)
)

In [343]:
# Create variable that captures ICD-9 codes not included in Elixhauser. 
diagnosis_elix_9.loc[:, 'elixhauser_other'] = (
    np.where(diagnosis_elix_9.iloc[:, 3:].eq(0).all(1), 1, 0)
)

In [344]:
# Single-row-per-patient dataframe with columns as Elixhauser comorbidities. 
diagnosis_elix_9_wide = (
    diagnosis_elix_9
    .drop(columns = ['DiagnosisCode', 'diagnosis_code'])
    .groupby('PatientID').sum()
    .reset_index()
)

In [345]:
row_ID(diagnosis_elix_9_wide)

(6457, 6457)

##### 10.1b Elixhauser for ICD-10

In [346]:
# ICD-10 dataframe with unique codes for each patient.  
diagnosis_elix_10 = (
    diagnosis
    .query('diagnosis_date_diff <= 30')
    .query('DiagnosisCodeSystem == "ICD-10-CM"')
    .drop_duplicates(subset = (['PatientID', 'DiagnosisCode']), keep = 'first')
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

In [347]:
row_ID(diagnosis_elix_10)

(55385, 7614)

In [348]:
diagnosis_elix_10.loc[:, 'chf'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('I099|'
                                                           'I1(10|3[02])|'
                                                           'I255|'
                                                           'I4(2[056789]|3)|'
                                                           'I50|'
                                                           'P290'), 1, 0)
)

In [349]:
diagnosis_elix_10.loc[:, 'cardiac_arrhythmias'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('I4(4[123]|5[69]|[789])|'
                                                           'R00[018]|'
                                                           'T821|'
                                                           'Z[49]50'), 1, 0)
)

In [350]:
diagnosis_elix_10.loc[:, 'valvular_disease'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('A520|'
                                                           'I0([5678]|9[18])|'
                                                           'I3[456789]|'
                                                           'Q23[0123]|'
                                                           'Z95[234]'), 1, 0)
)

In [351]:
diagnosis_elix_10.loc[:, 'pulmonary_circulation'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('I2([67]|8[089])'), 1, 0)
)

In [352]:
diagnosis_elix_10.loc[:, 'peripheral_vascular'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('I7([01]|3[189]|71|9[02])|'
                                                           'K55[189]|'
                                                           'Z95[89]'), 1, 0)
)

In [353]:
diagnosis_elix_10.loc[:, 'htn_uncomplicated'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('I10'), 1, 0)
)

In [354]:
diagnosis_elix_10.loc[:, 'htn_complicated'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('I1[1235]'), 1, 0)
)

In [355]:
diagnosis_elix_10.loc[:, 'paralysis'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('G041|'
                                                           'G114|'
                                                           'G8(0[12]|[12]|3[012349])'), 1, 0)
)

In [356]:
diagnosis_elix_10.loc[:, 'other_neuro_disorders'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('G1[0123]|'
                                                           'G2([012]|5[45])|'
                                                           'G3(1[289]|[2567])|'
                                                           'G4[01]|'
                                                           'G93[14]|'
                                                           'R470|'
                                                           'R56'), 1, 0)
)

In [357]:
diagnosis_elix_10.loc[:, 'chronic_pulmonary'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('I27[89]|'
                                                           'J4[01234567]|'
                                                           'J6([01234567]|84)|'
                                                           'J70[13]'), 1, 0)
)

In [358]:
diagnosis_elix_10.loc[:, 'diabetes_uncomplicated'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('E1[01234][019]'), 1, 0)
)

In [359]:
diagnosis_elix_10.loc[:, 'diabetes_complicated'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('E1[01234][2345678]'), 1, 0)
)

In [360]:
diagnosis_elix_10.loc[:, 'hypothyroidism'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('E0[0123]|'
                                                           'E890'), 1, 0)
)

In [361]:
diagnosis_elix_10.loc[:, 'renal_failure'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('I1(20|31)|'
                                                           'N1[89]|'
                                                           'N250|'
                                                           'Z49[012]|'
                                                           'Z9(40|92)'), 1, 0)
)

In [362]:
diagnosis_elix_10.loc[:, 'liver_disease'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('B18|'
                                                           'I8(5|64)|'
                                                           'I982|'
                                                           'K7(0|1[13457]|[234]|6[023456789])|'
                                                           'Z944'), 1, 0)
)

In [363]:
diagnosis_elix_10.loc[:, 'peptic_ulcer_disease'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('K2[5678][79]'), 1, 0)
)

In [364]:
diagnosis_elix_10.loc[:, 'aids_hiv'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('B2[0124]'), 1, 0)
)

In [365]:
diagnosis_elix_10.loc[:, 'lymphoma'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('C8[123458]|'
                                                           'C9(0[02]|6)'), 1, 0)
)

In [366]:
diagnosis_elix_10.loc[:, 'metastatic_cancer'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('C(7[789]|80)'), 1, 0)
)

In [367]:
diagnosis_elix_10.loc[:, 'solid_tumor_wout_mets'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('C[01]|'
                                                           'C2[0123456]|'
                                                           'C3[01234789]|'
                                                           'C4[01356789]|'
                                                           'C5[012345678]|'
                                                           'C6|'
                                                           'C7[0123456]|'
                                                           'C97'), 1, 0)
)

In [368]:
diagnosis_elix_10.loc[:, 'rheumatoid_arthritis'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('L94[013]|'
                                                           'M0[568]|'
                                                           'M12[03]|'
                                                           'M3(0|1[0123]|[2345])|'
                                                           'M4(5|6[189])'), 1, 0)
)

In [369]:
diagnosis_elix_10.loc[:, 'coagulopathy'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('D6([5678]|9[13456])'), 1, 0)
)

In [370]:
diagnosis_elix_10.loc[:, 'obesity'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('E66'), 1, 0)
)

In [371]:
diagnosis_elix_10.loc[:, 'weight_loss'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('E4[0123456]|'
                                                           'R6(34|4)'), 1, 0)
)

In [372]:
diagnosis_elix_10.loc[:, 'fluid_electrolyte'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('E222|'
                                                           'E8[67]'), 1, 0)
)

In [373]:
diagnosis_elix_10.loc[:, 'blood_loss_anemia'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('D500'), 1, 0)
)

In [374]:
diagnosis_elix_10.loc[:, 'deficiency_anemia'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('D5(0[89]|[123])'), 1, 0)
)

In [375]:
diagnosis_elix_10.loc[:, 'alcohol_abuse'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('F10|'
                                                           'E52|'
                                                           'G621|'
                                                           'I426|'
                                                           'K292|'
                                                           'K70[039]|'
                                                           'T51|'
                                                           'Z502|'
                                                           'Z7(14|21)'), 1, 0)
)

In [376]:
diagnosis_elix_10.loc[:, 'drug_abuse'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('F1[12345689]|'
                                                           'Z7(15|22)'), 1, 0)
)

In [377]:
diagnosis_elix_10.loc[:, 'psychoses'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('F2[0234589]|'
                                                           'F3([01]2|15)'), 1, 0)
)

In [378]:
diagnosis_elix_10.loc[:, 'depression'] = (
    np.where(diagnosis_elix_10['diagnosis_code'].str.match('F204|'
                                                           'F3(1[345]|[23]|41)|'
                                                           'F4[13]2'), 1, 0)
)

In [379]:
# Create variable that captures ICD-10 codes not included in Elixhauser. 
diagnosis_elix_10.loc[:, 'elixhauser_other'] = (
    np.where(diagnosis_elix_10.iloc[:, 3:].eq(0).all(1), 1, 0)
)

In [380]:
diagnosis_elix_10_wide = (
    diagnosis_elix_10
    .drop(columns = ['DiagnosisCode', 'diagnosis_code'])
    .groupby('PatientID').sum()
    .reset_index()
)

In [381]:
row_ID(diagnosis_elix_10_wide)

(7614, 7614)

In [382]:
# Merge Elixhauser 9 and 10 and sum by PatientID.
diagnosis_elixhauser = (
    pd.concat([diagnosis_elix_9_wide, diagnosis_elix_10_wide])
    .groupby('PatientID').sum()
)

In [383]:
# Create unqiue ICD count for each patient. 
diagnosis_elixhauser['icd_count'] = diagnosis_elixhauser.sum(axis = 1)  

In [384]:
# Other than unique ICD count, values greater than 1 are set to 1; 0 remains unchanged. 
diagnosis_elixhauser.iloc[:, :-1] = (
    diagnosis_elixhauser.iloc[:, :-1].mask(diagnosis_elixhauser.iloc[:, :-1] >1, 1)
)

In [385]:
diagnosis_elixhauser = diagnosis_elixhauser.reset_index()

In [386]:
row_ID(diagnosis_elixhauser)

(12197, 12197)

In [387]:
# Append missing training IDs.
diagnosis_elixhauser = (
    diagnosis_elixhauser
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(diagnosis_elixhauser['PatientID'])].to_frame(name = 'PatientID'), 
        sort = False)
    .fillna(0)
)

In [388]:
row_ID(diagnosis_elixhauser)

(13697, 13697)

#### 10.2 Other cancer 

##### 10.2a ICD-9 cancer codes

In [389]:
# Select all ICD-9 cancer codes between 140-239 (excluding benign neoplasms: 210-229).
cancer_9 = (
    diagnosis_elix_9[diagnosis_elix_9['DiagnosisCode'].str.startswith(
        ('14','15', '16', '17', '18', '19', '20', '23'))]
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

In [390]:
row_ID(cancer_9)

(8440, 5673)

**Remove the following ICD-9 codes representing lung cancer, metastasis, ill-defined neoplasms, and benign neoplasms of skin (BCC and SCC):**
* **162 - Malignant neoplasm of trachea, bronchus, and lung**
* **163 - Malignant neoplasm of pleura**
* **164.2 - Malignant neoplasm of anterior mediastinum**
* **164.3 - Malignant neoplasm of posterior mediastinum**
* **164.8 - Malignant neoplasm other parts of mediastinum**
* **164.9 - Malignant neoplasm of mediastinum, part unspecified** 
* **165 - Malignant neoplasm of other and ill-defined sites within respiratory system** 
* **173 - Other and unspecified malignant neoplasm of skin**
* **195.1 - Malignant neoplasm of thorax** 
* **196 - Secondary and unspecified malignant neoplasm of lymph nodes**
* **197 - Secondary malignant neoplasm of respiratory and digestive systems**
* **198 - Secondary malignant neoplasm of other specified sites** 
* **199 - Malignant neoplasm without specification of site**
* **231.1 - Carcinoma in situ of trachea**
* **231.2 - Carcinoma in situ of bronchus and lung**
* **231.8 - Carcinoma in situ of other specified parts of respiratory system**
* **231.9 - Carcinoma in situ of respiratory system, part unspecified** 
* **235.7 - Neoplasm of uncertain behavior of trachea, bronchus, and lung**
* **235.8 - Neoplasm of uncertain behavior of pleura, thymus, and mediastinum**
* **235.9 - Neoplasm of uncertain behavior of other and unspecified respiratory organs**
* **238.8 - Neoplasm of uncertain behavior of other specified sites**
* **238.9 - Neoplasm of uncertain behavior, site unspecified**
* **239 - Neoplasms of unspecified nature**

In [391]:
# Dataframe of ICD-9 neoplasm codes that exclude lung cancer, metastasis, or benign neoplasms.
other_cancer_9 = (
    cancer_9[~cancer_9['diagnosis_code'].str.match('16([23]|4[2389]|5)|'
                                                   '173|'
                                                   '19(51|[6789])|'
                                                   '23(1[1289]|5[789]|8[89]|9)')]
)

In [392]:
other_cancer_9.loc[:,'other_cancer_9'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [393]:
other_cancer_9 = (
    other_cancer_9
    .drop_duplicates(subset = 'PatientID', keep = 'first')
    .filter(items = ['PatientID', 'other_cancer_9'])
)

In [394]:
row_ID(other_cancer_9)

(923, 923)

In [395]:
other_cancer_9 = (
    other_cancer_9
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(other_cancer_9['PatientID'])].to_frame(name = 'PatientID'), 
        sort = False)
    .fillna(0)
)

In [396]:
row_ID(other_cancer_9)

(13697, 13697)

##### 10.2b ICD-10 cancer codes

In [397]:
# Select all ICD-10 codes between C00-D49 (excluding benign neoplasms: D10-D36 and benign neuroendocrine tumor: D3A) reflecting neoplasm diagnosis.
cancer_10 = (
    diagnosis_elix_10[diagnosis_elix_10['DiagnosisCode'].str.startswith(
        ('C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'D0', 'D37', 'D38', 'D39', 'D4'))]
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

In [398]:
row_ID(cancer_10)

(12605, 7059)

**Remove the following ICD-10 codes which capture lung cancer, metastasis, and benign skin neoplasms(eg., BCC and SCC).**
* **C33 - Malignant neoplasm of trachea**
* **C34 - Malignant neoplasm of bronchus and lung**
* **C38.1 - Malignant neoplasm of anterior mediastinum**
* **C38.2 - Malignant neoplasm of posterior mediastinum**
* **C38.3 - Malignant neoplasm of mediastinum, part unspecified**
* **C38.4 - Malignant neoplasm of pleura**
* **C38.8 - Malignant neoplasm of overlapping sites of heart, mediastinum and pleura**
* **C39 - Malignant neoplasm of other and ill-defined sites in the respiratory system and intrathoracic organs**
* **C44 - Other and unspecified malignant neoplasm of skin**
* **C76.1 -  Malignant neoplasm of other and ill-defined sites: malignant neoplasm of thorax**
* **C77 - Secondary and unspecified malignant neoplasm of lymph nodes**
* **C78 - Secondary malignant neoplasm of respiratory and digestive organs**
* **C79 - Secondary malignant neoplasm of other and unspecified sites**
* **C80 - Malignant neoplasm without specification of site**
* **D02.1 - Carcinoma in situ of trachea**
* **D02.2 - Carcinoma in situ of bronchus and lung**
* **D02.3 - Carcinoma in situ of other parts of respiratory system**
* **D02.4 - Carcinoma in situ of respiratory system, unspecified**
* **D38 - Neoplasm of uncertain behavior of middle ear and respiratory and intrathoracic organs**
* **D47.2 - Monoclonal gammopathy**
* **D49 - Neoplasms of unspecified behavior** 

In [399]:
# Dataframe of ICD-10 neoplasm codes that exclude lung cancer, metastasis, or benign neoplasms.
other_cancer_10 = (
    cancer_10[~cancer_10['diagnosis_code'].str.match('C3([34]|8[12348]|9)|'
                                                     'C44|'
                                                     'C7(61|[789])|'
                                                     'C80|'
                                                     'D(02[1234]|38|472|49)')]
)

In [400]:
other_cancer_10.loc[:,'other_cancer_10'] = 1

In [401]:
# Drop duplicates.
other_cancer_10 = (
    other_cancer_10
    .drop_duplicates(subset = 'PatientID', keep = 'first')
    .filter(items = ['PatientID', 'other_cancer_10'])
)

In [402]:
row_ID(other_cancer_10)

(718, 718)

In [403]:
# Append missing training IDs.
other_cancer_10 = (
    other_cancer_10
    .append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(other_cancer_10['PatientID'])].to_frame(name = 'PatientID'), 
        sort = False)
    .fillna(0)
)

In [404]:
row_ID(other_cancer_10)

(13697, 13697)

In [405]:
other_cancer = pd.merge(other_cancer_9, other_cancer_10, on = 'PatientID')

In [406]:
# Combine other_cancer_9 and other_cancer_10; replace values equal to 2 with 1. 
other_cancer = (
    other_cancer
    .assign(other_cancer = other_cancer['other_cancer_9'] + other_cancer['other_cancer_10'])
    .filter(items = ['PatientID', 'other_cancer'])
    .replace(2, 1)
)

In [407]:
row_ID(other_cancer)

(13697, 13697)

#### 10.3 Sites of metastases at diagnosis  

##### 10.3a ICD-9 sites of metastases

In [408]:
# Create dataframe contianing patients with ICD-9 codes within -90 to +30 days from advanced diagnosis and remove duplicate codes.(
diagnosis_mets_9 = (
    diagnosis
    .query('diagnosis_date_diff >= -90 and diagnosis_date_diff <= 30')
    .query('DiagnosisCodeSystem == "ICD-9-CM"')
    .drop_duplicates(subset = ['PatientID', 'DiagnosisCode'], keep = 'first')
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

Sites of metastasis will be grouped into the following categories according to ICD-9 codes:

- CNS - 198.3 and 198.4
- Bone - 198.5
- Liver - 197.7
- Respiratory system - 197.0, 197.1, 197.2, and 197.3
- Adrenal gland - 198.7
- Other - kidney/bladder (198.0 and 198.1), intestines (197.4 and 197.5), lymph nodes (196), other (197.6, 197.8, 198.2, 198.6, and 198.8)

In [409]:
diagnosis_mets_9['cns_met'] = np.where(diagnosis_mets_9['diagnosis_code'].str.match('198[34]'), 1, 0)

In [410]:
diagnosis_mets_9['bone_met'] = np.where(diagnosis_mets_9['diagnosis_code'].str.match('1985'), 1, 0)

In [411]:
diagnosis_mets_9['liver_met'] = np.where(diagnosis_mets_9['diagnosis_code'].str.match('1977'), 1, 0)

In [412]:
diagnosis_mets_9['resp_met'] = np.where(diagnosis_mets_9['diagnosis_code'].str.match('197[0123]'), 1, 0)

In [413]:
diagnosis_mets_9['adrenal_met'] = np.where(diagnosis_mets_9['diagnosis_code'].str.match('1987'), 1, 0)

In [414]:
diagnosis_mets_9['other_met'] = (
    np.where(diagnosis_mets_9['diagnosis_code'].str.match('196|'
                                                          '197[4568]|'
                                                          '198[01268]'), 1, 0)
)

In [415]:
# Collapse columns and sum.
diagnosis_mets_9 = (
    diagnosis_mets_9
    .drop(columns = ['DiagnosisCode', 'diagnosis_code'])
    .groupby('PatientID').sum()
    .reset_index()
)

##### 10.3b ICD-10 sites of metastases

In [416]:
# Create dataframe contianing patients with ICD-10 codes within -90 to +30 days from advanced diagnosis and remove duplicate codes.
diagnosis_mets_10 = (
    diagnosis
    .query('diagnosis_date_diff >= -90 and diagnosis_date_diff <= 30')
    .query('DiagnosisCodeSystem == "ICD-10-CM"')
    .drop_duplicates(subset = ['PatientID', 'DiagnosisCode'], keep = 'first')
    .filter(items = ['PatientID', 'DiagnosisCode', 'diagnosis_code'])
)

**Sites of metastasis will be grouped into the following categories according to ICD-10 codes:**
* **CNS - C79.3 and C79.4**
* **Bone - C79.5**
* **Liver - C78.7**
* **Respiratory system - C78.0, C78.1, C78.2, and C78.3**
* **Adrenal gland - C79.7**
* **Other - kidney/bladder (C79.0 and C79.1) intestines (C78.4 and C78.5), lymph nodes (C77), and other (C78.6, C78.8, C79.2, C79.6, C79.8, C79.9)**

In [417]:
diagnosis_mets_10['cns_met'] = np.where(diagnosis_mets_10['diagnosis_code'].str.match('C79[34]'), 1, 0)

In [418]:
diagnosis_mets_10['bone_met'] = np.where(diagnosis_mets_10['diagnosis_code'].str.match('C795'), 1, 0)

In [419]:
diagnosis_mets_10['liver_met'] = np.where(diagnosis_mets_10['diagnosis_code'].str.match('C787'), 1, 0)

In [420]:
diagnosis_mets_10['resp_met'] = np.where(diagnosis_mets_10['diagnosis_code'].str.match('C78[0123]'), 1, 0)

In [421]:
diagnosis_mets_10['adrenal_met'] = np.where(diagnosis_mets_10['diagnosis_code'].str.match('C797'), 1, 0)

In [422]:
diagnosis_mets_10['other_met'] = (
    np.where(diagnosis_mets_10['diagnosis_code'].str.match('C77|'
                                                           'C78[4568]|'
                                                           'C79[012689]'), 1, 0)
)

In [423]:
# Collapse columns and sum. 
diagnosis_mets_10 = (
    diagnosis_mets_10
    .drop(columns = ['DiagnosisCode', 'diagnosis_code'])
    .groupby('PatientID').sum()
    .reset_index()
)

In [424]:
# Merge ICD-9 and ICD-10 mets tables; collapse and sum. 
diagnosis_mets = (
    pd.concat([diagnosis_mets_9, diagnosis_mets_10])
    .groupby('PatientID').sum()
)

In [425]:
# All values >1 replaced by 1. 
diagnosis_mets = (
    diagnosis_mets.mask(diagnosis_mets > 1, 1)
    .reset_index()
)

In [426]:
# Append missing training IDs.
diagnosis_mets = (
    diagnosis_mets.append(
        pd.Series(test_IDs)[~pd.Series(test_IDs).isin(diagnosis_mets['PatientID'])].to_frame(name = 'PatientID'), 
        sort = False)
    .fillna(0)
)

In [427]:
row_ID(diagnosis_mets)

(13697, 13697)

#### 10.4 Merge

In [428]:
diagnosis_wide = pd.merge(diagnosis_elixhauser, other_cancer, on = 'PatientID')

In [429]:
diagnosis_wide = pd.merge(diagnosis_wide, diagnosis_mets, on = 'PatientID')

In [430]:
row_ID(diagnosis_wide)

(13697, 13697)

In [431]:
list(diagnosis_wide.columns)

['PatientID',
 'chf',
 'cardiac_arrhythmias',
 'valvular_disease',
 'pulmonary_circulation',
 'peripheral_vascular',
 'htn_uncomplicated',
 'htn_complicated',
 'paralysis',
 'other_neuro_disorders',
 'chronic_pulmonary',
 'diabetes_uncomplicated',
 'diabetes_complicated',
 'hypothyroidism',
 'renal_failure',
 'liver_disease',
 'peptic_ulcer_disease',
 'aids_hiv',
 'lymphoma',
 'metastatic_cancer',
 'solid_tumor_wout_mets',
 'rheumatoid_arthritis',
 'coagulopathy',
 'obesity',
 'weight_loss',
 'fluid_electrolyte',
 'blood_loss_anemia',
 'deficiency_anemia',
 'alcohol_abuse',
 'drug_abuse',
 'psychoses',
 'depression',
 'elixhauser_other',
 'icd_count',
 'other_cancer',
 'cns_met',
 'bone_met',
 'liver_met',
 'resp_met',
 'adrenal_met',
 'other_met']

In [432]:
%whos DataFrame

Variable                 Type         Data/Info
-----------------------------------------------
biomarker_wide           DataFrame               PatientID     <...>n[13697 rows x 8 columns]
cancer_10                DataFrame                PatientID Dia<...>n[12605 rows x 3 columns]
cancer_9                 DataFrame                PatientID Dia<...>\n[8440 rows x 3 columns]
demographics             DataFrame               PatientID Prac<...>n[13697 rows x 7 columns]
diagnosis                DataFrame                PatientID Dia<...>[290161 rows x 8 columns]
diagnosis_elix_10        DataFrame                PatientID Dia<...>[55385 rows x 35 columns]
diagnosis_elix_10_wide   DataFrame              PatientID  chf <...>n[7614 rows x 33 columns]
diagnosis_elix_9         DataFrame                PatientID Dia<...>[28278 rows x 35 columns]
diagnosis_elix_9_wide    DataFrame              PatientID  chf <...>n[6457 rows x 33 columns]
diagnosis_elixhauser     DataFrame               PatientID

In [433]:
# Keep diagnosis_wide.
del cancer_10
del cancer_9
del diagnosis
del diagnosis_elix_10
del diagnosis_elix_10_wide
del diagnosis_elix_9
del diagnosis_elix_9_wide
del diagnosis_elixhauser
del other_cancer
del other_cancer_10
del other_cancer_9

## Merge files to create master test dataframe

In [434]:
enhanced_adv = enhanced_adv.drop(columns = ['diagnosis_date', 'adv_date', 'adv_year_cat'])

In [435]:
test_full = pd.merge(demographics, enhanced_adv, on = 'PatientID')

In [436]:
test_full = pd.merge(test_full, mortality, on = 'PatientID')

In [437]:
test_full = pd.merge(test_full, med_admin_wide, on = 'PatientID')

In [438]:
test_full = pd.merge(test_full, biomarker_wide, on = 'PatientID')

In [439]:
test_full = pd.merge(test_full, insurance_wide, on = 'PatientID')

In [440]:
test_full = pd.merge(test_full, ecog_diagnosis_wide, on = 'PatientID')

In [441]:
test_full = pd.merge(test_full, weight_wide, on = 'PatientID')

In [442]:
test_full = pd.merge(test_full, lab_wide, on = 'PatientID')

In [443]:
test_full = pd.merge(test_full, diagnosis_wide, on = 'PatientID')

In [444]:
row_ID(test_full)

(13697, 13697)

In [445]:
len(test_full.columns)

213

In [446]:
list(test_full.columns)

['PatientID',
 'PracticeType',
 'gender',
 'race',
 'ethnicity',
 'age',
 'region',
 'Histology',
 'SmokingStatus',
 'stage',
 'adv_year',
 'delta_adv_diagnosis',
 'death_status',
 'timerisk_activity',
 'steroid_diag',
 'opioid_PO_diag',
 'nonopioid_PO_diag',
 'pain_IV_diag',
 'ac_diag',
 'antiinfective_IV_diag',
 'antiinfective_diag',
 'antihyperglycemic_diag',
 'ppi_diag',
 'antidepressant_diag',
 'bta_diag',
 'thyroid_diag',
 'is_diag',
 'ALK',
 'BRAF',
 'EGFR',
 'KRAS',
 'ROS1',
 'pdl1',
 'pdl1_n',
 'commercial',
 'medicare',
 'medicaid',
 'other_insurance',
 'ecog_diagnosis',
 'weight_diag',
 'bmi_diag',
 'bmi_diag_na',
 'weight_pct_change',
 'weight_pct_na',
 'weight_slope',
 'albumin_diag',
 'alp_diag',
 'alt_diag',
 'ast_diag',
 'bicarb_diag',
 'bun_diag',
 'calcium_diag',
 'chloride_diag',
 'creatinine_diag',
 'hemoglobin_diag',
 'neutrophil_count_diag',
 'platelet_diag',
 'potassium_diag',
 'sodium_diag',
 'total_bilirubin_diag',
 'wbc_diag',
 'albumin_diag_na',
 'alp_diag_na

In [447]:
test_full.to_csv('test_full.csv', index = False, header = True)