In this notebook we posprocess the data and add the cancer subtypes as the outcome. We also add BMI values and DICOM tags to each study and create new features.

#### Import libraries, load original sentara data, BMIs and DICOM tags

In [1]:
import virtual_biopsy_utils as vbu
import pandas as pd
import numpy as np
import pickle as pkl
import os
import re

train = pd.read_csv('../input_files/sentara_train_pathologies.csv', index_col = 0)
test = pd.read_csv('../input_files/sentara_test_pathologies.csv',  index_col = 0)
val = pd.read_csv('../input_files/sentara_val_pathologies.csv',  index_col = 0)

# sen_data = pd.read_csv('../input_files/fx_sentara_chorev_extra_data_for_bmi_prediction.csv', index_col=0)
sen_data = pd.read_csv('../input_files/fx_sentara_cohort.csv')
dicom_tags = pd.read_csv('../input_files/sentara_avg_dicom_tags.csv', index_col = 0)
bmis = pd.read_csv('../input_files/sentara_annotated_bmis_processed.csv', index_col = 0)

FileNotFoundError: [Errno 2] File ../input_files/sentara_train_pathologies.csv does not exist: '../input_files/sentara_train_pathologies.csv'

In [2]:
train

Unnamed: 0,finding_id,image_id,study_id,pathology,provider
27761,C1,1.2.840.113681.174858545.1366870411.3968.2122,MG130425000199,Apocrine_Metaplasia,sentara
27758,C1,1.2.840.113681.174858545.1366870411.3968.2111,MG130425000199,Apocrine_Metaplasia,sentara
93796,A1,1.2.840.113681.2229469590.940.3547286568.185.1,MG130529000207,InvasiveDuctalCarcinoma,sentara
93795,A1,1.2.840.113681.2229469590.940.3547286568.178.1,MG130529000207,InvasiveDuctalCarcinoma,sentara
96517,C1,1.2.840.113681.2229444612.1016.3624270449.161.1,MG151106000287,DuctalCarcinomaInSitu,sentara
...,...,...,...,...,...
107209,C1,1.2.840.113681.2215629362.2097.3630747234.67.1,MG160120000111,DuctalCarcinomaInSitu,sentara
122903,A1,1.2.840.113681.2230568404.981.3546503214.362.1,MG130520005072,Apocrine_Metaplasia,sentara
122904,A1,1.2.840.113681.2230568404.981.3546503214.360.1,MG130520005072,Apocrine_Metaplasia,sentara
59818,T1,1.2.840.113681.2749806494.1398234480.5476.4750,MG140423003764,DuctalCarcinomaInSitu,sentara


#### drop "urgent" labs that do not exist

In [2]:
sen_data.drop(columns = [x for x in sen_data if 'Urgent' in x], inplace = True)

#### drop/rename some columns

In [3]:
# drop columns we don't need(studyid and 'infection_current_ind.1')
sen_data.drop(['bmi_class_last', 'bmi_class_max'], axis=1, inplace = True)

# drop the cancer outcome (michal chorev defined different classes than us)
cancer_features = [x for x in sen_data.columns.tolist() if x.startswith('outcome_cancer_type')]
sen_data.drop(cancer_features, axis=1, inplace = True)

# rename age
sen_data.rename(columns = {'birth_date_yearsToInddate': 'age'}, inplace=True)

#rename breast density, set it as an outcome variable
sen_data.rename(columns = {'breast_density_current': 'outcome_breast_density_current'}, inplace=True)

#### fix lab duplicate names

In [4]:
sen_data.rename(columns=lambda x: re.sub('\.1$', '_last', x), inplace = True)
sen_data.rename(columns=lambda x: re.sub('\.2$', '_min', x), inplace = True)
sen_data.rename(columns=lambda x: re.sub('\.3$', '_max', x), inplace = True)

#### drop prev_birads_cnt to match maccabi

In [5]:
sen_data.drop(columns = ['prev_birads_cnt'], inplace=True)

#### add outcome_calc_biopsy_result based on outcome_biopsy_result_1y and outcome_bc_1y

In [6]:
sen_data['outcome_calc_biopsy_result'] = sen_data['outcome_biopsy_result_1yr'].map({0: 'A_12', 1: 'B', 3: 'C'})
sen_data.loc[sen_data['outcome_bc_1yr'] == 1, 'outcome_calc_biopsy_result'] = 'C'

#### fix menopause according to age at event

In [7]:
# menopause 
sen_data['menopause_ind'] = np.nan 
sen_data.loc[(sen_data['menopause_dx_ind'] == 1)|(sen_data['age'] >= sen_data['age_last_menstruation']), 'menopause_ind'] = 1 
sen_data.loc[sen_data['age'] < sen_data['age_last_menstruation'], 'menstruation_years'] = np.nan 
sen_data.loc[sen_data['age'] < sen_data['age_last_menstruation'], 'age_last_menstruation'] = np.nan
sen_data.drop(columns=['menopause_dx_ind'], inplace=True) 

# years between first period and full term pregnancy 

sen_data['menarche_to_ftp_years'] = sen_data['age_first_childbirth'] - sen_data['age_first_menstruation'] 

# distinct between zero and unknown number of children
sen_data.loc[sen_data['children_cnt'].isna(), 'children_ind'] = np.nan

In [8]:
sen_data['past_birads_max'] = sen_data[['past_birads', 'past_birads_us']].max(axis='columns') 
sen_data['past_birads_US_ind'] = (~sen_data[['past_birads_us']].isnull()).any(axis='columns').astype(int)
sen_data['past_birads_high_ind'] = (sen_data['past_birads_max'] >= 30).astype(int)
sen_data.loc[sen_data['past_birads_max'].isnull(), 'past_birads_high_ind'] = np.nan
sen_data['past_birads_US_high_ind'] = ((sen_data['past_birads_us'] >= 30)).astype(int)
sen_data.loc[sen_data['past_birads_us'].isnull(), 'past_birads_US_high_ind'] = np.nan

sen_data['breast_density_past_high_ind'] = (sen_data['breast_density_past'] >= 3).astype(int)
sen_data.loc[sen_data['breast_density_past'].isnull(), 'breast_density_past_high_ind'] = np.nan
sen_data['outcome_breast_density_current_high_ind'] = (sen_data['outcome_breast_density_current'] >= 3).astype(int)
sen_data.loc[sen_data['outcome_breast_density_current'].isnull(), 'outcome_breast_density_current_high_ind'] = np.nan
       
       
sen_data['family_cancer_2_or_more_ind'] = (sen_data['family_cancer_cnt'] >= 2).astype(int)
sen_data.loc[sen_data['family_cancer_cnt'].isnull(), 'family_cancer_2_or_more_ind'] = np.nan
       
sen_data['complaint_ind_current'] = sen_data[[x for x in sen_data.columns if ('nipple' in x or 'lump' in x or 'disorder' in x)
                                  and 'current' in x]].any(axis=1).astype(int)
sen_data['complaint_ind_past'] = sen_data[[x for x in sen_data.columns if ('nipple' in x or 'lump' in x or 'disorder' in x)
                               and 'past' in x]].any(axis=1).astype(int)
sen_data.drop(columns=['past_birads', 'past_birads_us'], inplace=True)
sen_data.rename(columns={'outcome_birads': 'outcome_calc_max_birads'}, inplace=True) 
       
# replace BI-RADS in the outcome to a readable format 
birads_num = [np.nan,0,10,20,30,40,41,42,43,50] 
birads = [np.nan,0,1, 2, 3, 4, 4, 4, 4, 5] 

sen_data['outcome_calc_max_birads'].replace(dict(zip(birads_num,birads)),inplace=True)
sen_data['past_birads_max'].replace(dict(zip(birads_num,birads)),inplace=True) 

 #### add bmi and dicom tags to sen_data

In [9]:
sen_data = sen_data.merge(dicom_tags, how='left', on = ['study_id'])

In [10]:
sen_data = sen_data.merge(bmis, how='left', on = ['study_id'])

#### add obesity index

In [11]:
def fnc(x):
    if x<=30:
        return 0
    elif x>30:
        return 1
    else:
        return np.nan
sen_data['obesity_ind'] = sen_data['bmi_current'].apply(fnc)

#### add change in bmi, bmi variance and months to first MG exam

In [12]:
newfeats = vbu.get_change_bmi_and_months_to_first_mg_sentara(df = sen_data)
sen_data = pd.merge(sen_data, newfeats, how = 'left', on= ['study_id', 'patient_id'])

#### create column for race

In [13]:
# adjust race ('race:white': 1, 'race:black': 2, 'race:asian': 3, 'race:pacific islander': 4, 'race:other':5)
sen_data['race:white'] = sen_data['race:white'].replace(1, 1)
sen_data['race:black'] = sen_data['race:black'].replace(1, 2)
sen_data['race:asian'] = sen_data['race:asian'].replace(1, 3)
sen_data['race:pacific islander'] = sen_data['race:pacific islander'].replace(1, 4)
sen_data['race:other'] = sen_data['race:other'].replace(1, 5)

sen_data['race'] = sen_data[['race:white', 'race:black', 'race:asian', 'race:pacific islander','race:other' ]].sum(axis=1)

sen_data.drop(['race:white', 'race:black', 'race:asian', 'race:pacific islander', 
              'race:other'], axis=1, inplace = True)

#### create column for religion

In [14]:
# adjust religion ('religion:christian': 1, 'religion:eastern_religions': 2,  'religion:jewish': 3 
# 'religion:muslim': 4, 'religion:other' or np.nan :5)

sen_data['religion:christian'] = sen_data['religion:christian'].replace(1, 1)
sen_data['religion:eastern_religions'] = sen_data['religion:eastern_religions'].replace(1, 2)
sen_data['religion:jewish'] = sen_data['religion:jewish'].replace(1, 3)
sen_data['religion:muslim'] = sen_data['religion:muslim'].replace(1, 4)
sen_data['religion:other'] = sen_data['religion:other'].replace(1, 5)

sen_data['religion'] = sen_data[['religion:christian', 'religion:eastern_religions', 'religion:jewish',
                              'religion:muslim', 'religion:other']].sum(axis=1)

sen_data.drop(['religion:christian', 'religion:eastern_religions', 'religion:jewish',
                              'religion:muslim', 'religion:other'], axis=1, inplace = True)
# if religion = 0 (did not have value 1 for any of the religions, set to unknown = other = 5
sen_data['religion'] = sen_data['religion'].replace(0, 5)

#### create cancer class based on pathology: first create a pathology column with each study pathology, and map them to a cancer class

In [15]:
frame = [train, val, test]
patho = pd.concat(frame)

studies = patho.study_id.unique().tolist()

pathologies = []

for s in studies:
    pathologies.append(patho[patho['study_id'] == s].pathology.unique().tolist())
    

d = {'study_id': studies, 'pathology': pathologies}
df_patho = pd.DataFrame(data=d)

In [16]:
buckets = {
    'A' : ['DuctalCarcinomaInSitu'],
    'B' : ['Papillary_Carcinoma',
         'Invasive Mammary Carcinoma',
         'Invasive lobular adenocarcinoma',
         'MucinousCarcinoma',
         'Tubular_Carcinoma',
         'Invasive_Lobular_Carcinoma',
         'InvasiveDuctalCarcinoma',
         'InvasiveDuctalCarcinomaMulticentric',
         'invasive ductal adenocarcinoma',
         'Squamous cell carcinoma'],
    'C' : ['AtypicalDuctalHyperplasia',
         'Atypical Lobular Hyperplasia',
         'Lobular carcinoma in situ of breast',
         'Lobular carcinoma in situ of breast\\nLobular carcinoma in situ of breast',
         'Columnar_Cell_Lesion',
         'Atypia'],
    'D' : ['Intraductal papilloma',
         'Papilloma',
         'Papillomatosis',
         'PhyllodesTumor',
         'Radial_Scar',
         'SclerosingPapilloma',
         'papillary lesion'],
    'E' : ['Angiolipoma',
         'Apocrine_Metaplasia',
         'Cyst of breast',
         'Edema Of Breast',
         'FatNecrosis',
         'Fibroadenoma',
         'fibroadenomatoid nodule',
         'FibrocysticChange',
         'Fibromatosis',
         'Fibrosis',
         'Granular_Cell_Tumor',
         'Gynecomastia',
         'Fibroadenolipoma',
         'Hematoma',
         'Usual Ductal Hyperplasia',
         'IntramammaryLymphNode',
         'Lipoma',
         'Lymph Node',
         'Mastitis',
         'OilCyst',
         'PseudoangiomatousStromalHyperplasia',
         'ScarTissue',
         'Sclerosing_Adenosis',
         'Seroma',
         'Complex_Cyst',
         'Complicated_Cyst',
         'Inflamed_Cyst',
         'Reactive Lymph Node',
         'Sebaceous cyst of skin of breast',
         'SimpleCyst',
         'benign breast tissue',
         'dense stromal fibrosis',
         'focal fibroadenomatoid change',
         'Plasma cell mastitis',
         'Angiomatosis',
         'Lactating_Adenoma',
         'Myofibroblastoma',
         'Lymphoid hyperplasia',
         'Neoplasm of skin of breast',
         'Granuloma',
         'DuctalAdenoma',
         'Fibrosing adenosis',
         'Follicle cyst',
         'Complicated_Cyst',
         'Mammary duct ectasia',
         'Ruptured keratious cyst',
         'Seborrhoeic keratosis']
}

In [17]:
# Binarize labels into multiple columns
df_patho['outcome_cancer_type_DCIS'] = df_patho['pathology'].map(lambda x: 1 if any(check in x for check in buckets['A']) else 0)
df_patho['outcome_cancer_type_Invasive'] = df_patho['pathology'].map(lambda x: 1 if any(check in x for check in buckets['B']) else 0)
df_patho['outcome_cancer_type_BenignHR'] = df_patho['pathology'].map(lambda x: 1 if any(check in x for check in buckets['C']) else 0)
df_patho['outcome_cancer_type_Papilloma'] = df_patho['pathology'].map(lambda x: 1 if any(check in x for check in buckets['D']) else 0)
df_patho['outcome_cancer_type_Benign'] = df_patho['pathology'].map(lambda x: 1 if any(check in x for check in buckets['E']) else 0)


# No longer need processed_path column
df_patho = df_patho.drop(['pathology'], axis=1)
df_patho.head()

Unnamed: 0,study_id,outcome_cancer_type_DCIS,outcome_cancer_type_Invasive,outcome_cancer_type_BenignHR,outcome_cancer_type_Papilloma,outcome_cancer_type_Benign
0,MG130425000199,0,0,0,0,1
1,MG130529000207,0,1,0,0,0
2,MG151106000287,1,0,0,0,0
3,MG161024000239,0,0,0,0,1
4,MG170410005194,0,1,0,0,0


In [18]:
# Merge with sen_data
sen_data['study_id'] = sen_data['study_id'].astype(object)
df_patho['study_id'] = df_patho['study_id'].astype(object)

sen_data = sen_data.merge(df_patho, on = ['study_id'])

In [19]:
sen_data.to_csv('../input_files/fx_sentara_cohort_processed.csv')