In [None]:
import pandas as pd
import os 
import sys
sys.path.append('../')

from PrepareDataset.ICDExtractor import ICDExtractor
file_path_imaging = '../data/data/tabular/ukb668815_imaging.csv'
#file_path_imaging = '../data/data/tabular/ukb668815.csv'
icd_code_dict_file_path = r"./resources/icd_code_dict_imaging.json"
#icd_code_dict_file_path = r"./resources/icd_code_dict.json"
#icd_code_dict_file_path = r"../ICDCodesAnalysis/icd_code_dict.json"
icdExtractor = ICDExtractor(file_path_imaging, icd_code_dict_file_path)

In [None]:
comorbidities_table = pd.read_csv('../tabular/coding/coding19_with_elixhauser_comorbidities.tsv', sep='\t')

In [None]:
MODE = 'cvd'
basis_file_path = './resources/3m_3y/' + 'cvd_3m_3y' # + MODE

## Load icd code dictionary

In [None]:
# extract icd codes from the main database
icdExtractor.extract_icd_codes()
icdExtractor.save_icd_code_dict(icdExtractor.icd_code_dict, icd_code_dict_file_path)

In [None]:
# load already extracted icd codes from local file
_ = icdExtractor.load_icd_code_dict()

In [None]:
len(icdExtractor.icd_code_dict)

## Define the cohort of interest with icd code pool

In [None]:
if MODE == "cvd2":
    icd_ischemic_heart_disease = [
    "I20", "I200", "I201", "I208", "I209", "I21", "I210", "I211", "I212", "I213", "I214", "I219",
    "I22", "I220", "I221", "I228", "I229", "I23", "I230", "I231", "I232", "I233", "I234", "I235",
    "I236", "I238", "I24", "I240", "I241", "I248", "I249", "I25", "I250", "I251", "I252", "I253",
    "I254", "I255", "I256", "I258", "I259"
    ]
    icd_ischemic_heart_disease_self_reported = ["1074", "1075"]
    icd_hypertensive = [
    "I10", "I11", "I110", "I119", "I12", "I120", "I129", "I13", "I130", "I131", "I132", "I139",
    "I50", "I500", "I501", "I509"
    ]
    icd_hypertensive_self_reported = ["1065", "1072", "1076"]
    icd_stroke = [
    "I60", "I600", "I601", "I602", "I603", "I604", "I605", "I606", "I607", "I608", "I609",
    "I61", "I610", "I611", "I612", "I613", "I614", "I615", "I616", "I618", "I619",
    "I63", "I630", "I631", "I632", "I633", "I634", "I635", "I636", "I638", "I639",
    "I64"
    ]
    icd_stroke_self_reported = ["1081", "1086", "1491", "1583"]
    
    icd_selector = icd_ischemic_heart_disease + icd_hypertensive + icd_stroke
    icd_selector += ["myoinfarction", "stroke"]
    
    from ComorbidityExtractor import ComorbidityExtractor
    comorbidity_selector = [
        'Congestive heart failure',
        'Cardiac arrhythmias',
        'Valvular disease',
        'Pulmonary circulation disorders',
        'Peripheral vascular disorders', 
        'Hypertension, uncomplicated',
        'Hypertension, complicated' 
    ]
    comorbidityExtractor = ComorbidityExtractor('../tabular/coding/coding19_with_elixhauser_comorbidities.tsv')
    icd_selector_extended = comorbidityExtractor.extract_icd_codes(comorbidity_selector)
    icd_selector_extended += icd_selector
    icd_selector_extended += ["myoinfarction", "stroke"]
    icd_self_reported = icd_ischemic_heart_disease_self_reported + icd_hypertensive_self_reported + icd_stroke_self_reported
    
    icd_origin = ['secondary', 'ado']
    censoring_date = '2022-05-31' #cvd    
elif MODE == "cvd":  # deprecated definition based only on Elixhauser comorbidities
    from ComorbidityExtractor import ComorbidityExtractor
    comorbidity_selector = [
        'Congestive heart failure',
        'Cardiac arrhythmias',
        'Valvular disease',
        'Pulmonary circulation disorders',
        'Peripheral vascular disorders', 
        'Hypertension, uncomplicated',
        'Hypertension, complicated' 
    ]
    comorbidityExtractor = ComorbidityExtractor('../tabular/coding/coding19_with_elixhauser_comorbidities.tsv')
    icd_selector = comorbidityExtractor.extract_icd_codes(comorbidity_selector)
    icd_selector += ["myoinfarction", "stroke"]
    # cvd cohort
    icd_selector_extended = icd_selector #+ ["myoinfarction", "stroke"]
    icd_origin = ['main', 'ado']
    censoring_date = '2022-05-31' #cvd
elif MODE == "cancer":
    patterns = ["C{0:02d}".format(x) for x in range(0, 98)] # all C00-C97
    patterns.remove('C44')  # without C44
    patterns.remove('C43')  # without C43
    patterns += ["D{0:02d}".format(x) for x in range(37, 49)]  # all D37-D48
    icd_selector = icdExtractor.extract_icd_codes_from_pattern(patterns)
    patterns = ["C{0:02d}".format(x) for x in range(0, 98)] + ["D{0:02d}".format(x) for x in range(37, 49)] # all C00-C97 and D37-D48
    icd_selector_extended = icdExtractor.extract_icd_codes_from_pattern(patterns)
    icd_origin = ["cancer"]
    censoring_date = '2020-05-31'
elif MODE == "liver":
    from ComorbidityExtractor import ComorbidityExtractor
    comorbidity_selector = [
        'Liver disease',
    ]
    comorbidityExtractor = ComorbidityExtractor('../tabular/coding/coding19_with_elixhauser_comorbidities.tsv')
    icd_selector = comorbidityExtractor.extract_icd_codes(comorbidity_selector)
    icd_selector += icdExtractor.extract_icd_codes_from_pattern(["C22"])
    icd_selector_extended = icd_selector 
    icd_origin = ['secondary', 'cancer']
    censoring_date = '2020-05-31' # downgrade, since cancer register is used 
elif MODE == "pancreas":
    from ComorbidityExtractor import ComorbidityExtractor
    comorbidity_selector = [
        'Pancreatic disease (Rickmer-defined)',
    ]
    comorbidityExtractor = ComorbidityExtractor('../tabular/coding/coding19_with_elixhauser_comorbidities.tsv')
    icd_selector = comorbidityExtractor.extract_icd_codes(comorbidity_selector)
    icd_selector += icdExtractor.extract_icd_codes_from_pattern(["C25"])
    patterns = ["C{0:02d}".format(x) for x in range(0, 98)] + ["D{0:02d}".format(x) for x in range(37, 49)] # all C00-C97 and D37-D48
    icd_selector_extended = icdExtractor.extract_icd_codes_from_pattern(patterns)
    icd_selector_extended += icd_selector 
    icd_origin = ['secondary', 'cancer']
    censoring_date = '2020-05-31'  # downgrade, since cancer register is used
elif MODE == "pancreas+liver":
    from ComorbidityExtractor import ComorbidityExtractor
    comorbidity_selector = [
        'Pancreatic disease (Rickmer-defined)',
        'Liver disease',
    ]
    comorbidityExtractor = ComorbidityExtractor('../tabular/coding/coding19_with_elixhauser_comorbidities.tsv')
    icd_selector = comorbidityExtractor.extract_icd_codes(comorbidity_selector)
    icd_selector += icdExtractor.extract_icd_codes_from_pattern(["C22", "C25"])
    icd_selector_extended = icd_selector 
    icd_origin = ['secondary', 'cancer']
    censoring_date = '2020-05-31'  # downgrade, since cancer register is used
elif MODE == "copd":
    icd_selector = ["copd"]
    icd_selector_extended = icd_selector
    icd_origin = ['ado']
    icd_self_reported = ['1112', '1113', '1472']
    censoring_date = '2022-05-31'
elif MODE == "dementia_allcause":  # not enough data
    icd_selector = ["dementia_allcause"]
    icd_selector_extended = icd_selector
    icd_origin = ['ado']
    icd_self_reported =['1263']
    censoring_date = '2022-05-31'
elif MODE == "ckd":
    icd_selector = ["N18", "N180", "N181", "N182", "N183", "N184", "N185", "N188", "N189"]
    icd_selector_extended = icd_selector
    icd_origin = ['secondary']
    icd_self_reported = ['1192', '1193', '1194']
    censoring_date = '2022-05-31'
elif MODE == "osteoarthritis":
    icd_selector = [
    "M15", "M150", "M1500", "M151", "M152", "M153", "M154", "M158", "M159", "M1599",
    "M16", "M160", "M161", "M162", "M163", "M164", "M165", "M166", "M167", "M169",
    "M17", "M170", "M171", "M172", "M173", "M174", "M175", "M179", "M18", "M180",
    "M181", "M182", "M183", "M184", "M185", "M189", "M19", "M190", "M1900", "M1901",
    "M1902", "M1903", "M1904", "M1905", "M1906", "M1907", "M1908", "M1909", "M191",
    "M1910", "M1911", "M1912", "M1913", "M1914", "M1915", "M1916", "M1917", "M1918",
    "M1919", "M192", "M1920", "M1921", "M1922", "M1923", "M1924", "M1925", "M1926",
    "M1927", "M1928", "M1929", "M198", "M1980", "M1981", "M1982", "M1983", "M1984",
    "M1985", "M1986", "M1987", "M1988", "M1989", "M199", "M1990", "M1991", "M1992",
    "M1993", "M1994", "M1995", "M1996", "M1997", "M1998", "M1999"
    ]
    icd_selector_extended = icd_selector
    icd_origin = ['secondary']
    icd_self_reported = ['1465']
    censoring_date = '2022-05-31'
    
elif MODE == "osteoporosis":  # not enough data
    icd_selector = [
    "M8000", "M8001", "M8002", "M8003", "M8004", "M8005", "M8006", "M8007", "M8008", "M8009",
    "M809", "M8090", "M8091", "M8092", "M8093", "M8094", "M8095", "M8096", "M8097", "M8098",
    "M8099"
    ]
    icd_selector_extended = icd_selector
    icd_origin = ['secondary']
    icd_self_reported = ['1309']
    censoring_date = '2022-05-31'

censoring_date = pd.to_datetime(censoring_date, errors='coerce', format='%Y-%m-%d')
interested_date = "first_imaging_date"
#interested_date = "first_visiting_date"
n_years = 3
n_min_days = 90

## Extract the cohort of interest
1. min(icd_date) later than first_imaging_date
2. event happened within last 5 years
3. Check there is no cancer code in extended version of the list prior to the cancer code in the main list 

In [None]:
eids_main = []
icd_code_exclude = list(set(icd_selector_extended) - set(icd_selector))
for eid, data in icdExtractor.icd_code_dict.items():
    icd_codes = data['icd_codes']
    code_date = []
    code_data_exclude= []
    for icd_code_index, icd_code in enumerate(icd_codes):
        if (icd_code in icd_selector) and (data['icd_origin'][icd_code_index] in icd_origin):
            code_date.append(data['icd_dates'][icd_code_index])
        if (icd_code in icd_code_exclude) and (data['icd_origin'][icd_code_index] in icd_origin):
            code_data_exclude.append(data['icd_dates'][icd_code_index])
    if len(code_date) > 0:
        diff = min(code_date) - data[interested_date]
        
        if len(code_data_exclude) != 0:
            diff_exclude = min(code_data_exclude) - data[interested_date]
        else:
            diff_exclude = pd.Timedelta(days=(n_min_days + 1))
        if (n_min_days < diff_exclude.days and 
                n_min_days < diff.days < 365 * n_years):
            eids_main.append(eid)
            
print(len(set(eids_main)))

In [None]:
self_reported_cancer_dict = icdExtractor.extract_self_reported_cancer()
icdExtractor.save_icd_code_dict(self_reported_cancer_dict, './resources/self_reported_cancer_dict.json')        

In [None]:
self_reported_cancer_dict = icdExtractor.load_self_reported_icd_code_dict('./resources/self_reported_cancer_dict_imaging.json')

In [None]:
len(self_reported_cancer_dict)

In [None]:
icdExtractor_self_reported = ICDExtractor(file_path_imaging, "/home/dmitrii/GitHub/ukbb_risk_assessment/PrepareDataset/resources/self_reported_noncancer_dict_imaging.json")
self_reported_noncancer_dict = icdExtractor_self_reported.load_icd_code_dict()

In [None]:
if MODE == "cancer" or MODE == "liver" or MODE == "pancreas" or MODE == "pancreas+liver":
    eids_self_reported_cancer = []
    for eid in eids_main:
        if eid in self_reported_cancer_dict:
            diff = min(self_reported_cancer_dict[eid]) - icdExtractor.icd_code_dict[eid][interested_date]
            #greater than 0 means the event happened after the first imaging date
            if diff.days < n_min_days:
                eids_self_reported_cancer.append(eid)
    eids_main = list(set(eids_main) - set(eids_self_reported_cancer))
elif MODE == "ckd" or MODE == "copd" or MODE == "osteoarthritis" or MODE == "osteoporosis" or MODE == "cvd2":
    eids_self_reported_noncancer = []
    for eid in eids_main:
        icd_dates = []
        if eid in self_reported_noncancer_dict.keys():
            for i, icd in enumerate(self_reported_noncancer_dict[eid]['icd_codes']):
                if icd in icd_self_reported:
                    icd_dates.append(self_reported_noncancer_dict[eid]['icd_dates'][i])
            if len(icd_dates) > 0:
                diff = min(icd_dates) - icdExtractor.icd_code_dict[eid][interested_date]
                if diff.days < n_min_days:
                    eids_self_reported_noncancer.append(eid)
    eids_main = list(set(eids_main) - set(eids_self_reported_noncancer))
                
len(eids_main)

In [None]:
# exclude the subjects that have the (extended) list of ICD codes before the first imaging date in all registers
eids_with_icd = []
for eid in eids_main:
    data = icdExtractor.get_all_before_date(eid, icdExtractor.icd_code_dict[eid][interested_date])
    for icd_code in data['icd_codes']:
        if icd_code in icd_selector_extended:
            eids_with_icd.append(eid)
print(len(set(eids_with_icd)))

In [None]:
eids_main = list(set(eids_main) - set(eids_with_icd))
len(eids_main)

## Define the positive cohort (has event in any register)

In [None]:
eids_positive = []
for eid, data in icdExtractor.icd_code_dict.items():
    icd_codes = data['icd_codes']
    code_date = []
    for icd_code_index, icd_code in enumerate(icd_codes):
        if icd_code in icd_selector_extended:
            eids_positive.append(eid)
print(len(set(eids_positive)))

In [None]:
if MODE == "cancer" or MODE == "liver" or MODE == "pancreas" or MODE == "pancreas+liver":
    eids_positive += list(set(list(self_reported_cancer_dict.keys())))
elif MODE == "ckd" or MODE == "copd" or MODE == "osteoarthritis" or MODE == "osteoporosis" or MODE == "cvd2":
    for eid in self_reported_noncancer_dict.keys():
        for icd in icd_self_reported:
            if icd in self_reported_noncancer_dict[eid]['icd_codes']:
                eids_positive.append(eid)
            

In [None]:
eids_positive = list(set(eids_positive))
print(len(set(eids_positive)))
pd.DataFrame(eids_positive, columns=['eid']).to_csv(os.path.join(basis_file_path, "eids_positive.csv"), index=False)

In [None]:
PROJECT_DIR = "put yours" 
eids_with_mri = pd.read_csv(PROJECT_DIR + "cancer_C00-C43_C45-C97_D37-D48/eids_all_mri.csv")['eid'].tolist()

eids_with_all_features = pd.read_csv(PROJECT_DIR + 'cancer_C00-C43_C45-C97_D37-D48/eids_with_features/eids_with_all_features.csv')['eid'].tolist()

eids_with_cardiac_features = pd.read_csv(PROJECT_DIR + 'cancer_C00-C43_C45-C97_D37-D48/eids_with_features/eids_with_cardiac_features.csv')['eid'].tolist()

eids_with_brain_features = pd.read_csv(PROJECT_DIR + 'cancer_C00-C43_C45-C97_D37-D48/eids_with_features/eids_with_brain_features.csv')['eid'].tolist()

eids_with_wholebody_features = pd.read_csv(PROJECT_DIR + 'cancer_C00-C43_C45-C97_D37-D48/eids_with_features/eids_with_wholebody_features.csv')['eid'].tolist()
eids_with_wholebody_images = pd.read_csv(PROJECT_DIR + "cancer_C00-C43_C45-C97_D37-D48/eids_with_features/eids_with_wholebody_images.csv")['eid'].tolist()

eids_with_cardiac_mri = pd.read_csv(PROJECT_DIR + "cancer_C00-C43_C45-C97_D37-D48/eids_with_cardiac_mri.csv")["eid"].tolist()

eids_with_liver_mri = pd.read_csv(PROJECT_DIR + "data/data/abdominal/liver_data/projections_all/eids_with_projections_all.csv")["eid"].tolist()

eids_with_pancreas_mri = pd.read_csv(PROJECT_DIR + "data/data/abdominal/pancreas_data/projections_all/eids_with_projections_all.csv")["eid"].tolist()

eids_with_wholebody_mris = pd.read_csv(PROJECT_DIR + "cancer_C00-C43_C45-C97_D37-D48/eids_with_features/eids_with_wholebodymris.csv")['eid'].tolist()


In [None]:
eids_with_all_features = set(eids_with_all_features)
eids_with_cardiac_features = set(eids_with_cardiac_features)
eids_with_brain_features = set(eids_with_brain_features)
eids_with_wholebody_features = set(eids_with_wholebody_features)
eids_with_wholebody_images = set(eids_with_wholebody_images)
eids_with_mri = set(eids_with_mri)
eids_with_cardiac_mri = set(eids_with_cardiac_mri)
eids_with_liver_mri = set(eids_with_liver_mri)
eids_with_pancreas_mri = set(eids_with_pancreas_mri)
eids_with_wholebody_mris = set(eids_with_wholebody_mris)

In [None]:
if MODE == "cancer":
    eids_basis = set.intersection(eids_with_wholebody_images, set(icdExtractor.icd_code_dict.keys()))
elif MODE == "copd" or MODE == "ckd" or MODE == "osteoarthritis" or MODE == "osteoporosis":
    eids_basis = set.intersection(eids_with_wholebody_images,eids_with_wholebody_mris, set(icdExtractor.icd_code_dict.keys()))
elif MODE == "cvd" or MODE == "cvd2":
    eids_basis = set.intersection(eids_with_cardiac_mri, eids_with_wholebody_mris, set(icdExtractor.icd_code_dict.keys()))
elif MODE == "liver"or MODE == "pancreas+liver":
    eids_basis = set.intersection(eids_with_liver_mri, set(icdExtractor.icd_code_dict.keys()))
elif MODE == "pancreas":
    eids_basis = set.intersection(eids_with_pancreas_mri, set(icdExtractor.icd_code_dict.keys()))

eids_main = list(set(eids_basis).intersection(set(eids_main)))

In [None]:
len(eids_main)

In [None]:
pd.DataFrame(eids_main, columns=['eid']).to_csv(os.path.join(basis_file_path, "eids_main.csv"), index=False)

In [None]:
eids_main = pd.read_csv(os.path.join(basis_file_path, "eids_main.csv"))['eid'].tolist()
len(eids_main)

In [None]:
len(eids_basis)

## Normal Cohort Definition

In [None]:
eids_basis = list(icdExtractor.icd_code_dict.keys())
len(eids_basis)

In [None]:
eids_basis = set(list(icdExtractor.icd_code_dict.keys())) - set(pd.read_csv(PROJECT_DIR + "PrepareDataset/resources/cancer_redefine/labels.csv")['eid'].tolist())
eids_basis = list(eids_basis)
len(eids_basis)

In [None]:
from NormalControlExtractor import NormalControlExtractor
basic_features_path = PROJECT_DIR + "PrepareDataset/resources/features/677795/basic_data_imaging_fields.csv"
normalControlExtractor = NormalControlExtractor(file_path_imaging, interested_date, basic_features_path)

normalControlExtractor.set_cencoring_date(n_years, censoring_date)    
normalControlExtractor.set_eids_pool(eids_basis)
normalControlExtractor.set_eids_positive_cohort(eids_main)
normalControlExtractor.set_eids_positive(eids_positive)

sex_field = ["31-0.0"]
age_field = ["21003-2.0"] 
bmi_field = ["21001-2.0"] 
ethnicity = ['21000-0.0']
propensity_attributes = bmi_field + sex_field + age_field + ethnicity
normalControlExtractor.set_propensity_attributes(propensity_attributes)

In [None]:
# only if you already have eids_selectes
from NormalControlExtractor import NormalControlExtractor
basic_features_path = PROJECT_DIR + "PrepareDataset/resources/features/677795/basic_data.csv"
eids_main = pd.read_csv(PROJECT_DIR + "resources/cancer_all_redefine/eids_main.csv")['eid'].tolist()
eids_selected = pd.read_csv(PROJECT_DIR + "resources/cancer_all_redefine/eids_selected_normal_control.csv")['eid'].tolist()
normalControlExtractor = NormalControlExtractor(file_path_imaging, interested_date, basic_features_path)
normalControlExtractor.set_eids_positive_cohort(eids_main)

sex_field = ["31-0.0"]
age_field = ["21003-0.0"] 
bmi_field = ["21001-0.0"] 
ethnicity = ['21000-0.0']
propensity_attributes = sex_field + age_field + bmi_field + ethnicity
normalControlExtractor.set_propensity_attributes(propensity_attributes)

matched_dataset = normalControlExtractor.extract_normal_control(eids_selected)
matched_dataset.to_csv(os.path.join(basis_file_path, "matched_dataset.csv"), index=False)

In [None]:
matched_dataset = normalControlExtractor.extract()
matched_dataset.to_csv(os.path.join(basis_file_path, "matched_dataset.csv"), index=False)

In [None]:
icdExtractor.save_all_with_eids(matched_dataset['eid'].tolist(), os.path.join(basis_file_path, "final_cohort.csv"))

## Label Definition and train test split

In [None]:
# split matched dataset into train and test by the label column status using stratified sklearn method train_test_split and create a column split where train and test are assigned
file_path_matched_dataset = os.path.join(basis_file_path, 'matched_dataset.csv')
matched_dataset = pd.read_csv(file_path_matched_dataset)

from sklearn.model_selection import train_test_split
matched_dataset_train, matched_dataset_test = train_test_split(matched_dataset, test_size=0.2, stratify=matched_dataset['status'], random_state=42)
matched_dataset_train['split'] = 'train'
matched_dataset_test['split'] = 'test'
matched_dataset = pd.concat([matched_dataset_train, matched_dataset_test])
matched_dataset.to_csv(file_path_matched_dataset, index=False)

In [None]:
len(matched_dataset)

In [None]:
labels_dict = {}
for eid in matched_dataset['eid'].tolist():
    event = matched_dataset[matched_dataset['eid'] == eid]['status'].values[0]
    split = matched_dataset[matched_dataset['eid'] == eid]['split'].values[0]
    data = icdExtractor.icd_code_dict[eid]
    if event == 1:
        code_date = []
        for icd_code_index, icd_code in enumerate(data["icd_codes"]):
            if (icd_code in icd_selector) and (data['icd_origin'][icd_code_index] in icd_origin):
                code_date.append(data['icd_dates'][icd_code_index])
        time_to_event = min(code_date) - data[interested_date]
    else:
        time_to_event = censoring_date - data[interested_date]
    labels_dict[eid] = {'event': event, 'time_to_event': time_to_event.days, 'split': split}

In [None]:
base_path = basis_file_path + "/"
eids_main_csv = base_path + "labels.csv"
labels = pd.read_csv(eids_main_csv)
seed = 42
from sklearn.model_selection import train_test_split
labels_train = labels[labels['split'] == 'train']
labels_test = labels[labels['split'] == 'test']
labels_train_train, labels_train_val = train_test_split(labels_train, test_size=0.1, stratify=labels_train['event'], random_state=seed)
labels_train_val['split'] = 'val'
# concat train and val and test
labels_res = pd.concat([labels_train_train, labels_train_val, labels_test])
labels_res.to_csv(base_path + "labels_with_val.csv", index=False)