# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import os
import yaml
from tqdm.notebook import trange, tqdm
dataset_name = "cvd_interactions"
path = "/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb"
data_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS/data"
dataset_path = f"{data_path}/2_datasets_pre/{dataset_name}"

In [2]:
from pathlib import Path
Path(dataset_path).mkdir(parents=True, exist_ok=True)

In [3]:
data = pd.read_feather(f"{data_path}/1_decoded/ukb_data.feather")
data_field = pd.read_feather(f"{data_path}/1_decoded/ukb_data_field.feather")
data_columns = data.columns.to_list()

## Mappings + Vocabulary

In [4]:
# Drop obviouse missing data
print(len(data))
data = data.dropna(subset=["sex_f31_0_0"], axis=0)
print(len(data))

502505
502504


# Starting information

In [5]:
time0_col="birth_date"
# time0_col="date_of_attending_assessment_centre_f53_0_0"

# Baseline covariates

In [6]:
def get_fields(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields) & data_field["field.tab"].str.contains("f\\.\\d+\\.0\\.\\d")].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_fields_all(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields)].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_data_fields(fields, data, data_field):
    f = get_fields(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

def get_data_fields_all(fields, data, data_field):
    f = get_fields_all(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

### Basics

In [7]:
fields_basics = [
    "21022", # age at recruitment
    "31", # sex
    "21000", # ethnicity
    "189", # Townsend index
    "53", # date of baseline assessment
]

temp = get_data_fields(fields_basics, data, data_field)

temp["sex_f31_0_0"] = temp["sex_f31_0_0"].cat.set_categories(["Female", 'Male'], ordered=False)

#temp["ethnic_background_f21000_0_0"] = temp["ethnic_background_f21000_0_0"].astype("string")

ethn_bg_def = {"White": ["White", "British", "Irish", "Any other white background"],
                "Mixed": ["Mixed", "White and Black Caribbean", "White and Black African", "White and Asian", "Any other mixed background"],  
                "Asian": ["Asian or Asian British", "Indian", "Pakistani", "Bangladeshi", "Any other Asian background"], 
                "Black": ["Black or Black British", "Caribbean", "African", "Any other Black background"],
                "Chinese": ["Chinese"],  
                np.nan: ["Other ethnic group", "Do not know", "Prefer not to answer"]}

ethn_bg_dict = {}
for key, values in ethn_bg_def.items(): 
    for value in values:
        ethn_bg_dict[value]=key 
        
temp["ethnic_background_f21000_0_0"].replace(ethn_bg_dict, inplace=True)
temp["ethnic_background_f21000_0_0"] = temp["ethnic_background_f21000_0_0"].astype("category")

#
#temp["ethnic_background_f21000_0_0"] = temp["ethnic_background_f21000_0_0"].astype("category").cat.set_categories(['White', 'Black', 'Asien', 'Mixed', 'Chinese'], ordered=False)

basics = temp
print(len(temp))
display(temp.head())

from dateutil.relativedelta import relativedelta
calc_birth_date = [date_of_attending_assessment_centre - relativedelta(years=age_at_recruitment) 
                                                             for date_of_attending_assessment_centre, age_at_recruitment 
                                                             in zip(basics["date_of_attending_assessment_centre_f53_0_0"], basics["age_at_recruitment_f21022_0_0"])]
basics = basics.assign(birth_date = calc_birth_date)


basics.to_feather(os.path.join(path, dataset_path, 'temp_basics.feather'))

502504


Unnamed: 0,eid,age_at_recruitment_f21022_0_0,sex_f31_0_0,ethnic_background_f21000_0_0,townsend_deprivation_index_at_recruitment_f189_0_0,date_of_attending_assessment_centre_f53_0_0
0,1000018,49.0,Female,White,-1.85293,2009-11-12
1,1000020,59.0,Male,White,0.204248,2008-02-19
2,1000037,59.0,Female,White,-3.49886,2008-11-11
3,1000043,63.0,Male,White,-5.35115,2009-06-03
4,1000051,51.0,Female,White,-1.79908,2006-06-10


In [8]:
 print(temp["ethnic_background_f21000_0_0"].unique())

[White, Black, NaN, Asian, Mixed, Chinese]
Categories (5, object): [White, Black, Asian, Mixed, Chinese]


### Questionnaire

In [9]:
fields_questionnaire = [
    "2178", # Overall health
    "20116", # Smoking status
    "1558",
]

temp = get_data_fields(fields_questionnaire, data, data_field)

temp["overall_health_rating_f2178_0_0"] = temp["overall_health_rating_f2178_0_0"]\
    .replace({"Do not know": np.nan, "Prefer not to answer": np.nan})\
    .astype("category").cat.set_categories(['Poor', 'Fair', 'Good', 'Excellent'], ordered=True)


temp["smoking_status_f20116_0_0"] = temp["smoking_status_f20116_0_0"]\
    .replace({"Prefer not to answer": np.nan}, inplace=False)\
    .astype("category").cat.set_categories(['Current', 'Previous', 'Never'], ordered=True)

temp["alcohol_intake_frequency_f1558_0_0"] = temp["alcohol_intake_frequency_f1558_0_0"]\
    .replace({"Prefer not to answer": np.nan}, inplace=False)\
    .astype("category").cat.set_categories([
        'Daily or almost daily', 
        'Three or four times a week', 
        'Once or twice a week',
        'One to three times a month',
        'Special occasions only', 
        'Never'], ordered=True)

questionnaire = temp
print(len(temp))
display(temp.head())

questionnaire.to_feather(os.path.join(path, dataset_path, 'temp_questionnaire.feather'))

502504


Unnamed: 0,eid,overall_health_rating_f2178_0_0,smoking_status_f20116_0_0,alcohol_intake_frequency_f1558_0_0
0,1000018,Fair,Current,Once or twice a week
1,1000020,Good,Current,Once or twice a week
2,1000037,Good,Previous,Once or twice a week
3,1000043,Fair,Previous,Three or four times a week
4,1000051,Poor,Never,One to three times a month


In [10]:
print(temp["alcohol_intake_frequency_f1558_0_0"].unique())

[Once or twice a week, Three or four times a week, One to three times a month, Daily or almost daily, Special occasions only, Never, NaN]
Categories (6, object): [Daily or almost daily < Three or four times a week < Once or twice a week < One to three times a month < Special occasions only < Never]


### Physical measurements

In [11]:
from statistics import mean

fields_measurements = [
#    "100313", # Walking speed !!! MISSING !!!
    "21001", # BMI
    "21002", # weight
    "4080", # Syst. BP
    "4079", # Diast. BP
    "102",
    "21021",
    "4195",
    "48",
    "49",
    "50",
    "23127",
    "23099",
    "23105",
    "20151",
    "20150",
    "20258",
    "3064",
    
]
temp = get_data_fields(fields_measurements, data, data_field)

sbp_cols = ["systolic_blood_pressure_automated_reading_f4080_0_0", "systolic_blood_pressure_automated_reading_f4080_0_1"]
dbp_cols = ["diastolic_blood_pressure_automated_reading_f4079_0_0", "diastolic_blood_pressure_automated_reading_f4079_0_1"]
pr_cols = ["pulse_rate_automated_reading_f102_0_0", "pulse_rate_automated_reading_f102_0_1"]

temp = temp.assign(systolic_blood_pressure_automated_reading_f4080 = temp[sbp_cols].mean(axis=1),
                   diastolic_blood_pressure_automated_reading_f4079 = temp[dbp_cols].mean(axis=1),
                   pulse_rate_automated_reading_f102 = temp[pr_cols].mean(axis=1))\
    .drop(sbp_cols + dbp_cols + pr_cols, axis=1)

measurements = temp
print(len(temp))
display(temp.head())

measurements.to_feather(os.path.join(path, dataset_path, 'temp_measurements.feather'))

502504


Unnamed: 0,eid,body_mass_index_bmi_f21001_0_0,weight_f21002_0_0,pulse_wave_arterial_stiffness_index_f21021_0_0,pulse_wave_reflection_index_f4195_0_0,waist_circumference_f48_0_0,hip_circumference_f49_0_0,standing_height_f50_0_0,trunk_fat_percentage_f23127_0_0,body_fat_percentage_f23099_0_0,basal_metabolic_rate_f23105_0_0,forced_vital_capacity_fvc_best_measure_f20151_0_0,forced_expiratory_volume_in_1second_fev1_best_measure_f20150_0_0,fev1_fvc_ratio_zscore_f20258_0_0,peak_expiratory_flow_pef_f3064_0_2,peak_expiratory_flow_pef_f3064_0_1,peak_expiratory_flow_pef_f3064_0_0,systolic_blood_pressure_automated_reading_f4080,diastolic_blood_pressure_automated_reading_f4079,pulse_rate_automated_reading_f102
0,1000018,26.5557,63.8,7.277,80.0,85.0,107.0,155.0,37.5,39.5,5012.0,3.21,2.16,1.978,317.0,312.0,339.0,159.5,88.0,50.0
1,1000020,22.7465,70.7,,,87.8,94.4,176.3,33.4,28.7,6171.0,,,1.375,301.0,496.0,504.0,133.0,81.0,74.0
2,1000037,32.4211,78.9,,,101.0,112.0,156.0,47.5,48.4,5397.0,1.61,1.27,0.138,,185.0,208.0,118.5,78.0,62.5
3,1000043,29.5679,95.8,11.1111,78.0,98.0,104.0,180.0,27.6,25.6,8711.0,4.14,2.84,1.096,557.0,513.0,530.0,141.5,93.5,64.5
4,1000051,41.0222,92.3,,,123.0,129.0,150.0,48.9,50.4,6100.0,,,0.518,,,,117.0,81.0,79.0


### Lab measurements

In [12]:
fields_blood_count = [
    "30160", #	Basophill count
    "30220", #	Basophill percentage
    "30150", #	Eosinophill count
    "30210", #	Eosinophill percentage
    "30030", #	Haematocrit percentage
    "30020", #	Haemoglobin concentration
    "30300", #	High light scatter reticulocyte count
    "30290", #	High light scatter reticulocyte percentage
    "30280", #	Immature reticulocyte fraction
    "30120", #	Lymphocyte count
    "30180", #	Lymphocyte percentage
    "30050", #	Mean corpuscular haemoglobin
    "30060", #	Mean corpuscular haemoglobin concentration
    "30040", #	Mean corpuscular volume
    "30100", #	Mean platelet (thrombocyte) volume
    "30260", #	Mean reticulocyte volume
    "30270", #	Mean sphered cell volume
    "30130", #	Monocyte count
    "30190", #	Monocyte percentage
    "30140", #	Neutrophill count
    "30200", #	Neutrophill percentage
    "30170", #	Nucleated red blood cell count
    "30230", #	Nucleated red blood cell percentage
    "30080", #	Platelet count
    "30090", #	Platelet crit
    "30110", #	Platelet distribution width
    "30010", #	Red blood cell (erythrocyte) count
    "30070", #	Red blood cell (erythrocyte) distribution width
    "30250", #	Reticulocyte count
    "30240", #	Reticulocyte percentage
    "30000", #	White blood cell (leukocyte) count
]

fields_blood_biochemistry = [
    "30620",#	Alanine aminotransferase
    "30600",#	Albumin
    "30610",#	Alkaline phosphatase
    "30630",#	Apolipoprotein A
    "30640",#	Apolipoprotein B
    "30650",#	Aspartate aminotransferase
    "30710",#	C-reactive protein
    "30680",#	Calcium
    "30690",#	Cholesterol
    "30700",#	Creatinine
    "30720",#	Cystatin C
    "30660",#	Direct bilirubin
    "30730",#	Gamma glutamyltransferase
    "30740",#	Glucose
    "30750",#	Glycated haemoglobin (HbA1c)
    "30760",#	HDL cholesterol
    "30770",#	IGF-1
    "30780",#	LDL direct
    "30790",#	Lipoprotein A
    "30800",#	Oestradiol
    "30810",#	Phosphate
    "30820",#	Rheumatoid factor
    "30830",#	SHBG
    "30850",#	Testosterone
    "30840",#	Total bilirubin
    "30860",#	Total protein
    "30870",#	Triglycerides
    "30880",#	Urate
    "30670",#	Urea
    "30890",#	Vitamin D
]

fields_blood_infectious = [
    "23000", #	1gG antigen for Herpes Simplex virus-1
    "23001", #	2mgG unique antigen for Herpes Simplex virus-2
    "23049", #	Antigen assay QC indicator
    "23048", #	Antigen assay date
    "23026", #	BK VP1 antigen for Human Polyomavirus BKV
    "23039", #	CagA antigen for Helicobacter pylori
    "23043", #	Catalase antigen for Helicobacter pylori
    "23018", #	Core antigen for Hepatitis C Virus
    "23030", #	E6 antigen for Human Papillomavirus type-16
    "23031", #	E7 antigen for Human Papillomavirus type-16
    "23006", #	EA-D antigen for Epstein-Barr Virus
    "23004", #	EBNA-1 antigen for Epstein-Barr Virus
    "23042", #	GroEL antigen for Helicobacter pylori
    "23016", #	HBc antigen for Hepatitis B Virus
    "23017", #	HBe antigen for Hepatitis B Virus
    "23025", #	HIV-1 env antigen for Human Immunodeficiency Virus
    "23024", #	HIV-1 gag antigen for Human Immunodeficiency Virus
    "23023", #	HTLV-1 env antigen for Human T-Lymphotropic Virus 1
    "23022", #	HTLV-1 gag antigen for Human T-Lymphotropic Virus 1
    "23010", #	IE1A antigen for Human Herpesvirus-6
    "23011", #	IE1B antigen for Human Herpesvirus-6
    "23027", #	JC VP1 antigen for Human Polyomavirus JCV
    "23015", #	K8.1 antigen for Kaposi's Sarcoma-Associated Herpesvirus
    "23029", #	L1 antigen for Human Papillomavirus type-16
    "23032", #	L1 antigen for Human Papillomavirus type-18
    "23014", #	LANA antigen for Kaposi's Sarcoma-Associated Herpesvirus
    "23028", #	MC VP1 antigen for Merkel Cell Polyomavirus
    "23019", #	NS3 antigen for Hepatitis C Virus
    "23041", #	OMP antigen for Helicobacter pylori
    "23037", #	PorB antigen for Chlamydia trachomatis
    "23013", #	U14 antigen for Human Herpesvirus-7
    "23044", #	UreA antigen for Helicobacter pylori
    "23003", #	VCA p18 antigen for Epstein-Barr Virus
    "23040", #	VacA antigen for Helicobacter pylori
    "23005", #	ZEBRA antigen for Epstein-Barr Virus
    "23002", #	gE / gI antigen for Varicella Zoster Virus
    "23034", #	momp A antigen for Chlamydia trachomatis
    "23033", #	momp D antigen for Chlamydia trachomatis
    "23012", #	p101 k antigen for Human Herpesvirus-6
    "23020", #	p22 antigen for Toxoplasma gondii
    "23038", #	pGP3 antigen for Chlamydia trachomatis
    "23009", #	pp 28 antigen for Human Cytomegalovirus
    "23008", #	pp 52 antigen for Human Cytomegalovirus
    "23007", #	pp150 Nter antigen for Human Cytomegalovirus
    "23021", #	sag1 antigen for Toxoplasma gondii
    "23035", #	tarp-D F1 antigen for Chlamydia trachomatis
    "23036", #	tarp-D F2 antigen for Chlamydia trachomatis
]

labs = temp = get_data_fields(fields_blood_count+fields_blood_biochemistry+fields_blood_infectious, data, data_field)
print(len(temp))
display(temp.head())

labs.to_feather(os.path.join(path, dataset_path, 'temp_labs.feather'))

502504


Unnamed: 0,eid,basophill_count_f30160_0_0,basophill_percentage_f30220_0_0,eosinophill_count_f30150_0_0,eosinophill_percentage_f30210_0_0,haematocrit_percentage_f30030_0_0,haemoglobin_concentration_f30020_0_0,high_light_scatter_reticulocyte_count_f30300_0_0,high_light_scatter_reticulocyte_percentage_f30290_0_0,immature_reticulocyte_fraction_f30280_0_0,...,phosphate_f30810_0_0,rheumatoid_factor_f30820_0_0,shbg_f30830_0_0,testosterone_f30850_0_0,total_bilirubin_f30840_0_0,total_protein_f30860_0_0,triglycerides_f30870_0_0,urate_f30880_0_0,urea_f30670_0_0,vitamin_d_f30890_0_0
0,1000018,0.04,0.26,0.25,1.75,39.79,13.9,0.022,0.464,0.378,...,1.422,,70.11,1.56,7.41,71.97,1.247,221.3,5.48,70.7
1,1000020,0.0,0.3,0.3,2.5,45.0,15.6,0.014,0.29,0.3,...,1.264,,55.31,12.237,8.07,78.45,1.906,374.7,5.28,35.9
2,1000037,0.04,0.57,0.1,1.43,39.48,13.58,0.031,0.686,0.38,...,,,,,,,,,,
3,1000043,0.02,0.32,0.11,1.8,44.31,14.99,0.025,0.508,0.25,...,0.928,,31.63,11.398,8.65,69.7,5.184,322.8,6.67,63.6
4,1000051,,,,,,,,,,...,,,,,,,,,,


### Family History

In [13]:
fh_list=["Heart disease", "Stroke", "High blood pressure",  "Diabetes", "Lung cancer", "Severe depression", "Parkinson's disease", "Alzheimer's disease/dementia", "Chronic bronchitis/emphysema", "Breast cancer", "Bowel cancer"]
with open(os.path.join(path, dataset_path, 'fh_list.yaml'), 'w') as file: yaml.dump(fh_list, file, default_flow_style=False)

fields_family_history = [
    "20107", # Family history 
    "20110" # Family history
]

raw = get_data_fields(fields_family_history, data, data_field)
temp = pd.melt(raw, id_vars=["eid"], value_vars=raw.drop("eid", axis=1).columns.to_list(), var_name = "field", value_name="family_history").drop("field", axis=1)
temp = temp[temp.family_history.isin(fh_list)].assign(family_history=temp["family_history"].str.lower().replace(" ", "_", regex=True))

temp = temp.drop_duplicates().sort_values("eid").reset_index().drop("index", axis=1).assign(n=True)
temp = pd.pivot_table(temp, index="eid", columns="family_history", values="n", observed=True).add_prefix('fh_')
family_history = temp = data[["eid"]].copy().merge(temp, how="left", on="eid").fillna(False)

print(len(temp))
temp.head()

family_history.to_feather(os.path.join(path, dataset_path, 'temp_family_history.feather'))

502504


## Medications

In [14]:
# https://list.essentialmeds.org/?showRemoved=0
# essential medicines WHO?!

In [15]:
atc_mapping = pd.read_csv(f"{path}/mapping/atc/atc_matched_list.csv")
athena_concepts = pd.read_csv(f"{data_path}/athena_vocabulary/CONCEPT.csv", sep="\t").assign(vocabulary_id = lambda x: x.vocabulary_id.astype("string"), concept_class_id = lambda x: x.concept_class_id.astype("string"))
atc_concepts = athena_concepts[athena_concepts.vocabulary_id=="ATC"]
atc2_concepts = atc_concepts[atc_concepts.concept_class_id=="ATC 2nd"].sort_values("concept_code")
medication_list = dict(zip([x.lower().replace(" ", "_") for x in atc2_concepts.concept_name.to_list()], [[x] for x in atc2_concepts.concept_code.to_list()]))
medication_list_extra = {
    "antihypertensives": ["C02"],
    "statins": ["C10A", "C10B"],
    "ass": ["B01"],
    "atypical_antipsychotics" : ["N05"],
    "glucocorticoids" : ["H02"]                        
}
medication_list.update(medication_list_extra)

with open(os.path.join(path, dataset_path, 'medication_list.yaml'), 'w') as file: yaml.dump(medication_list, file, default_flow_style=False)

In [None]:
def had_medication_before(data, data_field, medications, atc_mapping):
    fields = ["20003"]
    raw = get_data_fields(fields, data, data_field)
    temp = pd.melt(raw, id_vars=["eid"], value_vars=raw.drop("eid", axis=1).columns.to_list(), var_name = "field", value_name="UKBB_code").drop("field", axis=1).drop_duplicates()

    temp.UKBB_code = temp.UKBB_code.astype(str)
    temp = temp[temp.UKBB_code!="None"].copy()
    temp.UKBB_code = temp.UKBB_code.astype(int)

    temp_atc = temp.merge(atc_mapping, how="left", on="UKBB_code").sort_values("eid").reset_index(drop=True).dropna(subset=["ATC_code"], axis=0)
    temp_atc.ATC_code = temp_atc.ATC_code.astype("string")
    temp = data[["eid"]].copy()
    for med, med_codes in tqdm(medication_list.items()):
        regex_str = "^"+"|^".join(med_codes)
        df = temp_atc[temp_atc.ATC_code.str.contains(regex_str, case=False)][["eid"]]\
            .drop_duplicates(subset=["eid"])\
            .assign(medication=True)
        temp[med] = temp.merge(df, how="left", on="eid").fillna(False).medication
        
    return temp.sort_values("eid")

In [34]:
def had_diagnosis_before_per_ph(df_before, ph, ph_codes, temp):
   # regex = "|".join(ph_codes)
    #df_ph = df_before.set_index("meaning").loc[ph_codes][["eid"]]\
    #        .drop_duplicates(subset=["eid"])\
    #        .assign(phenotype=True) 
    df_ph = df_before[df_before.meaning.isin(ph_codes)][["eid"]]\
            .drop_duplicates(subset=["eid"])\
            .assign(phenotype=True) 
    #df_ph = df_before[df_before.meaning.str.contains(regex, case=False)][["eid"]]\
    #        .drop_duplicates(subset=["eid"])\
    #        .assign(phenotype=True)   
    return temp.merge(df_ph, how="left", on="eid").fillna(False).phenotype

def had_diagnosis_before(data, diagnoses_codes, phenotypes, time0=time0_col):
    diagnoses_codes_time = diagnoses_codes.merge(data[["eid", time0]], how="left", on="eid")
    
    temp = data[["eid"]].copy()
    df_before = diagnoses_codes_time[diagnoses_codes_time.date < diagnoses_codes_time[time0]]
                                                                                         
    df_phs = Parallel(n_jobs=20, require="sharedmem")(delayed(had_diagnosis_before_per_ph)(df_before, ph, phenotypes[ph], temp) for ph in tqdm(list(phenotypes)))
    for ph, df_ph_series in zip(tqdm(list(phenotypes)), df_phs): temp[ph] = df_ph_series#temp.merge(df_ph, how="left", on="eid").fillna(False).phenotype
    
    return temp.sort_values("eid")  #reduce(lambda left,right: pd.merge(left,right,on=['eid'], how='left'), df_phs).

In [35]:
medications = had_medication_before(data, data_field, medication_list, atc_mapping)
print(len(medications))
medications.head(100)

medications.to_feather(os.path.join(path, dataset_path, 'temp_medications.feather'))

HBox(children=(FloatProgress(value=0.0, max=98.0), HTML(value='')))


502504


## Diagnoses and events

In [36]:
vocab_dir = f"{data_path}/athena_vocabulary_covid"
vocab = {
    "concept": pd.read_csv(f"{vocab_dir}/CONCEPT.csv", sep='\t'),
    "domain": pd.read_csv(f"{vocab_dir}/DOMAIN.csv", sep='\t'),
    "class": pd.read_csv(f"{vocab_dir}/CONCEPT_CLASS.csv", sep='\t'),
    "relationship": pd.read_csv(f"{vocab_dir}/RELATIONSHIP.csv", sep='\t'),
    "drug_strength": pd.read_csv(f"{vocab_dir}/DRUG_STRENGTH.csv", sep='\t'),
    "vocabulary": pd.read_csv(f"{vocab_dir}/VOCABULARY.csv", sep='\t'),
    "concept_synonym": pd.read_csv(f"{vocab_dir}/CONCEPT_SYNONYM.csv", sep='\t'),
    "concept_ancestor": pd.read_csv(f"{vocab_dir}/CONCEPT_ANCESTOR.csv", sep='\t'),
    "concept_relationship": pd.read_csv(f"{vocab_dir}/CONCEPT_RELATIONSHIP.csv", sep='\t')                       
}

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


### Definitions

In [16]:
coding1836 = pd.read_csv(f"{path}/mapping/codings/coding1836.tsv", sep="\t").rename(columns={"coding":"code"})
phecodes = pd.read_csv(f"{path}/mapping/phecodes/phecode_icd10.csv")
def phenotype_children(phecodes, phenotype_list):
    l={}
    phecodes = phecodes.dropna(subset=["Phenotype"], axis=0)
    for ph, ph_names in phenotype_list.items():
        regex = "|".join(ph_names)
        l[ph] = list(phecodes[phecodes.Phenotype.str.contains(regex, case=False)].ICD10.str.replace("\\.", "").str.slice(0, 3).unique())
    return l

In [38]:
snomed_core = pd.read_csv(f"{path}/mapping/snomed_core_list.txt", sep="|")

In [39]:
snomed_core = snomed_core.query("SNOMED_CONCEPT_STATUS == 'Current'").copy()
new = snomed_core.SNOMED_FSN.str.split("(", n=1, expand=True)
snomed_core["snomed_name"] = new[0].str.rstrip(' ')
snomed_core["snomed_type"] = new[1].str.rstrip(')')
snomed_core_data = snomed_core#.query("(snomed_type=='disorder' | snomed_type=='finding') & USAGE>0.01")

In [40]:
snomed_names = snomed_core_data.snomed_name.to_list()
snomed_names = [str(item).lower().strip().replace(" ", "_").replace(";", "").replace(",", "") for item in snomed_names]

In [42]:
phenotype_list_snomed = dict(zip(snomed_names, snomed_core_data.SNOMED_CID.to_list()))
snomed_ids = vocab["concept"].query("vocabulary_id == 'SNOMED'").concept_id.to_list()
icd10_ids = vocab["concept"].query("vocabulary_id == 'ICD10CM'").concept_id.to_list()

ph_to_icd10_mapping = {}

def map_snomed_to_icd10(ph, snomed_code, concept, concept_ancestor, concept_relationship):
    concept_ids = concept.query("vocabulary_id == 'SNOMED' & concept_code == @snomed_code").concept_id.to_list()
    snomed_desc_ids = concept_ancestor.query("ancestor_concept_id== @concept_ids").descendant_concept_id.to_list()
    ph_desc = concept.query("concept_id == @snomed_desc_ids").query("vocabulary_id == 'SNOMED'")
    l_ph_desc_ids = ph_desc.concept_id.to_list()
    ph_icd10_ids = list(concept_relationship.query("concept_id_1==@l_ph_desc_ids").query("concept_id_2 == @icd10_ids").concept_id_2.unique())
    #ph_icd10_ids = list(concept_relationship.set_index("concept_id_1").query("index==@l_ph_desc_ids").query("concept_id_2 == @icd10_ids").query("relationship_id == 'Mapped from'").concept_id_2.unique()
    
    #ph_icd10_ids = list(temp.concept_id_2.unique())
    df = concept.query("concept_id == @ph_icd10_ids & vocabulary_id == 'ICD10CM'")
    icd10_list = list(df[~df.concept_code.str.contains("OMOP", na=False)].concept_code.unique())
    icd10_list = sorted(list(set([e[:3] for e in icd10_list])))
    #print(f"{ph}: {icd10_list}")
    return {ph: sorted(list(dict.fromkeys([str(e) for e in icd10_list])))}

from joblib import Parallel, delayed
concept_ids = vocab["concept"].query("(vocabulary_id == 'SNOMED') | (vocabulary_id == 'ICD10CM')")
vocab_concept_ids = concept_ids.concept_id.to_list()
concept_ancestor = vocab["concept_ancestor"][["ancestor_concept_id", "descendant_concept_id"]].query("ancestor_concept_id == @vocab_concept_ids")
concept_rel = vocab["concept_relationship"][["concept_id_1", "concept_id_2", "relationship_id"]].query("(concept_id_1 == @vocab_concept_ids) & (concept_id_2 == @vocab_concept_ids) & (relationship_id == 'Mapped from')")
icd10_codes = Parallel(n_jobs=10, require="sharedmem")(delayed(map_snomed_to_icd10)(ph, snomed_code, 
                                                              concept_ids, concept_ancestor, concept_rel) for ph, snomed_code in tqdm(phenotype_list_snomed.items()))

HBox(children=(FloatProgress(value=0.0, max=6181.0), HTML(value='')))




In [43]:
l10_snomed = {}
for ph in icd10_codes: l10_snomed.update(ph)

In [44]:
phenotype_list_basic = {
    "coronary_heart_disease": ["Ischemic heart disease"],
    "myocardial_infarction": ["Myocardial infarction"],
    "stroke": ["Cerebrovascular disease"],
    "diabetes1" : ["Type 1 diabetes"],
    "diabetes2" : ["Diabetes mellitus", "Type 2 diabetes"],
    "chronic_kidney_disease": ["Chronic kidney disease", "chronic renal failure"],
    "atrial_fibrillation": ["Atrial fibrillation", "Atrial flutter", "paroxysmal tachycardia"],
    "migraine": ["Migraine"],
    "rheumatoid_arthritis": ["Rheumatoid arthritis"],
    "systemic_lupus_erythematosus": ["Systemic lupus erythematosus"],
    "severe_mental_illness": ["Schizophrenia", "bipolar", "Major depressive disorder"],
    "erectile_dysfunction" : ["Erectile dysfunction"],  
}

l10_basic = phenotype_children(phecodes, phenotype_list_basic)

In [45]:
l10_all = l10_basic
for key, value in l10_snomed.items(): 
    if key not in l10_basic: l10_all[key] = value

In [46]:
l10 = {k: v for k, v in l10_all.items() if len(v)!=0}

#phenotype_list = {k: v for k, v in phenotype_list.items() if k in list(l10.keys())}

with open(os.path.join(path, dataset_path, 'phenotype_list.yaml'), 'w') as file: yaml.dump(l10, file, default_flow_style=False)

### 1. Self Reported

In [47]:
coding609 = pd.read_csv(f"{path}/mapping/codings/coding609.tsv", sep="\t").rename(columns={"coding":"code"})

In [48]:
from datetime import datetime, timedelta

def datetime_from_dec_year(dec_year):
    start = dec_year
    year = int(start)
    rem = start - year

    base = datetime(year, 1, 1)
    result = base + timedelta(seconds=(base.replace(year=base.year + 1) - base).total_seconds() * rem)
    #result.strftime("%Y-%m-%d")
    return result.date()

def extract_map_self_reported(data, data_field, code_map):
    pbar = tqdm(total=16)
    ### codes
    fields = ["20002"]; pbar.update(1)
    raw = get_data_fields_all(fields, data, data_field); pbar.update(1)
    col = "noncancer_illness_code_selfreported_f20002"; pbar.update(1)
    temp = pd.wide_to_long(raw, stubnames=[col], i="eid", j="instance_index", sep="_", suffix="\w+").reset_index(); pbar.update(1)
    codes = temp.rename(columns={col:"code"})\
        .assign(code=lambda x: x.code.astype(str))\
        .replace("None", np.nan) \
        .dropna(subset=["code"], axis=0)\
        .assign(code=lambda x: x.code.astype(int)) \
        .merge(code_map, how="left",on="code") \
        .dropna(subset=["meaning"], axis=0)\
        .sort_values(["eid", "instance_index"]) \
        .reset_index(drop=True); pbar.update(1)
    
    ### dates
    fields = ["20008"]; pbar.update(1)
    raw = get_data_fields_all(fields, data, data_field); pbar.update(1)
    col="interpolated_year_when_noncancer_illness_first_diagnosed_f20008"; pbar.update(1)
    temp = pd.wide_to_long(raw, stubnames=[col], i="eid", j="instance_index", sep="_", suffix="\w+").reset_index(); pbar.update(1)
    dates = temp.rename(columns={col:"date"})\
        .dropna(subset=["date"], axis=0)\
        .sort_values(["eid", "instance_index"]) \
        .reset_index(drop=True); pbar.update(1)

    dates = dates[dates.date!=-1]; pbar.update(1)
    dates = dates[dates.date!=-3]; pbar.update(1)
    dates.date = dates.date.apply(datetime_from_dec_year); pbar.update(1)
    
    test = codes.merge(dates, how="left", on=["eid", "instance_index"]).assign(origin="self_reported").copy(); pbar.update(1)
    
    test["instance_index"] = test["instance_index"].astype("string"); pbar.update(1)
    test[['instance','n']] = test.instance_index.str.split("_",expand=True); pbar.update(1)
    pbar.close()
    
    return test[["eid", "origin", 'instance','n', "code", "meaning", "date"]]

In [49]:
codes_self_reported = extract_map_self_reported(data, data_field, coding609)

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




### 2. Primary Care

### 3. Hospital episode statistics

In [12]:
codes_hospital_records = pd.read_feather(f"{data_path}/1_decoded/codes_hospital_records.feather").drop("level", axis=1)

### Combine diagnoses and events

In [51]:
diagnoses_codes = codes_self_reported.append(codes_hospital_records).sort_values(["eid", "instance", "n"]).dropna(subset=["date"], axis=0)
diagnoses_codes.head()

Unnamed: 0,eid,origin,instance,n,code,meaning,date
0,1000018,hes_icd10,0,1,S0240,S02,2005-06-02
1,1000018,hes_icd10,0,2,W188,W18,2005-06-02
2,1000018,hes_icd10,0,3,K37,K37,1998-05-11
3,1000018,hes_icd10,0,4,K37,K37,1998-05-16
4,1000018,hes_icd10,0,5,K37,K37,1998-06-01


In [52]:
diagnoses_codes.reset_index(drop=True).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13339466 entries, 0 to 13339465
Data columns (total 7 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   eid       int64 
 1   origin    object
 2   instance  object
 3   n         object
 4   code      object
 5   meaning   object
 6   date      object
dtypes: int64(1), object(6)
memory usage: 712.4+ MB


In [53]:
codes_hospital_records.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12547626 entries, 0 to 12547625
Data columns (total 7 columns):
 #   Column    Dtype  
---  ------    -----  
 0   eid       int32  
 1   origin    object 
 2   instance  float64
 3   n         int32  
 4   code      object 
 5   meaning   object 
 6   date      object 
dtypes: float64(1), int32(2), object(4)
memory usage: 574.4+ MB


In [54]:
diagnoses_codes.reset_index(drop=True).assign(eid = lambda x: x.eid.astype(int),
                                              origin = lambda x: x.origin.astype(str),
                                              instance = lambda x: x.instance.astype(int),
                                              n = lambda x: x.n.astype(int),
                                              code = lambda x: x.code.astype(str), 
                                              meaning = lambda x: x.meaning.astype(str))\
    .to_feather(os.path.join(path, dataset_path, 'temp_diagnoses_codes.feather'))

In [13]:
diagnoses_codes = pd.read_feather(os.path.join(path, dataset_path, 'temp_diagnoses_codes.feather'))

In [58]:
from joblib import Parallel, delayed
from functools import reduce
from numba import jit

def had_diagnosis_before_per_ph(df_before, ph, ph_codes, temp):
   # regex = "|".join(ph_codes)
    #df_ph = df_before.set_index("meaning").loc[ph_codes][["eid"]]\
    #        .drop_duplicates(subset=["eid"])\
    #        .assign(phenotype=True) 
    df_ph = df_before[df_before.meaning.isin(ph_codes)][["eid"]]\
            .drop_duplicates(subset=["eid"])\
            .assign(phenotype=True) 
    #df_ph = df_before[df_before.meaning.str.contains(regex, case=False)][["eid"]]\
    #        .drop_duplicates(subset=["eid"])\
    #        .assign(phenotype=True)   
    return temp.merge(df_ph, how="left", on="eid").fillna(False).phenotype

def had_diagnosis_before(data, diagnoses_codes, phenotypes, time0=time0_col):
    diagnoses_codes_time = diagnoses_codes.merge(data[["eid", time0]], how="left", on="eid")
    
    temp = data[["eid"]].copy()
    df_before = diagnoses_codes_time[diagnoses_codes_time.date < diagnoses_codes_time[time0]]
                                                                                         
    df_phs = Parallel(n_jobs=20, require="sharedmem")(delayed(had_diagnosis_before_per_ph)(df_before, ph, phenotypes[ph], temp) for ph in tqdm(list(phenotypes)))
    for ph, df_ph_series in zip(tqdm(list(phenotypes)), df_phs): temp[ph] = df_ph_series#temp.merge(df_ph, how="left", on="eid").fillna(False).phenotype
    
    return temp.sort_values("eid")  #reduce(lambda left,right: pd.merge(left,right,on=['eid'], how='left'), df_phs).

In [61]:
diagnoses = had_diagnosis_before(basics, diagnoses_codes, l10, time0=time0_col)
print(len(diagnoses))

diagnoses.to_feather(os.path.join(path, dataset_path, 'temp_diagnoses.feather'))

diagnoses.head()

HBox(children=(FloatProgress(value=0.0, max=3521.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=3521.0), HTML(value='')))


502504


Unnamed: 0,eid,coronary_heart_disease,myocardial_infarction,stroke,diabetes1,diabetes2,chronic_kidney_disease,atrial_fibrillation,migraine,rheumatoid_arthritis,...,sleep_terror_disorder,acute_frontal_sinusitis,benign_neoplasm_of_pancreas,primary_malignant_neoplasm_of_soft_tissues_of_lower_limb,neoplasm_of_uncertain_behavior_of_neck,injury_of_peroneal_nerve,dupuytren's_disease,stem_cell_donor,endemic_goiter,diplegic_cerebral_palsy
0,1000018,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1000020,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1000037,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1000043,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1000051,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [62]:
diagnoses = pd.read_feather(os.path.join(path, dataset_path, 'temp_diagnoses.feather'))

# Add embeddings for Snomed Diagnoses

#### Get SNOMED - node2vec dict

In [63]:
snomed_embeddings = pd.read_csv("/data/analysis/ag-reils/steinfej/data/snomed_embeddings/snomed.emb.p1.q1.w20.l40.e200.graph_format.txt", sep=" ", header=None, skiprows=1)
snomed_embeddings.columns = ["snomed_id"]+list(snomed_embeddings.columns)[:-1]

In [64]:
snomed_embeddings.head()

Unnamed: 0,snomed_id,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,129265001,0.027456,0.171026,-0.178822,-0.038667,0.299777,0.08254,-0.201284,0.215164,0.093241,...,0.21103,0.488475,0.082177,-0.047102,-0.086411,-0.012141,-0.258416,-0.113295,0.040249,-0.116884
1,360224006,-0.090187,0.086986,-0.184378,-0.028722,-0.010235,0.242439,-0.094009,0.203015,-0.097894,...,0.11275,0.333906,-0.018193,0.020459,-0.170226,0.277866,-0.007079,-0.080419,0.162456,-0.101768
2,102272007,0.55342,0.460947,0.143925,-0.053785,-0.849588,0.467587,-0.654922,-0.010799,0.510141,...,0.187371,0.248607,0.079687,-0.354121,0.412602,0.582461,-0.780365,0.04545,-0.196551,0.045745
3,39937001,0.123158,0.019882,0.050896,0.037364,0.200435,0.312911,-0.338977,-0.092584,-0.167741,...,0.05777,-0.012834,-0.139794,0.180572,0.190781,0.104039,-0.358235,-0.137954,-0.236551,-0.206458
4,23583003,0.423193,0.384338,-0.041503,0.116848,-0.055029,0.149064,-0.09281,-0.050148,0.113122,...,0.162375,-0.076505,-0.274352,-0.099204,-0.281887,-0.266345,-0.020257,-0.003843,-0.008804,-0.286832


In [65]:
diagnoses_snomed = diagnoses.columns[13:].to_list()
diagnoses_snomed_dict = {}
for d in diagnoses_snomed: diagnoses_snomed_dict[d] = phenotype_list_snomed[d]
    
snomed_codes_used = list(phenotype_list_snomed.values())
snomed_codes_emb = snomed_embeddings.snomed_id.to_list()

In [66]:
snomed_id_array = snomed_embeddings[["snomed_id"]].values
node2vec_array = snomed_embeddings.iloc[:, 1:].values

In [67]:
snomed_arrays = {}
for sid, row in zip(tqdm(snomed_id_array), node2vec_array):
    snomed_arrays[sid[0]] = row

HBox(children=(FloatProgress(value=0.0, max=373286.0), HTML(value='')))




#### Get Patient -> Snomed dict

In [68]:
diagnoses_array = diagnoses[diagnoses_snomed].values
eid_array = diagnoses[["eid"]].values

from numba import jit
import numpy as np

patient_diagnoses = {}
for eid, row in zip(tqdm(eid_array), diagnoses_array):
    patient_diagnoses[eid[0]] = list(np.argwhere(row==True).flatten())

#diagnoses.query("eid==1000092")[diagnoses_snomed]

HBox(children=(FloatProgress(value=0.0, max=502504.0), HTML(value='')))




In [69]:
patient_diagnoses_sid = {}
snomed_codes_emb_set = set(snomed_codes_emb)
for eid, p_d_col in tqdm(patient_diagnoses.items()):
    diagnoses_list = [diagnoses_snomed[i] for i in p_d_col]
    sid_list = [phenotype_list_snomed[i] for i in diagnoses_list]
    sid_list = [sid for sid in sid_list if sid in snomed_codes_emb_set]
    patient_diagnoses_sid[eid] = sid_list

HBox(children=(FloatProgress(value=0.0, max=502504.0), HTML(value='')))




#### Get Patient -> node2vec average

In [70]:
patient_node2vec_dict = {}
for eid, sids in tqdm(patient_diagnoses_sid.items()):
    array_list =  [snomed_arrays[sid] for sid in sids]
    patient_node2vec_dict[eid] = np.mean(array_list, axis=0)

HBox(children=(FloatProgress(value=0.0, max=502504.0), HTML(value='')))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)





In [71]:
# create imputation vector
arrays = [patient_node2vec_dict[key] for key in list(patient_node2vec_dict)]
arrays_ok = [array for array in tqdm(arrays) if ~np.isnan(array).any()]
imp_vector = np.mean(arrays_ok, axis=0)
imp_vector

HBox(children=(FloatProgress(value=0.0, max=502504.0), HTML(value='')))




array([ 0.37033895,  0.13321076, -0.10030267,  0.05625264, -0.08437331,
        0.11504787, -0.24833219, -0.11936529,  0.25239648, -0.32739932,
        0.06330914, -0.06493541, -0.28790415, -0.2885537 ,  0.02546298,
       -0.06123153, -0.17690672, -0.0867546 , -0.28337599, -0.08474496,
        0.31454532, -0.24158154,  0.15930864,  0.06124846,  0.07694602,
       -0.13752803,  0.13671128, -0.27988751, -0.03987635,  0.03632156,
       -0.13793361, -0.15857369, -0.16441636,  0.2412931 ,  0.20070578,
       -0.11412784,  0.02563965, -0.03560184,  0.17169744, -0.0153383 ,
        0.00117099,  0.1025362 , -0.06505568,  0.05646065, -0.02705149,
        0.04416442, -0.15798991, -0.10650637,  0.02082507, -0.21182802,
        0.13972325, -0.18089307, -0.12731068,  0.02907221, -0.19797107,
        0.19550177,  0.14941799,  0.21561857, -0.18085379,  0.10768238,
        0.12968045,  0.2082016 ,  0.03561408, -0.01122218, -0.27099816,
       -0.06029919, -0.18787618,  0.10084175, -0.07939234, -0.22

In [72]:
for eid, array in tqdm(patient_node2vec_dict.items()):
    if np.isnan(array).any(): 
        patient_node2vec_dict[eid] = imp_vector

HBox(children=(FloatProgress(value=0.0, max=502504.0), HTML(value='')))




In [73]:
array_eids = [key for key in tqdm(list(patient_node2vec_dict))]

HBox(children=(FloatProgress(value=0.0, max=502504.0), HTML(value='')))




In [74]:
ind_ok = [np.array([1]) if ~np.isnan(array).any() else np.array([0])  for array in tqdm(arrays)]

HBox(children=(FloatProgress(value=0.0, max=502504.0), HTML(value='')))




In [75]:
arrays_emb = [patient_node2vec_dict[key] for key in tqdm(list(patient_node2vec_dict))]

HBox(children=(FloatProgress(value=0.0, max=502504.0), HTML(value='')))




In [76]:
arrays_eids = np.reshape(np.stack(array_eids, axis=0),(-1,1)) 
arrays_ind = np.stack(ind_ok, axis=0)
arrays_c = np.stack(arrays_emb, axis=0)

In [77]:
arrays_complete = np.concatenate([arrays_eids, arrays_ind, arrays_c], axis=1)

In [78]:
diagnoses_emb = pd.DataFrame(data=arrays_complete, columns=["eid"]+["node2vec_available"]+[f"node2vec_{e}" for e in list(range(0, 200))])

In [79]:
diagnoses_emb

Unnamed: 0,eid,node2vec_available,node2vec_0,node2vec_1,node2vec_2,node2vec_3,node2vec_4,node2vec_5,node2vec_6,node2vec_7,...,node2vec_190,node2vec_191,node2vec_192,node2vec_193,node2vec_194,node2vec_195,node2vec_196,node2vec_197,node2vec_198,node2vec_199
0,1000018.0,0.0,0.370339,0.133211,-0.100303,0.056253,-0.084373,0.115048,-0.248332,-0.119365,...,0.073037,0.15891,-0.047578,-0.057776,-0.077222,0.167877,-0.211251,-0.101427,-0.190028,-0.064002
1,1000020.0,0.0,0.370339,0.133211,-0.100303,0.056253,-0.084373,0.115048,-0.248332,-0.119365,...,0.073037,0.15891,-0.047578,-0.057776,-0.077222,0.167877,-0.211251,-0.101427,-0.190028,-0.064002
2,1000037.0,0.0,0.370339,0.133211,-0.100303,0.056253,-0.084373,0.115048,-0.248332,-0.119365,...,0.073037,0.15891,-0.047578,-0.057776,-0.077222,0.167877,-0.211251,-0.101427,-0.190028,-0.064002
3,1000043.0,0.0,0.370339,0.133211,-0.100303,0.056253,-0.084373,0.115048,-0.248332,-0.119365,...,0.073037,0.15891,-0.047578,-0.057776,-0.077222,0.167877,-0.211251,-0.101427,-0.190028,-0.064002
4,1000051.0,0.0,0.370339,0.133211,-0.100303,0.056253,-0.084373,0.115048,-0.248332,-0.119365,...,0.073037,0.15891,-0.047578,-0.057776,-0.077222,0.167877,-0.211251,-0.101427,-0.190028,-0.064002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502499,6025150.0,0.0,0.370339,0.133211,-0.100303,0.056253,-0.084373,0.115048,-0.248332,-0.119365,...,0.073037,0.15891,-0.047578,-0.057776,-0.077222,0.167877,-0.211251,-0.101427,-0.190028,-0.064002
502500,6025165.0,0.0,0.370339,0.133211,-0.100303,0.056253,-0.084373,0.115048,-0.248332,-0.119365,...,0.073037,0.15891,-0.047578,-0.057776,-0.077222,0.167877,-0.211251,-0.101427,-0.190028,-0.064002
502501,6025173.0,0.0,0.370339,0.133211,-0.100303,0.056253,-0.084373,0.115048,-0.248332,-0.119365,...,0.073037,0.15891,-0.047578,-0.057776,-0.077222,0.167877,-0.211251,-0.101427,-0.190028,-0.064002
502502,6025182.0,0.0,0.370339,0.133211,-0.100303,0.056253,-0.084373,0.115048,-0.248332,-0.119365,...,0.073037,0.15891,-0.047578,-0.057776,-0.077222,0.167877,-0.211251,-0.101427,-0.190028,-0.064002


In [14]:
diagnoses_emb.to_feather(os.path.join(path, dataset_path, 'temp_diagnoses_emb.feather'))
diagnoses_emb = pd.read_feather(os.path.join(path, dataset_path, 'temp_diagnoses_emb.feather'))

FileNotFoundError: [Errno 2] Failed to open local file '/data/analysis/ag-reils/ag-reils-shared/cardioRS/data/2_datasets_pre/cvd_massive_from_birth/temp_diagnoses_emb.feather'. Detail: [errno 2] No such file or directory

# Endpoints

In [None]:
1+1

In [89]:
### define in snomed and get icd codes from there

### 1. Hospital admissions

In [17]:
endpoint_list = {
    "myocardial_infarction": ["Myocardial infarction"],
    "stroke": ["Cerebrovascular disease"],
    "cancer_breast" : ["Breast Cancer"],
    "diabetes" : ["Diabetes"],
    "atrial_fibrillation": ["Atrial fibrillation", "Atrial flutter", "paroxysmal tachycardia"],
    "copd": ["COPD"],
    "dementia":["dementia"]
}

endpoint_list = phenotype_children(phecodes, endpoint_list)
endpoint_list["cancer_breast"] = ["C50"]
endpoint_list["copd"] = ["J44"]
endpoint_list["diabetes"] = ["E10", "E11", "E12", "E13", "E14"]
endpoint_list["atrial_fibrillation"] = ["I47", "I48"]


with open(os.path.join(path, dataset_path, 'endpoint_list.yaml'), 'w') as file: yaml.dump(endpoint_list, file, default_flow_style=False)
endpoint_list

{'myocardial_infarction': ['I21', 'I22', 'I23', 'I24', 'I25', 'I51'],
 'stroke': ['G45', 'G46', 'I60', 'I67', 'I68', 'I69'],
 'cancer_breast': ['C50'],
 'diabetes': ['E10', 'E11', 'E12', 'E13', 'E14'],
 'atrial_fibrillation': ['I47', 'I48'],
 'copd': ['J44'],
 'dementia': ['F00', 'F01', 'F02', 'F03', 'F09', 'G31', 'R54']}

In [19]:
from dateutil.relativedelta import relativedelta

def extract_endpoints_tte(data, diagnoses_codes, endpoint_list, time0_col, level=None):
    if level is not None: diagnoses_codes = diagnoses_codes.query("level==@level")
    diagnoses_codes_time0 = diagnoses_codes.merge(data[["eid", time0_col]], how="left", on="eid")
    
    cens_time_right = min(diagnoses_codes.sort_values('date').groupby('origin').tail(1).date.to_list())
    print(f"t_0: {time0_col}")
    print(f"t_cens: {cens_time_right}")
    
    df_interval = diagnoses_codes_time0[(diagnoses_codes_time0.date > diagnoses_codes_time0[time0_col]) & 
                                        (diagnoses_codes_time0.date < cens_time_right)]
    
    temp = data[["eid", time0_col]].copy()
    for ph, ph_codes in tqdm(endpoint_list.items()):
        regex = "|".join(ph_codes)
        ph_df = df_interval[df_interval.meaning.str.contains(regex, case=False)] \
            .sort_values('date').groupby('eid').head(1).assign(phenotype=1, date=lambda x: x.date)
        temp_ph = temp.merge(ph_df, how="left", on="eid").fillna(0)
        temp[ph+"_event"], temp[ph+"_event_date"] = temp_ph.phenotype, temp_ph.date
        
        fill_date = {ph+"_event_date" : lambda x: [cens_time_right if event==0 else event_date for event, event_date in zip(x[ph+"_event"], x[ph+"_event_date"])]}
        calc_tte = {ph+"_event_time" : lambda x: [(event_date-time0).days/365.25  for time0, event_date in zip(x[time0_col], x[ph+"_event_date"])]}
        
        temp = temp.assign(**fill_date).assign(**calc_tte).drop([ph+"_event_date"], axis=1)
        
    temp = temp.drop([time0_col], axis=1)     
    
    return temp.drop_duplicates()

In [20]:
endpoints_hospital = extract_endpoints_tte(basics, diagnoses_codes, endpoint_list, time0_col)
print(len(endpoints_hospital))
endpoints_hospital.head()

t_0: birth_date
t_cens: 2020-03-14


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


502504


Unnamed: 0,eid,myocardial_infarction_event,myocardial_infarction_event_time,stroke_event,stroke_event_time,cancer_breast_event,cancer_breast_event_time,diabetes_event,diabetes_event_time,atrial_fibrillation_event,atrial_fibrillation_event_time,copd_event,copd_event_time,dementia_event,dementia_event_time
0,1000018,0.0,59.334702,0.0,59.334702,0.0,59.334702,0.0,59.334702,0.0,59.334702,0.0,59.334702,0.0,59.334702
1,1000020,0.0,71.063655,0.0,71.063655,0.0,71.063655,0.0,71.063655,0.0,71.063655,0.0,71.063655,0.0,71.063655
2,1000037,0.0,70.338125,0.0,70.338125,0.0,70.338125,0.0,70.338125,0.0,70.338125,0.0,70.338125,0.0,70.338125
3,1000043,1.0,68.123203,0.0,73.779603,0.0,73.779603,0.0,73.779603,0.0,73.779603,1.0,63.293634,0.0,73.779603
4,1000051,0.0,64.761123,0.0,64.761123,0.0,64.761123,1.0,45.062286,0.0,64.761123,1.0,21.062286,0.0,64.761123


### 2. Death registry

In [21]:
death_list = {
    "death_allcause":[],
    "death_cvd":['I{:02}'.format(ID+1) for ID in range(0, 98)],
}

death_codes = pd.read_feather(f"{data_path}/1_decoded/codes_death_records.feather")#.drop("level", axis=1)

with open(os.path.join(path, dataset_path, 'death_list.yaml'), 'w') as file: yaml.dump(death_list, file, default_flow_style=False)

In [23]:
endpoints_death = extract_endpoints_tte(basics, death_codes, death_list, time0_col, level="1")

t_0: birth_date
t_cens: 2020-06-28


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




## SCORES

In [24]:
scores_list = {
    "SCORE":['I{:02}'.format(ID) for ID in [10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 44, 45, 46, 47, 48, 49, 50, 51, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73]],
    "ASCVD":['I{:02}'.format(ID) for ID in [20, 21, 22, 23, 24, 25, 63]],
    "QRISK3":["G45", "I20", "I21", "I22", "I23", "I24", "I25", "I63", "I64"],
    "MACE":["G45", "I21", "I22", "I23", "I24", "I25", "I63", "I64"],    
}
with open(os.path.join(path, dataset_path, 'scores_list.yaml'), 'w') as file: yaml.dump(scores_list, file, default_flow_style=False)

In [25]:
scores_list_hospital = {}
scores_list_death = {}
for score, score_codes in scores_list.items():
    scores_list_hospital["hospital_"+score] = score_codes
    scores_list_death["death_"+score] = score_codes

In [26]:
endpoints_scores = {
    "hospital": extract_endpoints_tte(basics, diagnoses_codes, scores_list_hospital, time0_col=time0_col),
    "death": extract_endpoints_tte(basics, death_codes, scores_list_death, time0_col=time0_col, level=1)}

t_0: birth_date
t_cens: 2020-03-14


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


t_0: birth_date
t_cens: 2020-06-28


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [27]:
endpoints_scores_all = endpoints_scores["hospital"].merge(endpoints_scores["death"], on="eid", how="left")

### ESC SCORE (Conroy 2003)

In [28]:
score = "SCORE"

temp = endpoints_scores_all.filter(regex="eid|"+score).rename(columns={"death_SCORE_event":"SCORE_event", "death_SCORE_event_time":"SCORE_event_time"})
score_SCORE = temp = temp[["eid", "SCORE_event", "SCORE_event_time"]]
print(len(temp.query(score+"_event==1")))
temp.query(score+"_event==1").head()

5323


Unnamed: 0,eid,SCORE_event,SCORE_event_time
45,1000463,1.0,74.16564
83,1000841,1.0,76.005476
102,1001031,1.0,75.537303
122,1001237,1.0,50.132786
176,1001777,1.0,72.238193


### ASCVD (Goff 2014)

In [29]:
score = "ASCVD"
temp = endpoints_scores_all.filter(regex="eid|"+score)

aggr_event = {score +"_event" : lambda x: [1 if (hospital_event==1) | (death_event == 1) else 0 
                                                    for hospital_event, death_event in zip(x["hospital_"+score+"_event"], x["death_"+score+"_event"])]}
aggr_date = {score +"_event_time" : lambda x: [min(hospital_event_time, death_event_time)
                                                        for hospital_event_time, death_event_time in zip(x["hospital_"+score+"_event_time"], x["death_"+score+"_event_time"])]}

score_ASCVD = temp = temp.assign(**aggr_event).assign(**aggr_date)[["eid", score +"_event", score +"_event_time"]]
print(len(temp.query(score+"_event==1")))
temp.query(score+"_event==1").head()

61785


Unnamed: 0,eid,ASCVD_event,ASCVD_event_time
2,1000037,1,66.970568
3,1000043,1,68.123203
5,1000066,1,60.944559
6,1000079,1,61.054073
22,1000233,1,68.673511


### UK QRISK3 (2017)

In [30]:
score = "QRISK3"
temp = endpoints_scores_all.filter(regex="eid|"+score)

aggr_event = {score +"_event" : lambda x: [1 if (hospital_event==1) | (death_event == 1) else 0 
                                                    for hospital_event, death_event in zip(x["hospital_"+score+"_event"], x["death_"+score+"_event"])]}
aggr_date = {score +"_event_time" : lambda x: [min(hospital_event_time, death_event_time)
                                                        for hospital_event_time, death_event_time in zip(x["hospital_"+score+"_event_time"], x["death_"+score+"_event_time"])]}

score_QRISK3 = temp = temp.assign(**aggr_event).assign(**aggr_date)[["eid", score +"_event", score +"_event_time"]]
print(len(temp.query(score+"_event==1")))
temp.query(score+"_event==1").head()

69344


Unnamed: 0,eid,QRISK3_event,QRISK3_event_time
2,1000037,1,66.970568
3,1000043,1,68.123203
5,1000066,1,60.944559
6,1000079,1,61.054073
22,1000233,1,68.673511


### MACE (2020)

In [31]:
score = "MACE"
temp = endpoints_scores_all.filter(regex="eid|"+score)

aggr_event = {score +"_event" : lambda x: [1 if (hospital_event==1) | (death_event == 1) else 0 
                                                    for hospital_event, death_event in zip(x["hospital_"+score+"_event"], x["death_"+score+"_event"])]}
aggr_date = {score +"_event_time" : lambda x: [min(hospital_event_time, death_event_time)
                                                        for hospital_event_time, death_event_time in zip(x["hospital_"+score+"_event_time"], x["death_"+score+"_event_time"])]}

score_MACE = temp = temp.assign(**aggr_event).assign(**aggr_date)[["eid", score +"_event", score +"_event_time"]]
print(len(temp.query(score+"_event==1")))
temp.query(score+"_event==1").head()

62097


Unnamed: 0,eid,MACE_event,MACE_event_time
3,1000043,1,68.123203
22,1000233,1,68.673511
30,1000319,1,56.922656
45,1000463,1,74.069815
53,1000548,1,50.255989


## Merge Everything

In [32]:
data_dfs_dict = {"basics": pd.read_feather(os.path.join(path, dataset_path, 'temp_basics.feather')), 
                 "questionnaire": pd.read_feather(os.path.join(path, dataset_path, 'temp_questionnaire.feather')), 
                 "measurements": pd.read_feather(os.path.join(path, dataset_path, 'temp_measurements.feather')), 
                 "labs": pd.read_feather(os.path.join(path, dataset_path, 'temp_labs.feather')), 
                 "family_history": pd.read_feather(os.path.join(path, dataset_path, 'temp_family_history.feather')), 
                 "diagnoses": pd.read_feather(os.path.join(path, dataset_path, 'temp_diagnoses.feather')),
                # "diagnoses_emb": pd.read_feather(os.path.join(path, dataset_path, 'temp_diagnoses_emb.feather')), 
                 "medications": pd.read_feather(os.path.join(path, dataset_path, 'temp_medications.feather')), 
                 "endpoints_hospital":endpoints_hospital, 
                 "endpoints_death":endpoints_death, 
                 "score_SCORE":score_SCORE, 
                 "score_ASCVD":score_ASCVD, 
                 "score_QRISK3":score_QRISK3,
                 "score_MACE":score_MACE}

In [33]:
def get_cols_clean(df):
    df.columns = df.columns.str.replace(r'_0_0$', '').str.replace(r'_f[0-9]+$', '').str.replace("_automated_reading", '')
    return df.columns

def clean_df(df):
    df.columns = get_cols_clean(df)
    return df

In [34]:
import pandas as pd
from functools import reduce

data_baseline = reduce(lambda x, y: pd.merge(x, y, on = 'eid'), list(data_dfs_dict.values()))

In [35]:
data_baseline = clean_df(data_baseline)

In [36]:
for col in [col for col in list(data_baseline.columns) if ("_event" in col) & ("_time" not in col)]:
    data_baseline[col] = data_baseline[col].astype(int)

In [37]:
covariates = [col for col in list(data_baseline.columns) if not "_event" in col]
targets = [col for col in list(data_baseline.columns) if "_event" in col]

# Exporting

In [38]:
data_baseline.head()

Unnamed: 0,eid,age_at_recruitment,sex,ethnic_background,townsend_deprivation_index_at_recruitment,date_of_attending_assessment_centre,birth_date,overall_health_rating,smoking_status,alcohol_intake_frequency,...,death_cvd_event,death_cvd_event_time,SCORE_event,SCORE_event_time,ASCVD_event,ASCVD_event_time,QRISK3_event,QRISK3_event_time,MACE_event,MACE_event_time
0,1000018,49.0,Female,White,-1.85293,2009-11-12,1960-11-12,Fair,Current,Once or twice a week,...,0,59.624914,0,59.624914,0,59.334702,0,59.334702,0,59.334702
1,1000020,59.0,Male,White,0.204248,2008-02-19,1949-02-19,Good,Current,Once or twice a week,...,0,71.353867,0,71.353867,0,71.063655,0,71.063655,0,71.063655
2,1000037,59.0,Female,White,-3.49886,2008-11-11,1949-11-11,Good,Previous,Once or twice a week,...,0,70.628337,0,70.628337,1,66.970568,1,66.970568,0,70.338125
3,1000043,63.0,Male,White,-5.35115,2009-06-03,1946-06-03,Fair,Previous,Three or four times a week,...,0,74.069815,0,74.069815,1,68.123203,1,68.123203,1,68.123203
4,1000051,51.0,Female,White,-1.79908,2006-06-10,1955-06-10,Poor,Never,One to three times a month,...,0,65.051335,0,65.051335,0,64.761123,0,64.761123,0,64.761123


In [39]:
data_cols = {}
for topic, df in data_dfs_dict.items(): 
    data_cols["eid"] = ["admin"]
    data_cols[topic]=list(get_cols_clean(df))[1:]

In [40]:
data_cols_single = {}
for topic, columns in data_cols.items():
    for col in columns:
        data_cols_single[col] = topic

In [41]:
dtypes = {"int32":"integer", "int64":"integer", "float64":"numeric", "category":"category", "object":"category", "bool":"logical"}
desc_dict = {"id": [*range(1, len(data_baseline.columns.to_list())+1)] , 
             "covariate": data_baseline.columns.to_list(), 
             "dtype":[dtypes[str(col)] for col in data_baseline.dtypes.to_list()], 
             "isTarget":[True if col in targets else False for col in data_baseline.columns.to_list()],
            "based_on":[topic for col, topic in data_cols_single.items()],
            "aggr_fn": [np.nan for col in data_baseline.columns.to_list()]}
data_baseline_description = pd.DataFrame.from_dict(desc_dict)
data_baseline_description

Unnamed: 0,id,covariate,dtype,isTarget,based_on,aggr_fn
0,1,eid,integer,False,eid,
1,2,age_at_recruitment,numeric,False,basics,
2,3,sex,category,False,basics,
3,4,ethnic_background,category,False,basics,
4,5,townsend_deprivation_index_at_recruitment,numeric,False,basics,
...,...,...,...,...,...,...
3741,3742,ASCVD_event_time,numeric,True,score_ASCVD,
3742,3743,QRISK3_event,integer,True,score_QRISK3,
3743,3744,QRISK3_event_time,numeric,True,score_QRISK3,
3744,3745,MACE_event,integer,True,score_MACE,


# Exclusion Criteria

In [42]:
data_baseline_excl = data_baseline.query("myocardial_infarction == False & coronary_heart_disease == False & statins == False").reset_index(drop=True)

In [43]:
data_baseline_excl

Unnamed: 0,eid,age_at_recruitment,sex,ethnic_background,townsend_deprivation_index_at_recruitment,date_of_attending_assessment_centre,birth_date,overall_health_rating,smoking_status,alcohol_intake_frequency,...,death_cvd_event,death_cvd_event_time,SCORE_event,SCORE_event_time,ASCVD_event,ASCVD_event_time,QRISK3_event,QRISK3_event_time,MACE_event,MACE_event_time
0,1000018,49.0,Female,White,-1.852930,2009-11-12,1960-11-12,Fair,Current,Once or twice a week,...,0,59.624914,0,59.624914,0,59.334702,0,59.334702,0,59.334702
1,1000020,59.0,Male,White,0.204248,2008-02-19,1949-02-19,Good,Current,Once or twice a week,...,0,71.353867,0,71.353867,0,71.063655,0,71.063655,0,71.063655
2,1000037,59.0,Female,White,-3.498860,2008-11-11,1949-11-11,Good,Previous,Once or twice a week,...,0,70.628337,0,70.628337,1,66.970568,1,66.970568,0,70.338125
3,1000043,63.0,Male,White,-5.351150,2009-06-03,1946-06-03,Fair,Previous,Three or four times a week,...,0,74.069815,0,74.069815,1,68.123203,1,68.123203,1,68.123203
4,1000079,60.0,Female,White,-2.708040,2008-03-18,1948-03-18,Fair,Never,Once or twice a week,...,0,72.279261,0,72.279261,1,61.054073,1,61.054073,0,71.989049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402296,6025150,43.0,Female,White,0.046781,2007-06-30,1964-06-30,Excellent,Never,Three or four times a week,...,0,55.994524,0,55.994524,0,55.704312,0,55.704312,0,55.704312
402297,6025165,45.0,Female,White,-2.107040,2008-09-02,1963-09-02,Good,Never,Three or four times a week,...,0,56.821355,0,56.821355,0,56.531143,0,56.531143,0,56.531143
402298,6025173,57.0,Male,White,-1.827220,2008-09-17,1951-09-17,Good,Never,Never,...,0,68.780287,0,68.780287,0,68.490075,0,68.490075,0,68.490075
402299,6025182,56.0,Male,White,-0.010764,2010-07-01,1954-07-01,Excellent,Previous,Daily or almost daily,...,0,65.993155,0,65.993155,0,65.702943,0,65.702943,0,65.702943


In [44]:
feature_dict = {}
for group in data_baseline_description.based_on.unique(): feature_dict[group] = data_baseline_description.query("based_on==@group").covariate.to_list()
with open(os.path.join(path, dataset_path, 'feature_list.yaml'), 'w') as file: yaml.dump(feature_dict, file, default_flow_style=False, allow_unicode=True)

In [45]:
#feature_dict

In [46]:
### WRITE FEATURES IN YAML!!!

In [47]:
data_baseline.to_feather(os.path.join(path, dataset_path, 'baseline_clinical.feather'))
data_baseline_excl.to_feather(os.path.join(path, dataset_path, 'baseline_clinical_excl.feather'))
data_baseline_description.to_feather(os.path.join(path, dataset_path, 'baseline_clinical_description.feather'))

In [None]:
#data_baseline.to_csv(os.path.join(path, dataset_path, 'baseline_clinical.csv'), index=False)
#data_baseline_description.to_csv(os.path.join(path, dataset_path, 'baseline_clinical_description.csv'), index=False)

# !!! REMEMBER IMPUTATION !!!