# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import os
import yaml
from tqdm.notebook import trange, tqdm
dataset_name = "cvd_lifetime_time_series"
path = "/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb"
data_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS/data"
dataset_path = f"{data_path}/2_datasets_pre/{dataset_name}"

In [2]:
from pathlib import Path
Path(dataset_path).mkdir(parents=True, exist_ok=True)

In [None]:
%%time

data = pd.read_feather(f"{data_path}/1_decoded/ukb_data.feather")
data_field = pd.read_feather(f"{data_path}/1_decoded/ukb_data_field.feather")
data_columns = data.columns.to_list()

In [5]:
data

Unnamed: 0,eid,weight_method_f21_0_0,weight_method_f21_1_0,weight_method_f21_2_0,weight_method_f21_3_0,spirometry_method_f23_0_0,spirometry_method_f23_1_0,spirometry_method_f23_2_0,spirometry_method_f23_3_0,sex_f31_0_0,...,source_of_report_of_i85_oesophageal_varices_f131407_0_0,source_of_report_of_i89_other_noninfective_disorders_of_lymphatic_vessels_and_lymph_nodes_f131415_0_0,date_i95_first_reported_hypotension_f131416_0_0,source_of_report_of_i95_hypotension_f131417_0_0,date_i97_first_reported_postprocedural_disorders_of_circulatory_system_not_elsewhere_classified_f131418_0_0,source_of_report_of_i97_postprocedural_disorders_of_circulatory_system_not_elsewhere_classified_f131419_0_0,date_i98_first_reported_other_disorders_of_circulatory_system_in_diseases_classified_elsewhere_f131420_0_0,source_of_report_of_i98_other_disorders_of_circulatory_system_in_diseases_classified_elsewhere_f131421_0_0,date_i99_first_reported_other_and_unspecified_disorders_of_circulatory_system_f131422_0_0,source_of_report_of_i99_other_and_unspecified_disorders_of_circulatory_system_f131423_0_0
0,1000018,Direct entry,,,,Direct entry,,,,Female,...,,,,,,,,,,
1,1000020,Direct entry,,,,Direct entry,,,,Male,...,,,,,,,,,,
2,1000037,Direct entry,,,,Direct entry,,,,Female,...,,,,,,,,,,
3,1000043,Direct entry,,Direct entry,,Direct entry,,Direct entry,,Male,...,,,,,,,,,,
4,1000051,Direct entry,,,,,,,,Female,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502500,6025165,Direct entry,,,,Direct entry,,,,Female,...,,,,,,,,,,
502501,6025173,Direct entry,,,,Direct entry,,,,Male,...,,,,,,,,,,
502502,6025182,Direct entry,,,,Direct entry,,,,Male,...,,,,,,,,,2012-08-16,Primary care only
502503,6025198,Direct entry,,,,Direct entry,,,,Male,...,,,,,,,,,,


## Mappings + Vocabulary

In [6]:
# Drop obviouse missing data
print(len(data))
data = data.dropna(subset=["sex_f31_0_0"], axis=0)
print(len(data))

502505
502504


# Starting information

In [7]:
time0_col="birthdate"
# time0_col="date_of_attending_assessment_centre_f53_0_0"

# Basic Covariates

In [8]:
def get_fields(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields) & data_field["field.tab"].str.contains("f\\.\\d+\\.0\\.\\d")].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_fields_all(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields)].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_data_fields(fields, data, data_field):
    f = get_fields(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

def get_data_fields_all(fields, data, data_field):
    f = get_fields_all(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

### Basics

In [16]:
fields_basics = [
    "21022", # age at recruitment
    "31", # sex
    "21000", # ethnicity
    "189", # Townsend index
    "53", # date of baseline assessment
]

temp = get_data_fields_all(fields_basics, data, data_field)

temp["sex_f31_0_0"] = temp["sex_f31_0_0"].cat.set_categories(["Female", 'Male'], ordered=False)
temp["ethnic_background_f21000_0_0"] = temp["ethnic_background_f21000_0_0"].replace({"Do not know": np.nan, "Prefer not to answer": np.nan}).astype("category")

basics = temp
print(len(temp))
display(temp.head())

from dateutil.relativedelta import relativedelta
calc_birth_date = [date_of_attending_assessment_centre - relativedelta(years=age_at_recruitment) 
                                                             for date_of_attending_assessment_centre, age_at_recruitment 
                                                             in zip(basics["date_of_attending_assessment_centre_f53_0_0"], basics["age_at_recruitment_f21022_0_0"])]
#basics = basics.assign(birth_date = calc_birth_date)


basics.to_feather(os.path.join(path, dataset_path, 'temp_basics.feather'))

502504


Unnamed: 0,eid,age_at_recruitment_f21022_0_0,sex_f31_0_0,ethnic_background_f21000_0_0,ethnic_background_f21000_1_0,ethnic_background_f21000_2_0,townsend_deprivation_index_at_recruitment_f189_0_0,date_of_attending_assessment_centre_f53_0_0,date_of_attending_assessment_centre_f53_1_0,date_of_attending_assessment_centre_f53_2_0,date_of_attending_assessment_centre_f53_3_0
0,1000018,49.0,Female,British,,,-1.85293,2009-11-12,,,
1,1000020,59.0,Male,British,,,0.204248,2008-02-19,,,
2,1000037,59.0,Female,British,,,-3.49886,2008-11-11,,,
3,1000043,63.0,Male,British,,,-5.35115,2009-06-03,,2018-06-08,
4,1000051,51.0,Female,British,,,-1.79908,2006-06-10,,2019-09-15,


In [11]:
#basics["t"] = (basics.date_of_attending_assessment_centre_f53_0_0-basics.birth_date).dt.days/365.2425

In [12]:
print(temp["ethnic_background_f21000_0_0"].unique())

['British', 'Caribbean', 'Other ethnic group', 'Irish', 'Indian', ..., 'White and Black African', 'Any other Black background', 'Asian or Asian British', 'Mixed', 'Black or Black British']
Length: 21
Categories (20, object): ['British', 'Caribbean', 'Other ethnic group', 'Irish', ..., 'Any other Black background', 'Asian or Asian British', 'Mixed', 'Black or Black British']


### Questionnaire

In [13]:
fields_questionnaire = [
    "2178", # Overall health
    "20116", # Smoking status
    "1558",
]

temp = get_data_fields_all(fields_questionnaire, data, data_field)

temp["overall_health_rating_f2178_0_0"] = temp["overall_health_rating_f2178_0_0"]\
    .replace({"Do not know": np.nan, "Prefer not to answer": np.nan})\
    .astype("category").cat.set_categories(['Poor', 'Fair', 'Good', 'Excellent'], ordered=True)


temp["smoking_status_f20116_0_0"] = temp["smoking_status_f20116_0_0"]\
    .replace({"Prefer not to answer": np.nan}, inplace=False)\
    .astype("category").cat.set_categories(['Current', 'Previous', 'Never'], ordered=True)

temp["alcohol_intake_frequency_f1558_0_0"] = temp["alcohol_intake_frequency_f1558_0_0"]\
    .replace({"Prefer not to answer": np.nan}, inplace=False)\
    .astype("category").cat.set_categories([
        'Daily or almost daily', 
        'Three or four times a week', 
        'Once or twice a week',
        'One to three times a month',
        'Special occasions only', 
        'Never'], ordered=True)

questionnaire = temp
print(len(temp))
display(temp.head())

questionnaire.to_feather(os.path.join(path, dataset_path, 'temp_questionnaire.feather'))

502504


Unnamed: 0,eid,overall_health_rating_f2178_0_0,overall_health_rating_f2178_1_0,overall_health_rating_f2178_2_0,overall_health_rating_f2178_3_0,smoking_status_f20116_0_0,smoking_status_f20116_1_0,smoking_status_f20116_2_0,smoking_status_f20116_3_0,alcohol_intake_frequency_f1558_0_0,alcohol_intake_frequency_f1558_1_0,alcohol_intake_frequency_f1558_2_0,alcohol_intake_frequency_f1558_3_0
0,1000018,Fair,,,,Current,,,,Once or twice a week,,,
1,1000020,Good,,,,Current,,,,Once or twice a week,,,
2,1000037,Good,,,,Previous,,,,Once or twice a week,,,
3,1000043,Fair,,Fair,,Previous,,Previous,,Three or four times a week,,Three or four times a week,
4,1000051,Poor,,Fair,,Never,,Never,,One to three times a month,,One to three times a month,


In [14]:
print(temp["alcohol_intake_frequency_f1558_0_0"].unique())

['Once or twice a week', 'Three or four times a week', 'One to three times a month', 'Daily or almost daily', 'Special occasions only', 'Never', NaN]
Categories (6, object): ['Daily or almost daily' < 'Three or four times a week' < 'Once or twice a week' < 'One to three times a month' < 'Special occasions only' < 'Never']


### Physical measurements

In [15]:
from statistics import mean

fields_measurements = [
#    "100313", # Walking speed !!! MISSING !!!
    "21001", # BMI
    "21002", # weight
    "4080", # Syst. BP
    "4079", # Diast. BP
    "102",
    "21021",
    "4195",
    "48",
    "49",
    "50",
    "23127",
    "23099",
    "23105",
    "20151",
    "20150",
    "20258",
    "3064",
    
]
temp = get_data_fields_all(fields_measurements, data, data_field)

#sbp_cols = ["systolic_blood_pressure_automated_reading_f4080_0_0", "systolic_blood_pressure_automated_reading_f4080_0_1"]
#dbp_cols = ["diastolic_blood_pressure_automated_reading_f4079_0_0", "diastolic_blood_pressure_automated_reading_f4079_0_1"]
#pr_cols = ["pulse_rate_automated_reading_f102_0_0", "pulse_rate_automated_reading_f102_0_1"]

#temp = temp.assign(systolic_blood_pressure_automated_reading_f4080 = temp[sbp_cols].mean(axis=1),
#                   diastolic_blood_pressure_automated_reading_f4079 = temp[dbp_cols].mean(axis=1),
#                   pulse_rate_automated_reading_f102 = temp[pr_cols].mean(axis=1))\
#    .drop(sbp_cols + dbp_cols + pr_cols, axis=1)

measurements = temp
print(len(temp))
display(temp.head())

measurements.to_feather(os.path.join(path, dataset_path, 'temp_measurements.feather'))

502504


Unnamed: 0,eid,body_mass_index_bmi_f21001_2_0,body_mass_index_bmi_f21001_3_0,body_mass_index_bmi_f21001_0_0,body_mass_index_bmi_f21001_1_0,weight_f21002_3_0,weight_f21002_2_0,weight_f21002_1_0,weight_f21002_0_0,systolic_blood_pressure_automated_reading_f4080_3_0,...,peak_expiratory_flow_pef_f3064_2_0,peak_expiratory_flow_pef_f3064_1_2,peak_expiratory_flow_pef_f3064_1_1,peak_expiratory_flow_pef_f3064_1_0,peak_expiratory_flow_pef_f3064_0_2,peak_expiratory_flow_pef_f3064_0_1,peak_expiratory_flow_pef_f3064_3_0,peak_expiratory_flow_pef_f3064_3_1,peak_expiratory_flow_pef_f3064_2_2,peak_expiratory_flow_pef_f3064_3_2
0,1000018,,,26.5557,,,,,63.8,,...,,,,,317.0,312.0,,,,
1,1000020,,,22.7465,,,,,70.7,,...,,,,,301.0,496.0,,,,
2,1000037,,,32.4211,,,,,78.9,,...,,,,,,185.0,,,,
3,1000043,28.4349,,29.5679,,,90.6,,95.8,,...,476.0,,,,557.0,513.0,,,390.0,
4,1000051,,,41.0222,,,,,92.3,,...,,,,,,,,,,


### Lab measurements

In [16]:
fields_blood_count = [
    "30160", #	Basophill count
    "30220", #	Basophill percentage
    "30150", #	Eosinophill count
    "30210", #	Eosinophill percentage
    "30030", #	Haematocrit percentage
    "30020", #	Haemoglobin concentration
    "30300", #	High light scatter reticulocyte count
    "30290", #	High light scatter reticulocyte percentage
    "30280", #	Immature reticulocyte fraction
    "30120", #	Lymphocyte count
    "30180", #	Lymphocyte percentage
    "30050", #	Mean corpuscular haemoglobin
    "30060", #	Mean corpuscular haemoglobin concentration
    "30040", #	Mean corpuscular volume
    "30100", #	Mean platelet (thrombocyte) volume
    "30260", #	Mean reticulocyte volume
    "30270", #	Mean sphered cell volume
    "30130", #	Monocyte count
    "30190", #	Monocyte percentage
    "30140", #	Neutrophill count
    "30200", #	Neutrophill percentage
    "30170", #	Nucleated red blood cell count
    "30230", #	Nucleated red blood cell percentage
    "30080", #	Platelet count
    "30090", #	Platelet crit
    "30110", #	Platelet distribution width
    "30010", #	Red blood cell (erythrocyte) count
    "30070", #	Red blood cell (erythrocyte) distribution width
    "30250", #	Reticulocyte count
    "30240", #	Reticulocyte percentage
    "30000", #	White blood cell (leukocyte) count
]

fields_blood_biochemistry = [
    "30620",#	Alanine aminotransferase
    "30600",#	Albumin
    "30610",#	Alkaline phosphatase
    "30630",#	Apolipoprotein A
    "30640",#	Apolipoprotein B
    "30650",#	Aspartate aminotransferase
    "30710",#	C-reactive protein
    "30680",#	Calcium
    "30690",#	Cholesterol
    "30700",#	Creatinine
    "30720",#	Cystatin C
    "30660",#	Direct bilirubin
    "30730",#	Gamma glutamyltransferase
    "30740",#	Glucose
    "30750",#	Glycated haemoglobin (HbA1c)
    "30760",#	HDL cholesterol
    "30770",#	IGF-1
    "30780",#	LDL direct
    "30790",#	Lipoprotein A
    "30800",#	Oestradiol
    "30810",#	Phosphate
    "30820",#	Rheumatoid factor
    "30830",#	SHBG
    "30850",#	Testosterone
    "30840",#	Total bilirubin
    "30860",#	Total protein
    "30870",#	Triglycerides
    "30880",#	Urate
    "30670",#	Urea
    "30890",#	Vitamin D
]

fields_blood_infectious = [
    "23000", #	1gG antigen for Herpes Simplex virus-1
    "23001", #	2mgG unique antigen for Herpes Simplex virus-2
    "23049", #	Antigen assay QC indicator
    "23048", #	Antigen assay date
    "23026", #	BK VP1 antigen for Human Polyomavirus BKV
    "23039", #	CagA antigen for Helicobacter pylori
    "23043", #	Catalase antigen for Helicobacter pylori
    "23018", #	Core antigen for Hepatitis C Virus
    "23030", #	E6 antigen for Human Papillomavirus type-16
    "23031", #	E7 antigen for Human Papillomavirus type-16
    "23006", #	EA-D antigen for Epstein-Barr Virus
    "23004", #	EBNA-1 antigen for Epstein-Barr Virus
    "23042", #	GroEL antigen for Helicobacter pylori
    "23016", #	HBc antigen for Hepatitis B Virus
    "23017", #	HBe antigen for Hepatitis B Virus
    "23025", #	HIV-1 env antigen for Human Immunodeficiency Virus
    "23024", #	HIV-1 gag antigen for Human Immunodeficiency Virus
    "23023", #	HTLV-1 env antigen for Human T-Lymphotropic Virus 1
    "23022", #	HTLV-1 gag antigen for Human T-Lymphotropic Virus 1
    "23010", #	IE1A antigen for Human Herpesvirus-6
    "23011", #	IE1B antigen for Human Herpesvirus-6
    "23027", #	JC VP1 antigen for Human Polyomavirus JCV
    "23015", #	K8.1 antigen for Kaposi's Sarcoma-Associated Herpesvirus
    "23029", #	L1 antigen for Human Papillomavirus type-16
    "23032", #	L1 antigen for Human Papillomavirus type-18
    "23014", #	LANA antigen for Kaposi's Sarcoma-Associated Herpesvirus
    "23028", #	MC VP1 antigen for Merkel Cell Polyomavirus
    "23019", #	NS3 antigen for Hepatitis C Virus
    "23041", #	OMP antigen for Helicobacter pylori
    "23037", #	PorB antigen for Chlamydia trachomatis
    "23013", #	U14 antigen for Human Herpesvirus-7
    "23044", #	UreA antigen for Helicobacter pylori
    "23003", #	VCA p18 antigen for Epstein-Barr Virus
    "23040", #	VacA antigen for Helicobacter pylori
    "23005", #	ZEBRA antigen for Epstein-Barr Virus
    "23002", #	gE / gI antigen for Varicella Zoster Virus
    "23034", #	momp A antigen for Chlamydia trachomatis
    "23033", #	momp D antigen for Chlamydia trachomatis
    "23012", #	p101 k antigen for Human Herpesvirus-6
    "23020", #	p22 antigen for Toxoplasma gondii
    "23038", #	pGP3 antigen for Chlamydia trachomatis
    "23009", #	pp 28 antigen for Human Cytomegalovirus
    "23008", #	pp 52 antigen for Human Cytomegalovirus
    "23007", #	pp150 Nter antigen for Human Cytomegalovirus
    "23021", #	sag1 antigen for Toxoplasma gondii
    "23035", #	tarp-D F1 antigen for Chlamydia trachomatis
    "23036", #	tarp-D F2 antigen for Chlamydia trachomatis
]

labs = temp = get_data_fields_all(fields_blood_count+fields_blood_biochemistry+fields_blood_infectious, data, data_field)
print(len(temp))
display(temp.head())

labs.to_feather(os.path.join(path, dataset_path, 'temp_labs.feather'))

502504


Unnamed: 0,eid,basophill_count_f30160_2_0,basophill_count_f30160_1_0,basophill_count_f30160_0_0,basophill_percentage_f30220_2_0,basophill_percentage_f30220_0_0,basophill_percentage_f30220_1_0,eosinophill_count_f30150_0_0,eosinophill_count_f30150_1_0,eosinophill_count_f30150_2_0,...,total_protein_f30860_0_0,total_protein_f30860_1_0,triglycerides_f30870_0_0,triglycerides_f30870_1_0,urate_f30880_0_0,urate_f30880_1_0,urea_f30670_1_0,urea_f30670_0_0,vitamin_d_f30890_0_0,vitamin_d_f30890_1_0
0,1000018,,,0.04,,0.26,,0.25,,,...,71.97,,1.247,,221.3,,,5.48,70.7,
1,1000020,,,0.0,,0.3,,0.3,,,...,78.45,,1.906,,374.7,,,5.28,35.9,
2,1000037,,,0.04,,0.57,,0.1,,,...,,,,,,,,,,
3,1000043,,,0.02,,0.32,,0.11,,,...,69.7,,5.184,,322.8,,,6.67,63.6,
4,1000051,,,,,,,,,,...,,,,,,,,,,


## Get Demographic Data with times

In [17]:
standard = pd.concat([basics.set_index("eid"), questionnaire.set_index("eid"), measurements.set_index("eid"), labs.set_index("eid")], axis=1).reset_index()

In [18]:
cols_raw = [c[:-4] for c in standard.drop("eid", axis=1).columns.to_list()]
cols = list(dict.fromkeys(cols_raw))

In [19]:
standard

Unnamed: 0,eid,age_at_recruitment_f21022_0_0,sex_f31_0_0,ethnic_background_f21000_0_0,ethnic_background_f21000_1_0,ethnic_background_f21000_2_0,townsend_deprivation_index_at_recruitment_f189_0_0,date_of_attending_assessment_centre_f53_0_0,date_of_attending_assessment_centre_f53_1_0,date_of_attending_assessment_centre_f53_2_0,...,total_protein_f30860_0_0,total_protein_f30860_1_0,triglycerides_f30870_0_0,triglycerides_f30870_1_0,urate_f30880_0_0,urate_f30880_1_0,urea_f30670_1_0,urea_f30670_0_0,vitamin_d_f30890_0_0,vitamin_d_f30890_1_0
0,1000018,49.0,Female,British,,,-1.852930,2009-11-12,,,...,71.97,,1.247,,221.3,,,5.48,70.7,
1,1000020,59.0,Male,British,,,0.204248,2008-02-19,,,...,78.45,,1.906,,374.7,,,5.28,35.9,
2,1000037,59.0,Female,British,,,-3.498860,2008-11-11,,,...,,,,,,,,,,
3,1000043,63.0,Male,British,,,-5.351150,2009-06-03,,2018-06-08,...,69.70,,5.184,,322.8,,,6.67,63.6,
4,1000051,51.0,Female,British,,,-1.799080,2006-06-10,,2019-09-15,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502499,6025150,43.0,Female,British,British,,0.046781,2007-06-30,2012-11-17,2017-08-12,...,72.10,,0.730,2.285,298.8,356.1,5.65,4.21,41.6,17.9
502500,6025165,45.0,Female,British,,,-2.107040,2008-09-02,,,...,74.20,,1.442,,220.2,,,4.01,72.7,
502501,6025173,57.0,Male,British,,,-1.827220,2008-09-17,,,...,72.03,,1.136,,255.5,,,5.25,41.6,
502502,6025182,56.0,Male,British,,,-0.010764,2010-07-01,,,...,70.65,,5.756,,353.6,,,4.42,45.9,


In [56]:
df = standard.copy()

In [57]:
df_long = df.set_index(["eid"]).stack().reset_index()

In [58]:
df_long.columns = ["eid", "column", "value"]

In [None]:
def split_ukb_column(df, idx_col="column"):
    column = df["column"].to_list()
    df["column"] = [e[:-4] for e in column]
    df["t"]= [e[-3:] for e in column]
    return df

def split_ukb_index(df, idx_col="t"):
    new = df[idx_col].str.split("_", n = 1, expand = True) 
    df["visit"] = new[0]
    df["measurement"]= new[1]
    return df.drop(columns =[idx_col]) 

In [59]:
%%time
df_long_split = split_ukb_column(df_long, idx_col="column")

In [70]:
def df_sort_cols(df, cols): return df[start_cols+[c for c in df.columns.to_list() if c not in start_cols]]

start_cols = ["eid", "t", "column", "value"]
df_long_split = df_sort_cols(df_long_split, start_cols)

In [72]:
df_test = df_long_split.set_index(["eid", "t"])

In [77]:
df_test2 = df_test.set_index(["column"], append=True).unstack(level=-1).reset_index()

In [88]:
mi = df_test2.columns.to_flat_index().to_list()

In [91]:
ind = pd.Index([e[1] if e[1] != "" else e[0] for e in mi])
df_test2.columns = ind

Index(['eid', 't', 'age_at_recruitment_f21022',
       'alanine_aminotransferase_f30620', 'albumin_f30600',
       'alcohol_intake_frequency_f1558', 'alkaline_phosphatase_f30610',
       'apolipoprotein_a_f30630', 'apolipoprotein_b_f30640',
       'aspartate_aminotransferase_f30650',
       ...
       'total_protein_f30860',
       'townsend_deprivation_index_at_recruitment_f189',
       'triglycerides_f30870', 'trunk_fat_percentage_f23127', 'urate_f30880',
       'urea_f30670', 'vitamin_d_f30890', 'waist_circumference_f48',
       'weight_f21002', 'white_blood_cell_leukocyte_count_f30000'],
      dtype='object', length=260)

In [95]:
df_test2.columns = ind

In [99]:
start_cols = ["eid", "t"]+cols
df_test2  = df_sort_cols(df_test2, start_cols)

In [100]:
df_test2

Unnamed: 0,eid,t,age_at_recruitment_f21022,age_at_recruitment_f21022.1,age_at_recruitment_f21022.2,sex_f31,sex_f31.1,sex_f31.2,ethnic_background_f21000,ethnic_background_f21000.1,...,triglycerides_f30870,urate_f30880,urate_f30880.1,urate_f30880.2,urea_f30670,urea_f30670.1,urea_f30670.2,vitamin_d_f30890,vitamin_d_f30890.1,vitamin_d_f30890.2
0,1000018,0_0,49,0,0,Female,0,0,British,0,...,0,221.3,0,0,5.48,0,0,70.7,0,0
1,1000018,0_1,,,,,,,,,...,,,,,,,,,,
2,1000018,0_2,,,,,,,,,...,,,,,,,,,,
3,1000020,0_0,59,0,0,Male,0,0,British,0,...,0,374.7,0,0,5.28,0,0,35.9,0,0
4,1000020,0_1,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521929,6025182,0_1,,,,,,,,,...,,,,,,,,,,
1521930,6025182,0_2,,,,,,,,,...,,,,,,,,,,
1521931,6025198,0_0,67,0,0,Male,0,0,British,0,...,0,454.8,0,0,5.14,0,0,20.2,0,0
1521932,6025198,0_1,,,,,,,,,...,,,,,,,,,,


In [98]:
standard_long

Unnamed: 0,eid,t,age_at_recruitment_f21022,sex_f31,ethnic_background_f21000,townsend_deprivation_index_at_recruitment_f189,date_of_attending_assessment_centre_f53,overall_health_rating_f2178,smoking_status_f20116,alcohol_intake_frequency_f1558,...,phosphate_f30810,rheumatoid_factor_f30820,shbg_f30830,testosterone_f30850,total_bilirubin_f30840,total_protein_f30860,triglycerides_f30870,urate_f30880,urea_f30670,vitamin_d_f30890
0,1000018,0_0,49.0,Female,British,-1.852930,2009-11-12,Fair,Current,Once or twice a week,...,1.422,,70.11,1.560,7.41,71.97,1.247,221.3,5.48,70.7
1,1000018,0_1,,,,,,,,,...,,,,,,,,,,
2,1000018,0_2,,,,,,,,,...,,,,,,,,,,
3,1000020,0_0,59.0,Male,British,0.204248,2008-02-19,Good,Current,Once or twice a week,...,1.264,,55.31,12.237,8.07,78.45,1.906,374.7,5.28,35.9
4,1000020,0_1,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521929,6025182,0_1,,,,,,,,,...,,,,,,,,,,
1521930,6025182,0_2,,,,,,,,,...,,,,,,,,,,
1521931,6025198,0_0,67.0,Male,British,-1.930650,2010-01-26,Good,Current,Daily or almost daily,...,1.163,,45.09,15.030,11.85,70.62,2.327,454.8,5.14,20.2
1521932,6025198,0_1,,,,,,,,,...,,,,,,,,,,


In [79]:
standard_long

Unnamed: 0,eid,t,age_at_recruitment_f21022,sex_f31,ethnic_background_f21000,townsend_deprivation_index_at_recruitment_f189,date_of_attending_assessment_centre_f53,overall_health_rating_f2178,smoking_status_f20116,alcohol_intake_frequency_f1558,...,phosphate_f30810,rheumatoid_factor_f30820,shbg_f30830,testosterone_f30850,total_bilirubin_f30840,total_protein_f30860,triglycerides_f30870,urate_f30880,urea_f30670,vitamin_d_f30890
0,1000018,0_0,49.0,Female,British,-1.852930,2009-11-12,Fair,Current,Once or twice a week,...,1.422,,70.11,1.560,7.41,71.97,1.247,221.3,5.48,70.7
1,1000018,0_1,,,,,,,,,...,,,,,,,,,,
2,1000018,0_2,,,,,,,,,...,,,,,,,,,,
3,1000020,0_0,59.0,Male,British,0.204248,2008-02-19,Good,Current,Once or twice a week,...,1.264,,55.31,12.237,8.07,78.45,1.906,374.7,5.28,35.9
4,1000020,0_1,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521929,6025182,0_1,,,,,,,,,...,,,,,,,,,,
1521930,6025182,0_2,,,,,,,,,...,,,,,,,,,,
1521931,6025198,0_0,67.0,Male,British,-1.930650,2010-01-26,Good,Current,Daily or almost daily,...,1.163,,45.09,15.030,11.85,70.62,2.327,454.8,5.14,20.2
1521932,6025198,0_1,,,,,,,,,...,,,,,,,,,,


In [61]:
%%time
df_raw = split_ukb_index(df_long_split, idx_col="t")

In [66]:
def df_sort_cols(df, cols): return df[start_cols+[c for c in df.columns.to_list() if c not in start_cols]]

start_cols = ["eid", "visit", "measurement", "column", "value"]
df_raw = df_sort_cols(df_raw, start_cols)

In [68]:
df_raw.set_index(["eid", "visit", "measurement"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,column,value
eid,visit,measurement,Unnamed: 3_level_1,Unnamed: 4_level_1
1000018,0,0,age_at_recruitment_f21022,49
1000018,0,0,sex_f31,Female
1000018,0,0,ethnic_background_f21000,British
1000018,0,0,townsend_deprivation_index_at_recruitment_f189,-1.85293
1000018,0,0,date_of_attending_assessment_centre_f53,2009-11-12
...,...,...,...,...
6025198,0,0,total_protein_f30860,70.62
6025198,0,0,triglycerides_f30870,2.327
6025198,0,0,urate_f30880,454.8
6025198,0,0,urea_f30670,5.14


In [102]:
%%time

from joblib import Parallel, delayed

def df_wide_to_long(df): return pd.wide_to_long(standard, cols, i="eid", j="t", sep="_", suffix='\w+').reset_index()

df_input = standard
n = 100
list_df = [df_input[i:i+n] for i in range(0,df_input.shape[0],n)]

df_list = Parallel(n_jobs=10, require="sharedmem")(delayed(df_wide_to_long)(df) for df in tqdm(list_df))
df_concat = pd.concat(df_list, axis=0).reset_index(drop=True)

KeyboardInterrupt: 

In [31]:
from joblib import Parallel, delayed
df_list = Parallel(n_jobs=50)(delayed(df_wide_to_long)(df) for df in tqdm(list_df))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=503.0), HTML(value='')))

KeyboardInterrupt: 

In [20]:
%%time

standard_long = pd.wide_to_long(standard, cols, i="eid", j="t", sep="_", suffix='\w+').reset_index()#.set_index("eid")
standard_long = standard_long.dropna(how="all", subset=cols, axis=0).reset_index(drop=True)

CPU times: user 42min 7s, sys: 6min 58s, total: 49min 6s
Wall time: 50min 49s


In [21]:
def split_ukb_index(df, idx_col="t"):
    new = df[idx_col].str.split("_", n = 1, expand = True) 
    df["visit"] = new[0]
    df["measurement"]= new[1]
    return df.drop(columns =[idx_col]) 
df_raw = split_ukb_index(standard_long, idx_col="t")

In [22]:
def process_multiple_measurements(df):
    df_nonfloat = df.set_index(["eid", "visit"]).select_dtypes(exclude=np.number)
    nonfloat_columns = [c for c in df_nonfloat.columns if c not in ["measurement"]]
    df_nonfloat = df_nonfloat.dropna(subset=nonfloat_columns, how="all")[nonfloat_columns]
    df_float = df.set_index(["eid", "visit"]).select_dtypes(include=np.number).groupby(["eid", "visit"]).mean(numeric_only=True)
    df_complete = pd.concat([df_nonfloat, df_float], axis=1).reset_index()
    return df_complete

def df_sort_cols(df, cols): return df[start_cols+[c for c in df.columns.to_list() if c not in start_cols]]

start_cols = ["eid", "visit", "date_of_attending_assessment_centre_f53", "age_at_recruitment_f21022", "sex_f31", "ethnic_background_f21000"]
df_agg_measurement = df_sort_cols(process_multiple_measurements(df_raw), start_cols)

In [23]:
def get_birthdate(df_complete):
    from dateutil.relativedelta import relativedelta
    df = df_complete[df_complete.visit=="0"].copy()#.reset_index()
    df["birthdate"] =  [date - relativedelta(years=age) for date, age in zip(df.date_of_attending_assessment_centre_f53, df.age_at_recruitment_f21022)]
    df_birthdate = df.set_index("eid")[["birthdate"]]
    return df_birthdate

def convert_dates_to_timedelta(df_birthdate, df_complete):
    df_complete_bd = pd.concat([df_birthdate, df_complete.set_index(["eid"])], axis=1).reset_index()

    start_cols = ["eid", "birthdate",  "sex_f31", "ethnic_background_f21000", "visit", "date_of_attending_assessment_centre_f53"]
    df_complete_bd = df_complete_bd[start_cols+[c for c in df_complete_bd.columns.to_list() if c not in start_cols]]

    df_complete_bd = df_complete_bd.rename(columns={"visit":"t"}).assign(t= lambda x: (x.date_of_attending_assessment_centre_f53-x.birthdate).dt.days/365.2425)
    df_complete_bd = df_complete_bd.set_index(["eid", "t"]).drop(["date_of_attending_assessment_centre_f53", "age_at_recruitment_f21022"], axis=1)
    return df_complete_bd.reset_index()

In [24]:
df_birthdate = get_birthdate(df_agg_measurement)
df_baseline_time = convert_dates_to_timedelta(df_birthdate, df_agg_measurement)

## Medications

In [14]:
# https://list.essentialmeds.org/?showRemoved=0
# essential medicines WHO?!

In [15]:
atc_mapping = pd.read_csv(f"{path}/mapping/atc/atc_matched_list.csv")
athena_concepts = pd.read_csv(f"{data_path}/athena_vocabulary/CONCEPT.csv", sep="\t").assign(vocabulary_id = lambda x: x.vocabulary_id.astype("string"), concept_class_id = lambda x: x.concept_class_id.astype("string"))
atc_concepts = athena_concepts[athena_concepts.vocabulary_id=="ATC"]
atc2_concepts = atc_concepts[atc_concepts.concept_class_id=="ATC 2nd"].sort_values("concept_code")
medication_list = dict(zip([x.lower().replace(" ", "_") for x in atc2_concepts.concept_name.to_list()], [[x] for x in atc2_concepts.concept_code.to_list()]))
medication_list_extra = {
    "antihypertensives": ["C02"],
    "statins": ["C10A", "C10B"],
    "ass": ["B01"],
    "atypical_antipsychotics" : ["N05"],
    "glucocorticoids" : ["H02"]                        
}
medication_list.update(medication_list_extra)

with open(os.path.join(path, dataset_path, 'medication_list.yaml'), 'w') as file: yaml.dump(medication_list, file, default_flow_style=False)

In [None]:
def had_medication_before(data, data_field, medications, atc_mapping):
    fields = ["20003"]
    raw = get_data_fields(fields, data, data_field)
    temp = pd.melt(raw, id_vars=["eid"], value_vars=raw.drop("eid", axis=1).columns.to_list(), var_name = "field", value_name="UKBB_code").drop("field", axis=1).drop_duplicates()

    temp.UKBB_code = temp.UKBB_code.astype(str)
    temp = temp[temp.UKBB_code!="None"].copy()
    temp.UKBB_code = temp.UKBB_code.astype(int)

    temp_atc = temp.merge(atc_mapping, how="left", on="UKBB_code").sort_values("eid").reset_index(drop=True).dropna(subset=["ATC_code"], axis=0)
    temp_atc.ATC_code = temp_atc.ATC_code.astype("string")
    temp = data[["eid"]].copy()
    for med, med_codes in tqdm(medication_list.items()):
        regex_str = "^"+"|^".join(med_codes)
        df = temp_atc[temp_atc.ATC_code.str.contains(regex_str, case=False)][["eid"]]\
            .drop_duplicates(subset=["eid"])\
            .assign(medication=True)
        temp[med] = temp.merge(df, how="left", on="eid").fillna(False).medication
        
    return temp.sort_values("eid")

In [34]:
def had_diagnosis_before_per_ph(df_before, ph, ph_codes, temp):
   # regex = "|".join(ph_codes)
    #df_ph = df_before.set_index("meaning").loc[ph_codes][["eid"]]\
    #        .drop_duplicates(subset=["eid"])\
    #        .assign(phenotype=True) 
    df_ph = df_before[df_before.meaning.isin(ph_codes)][["eid"]]\
            .drop_duplicates(subset=["eid"])\
            .assign(phenotype=True) 
    #df_ph = df_before[df_before.meaning.str.contains(regex, case=False)][["eid"]]\
    #        .drop_duplicates(subset=["eid"])\
    #        .assign(phenotype=True)   
    return temp.merge(df_ph, how="left", on="eid").fillna(False).phenotype

def had_diagnosis_before(data, diagnoses_codes, phenotypes, time0=time0_col):
    diagnoses_codes_time = diagnoses_codes.merge(data[["eid", time0]], how="left", on="eid")
    
    temp = data[["eid"]].copy()
    df_before = diagnoses_codes_time[diagnoses_codes_time.date < diagnoses_codes_time[time0]]
                                                                                         
    df_phs = Parallel(n_jobs=20, require="sharedmem")(delayed(had_diagnosis_before_per_ph)(df_before, ph, phenotypes[ph], temp) for ph in tqdm(list(phenotypes)))
    for ph, df_ph_series in zip(tqdm(list(phenotypes)), df_phs): temp[ph] = df_ph_series#temp.merge(df_ph, how="left", on="eid").fillna(False).phenotype
    
    return temp.sort_values("eid")  #reduce(lambda left,right: pd.merge(left,right,on=['eid'], how='left'), df_phs).

In [35]:
medications = had_medication_before(data, data_field, medication_list, atc_mapping)
print(len(medications))
medications.head(100)

medications.to_feather(os.path.join(path, dataset_path, 'temp_medications.feather'))

HBox(children=(FloatProgress(value=0.0, max=98.0), HTML(value='')))


502504


## Diagnoses and events

In [25]:
vocab_dir = f"{data_path}/athena_vocabulary_covid"
vocab = {
    "concept": pd.read_csv(f"{vocab_dir}/CONCEPT.csv", sep='\t'),
    "domain": pd.read_csv(f"{vocab_dir}/DOMAIN.csv", sep='\t'),
    "class": pd.read_csv(f"{vocab_dir}/CONCEPT_CLASS.csv", sep='\t'),
    "relationship": pd.read_csv(f"{vocab_dir}/RELATIONSHIP.csv", sep='\t'),
    "drug_strength": pd.read_csv(f"{vocab_dir}/DRUG_STRENGTH.csv", sep='\t'),
    "vocabulary": pd.read_csv(f"{vocab_dir}/VOCABULARY.csv", sep='\t'),
    "concept_synonym": pd.read_csv(f"{vocab_dir}/CONCEPT_SYNONYM.csv", sep='\t'),
    "concept_ancestor": pd.read_csv(f"{vocab_dir}/CONCEPT_ANCESTOR.csv", sep='\t'),
    "concept_relationship": pd.read_csv(f"{vocab_dir}/CONCEPT_RELATIONSHIP.csv", sep='\t')                       
}

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


### Definitions

In [12]:
coding1836 = pd.read_csv(f"{path}/mapping/codings/coding1836.tsv", sep="\t").rename(columns={"coding":"code"})
phecodes = pd.read_csv(f"{path}/mapping/phecodes/phecode_icd10.csv")
def phenotype_children(phecodes, phenotype_list):
    l={}
    phecodes = phecodes.dropna(subset=["Phenotype"], axis=0)
    for ph, ph_names in phenotype_list.items():
        regex = "|".join(ph_names)
        l[ph] = list(phecodes[phecodes.Phenotype.str.contains(regex, case=False)].ICD10.str.replace("\\.", "").str.slice(0, 3).unique())
    return l

In [27]:
snomed_core = pd.read_csv(f"{path}/mapping/snomed_core_list.txt", sep="|")

In [31]:
snomed_core = snomed_core.query("SNOMED_CONCEPT_STATUS == 'Current'").copy()
new = snomed_core.SNOMED_FSN.str.split("(", n=1, expand=True)
snomed_core["snomed_name"] = new[0].str.rstrip(' ')
snomed_core["snomed_type"] = new[1].str.rstrip(')')
snomed_core_data = snomed_core.query("(snomed_type=='disorder' | snomed_type=='finding') & USAGE>0.01").reset_index(drop=True)

In [32]:
snomed_core_data

Unnamed: 0,SNOMED_CID,SNOMED_FSN,SNOMED_CONCEPT_STATUS,UMLS_CUI,OCCURRENCE,USAGE,FIRST_IN_SUBSET,IS_RETIRED_FROM_SUBSET,LAST_IN_SUBSET,REPLACED_BY_SNOMED_CID,snomed_name,snomed_type
0,38341003,"Hypertensive disorder, systemic arterial (diso...",Current,C0020538,8.0,3.2242,200907,False,,,"Hypertensive disorder, systemic arterial",disorder
1,55822004,Hyperlipidemia (disorder),Current,C0020473,8.0,2.1369,200907,False,,,Hyperlipidemia,disorder
2,35489007,Depressive disorder (disorder),Current,C0011581,8.0,1.5077,200907,False,,,Depressive disorder,disorder
3,235595009,Gastroesophageal reflux disease (disorder),Current,C0017168,8.0,1.3691,200907,False,,,Gastroesophageal reflux disease,disorder
4,44054006,Diabetes mellitus type 2 (disorder),Current,C0011860,8.0,1.0432,200907,False,,,Diabetes mellitus type 2,disorder
...,...,...,...,...,...,...,...,...,...,...,...,...
1013,125601008,Injury of knee (disorder),Current,C0022744,4.0,0.0101,200907,False,,,Injury of knee,disorder
1014,127295002,Traumatic brain injury (disorder),Current,C0876926,3.0,0.0101,200907,False,,,Traumatic brain injury,disorder
1015,373623009,Osteoarthritis of glenohumeral joint (disorder),Current,C0409939,2.0,0.0101,200907,False,,,Osteoarthritis of glenohumeral joint,disorder
1016,206002004,Fetal or neonatal effect of maternal medical p...,Current,C0411175,1.0,0.0101,200907,False,,,Fetal or neonatal effect of maternal medical p...,disorder


In [33]:
snomed_names = snomed_core_data.snomed_name.to_list()
snomed_names = [str(item).lower().strip().replace(" ", "_").replace(";", "").replace(",", "") for item in snomed_names]

In [34]:
phenotype_list_snomed = dict(zip(snomed_names, snomed_core_data.SNOMED_CID.to_list()))
snomed_ids = vocab["concept"].query("vocabulary_id == 'SNOMED'").concept_id.to_list()
icd10_ids = vocab["concept"].query("vocabulary_id == 'ICD10CM'").concept_id.to_list()

ph_to_icd10_mapping = {}

def map_snomed_to_icd10(ph, snomed_code, concept, concept_ancestor, concept_relationship):
    concept_ids = concept.query("vocabulary_id == 'SNOMED' & concept_code == @snomed_code").concept_id.to_list()
    snomed_desc_ids = concept_ancestor.query("ancestor_concept_id== @concept_ids").descendant_concept_id.to_list()
    ph_desc = concept.query("concept_id == @snomed_desc_ids").query("vocabulary_id == 'SNOMED'")
    l_ph_desc_ids = ph_desc.concept_id.to_list()
    ph_icd10_ids = list(concept_relationship.query("concept_id_1==@l_ph_desc_ids").query("concept_id_2 == @icd10_ids").concept_id_2.unique())
    #ph_icd10_ids = list(concept_relationship.set_index("concept_id_1").query("index==@l_ph_desc_ids").query("concept_id_2 == @icd10_ids").query("relationship_id == 'Mapped from'").concept_id_2.unique()
    
    #ph_icd10_ids = list(temp.concept_id_2.unique())
    df = concept.query("concept_id == @ph_icd10_ids & vocabulary_id == 'ICD10CM'")
    icd10_list = list(df[~df.concept_code.str.contains("OMOP", na=False)].concept_code.unique())
    icd10_list = sorted(list(set([e[:3] for e in icd10_list])))
    #print(f"{ph}: {icd10_list}")
    return {ph: sorted(list(dict.fromkeys([str(e) for e in icd10_list])))}

from joblib import Parallel, delayed
concept_ids = vocab["concept"].query("(vocabulary_id == 'SNOMED') | (vocabulary_id == 'ICD10CM')")
vocab_concept_ids = concept_ids.concept_id.to_list()
concept_ancestor = vocab["concept_ancestor"][["ancestor_concept_id", "descendant_concept_id"]].query("ancestor_concept_id == @vocab_concept_ids")
concept_rel = vocab["concept_relationship"][["concept_id_1", "concept_id_2", "relationship_id"]].query("(concept_id_1 == @vocab_concept_ids) & (concept_id_2 == @vocab_concept_ids) & (relationship_id == 'Mapped from')")
icd10_codes = Parallel(n_jobs=10, require="sharedmem")(delayed(map_snomed_to_icd10)(ph, snomed_code, 
                                                              concept_ids, concept_ancestor, concept_rel) for ph, snomed_code in tqdm(phenotype_list_snomed.items()))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1018.0), HTML(value='')))




In [35]:
l10_snomed = {}
for ph in icd10_codes: l10_snomed.update(ph)

In [36]:
l10 = {k: v for k, v in l10_snomed.items() if len(v)!=0}

#phenotype_list = {k: v for k, v in phenotype_list.items() if k in list(l10.keys())}

with open(os.path.join(path, dataset_path, 'phenotype_list.yaml'), 'w') as file: yaml.dump(l10, file, default_flow_style=False)

### 1. Self Reported

In [None]:
coding609 = pd.read_csv(f"{path}/mapping/codings/coding609.tsv", sep="\t").rename(columns={"coding":"code"})

In [None]:
from datetime import datetime, timedelta

def datetime_from_dec_year(dec_year):
    start = dec_year
    year = int(start)
    rem = start - year

    base = datetime(year, 1, 1)
    result = base + timedelta(seconds=(base.replace(year=base.year + 1) - base).total_seconds() * rem)
    #result.strftime("%Y-%m-%d")
    return result.date()

def extract_map_self_reported(data, data_field, code_map):
    pbar = tqdm(total=16)
    ### codes
    fields = ["20002"]; pbar.update(1)
    raw = get_data_fields_all(fields, data, data_field); pbar.update(1)
    col = "noncancer_illness_code_selfreported_f20002"; pbar.update(1)
    temp = pd.wide_to_long(raw, stubnames=[col], i="eid", j="instance_index", sep="_", suffix="\w+").reset_index(); pbar.update(1)
    codes = temp.rename(columns={col:"code"})\
        .assign(code=lambda x: x.code.astype(str))\
        .replace("None", np.nan) \
        .dropna(subset=["code"], axis=0)\
        .assign(code=lambda x: x.code.astype(int)) \
        .merge(code_map, how="left",on="code") \
        .dropna(subset=["meaning"], axis=0)\
        .sort_values(["eid", "instance_index"]) \
        .reset_index(drop=True); pbar.update(1)
    
    ### dates
    fields = ["20008"]; pbar.update(1)
    raw = get_data_fields_all(fields, data, data_field); pbar.update(1)
    col="interpolated_year_when_noncancer_illness_first_diagnosed_f20008"; pbar.update(1)
    temp = pd.wide_to_long(raw, stubnames=[col], i="eid", j="instance_index", sep="_", suffix="\w+").reset_index(); pbar.update(1)
    dates = temp.rename(columns={col:"date"})\
        .dropna(subset=["date"], axis=0)\
        .sort_values(["eid", "instance_index"]) \
        .reset_index(drop=True); pbar.update(1)

    dates = dates[dates.date!=-1]; pbar.update(1)
    dates = dates[dates.date!=-3]; pbar.update(1)
    dates.date = dates.date.apply(datetime_from_dec_year); pbar.update(1)
    
    test = codes.merge(dates, how="left", on=["eid", "instance_index"]).assign(origin="self_reported").copy(); pbar.update(1)
    
    test["instance_index"] = test["instance_index"].astype("string"); pbar.update(1)
    test[['instance','n']] = test.instance_index.str.split("_",expand=True); pbar.update(1)
    pbar.close()
    
    return test[["eid", "origin", 'instance','n', "code", "meaning", "date"]]

In [126]:
codes_self_reported = extract_map_self_reported(data, data_field, coding609)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=16.0), HTML(value='')))

ValueError: invalid literal for int() with base 10: 'nan'

### 2. Primary Care

### 3. Hospital episode statistics

In [18]:
codes_hospital_records = pd.read_feather(f"{data_path}/1_decoded/codes_hospital_records.feather").drop("level", axis=1)
# self reported bypass
diagnoses_codes = codes_hospital_records 
diagnoses_codes.head()

Unnamed: 0,eid,origin,instance,n,code,meaning,date
0,1000018,hes_icd10,0.0,1,S0240,S02,2005-06-02
1,1000018,hes_icd10,0.0,2,W188,W18,2005-06-02
2,1000018,hes_icd10,0.0,3,K37,K37,1998-05-11
3,1000018,hes_icd10,0.0,4,K37,K37,1998-05-16
4,1000018,hes_icd10,0.0,5,K37,K37,1998-06-01


## Combine Basics and Diagnosis

In [None]:
#diagnoses_codes = codes_self_reported.append(codes_hospital_records).sort_values(["eid", "instance", "n"]).dropna(subset=["date"], axis=0)
#diagnoses_codes.head()
#diagnoses_codes.reset_index(drop=True).info()

In [38]:
l_test = l10
icd_list = [item for sublist in l_test.values() for item in sublist]
icd_list = sorted(list(dict.fromkeys(icd_list)))

icd_dict = {}
for code in tqdm(icd_list):
    diag_list = []
    for key in l_test:
        if code in l_test[key]: diag_list.append(key)
    icd_dict[code] = diag_list

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1168.0), HTML(value='')))




In [39]:
time0=time0_col
diagnoses_codes_eid = diagnoses_codes[diagnoses_codes.eid.isin(df_birthdate.reset_index().eid.to_list())].reset_index(drop=True)
diagnoses_codes_eid_icd = diagnoses_codes_eid[diagnoses_codes_eid.meaning.isin(icd_dict)]
diagnoses_codes_time = diagnoses_codes_eid_icd.merge(df_birthdate.reset_index()[["eid", time0]], how="left", on="eid")

In [40]:
dct_simple = diagnoses_codes_time.assign(t= lambda x: (x.date-x.birthdate).dt.days/365.2425)[["eid", "t", "meaning"]]
dct_simple.t= dct_simple.t.round(1)
dct_simple["diagnosis"] = [icd_dict[code] for code in dct_simple.meaning]

In [331]:
#for col in list(l10.keys()): dct_simple[col]=False

In [41]:
dct_simple_eids = df_birthdate.reset_index()[["eid"]].merge(dct_simple, how="left", on="eid").drop(["meaning"], axis=1)

In [42]:
dct_simple_eids

Unnamed: 0,eid,t,diagnosis
0,1000018,44.6,"[injury_of_head, fracture_of_bone]"
1,1000018,37.5,[appendicitis]
2,1000018,37.5,[appendicitis]
3,1000018,37.6,[appendicitis]
4,1000018,56.0,"[vaginitis, postmenopausal_bleeding, bleeding_..."
...,...,...,...
9517540,6025198,75.9,[anemia]
9517541,6025198,75.9,[liver_function_tests_abnormal]
9517542,6025198,75.9,"[tobacco_dependence_syndrome, tobacco_user, sm..."
9517543,6025198,75.9,"[hypertensive_disorder_systemic_arterial, esse..."


In [43]:
dct = dct_simple_eids.groupby(["eid", "t"]).agg({'diagnosis': "sum"})

In [44]:
dct

Unnamed: 0_level_0,Unnamed: 1_level_0,diagnosis
eid,t,Unnamed: 2_level_1
1000018,37.5,"[appendicitis, appendicitis]"
1000018,37.6,[appendicitis]
1000018,44.6,"[injury_of_head, fracture_of_bone]"
1000018,56.0,"[vaginitis, postmenopausal_bleeding, bleeding_..."
1000018,58.3,"[melanocytic_nevus, hypertensive_disorder_syst..."
...,...,...
6025173,66.5,"[neutropenic_disorder, leukopenia]"
6025182,44.2,"[urinary_tract_infectious_disease, urinary_inc..."
6025182,50.3,"[headache, pain]"
6025198,75.8,"[sepsis, methicillin_resistant_staphylococcus_..."


In [336]:
dss_list = []
keys = list(l10.keys())
for d_list in tqdm(dct["diagnosis"].values):
    dss_list.append([True if e in d_list else False for e in list(l10.keys())])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2413496.0), HTML(value='')))




KeyboardInterrupt: 

In [348]:
%load_ext Cython

In [None]:
def get_list(d_list, keys): 
    return [True if e in d_list else False for e in list(keys)]

In [None]:
def get_diagnoses_wide(diagnoses_array, keys): 
    dss_list = []
    for d_list in diagnoses_array:
        dss_list.append([True if e in d_list else False for e in keys])  
    return dss_list

In [None]:
diagnoses_array = dct["diagnosis"].values
keys = list(l10.keys())
dss_list = get_diagnoses_wide(diagnoses_array, keys)

In [None]:
diagnoses_array

In [62]:
2+2

4

In [45]:
def get_list(d_list, keys): return [True if e in d_list else False for e in keys]

keys = list(l10.keys())
dss_list = [get_list(d_list, keys) for d_list in tqdm(dct["diagnosis"].values)]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2413496.0), HTML(value='')))




In [46]:
diagnoses_df = pd.DataFrame(data=np.array(dss_list), columns=list(l10.keys()))

In [47]:
diagnoses = pd.concat([dct.reset_index(), diagnoses_df], axis=1).set_index(["eid", "t"])

In [48]:
diagnoses

Unnamed: 0_level_0,Unnamed: 1_level_0,diagnosis,hypertensive_disorder_systemic_arterial,hyperlipidemia,depressive_disorder,gastroesophageal_reflux_disease,diabetes_mellitus_type_2,essential_hypertension,obesity,diabetes_mellitus,asthma,...,nonvenomous_insect_bite,spondylolisthesis,malignant_tumor_of_esophagus,aphthous_ulcer_of_mouth,ventricular_septal_defect,oropharyngeal_dysphagia,injury_of_knee,traumatic_brain_injury,osteoarthritis_of_glenohumeral_joint,fetal_or_neonatal_effect_of_maternal_medical_problem
eid,t,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1000018,37.5,"[appendicitis, appendicitis]",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1000018,37.6,[appendicitis],False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1000018,44.6,"[injury_of_head, fracture_of_bone]",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1000018,56.0,"[vaginitis, postmenopausal_bleeding, bleeding_...",True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1000018,58.3,"[melanocytic_nevus, hypertensive_disorder_syst...",True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6025173,66.5,"[neutropenic_disorder, leukopenia]",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6025182,44.2,"[urinary_tract_infectious_disease, urinary_inc...",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6025182,50.3,"[headache, pain]",False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6025198,75.8,"[sepsis, methicillin_resistant_staphylococcus_...",True,True,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
df_baseline_time.t= df_baseline_time.t.round(1)
df_baseline_time = df_baseline_time.set_index(["eid", "t"])

In [102]:
na_cols = df_baseline_time.columns.to_list()[1:]
df_baseline_time = df_baseline_time.dropna(how="all", subset=na_cols, axis=0)#.reset_index(drop=True)

In [103]:
df_baseline_time

Unnamed: 0_level_0,Unnamed: 1_level_0,birthdate,sex_f31,ethnic_background_f21000,overall_health_rating_f2178,smoking_status_f20116,alcohol_intake_frequency_f1558,townsend_deprivation_index_at_recruitment_f189,body_mass_index_bmi_f21001,weight_f21002,systolic_blood_pressure_automated_reading_f4080,...,phosphate_f30810,rheumatoid_factor_f30820,shbg_f30830,testosterone_f30850,total_bilirubin_f30840,total_protein_f30860,triglycerides_f30870,urate_f30880,urea_f30670,vitamin_d_f30890
eid,t,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1000018,49.0,1960-11-12,Female,British,Fair,Current,Once or twice a week,-1.852930,26.5557,63.8,159.5,...,1.422,,70.11,1.560,7.41,71.97,1.247,221.3,5.48,70.7
1000020,59.0,1949-02-19,Male,British,Good,Current,Once or twice a week,0.204248,22.7465,70.7,133.0,...,1.264,,55.31,12.237,8.07,78.45,1.906,374.7,5.28,35.9
1000037,59.0,1949-11-11,Female,British,Good,Previous,Once or twice a week,-3.498860,32.4211,78.9,118.5,...,,,,,,,,,,
1000043,63.0,1946-06-03,Male,British,Fair,Previous,Three or four times a week,-5.351150,29.5679,95.8,141.5,...,0.928,,31.63,11.398,8.65,69.70,5.184,322.8,6.67,63.6
1000043,72.0,1946-06-03,,,Fair,Previous,Three or four times a week,,28.4349,90.6,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6025150,55.3,1964-06-30,,,Good,Never,Three or four times a week,,33.5072,95.7,131.0,...,,,,,,,,,,
6025165,45.0,1963-09-02,Female,British,Good,Never,Three or four times a week,-2.107040,24.2275,62.8,152.5,...,0.996,,73.38,0.652,11.19,74.20,1.442,220.2,4.01,72.7
6025173,57.0,1951-09-17,Male,British,Good,Never,Never,-1.827220,25.9504,81.3,131.0,...,1.119,,50.13,13.517,6.31,72.03,1.136,255.5,5.25,41.6
6025182,56.0,1954-07-01,Male,British,Excellent,Previous,Daily or almost daily,-0.010764,29.1425,104.1,127.5,...,0.986,,24.48,10.951,9.95,70.65,5.756,353.6,4.42,45.9


In [104]:
%%time

df_baseline_diagnoses= pd.concat([df_baseline_time, diagnoses], axis=1)
df_baseline_diagnoses

CPU times: user 3min 26s, sys: 1min 11s, total: 4min 37s
Wall time: 4min 36s


Unnamed: 0_level_0,Unnamed: 1_level_0,birthdate,sex_f31,ethnic_background_f21000,overall_health_rating_f2178,smoking_status_f20116,alcohol_intake_frequency_f1558,townsend_deprivation_index_at_recruitment_f189,body_mass_index_bmi_f21001,weight_f21002,systolic_blood_pressure_automated_reading_f4080,...,nonvenomous_insect_bite,spondylolisthesis,malignant_tumor_of_esophagus,aphthous_ulcer_of_mouth,ventricular_septal_defect,oropharyngeal_dysphagia,injury_of_knee,traumatic_brain_injury,osteoarthritis_of_glenohumeral_joint,fetal_or_neonatal_effect_of_maternal_medical_problem
eid,t,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1000018,37.5,,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
1000018,37.6,,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
1000018,44.6,,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
1000018,49.0,1960-11-12,Female,British,Fair,Current,Once or twice a week,-1.852930,26.5557,63.8,159.5,...,,,,,,,,,,
1000018,56.0,,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6025182,50.3,,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
6025182,56.0,1954-07-01,Male,British,Excellent,Previous,Daily or almost daily,-0.010764,29.1425,104.1,127.5,...,,,,,,,,,,
6025198,67.0,1943-01-26,Male,British,Good,Current,Daily or almost daily,-1.930650,29.5988,102.4,156.5,...,,,,,,,,,,
6025198,75.8,,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False


In [106]:
df_baseline_diagnoses.reset_index().to_feather(os.path.join(path, dataset_path, 'baseline_diagnoses_times.feather'))

In [107]:
df_baseline_diagnoses.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2979355 entries, (1000018, 37.5) to (6025198, 75.9)
Columns: 883 entries, birthdate to fetal_or_neonatal_effect_of_maternal_medical_problem
dtypes: float64(79), object(804)
memory usage: 19.6+ GB


In [9]:
df_baseline_diagnoses = pd.read_feather(os.path.join(path, dataset_path, 'baseline_diagnoses_times.feather'))

In [10]:
df_baseline_diagnoses

Unnamed: 0,eid,t,birthdate,sex_f31,ethnic_background_f21000,overall_health_rating_f2178,smoking_status_f20116,alcohol_intake_frequency_f1558,townsend_deprivation_index_at_recruitment_f189,body_mass_index_bmi_f21001,...,nonvenomous_insect_bite,spondylolisthesis,malignant_tumor_of_esophagus,aphthous_ulcer_of_mouth,ventricular_septal_defect,oropharyngeal_dysphagia,injury_of_knee,traumatic_brain_injury,osteoarthritis_of_glenohumeral_joint,fetal_or_neonatal_effect_of_maternal_medical_problem
0,1000018,37.5,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
1,1000018,37.6,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
2,1000018,44.6,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
3,1000018,49.0,1960-11-12,Female,British,Fair,Current,Once or twice a week,-1.852930,26.5557,...,,,,,,,,,,
4,1000018,56.0,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2979350,6025182,50.3,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
2979351,6025182,56.0,1954-07-01,Male,British,Excellent,Previous,Daily or almost daily,-0.010764,29.1425,...,,,,,,,,,,
2979352,6025198,67.0,1943-01-26,Male,British,Good,Current,Daily or almost daily,-1.930650,29.5988,...,,,,,,,,,,
2979353,6025198,75.8,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False


In [None]:
basic_data = pd.concat([basics_birthdate.set_index("eid")[["birthdate"]], df_baseline_diagnoses.set_index(["eid"]).drop(columns="birthdate")], axis=1)

In [57]:
basic_data.reset_index().to_feather(os.path.join(path, dataset_path, 'basic_data.feather'))

In [3]:
basic_data = pd.read_feather(os.path.join(path, dataset_path, 'basic_data.feather')).set_index(["eid", "t"])

In [4]:
basic_data

Unnamed: 0_level_0,Unnamed: 1_level_0,birthdate,sex_f31,ethnic_background_f21000,overall_health_rating_f2178,smoking_status_f20116,alcohol_intake_frequency_f1558,townsend_deprivation_index_at_recruitment_f189,body_mass_index_bmi_f21001,weight_f21002,systolic_blood_pressure_automated_reading_f4080,...,nonvenomous_insect_bite,spondylolisthesis,malignant_tumor_of_esophagus,aphthous_ulcer_of_mouth,ventricular_septal_defect,oropharyngeal_dysphagia,injury_of_knee,traumatic_brain_injury,osteoarthritis_of_glenohumeral_joint,fetal_or_neonatal_effect_of_maternal_medical_problem
eid,t,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1000018,37.5,1960-11-12,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
1000018,37.6,1960-11-12,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
1000018,44.6,1960-11-12,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
1000018,49.0,1960-11-12,Female,British,Fair,Current,Once or twice a week,-1.852930,26.5557,63.8,159.5,...,,,,,,,,,,
1000018,56.0,1960-11-12,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6025182,50.3,1954-07-01,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
6025182,56.0,1954-07-01,Male,British,Excellent,Previous,Daily or almost daily,-0.010764,29.1425,104.1,127.5,...,,,,,,,,,,
6025198,67.0,1943-01-26,Male,British,Good,Current,Daily or almost daily,-1.930650,29.5988,102.4,156.5,...,,,,,,,,,,
6025198,75.8,1943-01-26,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False


In [None]:
import xarray as xa
x_data = basic_data.to_xarray()

In [9]:
x_data

In [60]:
basic_data_cols = basic_data.drop(columns="birthdate").columns.to_list()

In [61]:
basic_data_cols

['t',
 'sex_f31',
 'ethnic_background_f21000',
 'overall_health_rating_f2178',
 'smoking_status_f20116',
 'alcohol_intake_frequency_f1558',
 'townsend_deprivation_index_at_recruitment_f189',
 'body_mass_index_bmi_f21001',
 'weight_f21002',
 'systolic_blood_pressure_automated_reading_f4080',
 'diastolic_blood_pressure_automated_reading_f4079',
 'pulse_rate_automated_reading_f102',
 'pulse_wave_arterial_stiffness_index_f21021',
 'pulse_wave_reflection_index_f4195',
 'waist_circumference_f48',
 'hip_circumference_f49',
 'standing_height_f50',
 'trunk_fat_percentage_f23127',
 'body_fat_percentage_f23099',
 'basal_metabolic_rate_f23105',
 'forced_vital_capacity_fvc_best_measure_f20151',
 'forced_expiratory_volume_in_1second_fev1_best_measure_f20150',
 'fev1_fvc_ratio_zscore_f20258',
 'peak_expiratory_flow_pef_f3064',
 'basophill_count_f30160',
 'basophill_percentage_f30220',
 'eosinophill_count_f30150',
 'eosinophill_percentage_f30210',
 'haematocrit_percentage_f30030',
 'haemoglobin_concen

# Endpoints

In [89]:
### define in snomed and get icd codes from there

### 1. Hospital admissions

In [13]:
endpoint_list = {
    "myocardial_infarction": ["Myocardial infarction"],
    "stroke": ["Cerebrovascular disease"],
    "cancer_breast" : ["Breast Cancer"],
    "diabetes" : ["Diabetes"],
    "atrial_fibrillation": ["Atrial fibrillation", "Atrial flutter", "paroxysmal tachycardia"],
    "copd": ["COPD"],
    "dementia":["dementia"]
}

endpoint_list = phenotype_children(phecodes, endpoint_list)
endpoint_list["cancer_breast"] = ["C50"]
endpoint_list["copd"] = ["J44"]
endpoint_list["diabetes"] = ["E10", "E11", "E12", "E13", "E14"]
endpoint_list["atrial_fibrillation"] = ["I47", "I48"]


with open(os.path.join(path, dataset_path, 'endpoint_list.yaml'), 'w') as file: yaml.dump(endpoint_list, file, default_flow_style=False)
endpoint_list

{'myocardial_infarction': ['I21', 'I22', 'I23', 'I24', 'I25', 'I51'],
 'stroke': ['G45', 'G46', 'I60', 'I67', 'I68', 'I69'],
 'cancer_breast': ['C50'],
 'diabetes': ['E10', 'E11', 'E12', 'E13', 'E14'],
 'atrial_fibrillation': ['I47', 'I48'],
 'copd': ['J44'],
 'dementia': ['F00', 'F01', 'F02', 'F03', 'F09', 'G31', 'R54']}

In [14]:
from dateutil.relativedelta import relativedelta

def extract_endpoints_tte(data, diagnoses_codes, endpoint_list, time0_col, level=None):
    if level is not None: diagnoses_codes = diagnoses_codes.query("level==@level")
    diagnoses_codes_time0 = diagnoses_codes.merge(data[["eid", time0_col]], how="left", on="eid")
    
    cens_time_right = min(diagnoses_codes.sort_values('date').groupby('origin').tail(1).date.to_list())
    print(f"t_0: {time0_col}")
    print(f"t_cens: {cens_time_right}")
    
    df_interval = diagnoses_codes_time0[(diagnoses_codes_time0.date > diagnoses_codes_time0[time0_col]) & 
                                        (diagnoses_codes_time0.date < cens_time_right)]
    
    temp = data[["eid", time0_col]].copy()
    for ph, ph_codes in tqdm(endpoint_list.items()):
        regex = "|".join(ph_codes)
        ph_df = df_interval[df_interval.meaning.str.contains(regex, case=False)] \
            .sort_values('date').groupby('eid').head(1).assign(phenotype=1, date=lambda x: x.date)
        temp_ph = temp.merge(ph_df, how="left", on="eid").fillna(0)
        temp[ph+"_event"], temp[ph+"_event_date"] = temp_ph.phenotype, temp_ph.date
        
        fill_date = {ph+"_event_date" : lambda x: [cens_time_right if event==0 else event_date for event, event_date in zip(x[ph+"_event"], x[ph+"_event_date"])]}
        calc_tte = {ph+"_event_time" : lambda x: [(event_date-time0).days/365.25  for time0, event_date in zip(x[time0_col], x[ph+"_event_date"])]}
        
        temp = temp.assign(**fill_date).assign(**calc_tte).drop([ph+"_event_date"], axis=1)
        
    temp = temp.drop([time0_col], axis=1)     
    
    return temp.drop_duplicates()

In [20]:
basics

Unnamed: 0,eid,age_at_recruitment_f21022_0_0,sex_f31_0_0,ethnic_background_f21000_0_0,ethnic_background_f21000_1_0,ethnic_background_f21000_2_0,townsend_deprivation_index_at_recruitment_f189_0_0,date_of_attending_assessment_centre_f53_0_0,date_of_attending_assessment_centre_f53_1_0,date_of_attending_assessment_centre_f53_2_0,date_of_attending_assessment_centre_f53_3_0
0,1000018,49.0,Female,British,,,-1.852930,2009-11-12,,,
1,1000020,59.0,Male,British,,,0.204248,2008-02-19,,,
2,1000037,59.0,Female,British,,,-3.498860,2008-11-11,,,
3,1000043,63.0,Male,British,,,-5.351150,2009-06-03,,2018-06-08,
4,1000051,51.0,Female,British,,,-1.799080,2006-06-10,,2019-09-15,
...,...,...,...,...,...,...,...,...,...,...,...
502499,6025150,43.0,Female,British,British,,0.046781,2007-06-30,2012-11-17,2017-08-12,2019-10-20
502500,6025165,45.0,Female,British,,,-2.107040,2008-09-02,,,
502501,6025173,57.0,Male,British,,,-1.827220,2008-09-17,,,
502502,6025182,56.0,Male,British,,,-0.010764,2010-07-01,,,


In [22]:
basics_birthdate

Unnamed: 0,eid,age_at_recruitment_f21022_0_0,sex_f31_0_0,ethnic_background_f21000_0_0,ethnic_background_f21000_1_0,ethnic_background_f21000_2_0,townsend_deprivation_index_at_recruitment_f189_0_0,date_of_attending_assessment_centre_f53_0_0,date_of_attending_assessment_centre_f53_1_0,date_of_attending_assessment_centre_f53_2_0,date_of_attending_assessment_centre_f53_3_0,birth_date
0,1000018,49.0,Female,British,,,-1.852930,2009-11-12,,,,1960-11-12
1,1000020,59.0,Male,British,,,0.204248,2008-02-19,,,,1949-02-19
2,1000037,59.0,Female,British,,,-3.498860,2008-11-11,,,,1949-11-11
3,1000043,63.0,Male,British,,,-5.351150,2009-06-03,,2018-06-08,,1946-06-03
4,1000051,51.0,Female,British,,,-1.799080,2006-06-10,,2019-09-15,,1955-06-10
...,...,...,...,...,...,...,...,...,...,...,...,...
502499,6025150,43.0,Female,British,British,,0.046781,2007-06-30,2012-11-17,2017-08-12,2019-10-20,1964-06-30
502500,6025165,45.0,Female,British,,,-2.107040,2008-09-02,,,,1963-09-02
502501,6025173,57.0,Male,British,,,-1.827220,2008-09-17,,,,1951-09-17
502502,6025182,56.0,Male,British,,,-0.010764,2010-07-01,,,,1954-07-01


In [23]:
from dateutil.relativedelta import relativedelta
calc_birth_date = [date_of_attending_assessment_centre - relativedelta(years=age_at_recruitment) 
                                                             for date_of_attending_assessment_centre, age_at_recruitment 
                                                             in zip(basics["date_of_attending_assessment_centre_f53_0_0"], basics["age_at_recruitment_f21022_0_0"])]
basics_birthdate = basics.assign(birthdate = calc_birth_date)
endpoints_hospital = extract_endpoints_tte(basics_birthdate, diagnoses_codes, endpoint_list, time0_col)
print(len(endpoints_hospital))
endpoints_hospital.head()

t_0: birthdate
t_cens: 2020-03-31


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))


502504


Unnamed: 0,eid,myocardial_infarction_event,myocardial_infarction_event_time,stroke_event,stroke_event_time,cancer_breast_event,cancer_breast_event_time,diabetes_event,diabetes_event_time,atrial_fibrillation_event,atrial_fibrillation_event_time,copd_event,copd_event_time,dementia_event,dementia_event_time
0,1000018,0.0,59.381246,0.0,59.381246,0.0,59.381246,0.0,59.381246,0.0,59.381246,0.0,59.381246,0.0,59.381246
1,1000020,0.0,71.110198,0.0,71.110198,0.0,71.110198,0.0,71.110198,0.0,71.110198,0.0,71.110198,0.0,71.110198
2,1000037,0.0,70.384668,0.0,70.384668,0.0,70.384668,0.0,70.384668,0.0,70.384668,0.0,70.384668,0.0,70.384668
3,1000043,1.0,68.123203,0.0,73.826146,0.0,73.826146,0.0,73.826146,0.0,73.826146,1.0,63.293634,0.0,73.826146
4,1000051,0.0,64.807666,0.0,64.807666,0.0,64.807666,1.0,55.723477,0.0,64.807666,1.0,55.841205,0.0,64.807666


### 2. Death registry

In [24]:
death_list = {
    "death_allcause":[],
    "death_cvd":['I{:02}'.format(ID+1) for ID in range(0, 98)],
}

death_codes = pd.read_feather(f"{data_path}/1_decoded/codes_death_records.feather")#.drop("level", axis=1)

with open(os.path.join(path, dataset_path, 'death_list.yaml'), 'w') as file: yaml.dump(death_list, file, default_flow_style=False)

In [25]:
endpoints_death = extract_endpoints_tte(basics_birthdate, death_codes, death_list, time0_col, level="1")

t_0: birthdate
t_cens: 2020-06-28


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




## SCORES

In [26]:
scores_list = {
    "SCORE":['I{:02}'.format(ID) for ID in [10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 44, 45, 46, 47, 48, 49, 50, 51, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73]],
    "ASCVD":['I{:02}'.format(ID) for ID in [20, 21, 22, 23, 24, 25, 63]],
    "QRISK3":["G45", "I20", "I21", "I22", "I23", "I24", "I25", "I63", "I64"],
    "MACE":["G45", "I21", "I22", "I23", "I24", "I25", "I63", "I64"],    
}
with open(os.path.join(path, dataset_path, 'scores_list.yaml'), 'w') as file: yaml.dump(scores_list, file, default_flow_style=False)

In [27]:
scores_list_hospital = {}
scores_list_death = {}
for score, score_codes in scores_list.items():
    scores_list_hospital["hospital_"+score] = score_codes
    scores_list_death["death_"+score] = score_codes

In [28]:
endpoints_scores = {
    "hospital": extract_endpoints_tte(basics_birthdate, diagnoses_codes, scores_list_hospital, time0_col=time0_col),
    "death": extract_endpoints_tte(basics_birthdate, death_codes, scores_list_death, time0_col=time0_col, level=1)}

t_0: birthdate
t_cens: 2020-03-31


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))


t_0: birthdate
t_cens: 2020-06-28


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [29]:
endpoints_scores_all = endpoints_scores["hospital"].merge(endpoints_scores["death"], on="eid", how="left")

### ESC SCORE (Conroy 2003)

In [30]:
score = "SCORE"

temp = endpoints_scores_all.filter(regex="eid|"+score).rename(columns={"death_SCORE_event":"SCORE_event", "death_SCORE_event_time":"SCORE_event_time"})
score_SCORE = temp = temp[["eid", "SCORE_event", "SCORE_event_time"]]
print(len(temp.query(score+"_event==1")))
temp.query(score+"_event==1").head()

5323


Unnamed: 0,eid,SCORE_event,SCORE_event_time
45,1000463,1.0,74.16564
83,1000841,1.0,76.005476
102,1001031,1.0,75.537303
122,1001237,1.0,50.132786
176,1001777,1.0,72.238193


### ASCVD (Goff 2014)

In [31]:
score = "ASCVD"
temp = endpoints_scores_all.filter(regex="eid|"+score)

aggr_event = {score +"_event" : lambda x: [1 if (hospital_event==1) | (death_event == 1) else 0 
                                                    for hospital_event, death_event in zip(x["hospital_"+score+"_event"], x["death_"+score+"_event"])]}
aggr_date = {score +"_event_time" : lambda x: [min(hospital_event_time, death_event_time)
                                                        for hospital_event_time, death_event_time in zip(x["hospital_"+score+"_event_time"], x["death_"+score+"_event_time"])]}

score_ASCVD = temp = temp.assign(**aggr_event).assign(**aggr_date)[["eid", score +"_event", score +"_event_time"]]
print(len(temp.query(score+"_event==1")))
temp.query(score+"_event==1").head()

58893


Unnamed: 0,eid,ASCVD_event,ASCVD_event_time
2,1000037,1,66.970568
3,1000043,1,68.123203
5,1000066,1,65.051335
6,1000079,1,61.054073
22,1000233,1,68.673511


### UK QRISK3 (2017)

In [32]:
score = "QRISK3"
temp = endpoints_scores_all.filter(regex="eid|"+score)

aggr_event = {score +"_event" : lambda x: [1 if (hospital_event==1) | (death_event == 1) else 0 
                                                    for hospital_event, death_event in zip(x["hospital_"+score+"_event"], x["death_"+score+"_event"])]}
aggr_date = {score +"_event_time" : lambda x: [min(hospital_event_time, death_event_time)
                                                        for hospital_event_time, death_event_time in zip(x["hospital_"+score+"_event_time"], x["death_"+score+"_event_time"])]}

score_QRISK3 = temp = temp.assign(**aggr_event).assign(**aggr_date)[["eid", score +"_event", score +"_event_time"]]
print(len(temp.query(score+"_event==1")))
temp.query(score+"_event==1").head()

62625


Unnamed: 0,eid,QRISK3_event,QRISK3_event_time
2,1000037,1,66.970568
3,1000043,1,68.123203
5,1000066,1,65.051335
6,1000079,1,61.054073
22,1000233,1,68.673511


### MACE (2020)

In [33]:
score = "MACE"
temp = endpoints_scores_all.filter(regex="eid|"+score)

aggr_event = {score +"_event" : lambda x: [1 if (hospital_event==1) | (death_event == 1) else 0 
                                                    for hospital_event, death_event in zip(x["hospital_"+score+"_event"], x["death_"+score+"_event"])]}
aggr_date = {score +"_event_time" : lambda x: [min(hospital_event_time, death_event_time)
                                                        for hospital_event_time, death_event_time in zip(x["hospital_"+score+"_event_time"], x["death_"+score+"_event_time"])]}

score_MACE = temp = temp.assign(**aggr_event).assign(**aggr_date)[["eid", score +"_event", score +"_event_time"]]
print(len(temp.query(score+"_event==1")))
temp.query(score+"_event==1").head()

57031


Unnamed: 0,eid,MACE_event,MACE_event_time
3,1000043,1,68.123203
22,1000233,1,68.673511
30,1000319,1,56.922656
45,1000463,1,74.069815
83,1000841,1,75.980835


In [41]:
endpoints_all_list = [df.set_index("eid") for df in [endpoints_hospital, endpoints_death, score_SCORE, score_ASCVD, score_QRISK3, score_MACE]]
endpoints_all = pd.concat(endpoints_all_list, axis=1)

In [42]:
endpoints_all 

Unnamed: 0_level_0,myocardial_infarction_event,myocardial_infarction_event_time,stroke_event,stroke_event_time,cancer_breast_event,cancer_breast_event_time,diabetes_event,diabetes_event_time,atrial_fibrillation_event,atrial_fibrillation_event_time,...,death_cvd_event,death_cvd_event_time,SCORE_event,SCORE_event_time,ASCVD_event,ASCVD_event_time,QRISK3_event,QRISK3_event_time,MACE_event,MACE_event_time
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000018,0.0,59.381246,0.0,59.381246,0.0,59.381246,0.0,59.381246,0.0,59.381246,...,0.0,59.624914,0.0,59.624914,0,59.381246,0,59.381246,0,59.381246
1000020,0.0,71.110198,0.0,71.110198,0.0,71.110198,0.0,71.110198,0.0,71.110198,...,0.0,71.353867,0.0,71.353867,0,71.110198,0,71.110198,0,71.110198
1000037,0.0,70.384668,0.0,70.384668,0.0,70.384668,0.0,70.384668,0.0,70.384668,...,0.0,70.628337,0.0,70.628337,1,66.970568,1,66.970568,0,70.384668
1000043,1.0,68.123203,0.0,73.826146,0.0,73.826146,0.0,73.826146,0.0,73.826146,...,0.0,74.069815,0.0,74.069815,1,68.123203,1,68.123203,1,68.123203
1000051,0.0,64.807666,0.0,64.807666,0.0,64.807666,1.0,55.723477,0.0,64.807666,...,0.0,65.051335,0.0,65.051335,0,64.807666,0,64.807666,0,64.807666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6025150,0.0,55.750856,0.0,55.750856,0.0,55.750856,0.0,55.750856,0.0,55.750856,...,0.0,55.994524,0.0,55.994524,0,55.750856,0,55.750856,0,55.750856
6025165,0.0,56.577687,0.0,56.577687,0.0,56.577687,0.0,56.577687,0.0,56.577687,...,0.0,56.821355,0.0,56.821355,0,56.577687,0,56.577687,0,56.577687
6025173,0.0,68.536619,0.0,68.536619,0.0,68.536619,0.0,68.536619,0.0,68.536619,...,0.0,68.780287,0.0,68.780287,0,68.536619,0,68.536619,0,68.536619
6025182,0.0,65.749487,0.0,65.749487,0.0,65.749487,0.0,65.749487,0.0,65.749487,...,0.0,65.993155,0.0,65.993155,0,65.749487,0,65.749487,0,65.749487


In [43]:
endpoints_all.reset_index().to_feather(os.path.join(path, dataset_path, 'endpoints_all.feather'))

In [44]:
endpoints_all

Unnamed: 0_level_0,myocardial_infarction_event,myocardial_infarction_event_time,stroke_event,stroke_event_time,cancer_breast_event,cancer_breast_event_time,diabetes_event,diabetes_event_time,atrial_fibrillation_event,atrial_fibrillation_event_time,...,death_cvd_event,death_cvd_event_time,SCORE_event,SCORE_event_time,ASCVD_event,ASCVD_event_time,QRISK3_event,QRISK3_event_time,MACE_event,MACE_event_time
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000018,0.0,59.381246,0.0,59.381246,0.0,59.381246,0.0,59.381246,0.0,59.381246,...,0.0,59.624914,0.0,59.624914,0,59.381246,0,59.381246,0,59.381246
1000020,0.0,71.110198,0.0,71.110198,0.0,71.110198,0.0,71.110198,0.0,71.110198,...,0.0,71.353867,0.0,71.353867,0,71.110198,0,71.110198,0,71.110198
1000037,0.0,70.384668,0.0,70.384668,0.0,70.384668,0.0,70.384668,0.0,70.384668,...,0.0,70.628337,0.0,70.628337,1,66.970568,1,66.970568,0,70.384668
1000043,1.0,68.123203,0.0,73.826146,0.0,73.826146,0.0,73.826146,0.0,73.826146,...,0.0,74.069815,0.0,74.069815,1,68.123203,1,68.123203,1,68.123203
1000051,0.0,64.807666,0.0,64.807666,0.0,64.807666,1.0,55.723477,0.0,64.807666,...,0.0,65.051335,0.0,65.051335,0,64.807666,0,64.807666,0,64.807666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6025150,0.0,55.750856,0.0,55.750856,0.0,55.750856,0.0,55.750856,0.0,55.750856,...,0.0,55.994524,0.0,55.994524,0,55.750856,0,55.750856,0,55.750856
6025165,0.0,56.577687,0.0,56.577687,0.0,56.577687,0.0,56.577687,0.0,56.577687,...,0.0,56.821355,0.0,56.821355,0,56.577687,0,56.577687,0,56.577687
6025173,0.0,68.536619,0.0,68.536619,0.0,68.536619,0.0,68.536619,0.0,68.536619,...,0.0,68.780287,0.0,68.780287,0,68.536619,0,68.536619,0,68.536619
6025182,0.0,65.749487,0.0,65.749487,0.0,65.749487,0.0,65.749487,0.0,65.749487,...,0.0,65.993155,0.0,65.993155,0,65.749487,0,65.749487,0,65.749487


## Merge Everything

In [32]:
data_dfs_dict = {"basics": pd.read_feather(os.path.join(path, dataset_path, 'temp_basics.feather')), 
                 "questionnaire": pd.read_feather(os.path.join(path, dataset_path, 'temp_questionnaire.feather')), 
                 "measurements": pd.read_feather(os.path.join(path, dataset_path, 'temp_measurements.feather')), 
                 "labs": pd.read_feather(os.path.join(path, dataset_path, 'temp_labs.feather')), 
                 "family_history": pd.read_feather(os.path.join(path, dataset_path, 'temp_family_history.feather')), 
                 "diagnoses": pd.read_feather(os.path.join(path, dataset_path, 'temp_diagnoses.feather')),
                # "diagnoses_emb": pd.read_feather(os.path.join(path, dataset_path, 'temp_diagnoses_emb.feather')), 
                 "medications": pd.read_feather(os.path.join(path, dataset_path, 'temp_medications.feather')), 
                 "endpoints_hospital":endpoints_hospital, 
                 "endpoints_death":endpoints_death, 
                 "score_SCORE":score_SCORE, 
                 "score_ASCVD":score_ASCVD, 
                 "score_QRISK3":score_QRISK3,
                 "score_MACE":score_MACE}

In [33]:
def get_cols_clean(df):
    df.columns = df.columns.str.replace(r'_0_0$', '').str.replace(r'_f[0-9]+$', '').str.replace("_automated_reading", '')
    return df.columns

def clean_df(df):
    df.columns = get_cols_clean(df)
    return df

In [34]:
import pandas as pd
from functools import reduce

data_baseline = reduce(lambda x, y: pd.merge(x, y, on = 'eid'), list(data_dfs_dict.values()))

In [35]:
data_baseline = clean_df(data_baseline)

In [36]:
for col in [col for col in list(data_baseline.columns) if ("_event" in col) & ("_time" not in col)]:
    data_baseline[col] = data_baseline[col].astype(int)

In [37]:
covariates = [col for col in list(data_baseline.columns) if not "_event" in col]
targets = [col for col in list(data_baseline.columns) if "_event" in col]

# Exporting

In [38]:
data_baseline.head()

Unnamed: 0,eid,age_at_recruitment,sex,ethnic_background,townsend_deprivation_index_at_recruitment,date_of_attending_assessment_centre,birth_date,overall_health_rating,smoking_status,alcohol_intake_frequency,...,death_cvd_event,death_cvd_event_time,SCORE_event,SCORE_event_time,ASCVD_event,ASCVD_event_time,QRISK3_event,QRISK3_event_time,MACE_event,MACE_event_time
0,1000018,49.0,Female,White,-1.85293,2009-11-12,1960-11-12,Fair,Current,Once or twice a week,...,0,59.624914,0,59.624914,0,59.334702,0,59.334702,0,59.334702
1,1000020,59.0,Male,White,0.204248,2008-02-19,1949-02-19,Good,Current,Once or twice a week,...,0,71.353867,0,71.353867,0,71.063655,0,71.063655,0,71.063655
2,1000037,59.0,Female,White,-3.49886,2008-11-11,1949-11-11,Good,Previous,Once or twice a week,...,0,70.628337,0,70.628337,1,66.970568,1,66.970568,0,70.338125
3,1000043,63.0,Male,White,-5.35115,2009-06-03,1946-06-03,Fair,Previous,Three or four times a week,...,0,74.069815,0,74.069815,1,68.123203,1,68.123203,1,68.123203
4,1000051,51.0,Female,White,-1.79908,2006-06-10,1955-06-10,Poor,Never,One to three times a month,...,0,65.051335,0,65.051335,0,64.761123,0,64.761123,0,64.761123


In [39]:
data_cols = {}
for topic, df in data_dfs_dict.items(): 
    data_cols["eid"] = ["admin"]
    data_cols[topic]=list(get_cols_clean(df))[1:]

In [40]:
data_cols_single = {}
for topic, columns in data_cols.items():
    for col in columns:
        data_cols_single[col] = topic

In [41]:
dtypes = {"int32":"integer", "int64":"integer", "float64":"numeric", "category":"category", "object":"category", "bool":"logical"}
desc_dict = {"id": [*range(1, len(data_baseline.columns.to_list())+1)] , 
             "covariate": data_baseline.columns.to_list(), 
             "dtype":[dtypes[str(col)] for col in data_baseline.dtypes.to_list()], 
             "isTarget":[True if col in targets else False for col in data_baseline.columns.to_list()],
            "based_on":[topic for col, topic in data_cols_single.items()],
            "aggr_fn": [np.nan for col in data_baseline.columns.to_list()]}
data_baseline_description = pd.DataFrame.from_dict(desc_dict)
data_baseline_description

Unnamed: 0,id,covariate,dtype,isTarget,based_on,aggr_fn
0,1,eid,integer,False,eid,
1,2,age_at_recruitment,numeric,False,basics,
2,3,sex,category,False,basics,
3,4,ethnic_background,category,False,basics,
4,5,townsend_deprivation_index_at_recruitment,numeric,False,basics,
...,...,...,...,...,...,...
3741,3742,ASCVD_event_time,numeric,True,score_ASCVD,
3742,3743,QRISK3_event,integer,True,score_QRISK3,
3743,3744,QRISK3_event_time,numeric,True,score_QRISK3,
3744,3745,MACE_event,integer,True,score_MACE,


# Exclusion Criteria

In [42]:
data_baseline_excl = data_baseline.query("myocardial_infarction == False & coronary_heart_disease == False & statins == False").reset_index(drop=True)

In [43]:
data_baseline_excl

Unnamed: 0,eid,age_at_recruitment,sex,ethnic_background,townsend_deprivation_index_at_recruitment,date_of_attending_assessment_centre,birth_date,overall_health_rating,smoking_status,alcohol_intake_frequency,...,death_cvd_event,death_cvd_event_time,SCORE_event,SCORE_event_time,ASCVD_event,ASCVD_event_time,QRISK3_event,QRISK3_event_time,MACE_event,MACE_event_time
0,1000018,49.0,Female,White,-1.852930,2009-11-12,1960-11-12,Fair,Current,Once or twice a week,...,0,59.624914,0,59.624914,0,59.334702,0,59.334702,0,59.334702
1,1000020,59.0,Male,White,0.204248,2008-02-19,1949-02-19,Good,Current,Once or twice a week,...,0,71.353867,0,71.353867,0,71.063655,0,71.063655,0,71.063655
2,1000037,59.0,Female,White,-3.498860,2008-11-11,1949-11-11,Good,Previous,Once or twice a week,...,0,70.628337,0,70.628337,1,66.970568,1,66.970568,0,70.338125
3,1000043,63.0,Male,White,-5.351150,2009-06-03,1946-06-03,Fair,Previous,Three or four times a week,...,0,74.069815,0,74.069815,1,68.123203,1,68.123203,1,68.123203
4,1000079,60.0,Female,White,-2.708040,2008-03-18,1948-03-18,Fair,Never,Once or twice a week,...,0,72.279261,0,72.279261,1,61.054073,1,61.054073,0,71.989049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402296,6025150,43.0,Female,White,0.046781,2007-06-30,1964-06-30,Excellent,Never,Three or four times a week,...,0,55.994524,0,55.994524,0,55.704312,0,55.704312,0,55.704312
402297,6025165,45.0,Female,White,-2.107040,2008-09-02,1963-09-02,Good,Never,Three or four times a week,...,0,56.821355,0,56.821355,0,56.531143,0,56.531143,0,56.531143
402298,6025173,57.0,Male,White,-1.827220,2008-09-17,1951-09-17,Good,Never,Never,...,0,68.780287,0,68.780287,0,68.490075,0,68.490075,0,68.490075
402299,6025182,56.0,Male,White,-0.010764,2010-07-01,1954-07-01,Excellent,Previous,Daily or almost daily,...,0,65.993155,0,65.993155,0,65.702943,0,65.702943,0,65.702943


In [44]:
feature_dict = {}
for group in data_baseline_description.based_on.unique(): feature_dict[group] = data_baseline_description.query("based_on==@group").covariate.to_list()
with open(os.path.join(path, dataset_path, 'feature_list.yaml'), 'w') as file: yaml.dump(feature_dict, file, default_flow_style=False, allow_unicode=True)

In [45]:
#feature_dict

In [46]:
### WRITE FEATURES IN YAML!!!

In [47]:
data_baseline.to_feather(os.path.join(path, dataset_path, 'baseline_clinical.feather'))
data_baseline_excl.to_feather(os.path.join(path, dataset_path, 'baseline_clinical_excl.feather'))
data_baseline_description.to_feather(os.path.join(path, dataset_path, 'baseline_clinical_description.feather'))

In [None]:
#data_baseline.to_csv(os.path.join(path, dataset_path, 'baseline_clinical.csv'), index=False)
#data_baseline_description.to_csv(os.path.join(path, dataset_path, 'baseline_clinical_description.csv'), index=False)

# !!! REMEMBER IMPUTATION !!!