# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import os
import yaml
from tqdm.notebook import trange, tqdm
dataset_name = "210212_cvd_gp"
path = "/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb"
data_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS/data"
dataset_path = f"{data_path}/2_datasets_pre/{dataset_name}"

In [2]:
from pathlib import Path
Path(dataset_path).mkdir(parents=True, exist_ok=True)

In [3]:
data = pd.read_feather(f"{data_path}/1_decoded/ukb_data.feather")
data_field = pd.read_feather(f"{data_path}/1_decoded/ukb_data_field.feather")
data_columns = data.columns.to_list()

## Mappings + Vocabulary

In [4]:
# Drop obviouse missing data
print(len(data))
data = data.dropna(subset=["sex_f31_0_0"], axis=0)
print(len(data))

502505
502504


# Starting information

In [5]:
#time0_col="birth_date"
time0_col="date_of_attending_assessment_centre_f53_0_0"

# Baseline covariates

In [6]:
def get_fields(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields) & data_field["field.tab"].str.contains("f\\.\\d+\\.0\\.\\d")].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_fields_all(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields)].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_data_fields(fields, data, data_field):
    f = get_fields(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

def get_data_fields_all(fields, data, data_field):
    f = get_fields_all(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

## Diagnoses and events

In [7]:
vocab_dir = f"{data_path}/athena_vocabulary_covid"
vocab = {
    "concept": pd.read_csv(f"{vocab_dir}/CONCEPT.csv", sep='\t'),
    "domain": pd.read_csv(f"{vocab_dir}/DOMAIN.csv", sep='\t'),
    "class": pd.read_csv(f"{vocab_dir}/CONCEPT_CLASS.csv", sep='\t'),
    "relationship": pd.read_csv(f"{vocab_dir}/RELATIONSHIP.csv", sep='\t'),
    "drug_strength": pd.read_csv(f"{vocab_dir}/DRUG_STRENGTH.csv", sep='\t'),
    "vocabulary": pd.read_csv(f"{vocab_dir}/VOCABULARY.csv", sep='\t'),
    "concept_synonym": pd.read_csv(f"{vocab_dir}/CONCEPT_SYNONYM.csv", sep='\t'),
    "concept_ancestor": pd.read_csv(f"{vocab_dir}/CONCEPT_ANCESTOR.csv", sep='\t'),
    "concept_relationship": pd.read_csv(f"{vocab_dir}/CONCEPT_RELATIONSHIP.csv", sep='\t')                       
}

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


### Definitions

In [8]:
coding1836 = pd.read_csv(f"{path}/mapping/codings/coding1836.tsv", sep="\t").rename(columns={"coding":"code"})
phecodes = pd.read_csv(f"{path}/mapping/phecodes/phecode_icd10.csv")
def phenotype_children(phecodes, phenotype_list):
    l={}
    phecodes = phecodes.dropna(subset=["Phenotype"], axis=0)
    for ph, ph_names in phenotype_list.items():
        regex = "|".join(ph_names)
        l[ph] = list(phecodes[phecodes.Phenotype.str.contains(regex, case=False)].ICD10.str.replace("\\.", "").str.slice(0, 3).unique())
    return l

In [19]:
diagnoses_codes = pd.read_feather(os.path.join(path, dataset_path, 'temp_diagnoses_codes.feather')).drop("level", axis=1)

In [20]:
death_codes = pd.read_feather(f"{data_path}/1_decoded/codes_death_records_210115.feather").query("level==1").drop("level", axis=1)

In [21]:
endpoint_codes = pd.concat([diagnoses_codes, death_codes[diagnoses_codes.columns]])

# Endpoints

In [22]:
### define in snomed and get icd codes from there

### 1. Hospital admissions

In [23]:
endpoint_list = {
    "myocardial_infarction": ['I21', 'I22', 'I23', 'I24', 'I25'],
    "stroke": ['G45', "I63", "I64"],
    "diabetes" : ['E10', 'E11', 'E12', 'E13', 'E14'],
    "diabetes1" : ['E10'],
    "diabetes2" : ['E11', 'E12', 'E13', 'E14'],
    "atrial_fibrillation": ['I47', 'I48'],
    'migraine': ['G43', 'G44'],
    'rheumatoid_arthritis': ['J99', 'M05', 'M06', 'M08', 'M12', 'M13'],
    "systemic_lupus_erythematosus": ['M32'],
    'severe_mental_illness': ['F20', 'F25', 'F30', 'F31', 'F32', 'F33', 'F44'],
    "erectile_dysfunction" : ['F52', 'N48'],  
    "chronic_kidney_disease": ["I12", "N18", "N19"],
    "liver_disease":["K70", "K71", "K72", "K73", "K74", "K75", "K76", "K77"],
    "dementia":['F00', 'F01', 'F02', 'F03'],
    "copd": ['J44']}

with open(os.path.join(path, dataset_path, 'endpoint_list.yaml'), 'w') as file: yaml.dump(endpoint_list, file, default_flow_style=False)
endpoint_list

{'myocardial_infarction': ['I21', 'I22', 'I23', 'I24', 'I25'],
 'stroke': ['G45', 'I63', 'I64'],
 'diabetes': ['E10', 'E11', 'E12', 'E13', 'E14'],
 'diabetes1': ['E10'],
 'diabetes2': ['E11', 'E12', 'E13', 'E14'],
 'atrial_fibrillation': ['I47', 'I48'],
 'migraine': ['G43', 'G44'],
 'rheumatoid_arthritis': ['J99', 'M05', 'M06', 'M08', 'M12', 'M13'],
 'systemic_lupus_erythematosus': ['M32'],
 'severe_mental_illness': ['F20', 'F25', 'F30', 'F31', 'F32', 'F33', 'F44'],
 'erectile_dysfunction': ['F52', 'N48'],
 'chronic_kidney_disease': ['I12', 'N18', 'N19'],
 'liver_disease': ['K70', 'K71', 'K72', 'K73', 'K74', 'K75', 'K76', 'K77'],
 'dementia': ['F00', 'F01', 'F02', 'F03'],
 'copd': ['J44']}

In [24]:
from dateutil.relativedelta import relativedelta
import datetime

def extract_endpoints_tte(data, diagnoses_codes, endpoint_list, time0_col, level=None):
    if level is not None: diagnoses_codes = diagnoses_codes.query("level==@level")
    diagnoses_codes_time0 = diagnoses_codes.merge(data[["eid", time0_col]], how="left", on="eid")
    
    #cens_time_right = max(diagnoses_codes.sort_values('date').groupby('origin').tail(1).date.to_list())
    cens_time_right = datetime.date(2020, 9, 30)
    print(f"t_0: {time0_col}")
    print(f"t_cens: {cens_time_right}")
    
    df_interval = diagnoses_codes_time0[(diagnoses_codes_time0.date > diagnoses_codes_time0[time0_col]) & 
                                        (diagnoses_codes_time0.date < cens_time_right)]
    
    temp = data[["eid", time0_col]].copy()
    for ph, ph_codes in tqdm(endpoint_list.items()):
        regex = "|".join(ph_codes)
        ph_df = df_interval[df_interval.meaning.str.contains(regex, case=False)] \
            .sort_values('date').groupby('eid').head(1).assign(phenotype=1, date=lambda x: x.date)
        temp_ph = temp.merge(ph_df, how="left", on="eid").fillna(0)
        temp[ph+"_event"], temp[ph+"_event_date"] = temp_ph.phenotype, temp_ph.date
        
        fill_date = {ph+"_event_date" : lambda x: [cens_time_right if event==0 else event_date for event, event_date in zip(x[ph+"_event"], x[ph+"_event_date"])]}
        calc_tte = {ph+"_event_time" : lambda x: [(event_date-time0).days/365.25  for time0, event_date in zip(x[time0_col], x[ph+"_event_date"])]}
        
        temp = temp.assign(**fill_date).assign(**calc_tte).drop([ph+"_event_date"], axis=1)
        
    temp = temp.drop([time0_col], axis=1)     
    
    return temp.drop_duplicates()

In [25]:
basics = pd.read_feather(os.path.join(path, dataset_path, 'temp_basics.feather'))
endpoints_diagnoses = extract_endpoints_tte(basics, endpoint_codes, endpoint_list, time0_col)
print(len(endpoints_diagnoses))
endpoints_diagnoses.head()

t_0: date_of_attending_assessment_centre_f53_0_0
t_cens: 2020-09-30


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


502504


Unnamed: 0,eid,myocardial_infarction_event,myocardial_infarction_event_time,stroke_event,stroke_event_time,diabetes_event,diabetes_event_time,diabetes1_event,diabetes1_event_time,diabetes2_event,...,erectile_dysfunction_event,erectile_dysfunction_event_time,chronic_kidney_disease_event,chronic_kidney_disease_event_time,liver_disease_event,liver_disease_event_time,dementia_event,dementia_event_time,copd_event,copd_event_time
0,1000018,0.0,10.882957,0.0,10.882957,0.0,10.882957,0.0,10.882957,0.0,...,0.0,10.882957,0.0,10.882957,0.0,10.882957,1.0,1.305955,0.0,10.882957
1,1000020,0.0,12.613279,0.0,12.613279,0.0,12.613279,0.0,12.613279,0.0,...,0.0,12.613279,0.0,12.613279,0.0,12.613279,0.0,12.613279,0.0,12.613279
2,1000037,0.0,11.88501,0.0,11.88501,0.0,11.88501,0.0,11.88501,0.0,...,0.0,11.88501,0.0,11.88501,0.0,11.88501,0.0,11.88501,0.0,11.88501
3,1000043,1.0,5.122519,0.0,11.326489,0.0,11.326489,0.0,11.326489,0.0,...,0.0,11.326489,0.0,11.326489,0.0,11.326489,0.0,11.326489,1.0,0.29295
4,1000051,0.0,14.308008,0.0,14.308008,1.0,4.722793,0.0,14.308008,1.0,...,0.0,14.308008,1.0,4.722793,0.0,14.308008,0.0,14.308008,1.0,4.84052


### 2. Death registry

In [26]:
death_list = {
    "death_allcause":[],
    "death_cvd":['I{:02}'.format(ID+1) for ID in range(0, 98)],
}

with open(os.path.join(path, dataset_path, 'death_list.yaml'), 'w') as file: yaml.dump(death_list, file, default_flow_style=False)

In [27]:
endpoints_death = extract_endpoints_tte(basics, death_codes, death_list, time0_col)

t_0: date_of_attending_assessment_centre_f53_0_0
t_cens: 2020-09-30


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




## SCORES

In [28]:
scores_list = {
    "SCORE":['I{:02}'.format(ID) for ID in [10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 44, 45, 46, 47, 48, 49, 50, 51, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73]],
    "ASCVD":['I{:02}'.format(ID) for ID in [20, 21, 22, 23, 24, 25, 63]],
    "QRISK3":["G45", "I20", "I21", "I22", "I23", "I24", "I25", "I63", "I64"],
    "MACE":["G45", "I21", "I22", "I23", "I24", "I25", "I63", "I64"],    
}
with open(os.path.join(path, dataset_path, 'scores_list.yaml'), 'w') as file: yaml.dump(scores_list, file, default_flow_style=False)

In [29]:
death_scores =  extract_endpoints_tte(basics, death_codes, scores_list, time0_col=time0_col)
endpoint_scores = extract_endpoints_tte(basics, endpoint_codes, scores_list, time0_col=time0_col)

t_0: date_of_attending_assessment_centre_f53_0_0
t_cens: 2020-09-30


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))


t_0: date_of_attending_assessment_centre_f53_0_0
t_cens: 2020-09-30


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [30]:
endpoints_scores_all = death_scores[["eid", "SCORE_event", "SCORE_event_time"]].merge(endpoint_scores[["eid", "ASCVD_event", "ASCVD_event_time", "QRISK3_event", "QRISK3_event_time", "MACE_event", "MACE_event_time"]], on="eid")
endpoints_scores_all.to_feather(os.path.join(path, dataset_path, 'temp_endpoints_scores_all.feather'))

### ESC SCORE (Conroy 2003)

In [31]:
score = "SCORE"
print(len(endpoints_scores_all.query(score+"_event==1")))
endpoints_scores_all.query(score+"_event==1").head()

5540


Unnamed: 0,eid,SCORE_event,SCORE_event_time,ASCVD_event,ASCVD_event_time,QRISK3_event,QRISK3_event_time,MACE_event,MACE_event_time
45,1000463,1.0,6.16564,1.0,6.069815,1.0,6.069815,1.0,6.069815
83,1000841,1.0,11.003422,0.0,12.07666,1.0,10.978782,1.0,10.978782
102,1001031,1.0,6.53525,1.0,5.40178,1.0,5.40178,1.0,5.40178
122,1001237,1.0,2.132786,1.0,2.132786,1.0,2.132786,1.0,2.132786
176,1001777,1.0,7.238877,1.0,7.22245,1.0,7.22245,0.0,11.364819


### ASCVD (Goff 2014)

In [32]:
score = "ASCVD"
print(len(endpoints_scores_all.query(score+"_event==1")))
endpoints_scores_all.query(score+"_event==1").head()

62937


Unnamed: 0,eid,SCORE_event,SCORE_event_time,ASCVD_event,ASCVD_event_time,QRISK3_event,QRISK3_event_time,MACE_event,MACE_event_time
2,1000037,0.0,11.88501,1.0,7.969884,1.0,7.969884,0.0,11.88501
3,1000043,0.0,11.326489,1.0,5.122519,1.0,5.122519,1.0,5.122519
6,1000079,0.0,12.536619,1.0,1.054073,1.0,1.054073,0.0,12.536619
22,1000233,0.0,12.15332,1.0,3.671458,1.0,3.671458,1.0,3.671458
30,1000319,0.0,10.795346,1.0,10.20397,1.0,10.20397,1.0,10.20397


### UK QRISK3 (2017)

In [33]:
score = "QRISK3"
print(len(endpoints_scores_all.query(score+"_event==1")))
endpoints_scores_all.query(score+"_event==1").head()

68413


Unnamed: 0,eid,SCORE_event,SCORE_event_time,ASCVD_event,ASCVD_event_time,QRISK3_event,QRISK3_event_time,MACE_event,MACE_event_time
2,1000037,0.0,11.88501,1.0,7.969884,1.0,7.969884,0.0,11.88501
3,1000043,0.0,11.326489,1.0,5.122519,1.0,5.122519,1.0,5.122519
6,1000079,0.0,12.536619,1.0,1.054073,1.0,1.054073,0.0,12.536619
22,1000233,0.0,12.15332,1.0,3.671458,1.0,3.671458,1.0,3.671458
30,1000319,0.0,10.795346,1.0,10.20397,1.0,10.20397,1.0,10.20397


### MACE (2020)

In [34]:
score = "MACE"
print(len(endpoints_scores_all.query(score+"_event==1")))
endpoints_scores_all.query(score+"_event==1").head()

57869


Unnamed: 0,eid,SCORE_event,SCORE_event_time,ASCVD_event,ASCVD_event_time,QRISK3_event,QRISK3_event_time,MACE_event,MACE_event_time
3,1000043,0.0,11.326489,1.0,5.122519,1.0,5.122519,1.0,5.122519
22,1000233,0.0,12.15332,1.0,3.671458,1.0,3.671458,1.0,3.671458
30,1000319,0.0,10.795346,1.0,10.20397,1.0,10.20397,1.0,10.20397
45,1000463,1.0,6.16564,1.0,6.069815,1.0,6.069815,1.0,6.069815
72,1000731,0.0,11.321013,1.0,11.249829,1.0,11.249829,1.0,11.249829


## Merge Everything

In [41]:
data_dfs_dict = {"endpoints_diagnoses":endpoints_diagnoses, 
                 "endpoints_death":endpoints_death, 
                 "endpoints_scores_all":endpoints_scores_all}

In [42]:
def get_cols_clean(df):
    df.columns = df.columns.str.replace(r'_0_0$', '').str.replace(r'_f[0-9]+$', '').str.replace("_automated_reading", '')
    return df.columns

def clean_df(df):
    df.columns = get_cols_clean(df)
    return df

In [124]:
import pandas as pd
from functools import reduce

data_baseline = reduce(lambda x, y: pd.merge(x, y, on = 'eid'), list(data_dfs_dict.values()))
endpoint_columns = [c[:-11] for c in data_baseline.columns.tolist() if "_event_time" in c]
print(endpoint_columns)

['myocardial_infarction', 'stroke', 'diabetes', 'diabetes1', 'diabetes2', 'atrial_fibrillation', 'migraine', 'rheumatoid_arthritis', 'systemic_lupus_erythematosus', 'severe_mental_illness', 'erectile_dysfunction', 'chronic_kidney_disease', 'liver_disease', 'dementia', 'copd', 'death_allcause', 'death_cvd', 'SCORE', 'ASCVD', 'QRISK3', 'MACE']


## Competing Events

In [125]:
# endpoint < death -> 1
# death < endpoint -> 2
# time min(endpoint_time, death_time) -> time
def event_calc(endpoint, endpoint_time, death, death_time):
    endpoint = int(endpoint)
    death = int(death)
    if (endpoint==0) and (death==0): 
        return 0.0
    if (endpoint==1) and (death==0): 
        return 1.0
    elif (endpoint==0) and (death==1): 
        return 2.0
    elif (endpoint==1) and (death==1) and (endpoint_time<=death_time):
        return float(1)
    elif (endpoint==1) and (death==1) and (death_time<endpoint_time):
        return float(2)
    else: return np.nan

for c in tqdm(endpoint_columns): 
    if c!="death_allcause":
        data_baseline[f"{c}_comp_event"] = [event_calc(endpoint, endpoint_time, death, death_time) for endpoint, endpoint_time, death, death_time 
                                            in zip(data_baseline[f"{c}_event"], data_baseline[f"{c}_event_time"], data_baseline["death_allcause_event"], data_baseline["death_allcause_event_time"])]
        
        data_baseline[f"{c}_comp_event_time"] = [min(endpoint_time, death_time)
                                                 for endpoint, endpoint_time, death, death_time
                                                 in zip(data_baseline[f"{c}_event"], data_baseline[f"{c}_event_time"],
                                                        data_baseline["death_allcause_event"], data_baseline["death_allcause_event_time"])]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [126]:
data_baseline[[f"{c}_event" for c in sorted([c[:-11] for c in data_baseline.columns.tolist() if "_event_time" in c])]]

Unnamed: 0,ASCVD_event,ASCVD_comp_event,MACE_event,MACE_comp_event,QRISK3_event,QRISK3_comp_event,SCORE_event,SCORE_comp_event,atrial_fibrillation_event,atrial_fibrillation_comp_event,...,myocardial_infarction_event,myocardial_infarction_comp_event,rheumatoid_arthritis_event,rheumatoid_arthritis_comp_event,severe_mental_illness_event,severe_mental_illness_comp_event,stroke_event,stroke_comp_event,systemic_lupus_erythematosus_event,systemic_lupus_erythematosus_comp_event
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
502500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
502501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
502502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [127]:
data_baseline[data_baseline.death_allcause_event==1][["eid", "MACE_event", "MACE_event_time", "MACE_comp_event", "MACE_comp_event_time"]].query("MACE_event==0")

Unnamed: 0,eid,MACE_event,MACE_event_time,MACE_comp_event,MACE_comp_event_time
13,1000144,0.0,12.041068,2.0,3.523614
21,1000221,0.0,10.948665,2.0,6.891170
49,1000500,0.0,14.450376,2.0,12.279261
59,1000608,0.0,11.477070,2.0,2.507871
67,1000686,0.0,13.054073,2.0,10.318960
...,...,...,...,...,...
502430,6024468,0.0,11.540041,2.0,7.252567
502440,6024566,0.0,10.658453,2.0,2.329911
502445,6024611,0.0,10.324435,2.0,0.577687
502450,6024667,0.0,10.316222,2.0,2.431211


In [128]:
data_baseline.describe()

Unnamed: 0,eid,myocardial_infarction_event,myocardial_infarction_event_time,stroke_event,stroke_event_time,diabetes_event,diabetes_event_time,diabetes1_event,diabetes1_event_time,diabetes2_event,...,death_cvd_comp_event,death_cvd_comp_event_time,SCORE_comp_event,SCORE_comp_event_time,ASCVD_comp_event,ASCVD_comp_event_time,QRISK3_comp_event,QRISK3_comp_event_time,MACE_comp_event,MACE_comp_event_time
count,502504.0,502504.0,502504.0,502504.0,502504.0,502504.0,502504.0,502504.0,502504.0,502504.0,...,502504.0,502504.0,502504.0,502504.0,502504.0,502504.0,502504.0,502504.0,502504.0,502504.0
mean,3512606.0,0.092865,11.095097,0.030394,11.50435,0.088111,11.048187,0.019781,11.525729,0.086529,...,0.114708,11.390389,0.116879,11.390389,0.210277,10.641807,0.218651,10.567688,0.200367,10.747562
std,1450653.0,0.290243,2.274576,0.171668,1.450134,0.283456,2.430626,0.139247,1.480295,0.281143,...,0.450624,1.599884,0.457244,1.599884,0.501094,2.831019,0.503343,2.914048,0.495408,2.669474
min,1000018.0,0.0,0.002738,0.0,0.002738,0.0,0.002738,0.0,0.002738,0.0,...,0.0,0.010951,0.0,0.010951,0.0,0.002738,0.0,0.002738,0.0,0.002738
25%,2256298.0,0.0,10.78987,0.0,10.924025,0.0,10.800821,0.0,10.94319,0.0,...,0.0,10.86653,0.0,10.86653,0.0,10.532512,0.0,10.505133,0.0,10.55989
50%,3512620.0,0.0,11.556468,0.0,11.633128,0.0,11.561944,0.0,11.652293,0.0,...,0.0,11.59206,0.0,11.59206,0.0,11.427789,0.0,11.405886,0.0,11.444216
75%,4768908.0,0.0,12.290212,0.0,12.336756,0.0,12.295688,0.0,12.344969,0.0,...,0.0,12.303901,0.0,12.303901,0.0,12.219028,0.0,12.208077,0.0,12.221766
max,6025198.0,1.0,14.551677,1.0,14.551677,1.0,14.551677,1.0,14.551677,1.0,...,2.0,14.551677,2.0,14.551677,2.0,14.551677,2.0,14.551677,2.0,14.551677


In [129]:
data_baseline = clean_df(data_baseline)

In [130]:
for col in [col for col in list(data_baseline.columns) if ("_event" in col) & ("_time" not in col)]:
    data_baseline[col] = data_baseline[col].astype(int)

In [131]:
covariates = [col for col in list(data_baseline.columns) if not "_event" in col]
targets = [col for col in list(data_baseline.columns) if "_event" in col]

# Exporting

In [132]:
data_baseline.head()

Unnamed: 0,eid,myocardial_infarction_event,myocardial_infarction_event_time,stroke_event,stroke_event_time,diabetes_event,diabetes_event_time,diabetes1_event,diabetes1_event_time,diabetes2_event,...,death_cvd_comp_event,death_cvd_comp_event_time,SCORE_comp_event,SCORE_comp_event_time,ASCVD_comp_event,ASCVD_comp_event_time,QRISK3_comp_event,QRISK3_comp_event_time,MACE_comp_event,MACE_comp_event_time
0,1000018,0,10.882957,0,10.882957,0,10.882957,0,10.882957,0,...,0,10.882957,0,10.882957,0,10.882957,0,10.882957,0,10.882957
1,1000020,0,12.613279,0,12.613279,0,12.613279,0,12.613279,0,...,0,12.613279,0,12.613279,0,12.613279,0,12.613279,0,12.613279
2,1000037,0,11.88501,0,11.88501,0,11.88501,0,11.88501,0,...,0,11.88501,0,11.88501,1,7.969884,1,7.969884,0,11.88501
3,1000043,1,5.122519,0,11.326489,0,11.326489,0,11.326489,0,...,0,11.326489,0,11.326489,1,5.122519,1,5.122519,1,5.122519
4,1000051,0,14.308008,0,14.308008,1,4.722793,0,14.308008,1,...,0,14.308008,0,14.308008,0,14.308008,0,14.308008,0,14.308008


In [133]:
data_cols = {}
for topic, df in data_dfs_dict.items(): 
    data_cols["eid"] = ["admin"]
    data_cols[topic]=list(get_cols_clean(df))[1:]

In [134]:
data_cols_single = {}
for topic, columns in data_cols.items():
    for col in columns:
        data_cols_single[col] = topic

In [144]:
for c in [c for c in data_baseline.columns.tolist() if "comp" in c]:
    data_cols_single.update({c:"endpoints_competing"})

In [151]:
dtypes = {"int32":"int", "int64":"int", "float64":"float", "category":"category", "object":"category", "bool":"bool"}
desc_dict = {"id": [*range(1, len(data_baseline.columns.to_list())+1)] , 
             "covariate": data_baseline.columns.to_list(), 
             "dtype":[dtypes[str(col)] for col in data_baseline.dtypes.to_list()], 
             "isTarget":[True if col in targets else False for col in data_baseline.columns.to_list()],
            "based_on":[topic for col, topic in data_cols_single.items()],
            "aggr_fn": [np.nan for col in data_baseline.columns.to_list()]}
data_baseline_description = pd.DataFrame.from_dict(desc_dict)
data_baseline_description

Unnamed: 0,id,covariate,dtype,isTarget,based_on,aggr_fn
0,1,eid,int,False,eid,
1,2,myocardial_infarction_event,int,True,endpoints_diagnoses,
2,3,myocardial_infarction_event_time,float,True,endpoints_diagnoses,
3,4,stroke_event,int,True,endpoints_diagnoses,
4,5,stroke_event_time,float,True,endpoints_diagnoses,
...,...,...,...,...,...,...
78,79,ASCVD_comp_event_time,float,True,endpoints_competing,
79,80,QRISK3_comp_event,int,True,endpoints_competing,
80,81,QRISK3_comp_event_time,float,True,endpoints_competing,
81,82,MACE_comp_event,int,True,endpoints_competing,


In [152]:
#feature_dict

In [153]:
### WRITE FEATURES IN YAML!!!

In [154]:
data_baseline.to_feather(os.path.join(path, dataset_path, 'baseline_endpoints.feather'))
data_baseline_description.to_feather(os.path.join(path, dataset_path, 'baseline_endpoints_description.feather'))

In [None]:
#data_baseline.to_csv(os.path.join(path, dataset_path, 'baseline_clinical.csv'), index=False)
#data_baseline_description.to_csv(os.path.join(path, dataset_path, 'baseline_clinical_description.csv'), index=False)

# !!! REMEMBER IMPUTATION !!!