# Preprocessing

In [None]:
import pandas as pd
import numpy as np
import os
import yaml
from tqdm.notebook import trange, tqdm
dataset_name = "210212_cvd_gp"
path = "/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb"
data_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS/data"
dataset_path = f"{data_path}/2_datasets_pre/{dataset_name}"

In [None]:
from pathlib import Path
Path(dataset_path).mkdir(parents=True, exist_ok=True)

In [None]:
data = pd.read_feather(f"{data_path}/1_decoded/ukb_data.feather")
data_field = pd.read_feather(f"{data_path}/1_decoded/ukb_data_field.feather")
data_columns = data.columns.to_list()

## Mappings + Vocabulary

In [None]:
# Drop obviouse missing data
print(len(data))
data = data.dropna(subset=["sex_f31_0_0"], axis=0)
print(len(data))

# Starting information

In [None]:
#time0_col="birth_date"
time0_col="date_of_attending_assessment_centre_f53_0_0"

# Baseline covariates

In [None]:
def get_fields(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields) & data_field["field.tab"].str.contains("f\\.\\d+\\.0\\.\\d")].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_fields_all(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields)].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_data_fields(fields, data, data_field):
    f = get_fields(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

def get_data_fields_all(fields, data, data_field):
    f = get_fields_all(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

## Diagnoses and events

In [None]:
vocab_dir = f"{data_path}/athena_vocabulary_covid"
vocab = {
    "concept": pd.read_csv(f"{vocab_dir}/CONCEPT.csv", sep='\t'),
    "domain": pd.read_csv(f"{vocab_dir}/DOMAIN.csv", sep='\t'),
    "class": pd.read_csv(f"{vocab_dir}/CONCEPT_CLASS.csv", sep='\t'),
    "relationship": pd.read_csv(f"{vocab_dir}/RELATIONSHIP.csv", sep='\t'),
    "drug_strength": pd.read_csv(f"{vocab_dir}/DRUG_STRENGTH.csv", sep='\t'),
    "vocabulary": pd.read_csv(f"{vocab_dir}/VOCABULARY.csv", sep='\t'),
    "concept_synonym": pd.read_csv(f"{vocab_dir}/CONCEPT_SYNONYM.csv", sep='\t'),
    "concept_ancestor": pd.read_csv(f"{vocab_dir}/CONCEPT_ANCESTOR.csv", sep='\t'),
    "concept_relationship": pd.read_csv(f"{vocab_dir}/CONCEPT_RELATIONSHIP.csv", sep='\t')                       
}

### Definitions

In [None]:
coding1836 = pd.read_csv(f"{path}/mapping/codings/coding1836.tsv", sep="\t").rename(columns={"coding":"code"})
phecodes = pd.read_csv(f"{path}/mapping/phecodes/phecode_icd10.csv")
def phenotype_children(phecodes, phenotype_list):
    l={}
    phecodes = phecodes.dropna(subset=["Phenotype"], axis=0)
    for ph, ph_names in phenotype_list.items():
        regex = "|".join(ph_names)
        l[ph] = list(phecodes[phecodes.Phenotype.str.contains(regex, case=False)].ICD10.str.replace("\\.", "").str.slice(0, 3).unique())
    return l

In [None]:
diagnoses_codes = pd.read_feather(os.path.join(path, dataset_path, 'temp_diagnoses_codes.feather')).drop("level", axis=1)

In [None]:
death_codes = pd.read_feather(f"{data_path}/1_decoded/codes_death_records_210115.feather").query("level==1").drop("level", axis=1)

In [None]:
endpoint_codes = pd.concat([diagnoses_codes, death_codes[diagnoses_codes.columns]])

# Endpoints

In [None]:
### define in snomed and get icd codes from there

### 1. Hospital admissions

In [None]:
endpoint_list = {
    "myocardial_infarction": ['I21', 'I22', 'I23', 'I24', 'I25'],
    "stroke": ['G45', "I63", "I64"],
    "diabetes" : ['E10', 'E11', 'E12', 'E13', 'E14'],
    "diabetes1" : ['E10'],
    "diabetes2" : ['E11', 'E12', 'E13', 'E14'],
    "atrial_fibrillation": ['I47', 'I48'],
    'migraine': ['G43', 'G44'],
    'rheumatoid_arthritis': ['J99', 'M05', 'M06', 'M08', 'M12', 'M13'],
    "systemic_lupus_erythematosus": ['M32'],
    'severe_mental_illness': ['F20', 'F25', 'F30', 'F31', 'F32', 'F33', 'F44'],
    "erectile_dysfunction" : ['F52', 'N48'],  
    "chronic_kidney_disease": ["I12", "N18", "N19"],
    "liver_disease":["K70", "K71", "K72", "K73", "K74", "K75", "K76", "K77"],
    "dementia":['F00', 'F01', 'F02', 'F03'],
    "copd": ['J44']}

with open(os.path.join(path, dataset_path, 'endpoint_list.yaml'), 'w') as file: yaml.dump(endpoint_list, file, default_flow_style=False)
endpoint_list

In [None]:
from dateutil.relativedelta import relativedelta
import datetime

def extract_endpoints_tte(data, diagnoses_codes, endpoint_list, time0_col, level=None):
    if level is not None: diagnoses_codes = diagnoses_codes.query("level==@level")
    diagnoses_codes_time0 = diagnoses_codes.merge(data[["eid", time0_col]], how="left", on="eid")
    
    #cens_time_right = max(diagnoses_codes.sort_values('date').groupby('origin').tail(1).date.to_list())
    cens_time_right = datetime.date(2020, 9, 30)
    print(f"t_0: {time0_col}")
    print(f"t_cens: {cens_time_right}")
    
    df_interval = diagnoses_codes_time0[(diagnoses_codes_time0.date > diagnoses_codes_time0[time0_col]) & 
                                        (diagnoses_codes_time0.date < cens_time_right)]
    
    temp = data[["eid", time0_col]].copy()
    for ph, ph_codes in tqdm(endpoint_list.items()):
        regex = "|".join(ph_codes)
        ph_df = df_interval[df_interval.meaning.str.contains(regex, case=False)] \
            .sort_values('date').groupby('eid').head(1).assign(phenotype=1, date=lambda x: x.date)
        temp_ph = temp.merge(ph_df, how="left", on="eid").fillna(0)
        temp[ph+"_event"], temp[ph+"_event_date"] = temp_ph.phenotype, temp_ph.date
        
        fill_date = {ph+"_event_date" : lambda x: [cens_time_right if event==0 else event_date for event, event_date in zip(x[ph+"_event"], x[ph+"_event_date"])]}
        calc_tte = {ph+"_event_time" : lambda x: [(event_date-time0).days/365.25  for time0, event_date in zip(x[time0_col], x[ph+"_event_date"])]}
        
        temp = temp.assign(**fill_date).assign(**calc_tte).drop([ph+"_event_date"], axis=1)
        
    temp = temp.drop([time0_col], axis=1)     
    
    return temp.drop_duplicates()

In [None]:
basics = pd.read_feather(os.path.join(path, dataset_path, 'temp_basics.feather'))
endpoints_diagnoses = extract_endpoints_tte(basics, endpoint_codes, endpoint_list, time0_col)
print(len(endpoints_diagnoses))
endpoints_diagnoses.head()

### 2. Death registry

In [None]:
death_list = {
    "death_allcause":[],
    "death_cvd":['I{:02}'.format(ID+1) for ID in range(0, 98)],
}

with open(os.path.join(path, dataset_path, 'death_list.yaml'), 'w') as file: yaml.dump(death_list, file, default_flow_style=False)

In [None]:
endpoints_death = extract_endpoints_tte(basics, death_codes, death_list, time0_col)

## SCORES

In [None]:
scores_list = {
    "SCORE":['I{:02}'.format(ID) for ID in [10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 44, 45, 46, 47, 48, 49, 50, 51, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73]],
    "ASCVD":['I{:02}'.format(ID) for ID in [20, 21, 22, 23, 24, 25, 63]],
    "QRISK3":["G45", "I20", "I21", "I22", "I23", "I24", "I25", "I63", "I64"],
    "MACE":["G45", "I21", "I22", "I23", "I24", "I25", "I63", "I64"],    
}
with open(os.path.join(path, dataset_path, 'scores_list.yaml'), 'w') as file: yaml.dump(scores_list, file, default_flow_style=False)

In [None]:
death_scores =  extract_endpoints_tte(basics, death_codes, scores_list, time0_col=time0_col)
endpoint_scores = extract_endpoints_tte(basics, endpoint_codes, scores_list, time0_col=time0_col)

In [None]:
endpoints_scores_all = death_scores[["eid", "SCORE_event", "SCORE_event_time"]].merge(endpoint_scores[["eid", "ASCVD_event", "ASCVD_event_time", "QRISK3_event", "QRISK3_event_time", "MACE_event", "MACE_event_time"]], on="eid")
endpoints_scores_all.to_feather(os.path.join(path, dataset_path, 'temp_endpoints_scores_all.feather'))

### ESC SCORE (Conroy 2003)

In [None]:
score = "SCORE"
print(len(endpoints_scores_all.query(score+"_event==1")))
endpoints_scores_all.query(score+"_event==1").head()

### ASCVD (Goff 2014)

In [None]:
score = "ASCVD"
print(len(endpoints_scores_all.query(score+"_event==1")))
endpoints_scores_all.query(score+"_event==1").head()

### UK QRISK3 (2017)

In [None]:
score = "QRISK3"
print(len(endpoints_scores_all.query(score+"_event==1")))
endpoints_scores_all.query(score+"_event==1").head()

### MACE (2020)

In [None]:
score = "MACE"
print(len(endpoints_scores_all.query(score+"_event==1")))
endpoints_scores_all.query(score+"_event==1").head()

## Merge Everything

In [None]:
data_dfs_dict = {"endpoints_diagnoses":endpoints_diagnoses, 
                 "endpoints_death":endpoints_death, 
                 "endpoints_scores_all":endpoints_scores_all}

In [None]:
def get_cols_clean(df):
    df.columns = df.columns.str.replace(r'_0_0$', '').str.replace(r'_f[0-9]+$', '').str.replace("_automated_reading", '')
    return df.columns

def clean_df(df):
    df.columns = get_cols_clean(df)
    return df

In [None]:
import pandas as pd
from functools import reduce

data_baseline = reduce(lambda x, y: pd.merge(x, y, on = 'eid'), list(data_dfs_dict.values()))
endpoint_columns = [c[:-11] for c in data_baseline.columns.tolist() if "_event_time" in c]
print(endpoint_columns)

## Competing Events

In [None]:
# endpoint < death -> 1
# death < endpoint -> 2
# time min(endpoint_time, death_time) -> time
def event_calc(endpoint, endpoint_time, death, death_time):
    endpoint = int(endpoint)
    death = int(death)
    if (endpoint==0) and (death==0): 
        return 0.0
    if (endpoint==1) and (death==0): 
        return 1.0
    elif (endpoint==0) and (death==1): 
        return 2.0
    elif (endpoint==1) and (death==1) and (endpoint_time<=death_time):
        return float(1)
    elif (endpoint==1) and (death==1) and (death_time<endpoint_time):
        return float(2)
    else: return np.nan

for c in tqdm(endpoint_columns): 
    if c!="death_allcause":
        data_baseline[f"{c}_comp_event"] = [event_calc(endpoint, endpoint_time, death, death_time) for endpoint, endpoint_time, death, death_time 
                                            in zip(data_baseline[f"{c}_event"], data_baseline[f"{c}_event_time"], data_baseline["death_allcause_event"], data_baseline["death_allcause_event_time"])]
        
        data_baseline[f"{c}_comp_event_time"] = [min(endpoint_time, death_time)
                                                 for endpoint, endpoint_time, death, death_time
                                                 in zip(data_baseline[f"{c}_event"], data_baseline[f"{c}_event_time"],
                                                        data_baseline["death_allcause_event"], data_baseline["death_allcause_event_time"])]

In [None]:
data_baseline = clean_df(data_baseline)

In [None]:
for col in [col for col in list(data_baseline.columns) if ("_event" in col) & ("_time" not in col)]:
    data_baseline[col] = data_baseline[col].astype(int)

In [None]:
covariates = [col for col in list(data_baseline.columns) if not "_event" in col]
targets = [col for col in list(data_baseline.columns) if "_event" in col]

# Exporting

In [None]:
data_cols = {}
for topic, df in data_dfs_dict.items(): 
    data_cols["eid"] = ["admin"]
    data_cols[topic]=list(get_cols_clean(df))[1:]

In [None]:
data_cols_single = {}
for topic, columns in data_cols.items():
    for col in columns:
        data_cols_single[col] = topic

In [None]:
for c in [c for c in data_baseline.columns.tolist() if "comp" in c]:
    data_cols_single.update({c:"endpoints_competing"})

In [None]:
dtypes = {"int32":"int", "int64":"int", "float64":"float", "category":"category", "object":"category", "bool":"bool"}
desc_dict = {"id": [*range(1, len(data_baseline.columns.to_list())+1)] , 
             "covariate": data_baseline.columns.to_list(), 
             "dtype":[dtypes[str(col)] for col in data_baseline.dtypes.to_list()], 
             "isTarget":[True if col in targets else False for col in data_baseline.columns.to_list()],
            "based_on":[topic for col, topic in data_cols_single.items()],
            "aggr_fn": [np.nan for col in data_baseline.columns.to_list()]}
data_baseline_description = pd.DataFrame.from_dict(desc_dict)
data_baseline_description

In [None]:
data_baseline.to_feather(os.path.join(path, dataset_path, 'baseline_endpoints.feather'))
data_baseline_description.to_feather(os.path.join(path, dataset_path, 'baseline_endpoints_description.feather'))