In [None]:
import pandas as pd
import numpy as np
import os
import sys
import pickle

In [None]:
# Features for modelling part:
# "53-0.0": first day of attending 
# 31-0.0: gender
# 34-0.0: year of birth
# 845-1.0	845-2.0: age completed full time education
# 6138: QUALIFICATION

In [None]:
root_path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())))

In [None]:
# read metadata
with open(os.path.join(root_path, "data\\metadata\\target_col_idx.pickle"), 'rb') as f:
    target_col_idx = pickle.load(f)
target_col_idx.append("eid")
metadata = pd.read_csv(os.path.join(root_path, "data/metadata/metadata_subset.csv"), usecols=target_col_idx)
# metadata = pd.read_csv(os.path.join(root_path, "data/metadata/metadata_subset.csv"))
metadata.set_index("eid", inplace=True)

In [None]:
# read the icd10 disease that we use to exclude the participants
target = pd.read_csv(os.path.join(root_path, "data/metadata/CNS_icd10_11_merge_full.csv"), encoding= 'unicode_escape',
                    usecols=["icd10Code", "icd10Title"])
target.dropna(axis=0, inplace=True)
target_list = list(target["icd10Code"].apply(lambda x: x.replace(".", "")))
target_block_list = [i for i in target_list if "-" in i] #block disease
target_single_list = list(set(target_list) - set(target_block_list))
print(len(target_block_list))
print(len(target_single_list))

In [None]:
# read metadata coding, and get the whole list of icd10 that UKbiobank used.
data_coding = pd.read_csv(os.path.join(root_path, "data/metadata/Codings.csv"), 
                          encoding='latin-1')
# data_dict = pd.read_csv(os.path.join(root_path, "data/metadata/Data_Dictionary_Showcase.csv"))

data_coding = data_coding[data_coding["Coding"] == 19] # the coding of icd10 fields is 19
icd_diseases = list(data_coding.Value)
icd_diseases = [i.replace("Block ", "") if "Block" in i else i for i in icd_diseases]
icd_block_list = [i for i in icd_diseases if "-" in i]
icd_single_list = list(set(icd_diseases) - set(icd_block_list))
print(len(icd_block_list))
print(len(icd_single_list))

In [None]:
# delete the icd entries that are not included in UKbiobank icd10.

# single disease
to_be_del = [i for i in target_single_list if i not in icd_single_list]
target_single_list = list(set(target_single_list) - set(to_be_del))

# block disease
to_be_del = [i for i in target_block_list if i not in icd_block_list]
target_block_list = list(set(target_block_list) - set(to_be_del))

# double check
assert len([i for i in target_single_list if i not in icd_single_list]) == 0
assert len([i for i in target_block_list if i not in icd_block_list]) == 0

In [None]:
# add all diseases start with "C: 'C00-C97', a special case. 
to_be_added = [i for i in icd_block_list if "C" in i]
target_block_list.extend(to_be_added)
target_block_list = ["Block "+i for i in target_block_list]

In [None]:
# combine the two in a list.
full_target_list = []
full_target_list.extend(target_block_list)
full_target_list.extend(target_single_list)
print(len(full_target_list))

In [None]:
# still a few special case here:
for i in ["I60", "I61", "I63", "I64", "G35", "G20", "F00", "F01", "F02", "F03", 
           "G30", "G31", "G32", "Block G30-G32", "G36", "G37"]:
    if i not in full_target_list:
        print(i)
        full_target_list.append(i)

# from Boris' screenshot: Additionally these developmental disorders
extra_disease = ["E700","E701","E720","E750","E752","E762","E763","E830","E720","F70","R620","F71","F819","F79","R625"
"G318","F82","F840","F88","F849","R625","Q992","F89","F901","F802","F809","F952","R471"]
full_target_list.extend(extra_disease)
full_target_list = list(set(full_target_list))
len(full_target_list)

In [None]:
# the final disease dictionary!
disease_dict = { 
# FieldID: Coding
 "20002": ["1081.0", "1086.0", "1491.0", "1583.0", "1261.0", "1262.0", "1263.0", "1397.0"], 
 "41203_41205": ["430.0", "431.0", "434.0", "436.0", "340.0", "332.0", "290.0", "341.0"], 
#  "41205": [430, 431, 434, 436, 340, 332, 290, 341],
#  "41202": ["I60", "I61", "I63", "I64", "G35", "G20", "F00", "F01", "F02", "F03", 
#            "G30", "G31", "G32", "Block G30-G32", "G36", "G37"], 
#  "41204": ["I60", "I61", "I63", "I64", "G35", "G20", "F00", "F01", "F02", "F03", 
#            "G30", "G31", "G32", "Block G30-G32", "G36", "G37"], 
 '41202_41204_41270': full_target_list, 
 }

In [None]:
# #  ----------------------------------------------
# # we use the following code to find out the relavant the columns in metadata, to reduce the memory needed.
# selected_col = []
# all_col = list(metadata.columns)
# disease_fields = ['20002', '41203', '41205', '41202', '41204', '41270']

# for field in disease_fields:
#     # target_cols = [(idx, col) for idx, col in enumerate(all_col) if field in col]
#     kept_cols = [col for col in (all_col) if field in col]
#     selected_col.extend(kept_cols)

# with open(os.path.join(root_path, "data\\metadata\\target_col_idx.pickle"), 'wb') as f:
#     pickle.dump(selected_col, f)
# #  ----------------------------------------------

In [None]:
def is_included(l1, l2):
    # This function return True if any element of l1 is included in l2, else False
    for i in l1:
        if i in l2:
            return True
    return False

In [None]:
all_col = list(metadata.columns)
metadata_concate = pd.DataFrame(index=metadata.index)
metadata_concate["is_selected"] = True # if the participant will be selected

for field in disease_dict.keys():
    print(field)
    
    if len(field) > 5: # for the fields share the same coding
        all_fields = field.split("_")
        target_fields = []
        for f in all_fields:
            target_fields.extend([col for col in all_col if f in col])
    else:
        target_fields = [col for col in all_col if field in col]
        
    # concatenate the cols under the same field to a list, and drop NAN
    combined_str = metadata[target_fields].apply(
                    lambda x: ','.join(x.dropna().astype(str)),
                    axis=1)
    # print(combined_str)
    list_disease = combined_str.apply(lambda x: x.split(","))
    metadata_concate[field] = combined_str.apply(lambda x: x.split(","))
    
    # # concate all the target disease
    # all_disease = "".join(str(i)+"|" for i in disease_dict[field])
    # all_disease = all_disease[:-1]
    
    # # the person is tagged as unhealthy if any of the target diseases exist.
    # is_healthy = ~metadata_concate[field].str.contains(all_disease)
    
    # we exclude the participant if he/she has any of the target diseases.
    is_healthy = ~metadata_concate[field].apply(lambda x: is_included(l1=x, l2=disease_dict[field]))
    print(is_healthy.sum())
    metadata_concate["based_on_"+field] = is_healthy
    metadata_concate["is_selected"] = metadata_concate["is_selected"] & metadata_concate["based_on_"+field]
    print(metadata_concate["is_selected"].sum())
    
print(metadata_concate)
# metadata = metadata[selected_col]

In [None]:
metadata_concate[metadata_concate["is_selected"] == False]

In [None]:
metadata_concate.to_csv(os.path.join(root_path, "data/metadata/metadata_concate.csv"))