## Cohort Context Exploration

We extended our knowledge discovery pipeline for enhancing data analysis for patient cohort context exploration. This setting takes in the clinical profiles of a particular patient cohort (called the query cohort) and maps them to corresponding biomedical entities (called cohort description entities) in iBKH.

Please make sure all the files following the structure below.

```
.
└── ...
    ├── Cohort Context Exploration.ipynb
    ├── find_UMLS.py
    ├── exploration_CC.py
    ├── APOE_disease.csv
    └── APOE_logit_reg.csv
    
```

In [None]:
import pickle
from Case_study.find_UMLS import *
from Case_study.exploration_CC import *
from neo4j import GraphDatabase
import itertools
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import maxabs_scale

### Step 1: Map the AD/cognitive disorder-related traits from the APOE cohort to the entities in iBKH

We collected 36 AD/cognitive disorder-related traits through a literature review. We used the UMLS API to assign corresponding UMLS CUIs for these features to map to entities in iBKH.

In [None]:
apoe_disease_data = pd.read_csv("APOE_disease.csv")  # Read APOE cohort data

In [None]:
# Assign the corresponding UMLS CUIs for the 36 collected traits and map them to the entiites in iBKH
def map_entities2iBKH():
    raw_dx_list = apoe_disease_data['Unnamed: 0'].tolist()
    dx_cui = {}
    for concept_name in tqdm(raw_dx_list):
        umls_cui = access_UMLS_by_name(concept_name)
        umls_name = get_UMLS_name(umls_cui)
        dx_cui[concept_name] = [umls_cui, umls_name]

    res = {}
    for raw_dx in tqdm(dx_cui):
        umls_cui, umls_name = dx_cui[raw_dx]
        if umls_cui in di_vobUMLS_list:
            idx = di_vob.loc[di_vob['umls_cui'] == umls_cui].index[0]
            primary_id = di_vob.loc[di_vob['umls_cui'] == umls_cui].loc[idx, 'primary']
            name = di_vob.loc[di_vob['umls_cui'] == umls_cui].loc[idx, 'name']
            res[raw_dx] = [primary_id, name, 'Disease', umls_cui]
        elif umls_cui in sy_vobUMLS_list:
            idx = sy_vob.loc[sy_vob['umls_cui'] == umls_cui].index[0]
            primary_id = sy_vob.loc[sy_vob['umls_cui'] == umls_cui].loc[idx, 'primary']
            name = sy_vob.loc[sy_vob['umls_cui'] == umls_cui].loc[idx, 'name']
            res[raw_dx] = [primary_id, name, 'Symptom', umls_cui]
        elif umls_cui in se_vobUMLS_list:
            idx = se_vob.loc[se_vob['umls_cui'] == umls_cui].index[0]
            primary_id = se_vob.loc[se_vob['umls_cui'] == umls_cui].loc[idx, 'primary']
            name = se_vob.loc[se_vob['umls_cui'] == umls_cui].loc[idx, 'name']
            res[raw_dx] = [primary_id, name, 'Side_Effect', umls_cui]
    print(res)
    with open('concept_iBKH_apoe.obj', 'wb') as f:
        pickle.dump(res, f)
    f.close()

In [None]:
map_entities2iBKH()

### Step 2: Generate the input list

We construct the context for the APOE cohort based on the mapped result. Namely, we construct the input list to explore the APOE cohort.

In [None]:
def get_cohort_context(weight_type, pval_filter, topk):
    raw_dx_list = apoe_disease_data['Unnamed: 0'].tolist()
    with open("concept_iBKH_apoe.obj", "rb") as f:
        concept_iBKH_apoe = pickle.load(f)
    f.close()
    raw_dx_weight = {}
    if weight_type == "LR":
        weight_data = pd.read_csv("APOE_logit_reg.csv")
        for i in range(len(weight_data)):
            raw_dx = weight_data.loc[i, 'Unnamed: 0']
            weight = weight_data.loc[i, 'apoe']
            pval = weight_data.loc[i, 'p_apoe']
            raw_dx_weight[raw_dx] = [weight, pval]

        for concept in concept_iBKH_apoe:
            ibkh_name = concept_iBKH_apoe[concept][1]
            ibkh_type = concept_iBKH_apoe[concept][2]
            if pval_filter:
                weight, pval = raw_dx_weight[concept]
                if pval <= 0.2:
                    input_entity_list.append([ibkh_name, ibkh_type, weight])
            else:
                weight = raw_dx_weight[concept][0]
                input_entity_list.append([ibkh_name, ibkh_type, weight])
        if pval_filter:
            with open("input_entity_list_LR_pval.obj", "wb") as f:
                pickle.dump(input_entity_list, f)
            f.close()
        else:
            with open("input_entity_list_LR.obj", "wb") as f:
                pickle.dump(input_entity_list, f)
            f.close()
    else:
        apoe_proportion = np.asarray(apoe_disease_data['exact_proportion'].tolist())
        nonapoe_proportion = np.asarray(nonapoe_disease_data['exact_proportion'].tolist())
        raw_weight = apoe_proportion / nonapoe_proportion
        log_weight = np.emath.log(raw_weight)
        norm_weight = maxabs_scale(log_weight)
        for i, raw_dx in enumerate(raw_dx_list):
            raw_dx_weight[raw_dx] = norm_weight[i]
        for concept in concept_iBKH_apoe:
            ibkh_name = concept_iBKH_apoe[concept][1]
            ibkh_type = concept_iBKH_apoe[concept][2]
            weight = raw_dx_weight[concept][0]
            input_entity_list.append([ibkh_name, ibkh_type, weight])

        with open("input_entity_list.obj", "wb") as f:
            pickle.dump(input_entity_list, f)
        f.close()

In [None]:
weight_type = 'LR' # A mixed effect logistic regression model was fitted for each trait as a dependent variable and APOE ε4 status as the independent variable to study their statistical associations, adjusted for age, sex, race, and ethnicity as covariates. 
pval_filter = True # Since we don’t expect to lose the variables with relatively weak signals, we used a P-value < 0.2 for thresholding.
get_cohort_context(weight_type, pval_filter, topk)

### Step 3: Cohort Context Exploration

We then predicted the context entities of the query cohort, given the description entities and their weights in the query cohort. We used the algorithms (Ensemble model, TransE, TransR, ComplEx and DistMult) to calculate the edge scores. And the edge scores indicate the strength of association between candidate entities and built cohort.

In [None]:
def predeict_cohort_context(weight_type, pval_filter, topk):
        if weight_type == 'LR':
        if pval_filter:
            with open("input_entity_list_LR_pval.obj", "rb") as f:
                input_entity_list = pickle.load(f)
            f.close()
        else:
            with open("input_entity_list_LR.obj", "rb") as f:
                input_entity_list = pickle.load(f)
            f.close()
    else:
        with open("input_entity_list.obj", "rb") as f:
            input_entity_list = pickle.load(f)
        f.close()
    print(len(input_entity_list))

    input_emb_ids = map_input2embedding_id(input_entity_list)
    TransE_res = get_averaged_rank(input_emb_ids, 'TransE')
    TransR_res = get_averaged_rank(input_emb_ids, 'TransR')
    ComplEx_res = get_averaged_rank(input_emb_ids, 'ComplEx')
    DistMult_res = get_averaged_rank(input_emb_ids, 'DistMult')

    output_category = ['Gene', 'Pathway', 'Drug', 'Disease', 'Symptom', 'Side_Effect']

    if weight_type == "LR":
        if pval_filter:
            root_path = os.path.dirname('apoe_result/LR_pavl_filter/')
        else:
            root_path = os.path.dirname('apoe_result/LR/')
    else:
        root_path = os.path.dirname('apoe_result/Norm/')
    if topk is None:
        res_path = root_path
    else:
        res_path = root_path + '/top_' + str(topk) + '/'
    if not os.path.exists(res_path):
        os.makedirs(res_path)
    for oc in output_category:
        TransE_oc_res = TransE_res[oc]
        TransR_oc_res = TransR_res[oc]
        ComplEx_oc_res = ComplEx_res[oc]
        DistMult_oc_res = DistMult_res[oc]
        if bool(TransE_oc_res):
            ensemble_oc_res = vote_result(TransE_oc_res, TransR_oc_res, ComplEx_oc_res, DistMult_oc_res)
        else:
            ensemble_oc_res = {}

        res_table = generate_predict_result(ensemble_oc_res, oc, topk)
        if len(res_table) > 0:
            res_table.to_csv(res_path + "/" + oc + '.csv', index=False)

### Step 4: Generate Network Data

To visualize the predicted context entities of the query cohort, we pull shortest paths between each pair of cohort description entity and context entity. We visualized the network structure using Gephi 0.9 (https://gephi.org).

In [None]:
def generate_network_data(weight_type, pavl_filter, target_type, topk):
    if weight_type == "LR":
        if pavl_filter:
            with open("input_entity_list_LR_pval.obj", "rb") as f:
                input_entity_list = pickle.load(f)
            f.close()
            data_path = 'apoe_result/LR_pavl_filter/'
        else:
            with open("input_entity_list_LR.obj", "rb") as f:
                input_entity_list = pickle.load(f)
            f.close()
            data_path = 'apoe_result/LR/'
    else:
        with open("input_entity_list.obj", "rb") as f:
            input_entity_list = pickle.load(f)
        f.close()
        data_path = 'apoe_result/Norm/'

    if topk is None:
        predict_res = pd.read_csv(data_path + target_type + ".csv")
    else:
        predict_res = pd.read_csv(data_path + "top_" + str(topk) + "/" + target_type + ".csv")

    candidate_list = predict_res.set_index('name')['type'].to_dict()

    triplets_list = []
    concepts_list = []
    for input_entity in tqdm(input_entity_list):
        ibkh_name, ibkh_type, weight = input_entity
        concepts_list.append(ibkh_name)
        for candidate in tqdm(candidate_list):
            if ibkh_name != candidate:
                if ibkh_type == 'Gene':
                    cypher_statement = "MATCH (pre:" + ibkh_type + " {symbol: \"" + ibkh_name + "\"}), "
                else:
                    cypher_statement = "MATCH (pre:" + ibkh_type + " {name: \"" + ibkh_name + "\"}), "
                if candidate_list[candidate] == 'Gene':
                    cypher_statement += "(can:" + target_type + " {symbol: \"" + candidate + "\"}), "
                else:
                    cypher_statement += "(can:" + target_type + " {name: \"" + candidate + "\"}), "
                cypher_statement += "path = shortestPath((pre)-[*..15]-(can)) RETURN path LIMIT 5"
                triplets_list += generate_network_triplets(cypher_statement)
    network_data = generate_network_data(triplets_list, concepts_list, candidate_list, "APOE")
    if weight_type == "LR":
        if pavl_filter:
            with open(data_path + "top_" + str(topk) + "/" + target_type + '_network.obj', 'wb') as f:
                pickle.dump(network_data, f)
            f.close()
        else:
            with open(data_path + "top_" + str(topk) + "/" + target_type + '_network.obj', 'wb') as f:
                pickle.dump(network_data, f)
            f.close()
    else:
        with open(data_path + "top_" + str(topk) + "/" + target_type + '_network.obj', 'wb') as f:
            pickle.dump(network_data, f)
        f.close()