# iBKH-based Knowledge Discovery Pipeline

This is the implementation of Knowledge Discovery pipeline in our iBKH portal at http://ibkh.ai/.

Given a target entity of interest, the task is to discover the Top-N entities from different entity types (currently supporting gene, drug, symptom, and pathway entities) that potentially link to the target entity. 


Generally, the script contains 3 steps, including: 1) data preparation (triplets generation); 2) knowledge graph embedding learning; and 3)link prediction for knowledge discovery. For convenience, we have preprocessed the raw iBKH data to generate triplet files and trained the knowledge graph embedding models and produced embedding vectors. You may also re-produce them following the steps below.

All the input iBKH raw entity & relation files as well as the intermediate products, i.e., triplet files and embeddings can be downloaded at: https://wcm.box.com/s/1icrjj589nq7kx9bjpdz5sisxl3keerj

Please make sure putting the downloaded files following the structure below.

```
.
├── ...
├── Knowledge Discovery.ipynb
├── Data
│   ├── iBKH                             # iBKH raw entity & relation data - INPUT
│   │   ├── Entity 
│   │   ├── Relation
│   ├── triplets                         # Extracted triplets 
│   │   ├── DDi_triplet.csv
│   │   ├── DG_triplet.csv
│   │   ├── DD_triplet.csv
│   │   ├── DPwy_triplet.csv
│   │   ├── DSE_triplet.csv
│   │   ├── DiDi_triplet.csv
│   │   ├── DiG_triplet.csv
│   │   ├── DiPwy_triplet.csv
│   │   ├── DiSy_triplet.csv
│   │   ├── GG_triplet.csv
│   │   ├── GPwy_triplet.csv
│   │   ├── triplet_whole.csv
│   ├── dataset
│   │   ├── training_triplet_whole.tsv   # Extracted triplets in DGL format
│   ├── UI_emb                           # KG embeddings 
│   │   ├── entities.tsv 
│   │   ├── relations.tsv
│   │   ├── DistMult                             
│   │   │   ├── iBKH_DistMult_entity.npy
│   │   │   ├── iBKH_DistMult_relation.npy
│   │   ├── ComplEx
│   │   │   ├── iBKH_ComplEx_entity.npy
│   │   │   ├── iBKH_ComplEx_relation.npy
│   │   ├── TransE_l2         
│   │   │   ├── iBKH_TransE_l2.npy
│   │   │   ├── iBKH_TransE_l2.npy
│   │   ├── TransR
│   │   │   ├── iBKH_TransR_entity.npy
│   │   │   ├── iBKH_TransR_relation.npy
│   └── ...
└── ...
```

In [None]:
import pandas as pd
import numpy as np
import pickle

import torch as th
import torch.nn.functional as fn

### Step 1:  Generate Triplet Set from iBKH 

A triplet, i.e., (h, r, t), is the basic unit for a knowledge graph. We generate triplet set from iBKH, which will be used for knowledge graph embedding learning.

In [None]:
kg_folder = 'Data/iBKH/' # The folder is used to store the iBKH results, include Entity results and Relation results
triplet_path = 'Data/triplets/' # The folder is used to store processed results
if not os.path.exists(triplet_path):
    os.makedirs(triplet_path)

In [None]:
# Extracting triplets between drug and disease entities

def DDi_triplets(): 
    ddi = pd.read_csv(kg_folder + 'Relation/D_Di_res.csv')

    ddi_treats = ddi[ddi['Treats'] == 1]
    ddi_treats['Relation'] = ['Treats_DDi'] * len(ddi_treats)
    ddi_treats = ddi_treats[['Drug', 'Relation', 'Disease', 'Inference_Score']]

    ddi_palliates = ddi[ddi['Palliates'] == 1]
    ddi_palliates['Relation'] = ['Palliates_DDi'] * len(ddi_palliates)
    ddi_palliates = ddi_palliates[['Drug', 'Relation', 'Disease', 'Inference_Score']]

    ddi_effect = ddi[ddi['Effect'] == 1]
    ddi_effect['Relation'] = ['Effect_DDi'] * len(ddi_effect)
    ddi_effect = ddi_effect[['Drug', 'Relation', 'Disease', 'Inference_Score']]

    ddi_associate = ddi[ddi['Associate'] == 1]
    ddi_associate['Relation'] = ['Associate_DDi'] * len(ddi_associate)
    ddi_associate = ddi_associate[['Drug', 'Relation', 'Disease', 'Inference_Score']]

    ddi_IR = ddi[ddi['Inferred_Relation'] == 1]
    ddi_IR['Relation'] = ['Inferred_Relation_DDi'] * len(ddi_IR)
    ddi_IR = ddi_IR[['Drug', 'Relation', 'Disease', 'Inference_Score']]

    ddi_SR = ddi[
        (ddi['treatment/therapy (including investigatory)'] == 1) | (ddi['inhibits cell growth (esp. cancers)'] == 1) |
        (ddi['alleviates, reduces'] == 1) | (ddi['biomarkers (of disease progression)'] == 1) |
        (ddi['prevents, suppresses'] == 1) | (ddi['role in disease pathogenesis'] == 1)]
    ddi_SR['Relation'] = ['Semantic_Relation_DDi'] * len(ddi_SR)
    ddi_SR = ddi_SR[['Drug', 'Relation', 'Disease', 'Inference_Score']]

    ddi_res = pd.concat((ddi_treats, ddi_palliates, ddi_effect, ddi_associate, ddi_IR, ddi_SR))
    ddi_res = ddi_res.rename(columns={'Drug': 'Head', 'Disease': 'Tail'})

    ddi_res.loc[ddi_res['Relation'] != 'Inferred_Relation_DDi', 'Inference_Score'] = np.nan

    ddi_res.to_csv('Data/triplets/DDi_triplet.csv', index=False)

In [None]:
# Extracting triplets between drug and gene entities.

def DG_triplets():
    dg = pd.read_csv(kg_folder + 'Relation/D_G_res.csv')

    dg_target = dg[dg['Target'] == 1]
    dg_target['Relation'] = ['Target_DG'] * len(dg_target)
    dg_target['Inference_Score'] = [''] * len(dg_target)
    dg_target = dg_target[['Drug', 'Relation', 'Gene', 'Inference_Score']]

    dg_transporter = dg[dg['Transporter'] == 1]
    dg_transporter['Relation'] = ['Transporter_DG'] * len(dg_transporter)
    dg_transporter['Inference_Score'] = [''] * len(dg_transporter)
    dg_transporter = dg_transporter[['Drug', 'Relation', 'Gene', 'Inference_Score']]

    dg_enzyme = dg[dg['Enzyme'] == 1]
    dg_enzyme['Relation'] = ['Enzyme_DG'] * len(dg_enzyme)
    dg_enzyme['Inference_Score'] = [''] * len(dg_enzyme)
    dg_enzyme = dg_enzyme[['Drug', 'Relation', 'Gene', 'Inference_Score']]

    dg_carrier = dg[dg['Carrier'] == 1]
    dg_carrier['Relation'] = ['Carrier_DG'] * len(dg_carrier)
    dg_carrier['Inference_Score'] = [''] * len(dg_carrier)
    dg_carrier = dg_carrier[['Drug', 'Relation', 'Gene', 'Inference_Score']]

    dg_downregulates = dg[dg['Downregulates'] == 1]
    dg_downregulates['Relation'] = ['Downregulates_DG'] * len(dg_downregulates)
    dg_downregulates['Inference_Score'] = [''] * len(dg_downregulates)
    dg_downregulates = dg_downregulates[['Drug', 'Relation', 'Gene', 'Inference_Score']]

    dg_upregulates = dg[dg['Upregulates'] == 1]
    dg_upregulates['Relation'] = ['Upregulates_DG'] * len(dg_upregulates)
    dg_upregulates['Inference_Score'] = [''] * len(dg_upregulates)
    dg_upregulates = dg_downregulates[['Drug', 'Relation', 'Gene', 'Inference_Score']]

    dg_associate = dg[dg['Associate'] == 1]
    dg_associate['Relation'] = ['Associate_DG'] * len(dg_associate)
    dg_associate['Inference_Score'] = [''] * len(dg_associate)
    dg_associate = dg_associate[['Drug', 'Relation', 'Gene', 'Inference_Score']]

    dg_binds = dg[dg['Binds'] == 1]
    dg_binds['Relation'] = ['Binds_DG'] * len(dg_binds)
    dg_binds['Inference_Score'] = [''] * len(dg_binds)
    dg_binds = dg_binds[['Drug', 'Relation', 'Gene', 'Inference_Score']]

    dg_interaction = dg[dg['Interaction'] == 1]
    dg_interaction['Relation'] = ['Interaction_DG'] * len(dg_interaction)
    dg_interaction['Inference_Score'] = [''] * len(dg_interaction)
    dg_interaction = dg_interaction[['Drug', 'Relation', 'Gene', 'Inference_Score']]

    dg_SR = dg[(dg['affects expression/production (neutral)'] == 1) | (dg['agonism, activation'] == 1) |
               (dg['inhibits'] == 1) | (dg['metabolism, pharmacokinetics'] == 1) | (dg['antagonism, blocking'] == 1) |
               (dg['increases expression/production'] == 1) | (dg['binding, ligand (esp. receptors)'] == 1) |
               (dg['decreases expression/production'] == 1) | (dg['transport, channels'] == 1) |
               (dg['enzyme activity'] == 1) | (dg['physical association'] == 1)]
    dg_SR['Relation'] = ['Semantic_Relation_DG'] * len(dg_SR)
    dg_SR['Inference_Score'] = [''] * len(dg_SR)
    dg_SR = dg_SR[['Drug', 'Relation', 'Gene', 'Inference_Score']]

    dg_res = pd.concat(
        (dg_target, dg_transporter, dg_enzyme, dg_carrier, dg_downregulates, dg_upregulates, dg_associate,
         dg_binds, dg_interaction, dg_SR))
    dg_res = dg_res.rename(columns={'Drug': 'Head', 'Gene': 'Tail'})

    dg_res.to_csv('Data/triplets/DG_triplet.csv', index=False)

In [None]:
# Extracting triplets between drug entities.

def DD_triplets():
    dd = pd.read_csv(kg_folder + 'Relation/D_D_res.csv')

    dd_interaction = dd[dd['Interaction'] == 1]
    dd_interaction['Relation'] = ['Interaction_DD'] * len(dd_interaction)
    dd_interaction['Inference_Score'] = [''] * len(dd_interaction)
    dd_interaction = dd_interaction[['Drug_1', 'Relation', 'Drug_2', 'Inference_Score']]

    dd_resemble = dd[dd['Resemble'] == 1]
    dd_resemble['Relation'] = ['Resemble_DD'] * len(dd_resemble)
    dd_resemble['Inference_Score'] = [''] * len(dd_resemble)
    dd_resemble = dd_resemble[['Drug_1', 'Relation', 'Drug_2', 'Inference_Score']]

    dd_res = pd.concat((dd_interaction, dd_resemble))
    dd_res = dd_res.rename(columns={'Drug_1': 'Head', 'Drug_2': 'Tail'})

    dd_res.to_csv('Data/triplets/DD_triplet.csv', index=False)

In [None]:
# Extracting triplets between drug and pathway entities.

def DPwy_triplets():
    dpwy = pd.read_csv(kg_folder + 'Relation/D_Pwy_res.csv')

    dpwy['Relation'] = ['Association_DPwy'] * len(dpwy)
    dpwy['Inference_Score'] = [''] * len(dpwy)
    dpwy_res = dpwy[['Drug', 'Relation', 'Pathway', 'Inference_Score']]
    dpwy_res = dpwy_res.rename(columns={'Drug': 'Head', 'Pathway': 'Tail'})

    dpwy_res.to_csv('Data/triplets/DPwy_triplet.csv', index=False)

In [None]:
# Extracting triplets between drug and side-effect entities.

def DSE_triplets():
    dse = pd.read_csv(kg_folder + 'Relation/D_SE_res.csv')

    dse['Relation'] = ['Cause_DSE'] * len(dse)
    dse['Inference_Score'] = [''] * len(dse)
    dse_res = dse[['Drug', 'Relation', 'Side_Effect', 'Inference_Score']]
    dse_res = dse_res.rename(columns={'Drug': 'Head', 'Side_Effect': 'Tail'})

    dse_res.to_csv('Data/triplets/DSE_triplet.csv', index=False)

In [None]:
# Extracting triplets between disease entities.

def DiDi_triplets():
    didi = pd.read_csv(kg_folder + 'Relation/Di_Di_res.csv')

    didi_is_a = didi[didi['is_a'] == 1]
    didi_is_a['Relation'] = ['is_a_DiDi'] * len(didi_is_a)
    didi_is_a['Inference_Score'] = [''] * len(didi_is_a)
    didi_is_a = didi_is_a[['Disease_1', 'Relation', 'Disease_2', 'Inference_Score']]

    didi_resemble = didi[didi['Resemble'] == 1]
    didi_resemble['Relation'] = ['Resemble_DiDi'] * len(didi_resemble)
    didi_resemble['Inference_Score'] = [''] * len(didi_resemble)
    didi_resemble = didi_resemble[['Disease_1', 'Relation', 'Disease_2', 'Inference_Score']]

    didi_res = pd.concat((didi_is_a, didi_resemble))
    didi_res = didi_res.rename(columns={'Disease_1': 'Head', 'Disease_2': 'Tail'})

    didi_res.to_csv('Data/triplets/DiDi_triplet.csv', index=False)

In [None]:
# Extracting triplets between disease and gene entities.

def DiG_triplets():
    dig = pd.read_csv(kg_folder + 'Relation/Di_G_res.csv')

    dig_associate = dig[dig['Associate'] == 1]
    dig_associate['Relation'] = ['Associate_DiG'] * len(dig_associate)
    dig_associate = dig_associate[['Disease', 'Relation', 'Gene', 'Inference_Score']]

    dig_downregulates = dig[dig['Downregulates'] == 1]
    dig_downregulates['Relation'] = ['Downregulates_DiG'] * len(dig_downregulates)
    dig_downregulates = dig_downregulates[['Disease', 'Relation', 'Gene', 'Inference_Score']]

    dig_upregulates = dig[dig['Upregulates'] == 1]
    dig_upregulates['Relation'] = ['Upregulates_DiG'] * len(dig_upregulates)
    dig_upregulates = dig_upregulates[['Disease', 'Relation', 'Gene', 'Inference_Score']]

    dig_IR = dig[dig['Inferred_Relation'] == 1]
    dig_IR['Relation'] = ['Inferred_Relation_DiG'] * len(dig_IR)
    dig_IR = dig_IR[['Disease', 'Relation', 'Gene', 'Inference_Score']]

    dig_SR = dig[(dig['improper regulation linked to disease'] == 1) | (dig['causal mutations'] == 1) |
                 (dig['polymorphisms alter risk'] == 1) | (dig['role in pathogenesis'] == 1) |
                 (dig['possible therapeutic effect'] == 1) | (dig['biomarkers (diagnostic)'] == 1) |
                 (dig['promotes progression'] == 1) | (dig['drug targets'] == 1) | (
                         dig['overexpression in disease'] == 1) |
                 (dig['mutations affecting disease course'] == 1)]
    dig_SR['Relation'] = ['Semantic_Relation_DiG'] * len(dig_SR)
    dig_SR = dig_SR[['Disease', 'Relation', 'Gene', 'Inference_Score']]

    dig_res = pd.concat((dig_associate, dig_downregulates, dig_upregulates, dig_IR, dig_SR))
    dig_res = dig_res.rename(columns={'Disease': 'Head', 'Gene': 'Tail'})

    dig_res.loc[dig_res['Relation'] != 'Inferred_Relation_DiG', 'Inference_Score'] = np.nan

    dig_res.to_csv('Data/triplets/DiG_triplet.csv', index=False)

In [None]:
# Extracting triplets between disease and pathway entities.

def DiPwy_triplets():
    dipwy = pd.read_csv(kg_folder + 'Relation/Di_Pwy_res.csv')

    dipwy['Relation'] = ['Association_DiPwy'] * len(dipwy)
    dipwy['Inference_Score'] = [''] * len(dipwy)
    dipwy_res = dipwy[['Disease', 'Relation', 'Pathway', 'Inference_Score']]
    dipwy_res = dipwy_res.rename(columns={'Disease': 'Head', 'Pathway': 'Tail'})

    dipwy_res.to_csv('Data/triplets/DiPwy_triplet.csv', index=False)

In [None]:
# Extracting triplets between disease and symptom entities.

def DiSy_triplets():
    disy = pd.read_csv(kg_folder + 'Relation/Di_Sy_res.csv')

    disy['Relation'] = ['Present_DiSy'] * len(disy)
    disy['Inference_Score'] = [''] * len(disy)
    disy_res = disy[['Disease', 'Relation', 'Symptom', 'Inference_Score']]
    disy_res = disy_res.rename(columns={'Disease': 'Head', 'Symptom': 'Tail'})

    disy_res.to_csv('Data/triplets/DiSy_triplet.csv', index=False)

In [None]:
# Extracting triplets between gene entities.

def GG_triplets():
    gg = pd.read_csv(kg_folder + 'Relation/G_G_res.csv')

    gg_covaries = gg[gg['Covaries'] == 1]
    gg_covaries['Relation'] = ['Covaries_GG'] * len(gg_covaries)
    gg_covaries['Inference_Score'] = [''] * len(gg_covaries)
    gg_covaries = gg_covaries[['Gene_1', 'Relation', 'Gene_2', 'Inference_Score']]

    gg_interacts = gg[gg['Interacts'] == 1]
    gg_interacts['Relation'] = ['Interacts_GG'] * len(gg_interacts)
    gg_interacts['Inference_Score'] = [''] * len(gg_interacts)
    gg_interacts = gg_interacts[['Gene_1', 'Relation', 'Gene_2', 'Inference_Score']]

    gg_regulates = gg[gg['Regulates'] == 1]
    gg_regulates['Relation'] = ['Regulates_GG'] * len(gg_regulates)
    gg_regulates['Inference_Score'] = [''] * len(gg_regulates)
    gg_regulates = gg_regulates[['Gene_1', 'Relation', 'Gene_2', 'Inference_Score']]

    gg_associate = gg[gg['Associate'] == 1]
    gg_associate['Relation'] = ['Associate_GG'] * len(gg_associate)
    gg_associate['Inference_Score'] = [''] * len(gg_associate)
    gg_associate = gg_associate[['Gene_1', 'Relation', 'Gene_2', 'Inference_Score']]

    gg_SR = gg[
        (gg['activates, stimulates'] == 1) | (gg['production by cell population'] == 1) | (gg['regulation'] == 1) |
        (gg['binding, ligand (esp. receptors)'] == 1) | (gg['signaling pathway'] == 1) |
        (gg['increases expression/production'] == 1) | (gg['same protein or complex'] == 1) |
        (gg['enhances response'] == 1) | (gg['affects expression/production (neutral)'] == 1) |
        (gg['physical association'] == 1) | (gg['association'] == 1) | (gg['colocalization'] == 1) |
        (gg['dephosphorylation reaction'] == 1) | (gg['cleavage reaction'] == 1) | (gg['direct interation'] == 1) |
        (gg['ADP ribosylation reaction'] == 1) | (gg['ubiquitination reaction'] == 1) |
        (gg['phosphorylation reaction'] == 1) | (gg['protein cleavage'] == 1)]
    gg_SR['Relation'] = ['Semantic_Relation_GG'] * len(gg_SR)
    gg_SR['Inference_Score'] = [''] * len(gg_SR)
    gg_SR = gg_SR[['Gene_1', 'Relation', 'Gene_2', 'Inference_Score']]

    gg_res = pd.concat((gg_covaries, gg_interacts, gg_regulates, gg_associate, gg_SR))
    gg_res = gg_res.rename(columns={'Gene_1': 'Head', 'Gene_2': 'Tail'})

    gg_res.to_csv('Data/triplets/GG_triplet.csv', index=False)

In [None]:
# Extracting triplets between gene and pathway entities.

def GPwy_triplets():
    gpwy = pd.read_csv(kg_folder + 'Relation/G_Pwy_res.csv')

    gpwy['Relation'] = ['Associate_GPwy'] * len(gpwy)
    gpwy['Inference_Score'] = [''] * len(gpwy)
    gpwy_res = gpwy[['Gene', 'Relation', 'Pathway', 'Inference_Score']]
    gpwy_res = gpwy_res.rename(columns={'Gene': 'Head', 'Pathway': 'Tail'})

    gpwy_res.to_csv('Data/triplets/GPwy_triplet.csv', index=False)

Run functions to generate corresponding triplets

In [None]:
DDi_triplets()
DG_triplets()
DPwy_triplets()
DSE_triplets()
DiDi_triplets()
DiG_triplets()
DiPwy_triplets()
DiSy_triplets()
GG_triplets()
GPwy_triplets()

Combine all the triplets set extracting from the relation results among the entities, then convert the triplet set from .csv format to the .tsv format based on the DGL input requirement.

In [None]:
# This function will return a csv file, which combines all triplets extracted from the above functions.

def generate_triplet_set():
    DDi_triplet = pd.read_csv('Data/triplets/DDi_triplet.csv')
    DiG_triplet = pd.read_csv('Data/triplets/DiG_triplet.csv')
    DG_triplet = pd.read_csv('Data/triplets/DG_triplet.csv')
    GG_triplet = pd.read_csv('Data/triplets/GG_triplet.csv')
    DD_triplet = pd.read_csv('Data/triplets/DD_triplet.csv')
    DiDi_triplet = pd.read_csv('Data/triplets/DiDi_triplet.csv')
    GPwy_triplet = pd.read_csv('Data/triplets/GPwy_triplet.csv')
    DiPwy_triplet = pd.read_csv('Data/triplets/DiPwy_triplet.csv')
    DPwy_triplet = pd.read_csv('Data/triplets/DPwy_triplet.csv')
    DiSy_triplet = pd.read_csv('Data/triplets/DiSy_triplet.csv')
    DSE_triplet = pd.read_csv('Data/triplets/DSE_triplet.csv')

    triplet_set = pd.concat((DDi_triplet, DG_triplet, DiG_triplet, GG_triplet, DD_triplet, DiDi_triplet, GPwy_triplet,
                             DPwy_triplet, DiPwy_triplet, DiSy_triplet, DSE_triplet))
    triplet_set = triplet_set[['Head', 'Relation', 'Tail']]

    triplet_set.to_csv('Data/triplets/triplet_whole.csv', index=False)

generate_triplet_set()  

In [None]:
# This function converts the triplet csv file to tsv format, according to DGL requirment

def generate_training_set():
    triplets_set = pd.read_csv('Data/triplets/triplet_whole.csv')

    triples = triplets_set.values.tolist()
    train_set = np.arange(len(triples)).tolist()
    
    dataset_path = 'Data/dataset/'
    if not os.path.exists(dataset_path):
    os.makedirs(dataset_path)

    print(len(triples), len(train_set))
    with open("Data/dataset/training_triplet_whole.tsv", 'w+') as f:
        for idx in train_set:
            f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))
            
generate_training_set()

### Step 2:  Knowledge graph embedding

We directly invoke the command line toolkit provided by DGL-KE to learn the embedding of entities and relations in iBKH. Here, we use four different models to learn the entity and edge representations of iBKH, namely TransE, TransR, DistMult, and ComplEx. To use other KGE model or AWS instances please refer to DGL-KE’s <a href="https://aws-dglke.readthedocs.io/en/latest/index.html" target="_blank">Document</a>.


Of note, for convenience, we have trained the knowledge graph embedding models and produced the embedding vectors which can be found in the downloaded folder. You may also run the command below to reproduce the embedding vectors. 

##### TransE

In [None]:
DGLBACKEND=pytorch dglke_train --dataset iBKH --data_path ./dataset --data_files training_triplet_whole.tsv --format raw_udd_hrt --model_name TransE_l2 --batch_size 3000 --neg_sample_size 256 --hidden_dim 400 --gamma 12.0 --lr 0.1 --max_step 50000 --log_interval 100 --batch_size_eval 1000 -adv --regularization_coef 1.00E-09 --num_thread 1 --num_proc 8 --neg_sample_size_eval 1000

##### TransR

In [None]:
DGLBACKEND=pytorch dglke_train --dataset iBKH --data_path ./dataset --data_files training_triplet_whole.tsv --format raw_udd_hrt --model_name TransR --batch_size 1024 --neg_sample_size 256 --hidden_dim 200 --gamma 12.0 --lr 0.005 --max_step 10000 --log_interval 100 --batch_size_eval 1000 -adv --regularization_coef 1.00E-07 --num_thread 1 --num_proc 8 --neg_sample_size_eval 1000

##### DistMult

In [None]:
DGLBACKEND=pytorch dglke_train --dataset iBKH --data_path ./dataset --data_files training_triplet_whole.tsv --format raw_udd_hrt --model_name DistMult --batch_size 1024 --neg_sample_size 256 --hidden_dim 400 --gamma 12.0 --lr 0.005 --max_step 10000 --log_interval 100 --batch_size_eval 1000 -adv --regularization_coef 1.00E-07 --num_thread 1 --num_proc 8 --neg_sample_size_eval 1000

##### ComplEx

In [None]:
DGLBACKEND=pytorch dglke_train --dataset iBKH --data_path ./dataset --data_files training_triplet_whole.tsv --format raw_udd_hrt --model_name ComplEx --batch_size 1024 --neg_sample_size 256 --hidden_dim 200 --gamma 12.0 --lr 0.05 --max_step 10000 --log_interval 100 --batch_size_eval 1000 -adv --regularization_coef 1.00E-07 --num_thread 1 --num_proc 8 --neg_sample_size_eval 1000

### Step 3: Knowledge Discovery on iBKH

Inspired by https://www.dgl.ai/news/2020/06/09/covid.html, we used the following algorithms to calculate the edge scores. And the edge scores indicate the strength of association between candidate entities and queried entity.

In [None]:
def transE_l2(head, rel, tail, gamma=12.0):
    # Paper link: https://papers.nips.cc/paper/5071-translating-embeddings-for-modeling-multi-relational-data
    score = head + rel - tail
    
    return gamma - th.norm(score, p=2, dim=-1)

In [None]:
def transR(head, rel, tail, proj, rel_idx, gamma=12.0):
    # Paper link: https://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/download/9571/9523
    proj = proj.reshape(-1, head.shape[1], rel.shape[0])[rel_idx]
    head_r = th.einsum('ab,bc->ac', head, proj)
    tail_r = th.einsum('b,bc->c', th.tensor(tail), proj)
    score = head_r + rel - tail_r
    
    return gamma - th.norm(score, p=1, dim=-1)

In [None]:
def DistMult(head, rel, tail):
    # Paper link: https://arxiv.org/abs/1412.6575
    score = head * rel * tail
    
    return th.sum(score, dim=-1)

In [None]:
def complEx(head, rel, tail, gamma=12.0):
    # Paper link: https://arxiv.org/abs/1606.06357
    real_head, img_head = th.chunk(head, 2, dim=-1)
    real_tail, img_tail = th.chunk(th.tensor(tail), 2, dim=-1)
    real_rel, img_rel = th.chunk(rel, 2, dim=-1)

    score = real_head * real_tail * real_rel \
            + img_head * img_tail * real_rel \
            + real_head * img_tail * img_rel \
            - img_head * real_tail * img_rel

    return th.sum(score, -1)

In [None]:
DDi = pd.read_csv('Data/triplets/DDi_triplet.csv')
DiG = pd.read_csv('Data/triplets/DiG_triplet.csv')
DiSy = pd.read_csv('Data/triplets/DiSy_triplet.csv')
DiPwy = pd.read_csv('Data/triplets/DiPwy_triplet.csv')
DG = pd.read_csv('Data/triplets/DG_triplet.csv')
DSE = pd.read_csv('Data/triplets/DSE_triplet.csv')
DPwy = pd.read_csv('Data/triplets/DPwy_triplet.csv')
GPwy = pd.read_csv('Data/triplets/GPwy_triplet.csv')
GG = pd.read_csv('Data/triplets/GG_triplet.csv')
DiDi = pd.read_csv('Data/triplets/DiDi_triplet.csv')
DD = pd.read_csv('Data/triplets/DD_triplet.csv')

drug_vocab = pd.read_csv(kg_folder + 'Entity/drug_vocab.csv')
drug_dict = drug_vocab.set_index('primary')['name'].to_dict()
drug_name_dict = drug_vocab.set_index('name')['primary'].to_dict()
disease_vocab = pd.read_csv(kg_folder + 'Entity/disease_vocab.csv')
disease_dict = disease_vocab.set_index('primary')['name'].to_dict()
disease_name_dict = disease_vocab.set_index('name')['primary'].to_dict()
gene_vocab = pd.read_csv(kg_folder + 'Entity/gene_vocab.csv')
gene_dict = gene_vocab.set_index('primary')['symbol'].to_dict()
gene_symbol_dict = gene_vocab.set_index('symbol')['primary'].to_dict()
symptom_vocab = pd.read_csv(kg_folder + 'Entity/symptom_vocab.csv')
symptom_dict = symptom_vocab.set_index('primary')['name'].to_dict()
symptom_name_dict = symptom_vocab.set_index('name')['primary'].to_dict()
pathway_vocab = pd.read_csv(kg_folder + 'Entity/pathway_vocab.csv')
pathway_dict = pathway_vocab.set_index('primary')['name'].to_dict()
pathway_name_dict = pathway_vocab.set_index('name')['primary'].to_dict()
se_vocab = pd.read_csv(kg_folder + 'Entity/side_effect_vocab.csv')
se_dict = se_vocab.set_index('primary')['name'].to_dict()
se_name_dict = se_vocab.set_index('name')['primary'].to_dict()

# Pre-processing dictionaries to help execute functions

with open('Data/entity_map.obj', 'rb') as f:
    entity_map = pickle.load(f)
f.close()
with open('Data/entity_id_map.obj', 'rb') as f:
    entity_id_map = pickle.load(f)
f.close()
with open('Data/relation_map.obj', 'rb') as f:
    relation_map = pickle.load(f)
f.close()

entity_emb_TransE = np.load('Data/UI_emb/TransE/iBKH_TransE_l2_entity.npy')
rel_emb_TransE = np.load('Data/UI_emb/TransE/iBKH_TransE_l2_relation.npy')

entity_emb_TransR = np.load('Data/UI_emb/TransR/iBKH_TransR_entity.npy')
rel_emb_TransR = np.load('Data/UI_emb/TransR/iBKH_TransR_relation.npy')
proj_np = np.load('Data/UI_emb/TransR/iBKH_TransRprojection.npy')
proj_emb = th.tensor(proj_np)

entity_emb_ComplEx = np.load('Data/UI_emb/ComplEx/iBKH_ComplEx_entity.npy')
rel_emb_ComplEx = np.load('Data/UI_emb/ComplEx/iBKH_ComplEx_relation.npy')

entity_emb_DistMult = np.load('Data/UI_emb/DistMult/iBKH_DistMult_entity.npy')
rel_emb_DistMult = np.load('Data/UI_emb/DistMult/iBKH_DistMult_relation.npy')

entity_df = pd.read_table('Case_Study/UI_emb/entities.tsv', header=None)
entity_df = entity_df.dropna().reset_index(drop=True)

In [None]:
# This function will return the ranked results based on edge scores from different embedding data.

def rank_result(candidate_ids, rel_ids, target_emb, rel_embs, candidate_emb, model_name, rel_type=None, proj_emb=None):
    scores_candidate = []
    dids = []
    if rel_type == 'GG':
        for rid in range(len(rel_embs)):
            GG_emb = rel_embs[rid]
            if model_name == 'TransE':
                score_1 = fn.logsigmoid(transE_l2(candidate_emb, GG_emb, target_emb))
                score_2 = fn.logsigmoid(transE_l2(target_emb, GG_emb, candidate_emb))
            elif model_name == 'ComplEx':
                score_1 = fn.logsigmoid(complEx(candidate_emb, GG_emb, target_emb))
                score_2 = fn.logsigmoid(complEx(target_emb, GG_emb, candidate_emb))
            elif model_name == 'TransR':
                score_1 = fn.logsigmoid(transR(candidate_emb, GG_emb, target_emb, proj_emb, rel_ids[rid]))
                score_2 = fn.logsigmoid(transR(candidate_emb, GG_emb, target_emb, proj_emb, rel_ids[rid]))
            elif model_name == 'DistMult':
                score_1 = fn.logsigmoid(DistMult(candidate_emb, GG_emb, target_emb))
                score_2 = fn.logsigmoid(DistMult(target_emb, GG_emb, candidate_emb))
            scores_candidate.append(score_1)
            scores_candidate.append(score_2)
            dids.append(candidate_ids)
            dids.append(candidate_ids)
    else:
        for rid in range(len(rel_embs)):
            rel_emb = rel_embs[rid]
            if model_name == 'TransE':
                score = fn.logsigmoid(transE_l2(candidate_emb, rel_emb, target_emb))
            elif model_name == 'ComplEx':
                score = fn.logsigmoid(complEx(candidate_emb, rel_emb, target_emb))
            elif model_name == 'TransR':
                score = fn.logsigmoid(transR(candidate_emb, rel_emb, target_emb, proj_emb, rel_ids[rid]))
            elif model_name == 'DistMult':
                score = fn.logsigmoid(DistMult(candidate_emb, rel_emb, target_emb))
            scores_candidate.append(score)
            dids.append(candidate_ids)

    scores = th.cat(scores_candidate)
    dids = th.cat(dids)

    idx = th.flip(th.argsort(scores), dims=[0])
    dids = dids[idx].numpy()

    _, unique_indices = np.unique(dids, return_index=True)
    topk_indices = np.sort(unique_indices)
    proposed_dids = dids[topk_indices]

    return [entity_id_map[int(idx)] for idx in proposed_dids]

We propose an ensemble model, which combines the results based on TransE, TransR, DistMult, and complEx to claculate a combined score for each entity.

In [None]:
def ensemble_model(transE_res, transR_res, complEx_res, distMult_res):
    res = pd.DataFrame(columns=['Entity', 'vote_score'])
    idx = 0
    for drug in transE_res:
        vote_transE = len(transE_res) - transE_res.index(drug)
        vote_transR = len(transR_res) - transR_res.index(drug)
        vote_complEx = len(complEx_res) - complEx_res.index(drug)
        vote_distMult = len(distMult_res) - distMult_res.index(drug)
        vote_score = vote_transE + vote_transR + vote_complEx + vote_distMult
        res.loc[idx] = [drug, vote_score]
        idx += 1
    res = res.sort_values('vote_score', ascending=False)
    res = res.reset_index(drop=True)

    return res['Entity'].tolist()

In [None]:
# This function will return the HTML content based on the ranked results.

def generate_html(candidate_ids, candidate_type, top_k):
    col = ['Rank', 'Type', 'Primary_ID', 'Name', 'Link']
    predicted_candidate_df = pd.DataFrame(columns=col)
    candidate_ids = candidate_ids[:int(top_k)]
    for i, candidate_id in enumerate(candidate_ids):
        candidate_name = ''
        if candidate_type == 'Drug':
            candidate_name = drug_dict[candidate_id]
            candidate_link = 'https://go.drugbank.com/drugs/' + candidate_id.replace('DrugBank:', '')
            candidate_idx = str(i + 1)
        if candidate_type == 'Disease':
            candidate_name = disease_dict[candidate_id]
            candidate_link = 'https://disease-ontology.org/'
            candidate_idx = str(i + 1)
        if candidate_type == 'Gene':
            candidate_name = gene_dict[candidate_id]
            candidate_link = 'https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/' + candidate_id
            candidate_idx = str(i + 1)
        if candidate_type == 'Symptom':
            candidate_name = symptom_dict[candidate_id]
            candidate_link = 'https://www.ncbi.nlm.nih.gov/mesh/?term=' + candidate_id.replace('MESH:', '')
            candidate_idx = str(i + 1)
        if candidate_type == 'Pathway':
            candidate_name = pathway_dict[candidate_id]
            if 'REACT' in candidate_id:
                candidate_link = 'https://reactome.org/content/detail/R-HSA-71403' + candidate_id.replace('REACT:', '')
            else:
                candidate_link = 'https://www.genome.jp/pathway/' + candidate_id.replace('KEGG:', '')
            candidate_idx = str(i + 1)
        if candidate_type == 'Side_Effect':
            candidate_name = se_dict[candidate_id]
            candidate_link = 'http://sideeffects.embl.de/se/' + candidate_id.replace('UMLS:', '')
            candidate_idx = str(i + 1)

        predicted_candidate_df.loc[i] = [candidate_idx, candidate_type, candidate_id, candidate_name, candidate_link]

    html = '<table lay-filter="index_table" lay-size="lg"><thead><tr>'
    for col_names in col:
        if col_names == 'Link':
            html += "<th lay-data='{field:\"" + col_names + "\", width:380}'>" + col_names + '</th>'
        elif col_names == 'Rank':
            html += "<th lay-data='{field:\"" + col_names + "\", width:80}'>" + col_names + '</th>'
        elif col_names == 'Type':
            html += "<th lay-data='{field:\"" + col_names + "\", width:120}'>" + col_names + '</th>'
        else:
            html += "<th lay-data='{field:\"" + col_names + "\", width:270}'>" + col_names + '</th>'
    html += "<th lay-data='{field:\"Evidence\", toolbar:\"#barDemo\", width:250}'>Evidence</th>"
    html += "</tr></thead><tbody>"

    for i in range(0, len(predicted_candidate_df)):
        html += "<tr>"
        features = predicted_candidate_df.iloc[i, 0:-1]
        for value in features:
            html += "<td>" + str(value) + "</td>"
        candidate_link = predicted_candidate_df.iloc[i, -1]
        html += "<td><a href='" + candidate_link + "' target='_blank' rel='noopener noreferrer'>" + candidate_link + "</a></td>"
        html += "</tr>"
    html += "</tbody></table>"

    return html

In [None]:
def predict_triplets(target_entity, candidate_ids, candidate_type, rel_ids, top_k, rel_type=None):
    if target_entity in entity_map:
        candidate_emb_transE = th.tensor(entity_emb_TransE[candidate_ids])
        rel_embs_transE = [th.tensor(rel_emb_TransE[rid]) for rid in rel_ids]
        target_emb_transE = th.tensor(entity_emb_TransE[entity_map[target_entity]])

        candidate_emb_TransR = th.tensor(entity_emb_TransR[candidate_ids])
        rel_embs_TransR = [th.tensor(rel_emb_TransR[rid]) for rid in rel_ids]
        target_emb_TransR = th.tensor(entity_emb_TransR[entity_map[target_entity]])

        candidate_emb_ComplEx = th.tensor(entity_emb_ComplEx[candidate_ids])
        rel_embs_ComplEx = [th.tensor(rel_emb_ComplEx[rid]) for rid in rel_ids]
        target_emb_ComplEx = th.tensor(entity_emb_ComplEx[entity_map[target_entity]])

        candidate_emb_DistMult = th.tensor(entity_emb_DistMult[candidate_ids])
        rel_embs_DistMult = [th.tensor(rel_emb_DistMult[rid]) for rid in rel_ids]
        target_emb_DistMult = th.tensor(entity_emb_DistMult[entity_map[target_entity]])

        if rel_type == 'GG':
            predict_res_TransE = rank_result(candidate_ids, rel_ids, target_emb_transE, rel_embs_transE,
                                             candidate_emb_transE, 'TransE', rel_type='GG')
            predict_res_TransR = rank_result(candidate_ids, rel_ids, target_emb_TransR, rel_embs_TransR,
                                             candidate_emb_TransR, 'TransR', 'GG', proj_emb)
            predict_res_ComplEx = rank_result(candidate_ids, rel_ids, target_emb_ComplEx, rel_embs_ComplEx,
                                              candidate_emb_ComplEx, 'ComplEx', rel_type='GG')
            predict_res_DistMult = rank_result(candidate_ids, rel_ids, target_emb_DistMult, rel_embs_DistMult,
                                               candidate_emb_DistMult, 'DistMult', rel_type='GG')
        else:
            predict_res_TransE = rank_result(candidate_ids, rel_ids, target_emb_transE, rel_embs_transE, candidate_emb_transE,
                                             'TransE')
            predict_res_TransR = rank_result(candidate_ids, rel_ids, target_emb_TransR, rel_embs_TransR, candidate_emb_TransR,
                                             'TransR', None, proj_emb)
            predict_res_ComplEx = rank_result(candidate_ids, rel_ids, target_emb_ComplEx, rel_embs_ComplEx,
                                              candidate_emb_ComplEx, 'ComplEx')
            predict_res_DistMult = rank_result(candidate_ids, rel_ids, target_emb_DistMult, rel_embs_DistMult,
                                               candidate_emb_DistMult, 'DistMult')

        ensemble_res = ensemble_model(predict_res_TransE, predict_res_TransR, predict_res_ComplEx, predict_res_DistMult)

        html_TransE = generate_html(predict_res_TransE, candidate_type, top_k)
        html_TransR = generate_html(predict_res_TransR, candidate_type, top_k)
        html_ComplEx = generate_html(predict_res_ComplEx, candidate_type, top_k)
        html_DistMult = generate_html(predict_res_DistMult, candidate_type, top_k)
        html_ensemble = generate_html(ensemble_res, candidate_type, top_k)
    else:
        html_ensemble = html_TransE = html_TransR = html_ComplEx = html_DistMult = ''

    return html_ensemble, html_TransE, html_TransR, html_ComplEx, html_DistMult

In [None]:
# Based on the input target entity, the following function will generate the HTML content, 
# which will be displayed in our iBKH portal.

def generate_predict_res(target_type, entity_name, top_k):
    predict_res = {}
    if target_type == 'Disease':
        target_id = disease_name_dict[entity_name]
        drug_tp_df = DDi[DDi['Tail'] == target_id]
        drug_tp_list = list(drug_tp_df['Head'])
        drug_candidate_df = entity_df[(entity_df[1].str.contains('DrugBank')) & (~entity_df[1].isin(drug_tp_list))]
        drug_list = list(drug_candidate_df[1])
        DDi_rel = list(DDi.drop_duplicates(subset='Relation', keep='first')['Relation'])
        drug_ids = [entity_map[d] for d in drug_list]
        drug_ids = th.tensor(drug_ids).long()
        DDi_ids = [relation_map[rel] for rel in DDi_rel]
        DDi_ids = th.tensor(DDi_ids)
        html_ensemble_drug, html_TransE_drug, html_TransR_drug, html_ComplEx_drug, html_DistMult_drug = \
            predict_triplets(target_id, drug_ids, 'Drug', DDi_ids, top_k)

        gene_tp_df = DiG[DiG['Head'] == target_id]
        gene_tp_list = list(gene_tp_df['Tail'])
        gene_candidate_df = entity_df[(entity_df[1].str.contains('HGNC')) & (~entity_df[1].isin(gene_tp_list))]
        gene_list = list(gene_candidate_df[1])
        DiG_rel = list(DiG.drop_duplicates(subset='Relation', keep='first')['Relation'])
        gene_ids = [entity_map[d] for d in gene_list]
        gene_ids = th.tensor(gene_ids).long()
        DiG_ids = [relation_map[rel] for rel in DiG_rel]
        DiG_ids = th.tensor(DiG_ids)
        html_ensemble_gene, html_TransE_gene, html_TransR_gene, html_ComplEx_gene, html_DistMult_gene = \
            predict_triplets(target_id, gene_ids, 'Gene', DiG_ids, top_k)

        symptom_tp_df = DiSy[DiSy['Head'] == target_id]
        symptom_tp_list = list(symptom_tp_df['Tail'])
        symptom_candidate_df = symptom_vocab[~symptom_vocab['primary'].isin(symptom_tp_list)]
        symptom_list = list(symptom_candidate_df['primary'])
        DiSy_rel = ['Present_DiSy']
        symptom_ids = []
        for sy in symptom_list:
            if sy in entity_map:
                symptom_ids.append(entity_map[sy])
            else:
                continue
        symptom_ids = th.tensor(symptom_ids).long()
        DiSy_ids = [relation_map[rel] for rel in DiSy_rel]
        DiSy_ids = th.tensor(DiSy_ids)
        html_ensemble_symptom, html_TransE_symptom, html_TransR_symptom, html_ComplEx_symptom, html_DistMult_symptom = \
            predict_triplets(target_id, symptom_ids, 'Symptom', DiSy_ids, top_k)

        pathway_tp_df = DiPwy[DiPwy['Head'] == target_id]
        pathway_tp_list = list(pathway_tp_df['Tail'])
        pathway_candidate_df = pathway_vocab[~pathway_vocab['primary'].isin(pathway_tp_list)]
        pathway_list = list(pathway_candidate_df['primary'])
        DiPwy_rel = ['Association_DiPwy']
        pathway_ids = []
        for pwy in pathway_list:
            if pwy in entity_map:
                pathway_ids.append(entity_map[pwy])
            else:
                continue
        pathway_ids = th.tensor(pathway_ids).long()
        DiPwy_rids = [relation_map[rel] for rel in DiPwy_rel]
        DiPwy_rids = th.tensor(DiPwy_rids)
        html_ensemble_pathway, html_TransE_pathway, html_TransR_pathway, html_ComplEx_pathway, html_DistMult_pathway = \
            predict_triplets(target_id, pathway_ids, 'Pathway', DiPwy_rids, top_k)

        predict_res['Ensemble'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>"+ html_ensemble_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_ensemble_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Symptom</h2><div class='layui-colla-content'>" + html_ensemble_symptom + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_ensemble_pathway + "</div></div></div>"

        predict_res['TransE'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_TransE_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_TransE_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Symptom</h2><div class='layui-colla-content'>" + html_TransE_symptom + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_TransE_pathway + "</div></div></div>"

        predict_res['TransR'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_TransR_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_TransR_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Symptom</h2><div class='layui-colla-content'>" + html_TransR_symptom + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_TransR_pathway + "</div></div></div>"

        predict_res['ComplEx'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_ComplEx_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_ComplEx_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Symptom</h2><div class='layui-colla-content'>" + html_ComplEx_symptom + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_ComplEx_pathway + "</div></div></div>"

        predict_res['DistMult'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_DistMult_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_DistMult_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Symptom</h2><div class='layui-colla-content'>" + html_DistMult_symptom + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_DistMult_pathway + "</div></div></div>"

    elif target_type == 'Drug':
        target_id = drug_name_dict[entity_name]
        disease_tp_df = DDi[DDi['Head'] == target_id]
        disease_tp_list = list(disease_tp_df['Tail'])
        disease_candidate_df = entity_df[(entity_df[1].str.contains('DOID')) & (~entity_df[1].isin(disease_tp_list))]
        disease_list = list(disease_candidate_df[1])
        DDi_rel = list(DDi.drop_duplicates(subset='Relation', keep='first')['Relation'])
        disease_ids = [entity_map[d] for d in disease_list]
        disease_ids = th.tensor(disease_ids).long()
        DDi_ids = [relation_map[rel] for rel in DDi_rel]
        DDi_ids = th.tensor(DDi_ids)
        html_ensemble_disease, html_TransE_disease, html_TransR_disease, html_ComplEx_disease, html_DistMult_disease = \
            predict_triplets(target_id, disease_ids, 'Disease', DDi_ids, top_k)

        gene_tp_df = DG[DG['Head'] == target_id]
        gene_tp_list = list(gene_tp_df['Tail'])
        gene_candidate_df = entity_df[(entity_df[1].str.contains('HGNC')) & (~entity_df[1].isin(gene_tp_list))]
        gene_list = list(gene_candidate_df[1])
        DG_rel = list(DG.drop_duplicates(subset='Relation', keep='first')['Relation'])
        gene_ids = [entity_map[g] for g in gene_list]
        gene_ids = th.tensor(gene_ids).long()
        DG_ids = [relation_map[rel] for rel in DG_rel]
        DG_ids = th.tensor(DG_ids)
        html_ensemble_gene, html_TransE_gene, html_TransR_gene, html_ComplEx_gene, html_DistMult_gene = \
            predict_triplets(target_id, gene_ids, 'Gene', DG_ids, top_k)

        se_tp_df = DSE[DSE['Head'] == target_id]
        se_tp_list = list(se_tp_df['Tail'])
        se_candidate_df = se_vocab[~se_vocab['primary'].isin(se_tp_list)]
        se_list = list(se_candidate_df['primary'])
        DSE_rel = ['Cause_DSE']
        DSE_ids = [relation_map[rel] for rel in DSE_rel]
        DSE_ids = th.tensor(DSE_ids)
        se_ids = []
        for se in se_list:
            if se in entity_map:
                se_ids.append(entity_map[se])
            else:
                continue
        se_ids = th.tensor(se_ids).long()
        html_ensemble_se, html_TransE_se, html_TransR_se, html_ComplEx_se, html_DistMult_se = \
            predict_triplets(target_id, se_ids, 'Side_Effect', DSE_ids, top_k)

        pathway_tp_df = DPwy[DPwy['Head'] == target_id]
        pathway_tp_list = list(pathway_tp_df['Tail'])
        pathway_candidate_df = pathway_vocab[~pathway_vocab['primary'].isin(pathway_tp_list)]
        pathway_list = list(pathway_candidate_df['primary'])
        DPwy_rel = ['Association_DiPwy']
        DPwy_ids = [relation_map[rel] for rel in DPwy_rel]
        pathway_ids = []
        for pwy in pathway_list:
            if pwy in entity_map:
                pathway_ids.append(entity_map[pwy])
            else:
                continue
        pathway_ids = th.tensor(pathway_ids).long()
        DPwy_ids = th.tensor(DPwy_ids)
        html_ensemble_pathway, html_TransE_pathway, html_TransR_pathway, html_ComplEx_pathway, html_DistMult_pathway = \
            predict_triplets(target_id, pathway_ids, 'Pathway', DPwy_ids, top_k)

        predict_res['Ensemble'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_ensemble_disease + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_ensemble_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Side Effect</h2><div class='layui-colla-content'>" + html_ensemble_se + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_ensemble_pathway + "</div></div></div>"

        predict_res['TransE'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_TransE_disease + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_TransE_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Side Effect</h2><div class='layui-colla-content'>" + html_TransE_se + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_TransE_pathway + "</div></div></div>"

        predict_res['TransR'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_TransR_disease + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_TransR_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Side Effect</h2><div class='layui-colla-content'>" + html_TransR_se + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_TransR_pathway + "</div></div></div>"

        predict_res['ComplEx'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_ComplEx_disease + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_ComplEx_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Side Effect</h2><div class='layui-colla-content'>" + html_ComplEx_se + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_ComplEx_pathway + "</div></div></div>"

        predict_res['DistMult'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_DistMult_disease + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_DistMult_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Side Effect</h2><div class='layui-colla-content'>" + html_DistMult_se + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_DistMult_pathway + "</div></div></div>"

    elif target_type == 'Gene':
        target_id = gene_symbol_dict[entity_name]
        disease_tp_df = DiG[DiG['Tail'] == target_id]
        disease_tp_list = list(disease_tp_df['Head'])
        disease_candidate_df = entity_df[(entity_df[1].str.contains('DOID')) & (~entity_df[1].isin(disease_tp_list))]
        disease_list = list(disease_candidate_df[1])
        DiG_rel = list(DiG.drop_duplicates(subset='Relation', keep='first')['Relation'])
        DiG_ids = [relation_map[rel] for rel in DiG_rel]
        disease_ids = [entity_map[g] for g in disease_list]
        disease_ids = th.tensor(disease_ids).long()
        DiG_ids = th.tensor(DiG_ids)
        html_ensemble_disease, html_TransE_disease, html_TransR_disease, html_ComplEx_disease, html_DistMult_disease = \
            predict_triplets(target_id, disease_ids, 'Disease', DiG_ids, top_k)

        drug_tp_df = DG[DG['Tail'] == target_id]
        drug_tp_list = list(drug_tp_df['Head'])
        drug_candidate_df = entity_df[(entity_df[1].str.contains('DrugBank')) & (~entity_df[1].isin(drug_tp_list))]
        drug_list = list(drug_candidate_df[1])
        DG_rel = list(DG.drop_duplicates(subset='Relation', keep='first')['Relation'])
        DG_ids = [relation_map[rel] for rel in DG_rel]
        drug_ids = [entity_map[g] for g in drug_list]
        drug_ids = th.tensor(drug_ids).long()
        DG_ids = th.tensor(DG_ids)
        html_ensemble_drug, html_TransE_drug, html_TransR_drug, html_ComplEx_drug, html_DistMult_drug = \
            predict_triplets(target_id, drug_ids, 'Drug', DG_ids, top_k)

        pathway_tp_df = GPwy[GPwy['Head'] == target_id]
        pathway_tp_list = list(pathway_tp_df['Tail'])
        pathway_candidate_df = pathway_vocab[~pathway_vocab['primary'].isin(pathway_tp_list)]
        pathway_list = list(pathway_candidate_df['primary'])
        GPwy_rel = ['Association_DiPwy']
        GPwy_ids = [relation_map[rel] for rel in GPwy_rel]
        pathway_ids = []
        for pwy in pathway_list:
            if pwy in entity_map:
                pathway_ids.append(entity_map[pwy])
            else:
                continue
        pathway_ids = th.tensor(pathway_ids).long()
        GPwy_ids = th.tensor(GPwy_ids)
        html_ensemble_pathway, html_TransE_pathway, html_TransR_pathway, html_ComplEx_pathway, html_DistMult_pathway = \
            predict_triplets(target_id, pathway_ids, 'Pathway', GPwy_ids, top_k)

        gene_tp_df = GG[(GG['Head'] == target_id) | (GG['Tail'] == target_id)]
        gene_tp_list = list(set(list(gene_tp_df['Head']) + list(gene_tp_df['Tail'])))
        gene_candidate_df = entity_df[(entity_df[1].str.contains('HGNC')) & (~entity_df[1].isin(gene_tp_list))]
        gene_list = list(gene_candidate_df[1])
        GG_rel = list(GG.drop_duplicates(subset='Relation', keep='first')['Relation'])
        GG_ids = [relation_map[rel] for rel in GG_rel]
        gene_ids = [entity_map[g] for g in gene_list]
        gene_ids = th.tensor(gene_ids).long()
        GG_ids = th.tensor(GG_ids)
        html_ensemble_gene, html_TransE_gene, html_TransR_gene, html_ComplEx_gene, html_DistMult_gene = \
            predict_triplets(target_id, gene_ids, 'Gene', GG_ids, top_k, 'GG')

        predict_res['Ensemble'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_ensemble_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_ensemble_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_ensemble_disease + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_ensemble_pathway + "</div></div></div>"

        predict_res['TransE'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_TransE_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_TransE_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_TransE_disease + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_TransE_pathway + "</div></div></div>"

        predict_res['TransR'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_TransR_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_TransR_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_TransR_disease + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_TransR_pathway + "</div></div></div>"

        predict_res['ComplEx'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_ComplEx_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_ComplEx_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_ComplEx_disease + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_ComplEx_pathway + "</div></div></div>"

        predict_res['DistMult'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_DistMult_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_DistMult_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_DistMult_disease + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Pathway</h2><div class='layui-colla-content'>" + html_DistMult_pathway + "</div></div></div>"

    elif target_type == 'Symptom':
        target_id = symptom_name_dict[entity_name]
        disease_tp_df = DiSy[DiSy['Tail'] == target_id]
        disease_tp_list = list(disease_tp_df['Head'])
        disease_candidate_df = entity_df[(entity_df[1].str.contains('DOID')) & (~entity_df[1].isin(disease_tp_list))]
        disease_list = list(disease_candidate_df[1])
        DiSy_rel = ['Present_DiSy']
        DiSy_ids = [relation_map[rel] for rel in DiSy_rel]
        disease_ids = [entity_map[g] for g in disease_list]
        disease_ids = th.tensor(disease_ids).long()
        DiSy_ids = th.tensor(DiSy_ids)
        html_ensemble_disease, html_TransE_disease, html_TransR_disease, html_ComplEx_disease, html_DistMult_disease = \
            predict_triplets(target_id, disease_ids, 'Disease', DiSy_ids, top_k)

        predict_res['Ensemble'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_ensemble_disease + "</div></div></div>"

        predict_res['TransE'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_TransE_disease + "</div></div></div>"

        predict_res['TransR'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_TransR_disease + "</div></div></div>"

        predict_res['ComplEx'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_ComplEx_disease + "</div></div></div>"

        predict_res['DistMult'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_DistMult_disease + "</div></div></div>"

    elif target_type == 'Side_Effect':
        target_id = se_name_dict[entity_name]
        drug_tp_df = DSE[DSE['Tail'] == target_id]
        drug_tp_list = list(drug_tp_df['Head'])
        drug_candidate_df = entity_df[(entity_df[1].str.contains('DrugBank')) & (~entity_df[1].isin(drug_tp_list))]
        drug_list = list(drug_candidate_df[1])
        DSE_rel = ['Cause_DSE']
        DSE_ids = [relation_map[rel] for rel in DSE_rel]
        drug_ids = [entity_map[g] for g in drug_list]
        drug_ids = th.tensor(drug_ids).long()
        DSE_ids = th.tensor(DSE_ids)
        html_ensemble_drug, html_TransE_drug, html_TransR_drug, html_ComplEx_drug, html_DistMult_drug = \
            predict_triplets(target_id, drug_ids, 'Drug', DSE_ids, top_k)

        predict_res['Ensemble'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_ensemble_drug + "</div></div></div>"

        predict_res['TransE'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_TransE_drug + "</div></div></div>"

        predict_res['TransR'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_TransR_drug + "</div></div></div>"

        predict_res['ComplEx'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_ComplEx_drug + "</div></div></div>"

        predict_res['DistMult'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_DistMult_drug + "</div></div></div>"

    elif target_type == 'Pathway':
        target_id = pathway_name_dict[entity_name]
        disease_tp_df = DiPwy[DiPwy['Tail'] == target_id]
        disease_tp_list = list(disease_tp_df['Head'])
        disease_candidate_df = entity_df[(entity_df[1].str.contains('DOID')) & (~entity_df[1].isin(disease_tp_list))]
        disease_list = list(disease_candidate_df[1])
        DiPwy_rel = ['Association_DiPwy']
        DiPwy_ids = [relation_map[rel] for rel in DiPwy_rel]
        disease_ids = [entity_map[g] for g in disease_list]
        disease_ids = th.tensor(disease_ids).long()
        DiPwy_ids = th.tensor(DiPwy_ids)
        html_ensemble_disease, html_TransE_disease, html_TransR_disease, html_ComplEx_disease, html_DistMult_disease = \
            predict_triplets(target_id, disease_ids, 'Disease', DiPwy_ids, top_k)

        drug_tp_df = DPwy[DPwy['Tail'] == target_id]
        drug_tp_list = list(drug_tp_df['Head'])
        drug_candidate_df = entity_df[(entity_df[1].str.contains('DrugBank')) & (~entity_df[1].isin(drug_tp_list))]
        drug_list = list(drug_candidate_df[1])
        DPwy_rel = ['Association_DiPwy']
        DPwy_ids = [relation_map[rel] for rel in DPwy_rel]
        drug_ids = [entity_map[g] for g in drug_list]
        drug_ids = th.tensor(drug_ids).long()
        DPwy_ids = th.tensor(DPwy_ids)
        html_ensemble_drug, html_TransE_drug, html_TransR_drug, html_ComplEx_drug, html_DistMult_drug = \
            predict_triplets(target_id, drug_ids, 'Drug', DPwy_ids, top_k)

        gene_tp_df = GPwy[GPwy['Tail'] == target_id]
        gene_tp_list = list(gene_tp_df['Head'])
        gene_candidate_df = entity_df[(entity_df[1].str.contains('HGNC')) & (~entity_df[1].isin(gene_tp_list))]
        gene_list = list(gene_candidate_df[1])
        GPwy_rel = ['Association_DiPwy']
        GPwy_ids = [relation_map[rel] for rel in GPwy_rel]
        gene_ids = [entity_map[g] for g in gene_list]
        gene_ids = th.tensor(gene_ids).long()
        GPwy_ids = th.tensor(GPwy_ids)
        html_ensemble_gene, html_TransE_gene, html_TransR_gene, html_ComplEx_gene, html_DistMult_gene = \
            predict_triplets(target_id, gene_ids, 'Gene', GPwy_ids, top_k)

        predict_res['Ensemble'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_ensemble_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_ensemble_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_ensemble_disease + "</div></div></div>"

        predict_res['TransE'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_TransE_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_TransE_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_TransE_disease + "</div></div></div>"

        predict_res['TransR'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_TransR_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_TransR_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_TransR_disease + "</div></div></div>"

        predict_res['ComplEx'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_ComplEx_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_ComplEx_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_ComplEx_disease + "</div></div></div>"

        predict_res['DistMult'] = "<div class='layui-collapse'>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Drug</h2><div class='layui-colla-content'>" + html_DistMult_drug + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Gene</h2><div class='layui-colla-content'>" + html_DistMult_gene + "</div></div>" \
                    "<div class='layui-colla-item'><h2 class='layui-colla-title'>Disease</h2><div class='layui-colla-content'>" + html_DistMult_disease + "</div></div></div>"

    return predict_res, target_id

In [None]:
# An example for discovering top 10 genes, drugs, symptoms, and pathways which potentially link to AD (Alzheimer's disease)

entity_type = 'Disease'
entity_name = "alzheimer's disease"
topk = 10
predict_res, entity_id = generate_predict_res(entity_type, entity_name, topk)

![title](image/knowledge_discover.png)