In [1]:
import numpy as np
import pandas as pd
import os

import csv
import yaml

from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import Namespace

from tqdm import tqdm
from collections import defaultdict

### DRKG Import

In [2]:
drkg_file = 'data/drkg/drkg.tsv'
drkg_df = pd.read_csv(drkg_file, sep="\t", header=None)     # header fix to original code
drkg_triplets_list = drkg_df.values.tolist()

### DrugMechDB Import

In [3]:
drugmech_path = "data/drugmech/drugmechdb.yaml"

with open(drugmech_path, 'r') as fh:
        dmdb_yaml = yaml.safe_load(fh)

In [None]:
# from dmdb_data_tools_analysis import *

# all_metapath_nodes = get_metapath_node(dmdb_yaml)
# all_metapath_edges = get_metapath_edges(dmdb_yaml)
# basic_stats = defaultdict(list)

# all_metaedges = []
# all_parings = []
# all_targets = []
# unique_metaedges = []
# first_edge_type = []
# all_nodes = []

# id_to_name = {}
# id_to_label = {}

# for i, p in enumerate(dmdb_yaml):
#     _id = (p["graph"]["_id"])
#     drug_id, dis_id = path_to_tup(p)
#     paths = get_all_paths(p)
#     G = path_to_G(p)
    
#     G = add_metaedges(G)
#     G = add_meanode_pairs(G)
    
#     basic_stats['idx'].append(i) #index
#     basic_stats['id'].append(p['graph']['_id']) #DrugMechDB id
#     basic_stats['drug'].append(drug_id) #Drug id
#     basic_stats['disease'].append(dis_id)#Disease id
#     basic_stats['nodes'].append((G.nodes)) #nodes in metapath
#     basic_stats['n_nodes'].append(len(G.nodes)) # number of nodes in metapath
#     basic_stats['n_edges'].append(len(G.edges)) #number of edges in metapath
#     basic_stats['n_paths'].append(len(all_metapath_nodes[_id])) #number of paths
#     basic_stats['metapath'].append(all_metapath_nodes[_id])
#     basic_stats['metapath_with_edges'].append(all_metapath_edges[_id])

    
#     this_metaedges = [G.edges[e]['metaedge'] for e in G.edges]
    
#     all_metaedges += this_metaedges
#     unique_metaedges += list(set(this_metaedges))
    
#     all_parings += [G.edges[e]['mn_pair'] for e in G.edges]
#     all_targets += get_targets(G)
#     first_edge_type += get_target_metaedges(G)
#     all_nodes += list(G.nodes)
    
#     id_to_label = {**id_to_label, **get_id_to_type(G)}
#     id_to_name = {**id_to_name, **get_id_to_name(G)}
    
# dmdb_df = pd.DataFrame(basic_stats)

<!-- ### DruugMechDB Import -->

## Entity Dictionaries

In [4]:
def insert_entry(entity,ent_type,dic):
    if ent_type not in dic:
        dic[ent_type]={}
    ent_n_id=len(dic[ent_type])
    if entity not in dic[ent_type]:
        dic[ent_type][entity]=ent_n_id
    return dic

In [5]:
drkg_entity_dictionary={}

for triple in drkg_triplets_list:
    head,pred,tail = triple

    head_type, head = head.split('::')
    tail_type, tail = tail.split('::')
    insert_entry(head,head_type,drkg_entity_dictionary)
    insert_entry(tail,tail_type,drkg_entity_dictionary)

In [None]:
# Uses MESH and disregards DB for 'Drug' type
dmdb_entity_dictionary = {}

counter = 0
debug_set = set()

for entry in dmdb_yaml:
    dbg_flag = 0
    drug_db_id = entry['graph']['drugbank']
    drug_mesh_id = entry['graph']['drug_mesh']       ## Can't trust the database, checking entity class

    for node in entry['nodes']:
        ent_id = node['id']
        ent_type = node['label']
        insert_entry(ent_id,ent_type,dmdb_entity_dictionary)

        if drug_mesh_id is not None and ent_id == drug_mesh_id:      # Extra (duplicate) log with drugbank id (only if drugmesh is not null)
            insert_entry(drug_db_id,ent_type,dmdb_entity_dictionary)
            dbg_flag = 1

        if drug_mesh_id is None:    ## for DEBUG - if null, DB id is used as default
            dbg_flag = 1

    if dbg_flag == 0:   ## for DEBUG
        #print(drug_db_id)
        debug_set.add(drug_db_id)
        counter += 1

print(counter)
print(len(debug_set))

116
55


### Inverse Dicts for Entity Search 

In [79]:
#dmdb_inv_ent_dict = {v: k for k, v in dmdb_entity_dictionary.items()}
dmdb_inverse_ent_dict = {}
for subdict in dmdb_entity_dictionary.values():
    for k, v in subdict.items():
        dmdb_inverse_ent_dict[v]=k


#drkg_inv_ent_dict = {v: k for k, v in drkg_entity_dictionary.items()}
drkg_inverse_ent_dict = {}
for subdict in drkg_entity_dictionary.values():
    for k, v in subdict.items():
        drkg_inverse_ent_dict[v]=k

## Discovering format-defying exceptions

In [None]:
# TODO: How to handle exceptions

def print_all_type(list,feat):
    for l in list['nodes']:
        print(l[feat])

for entry in dmdb_yaml:
    if entry['nodes'][0]['label'] == 'Drug' and entry['nodes'][-1]['label'] == 'Disease':
        for idx, node in enumerate(entry['nodes']):
            if idx == 0 or idx == (len(entry['nodes'])-1): continue
            elif node['label'] == 'Drug' or node['label'] == 'Disease':
                print_all_type(entry,'label')
                print()
    else:
        print("\t", entry['nodes'][0]['label'], entry['nodes'][-1]['label'])
        print()


### Notes

        drug: Ammonium lactate
        drug_mesh: MESH:null

DrugMechDB contains both  MESH:null and null for drug_mesh, and only one null for drugbank section.

        drug: tazanolast
        drug_mesh: MESH:C106301
        drugbank: null

And only two instances of double entries in drug_mesh:

        drug: acrisorcin
        drug_mesh: MESH:D000585, MESH:D006604
        drugbank: DB:DB11254

        drug: carfilzomib
        drug_mesh: MESH:C524865, MESH:C519125
        drugbank: DB:DB08889

In [22]:
print(len(dmdb_entity_dictionary['Drug']))
print(len(drkg_entity_dictionary['Compound']))
print()
print(len(dmdb_entity_dictionary['Disease']))
print(len(drkg_entity_dictionary['Disease']))

3135
24313

764
5103


In [23]:
def print_sample(dic, key):
    for k,v in dic[key].items():
        print(k)
        break

print_sample(dmdb_entity_dictionary, 'Drug')
print_sample(drkg_entity_dictionary, 'Compound')
print()
print_sample(dmdb_entity_dictionary, 'Disease')
print_sample(drkg_entity_dictionary, 'Disease')

MESH:D000068877
DB02573

MESH:D015464
SARS-CoV2 E


In [25]:
# 1 DrugBank ID is null

cnt = 0
for entry in dmdb_yaml:
    elem = entry['graph']['drugbank']
    if elem is None: 
        print("null")
        continue
    if "DB" in elem:
        cnt += 1
cnt

null


4845

In DRKG;

Compound::DB00898
Compound::MESH:C106876      Disease::MESH:C537014
Compound::MESH:D000588      Disease::MESH:D013274

Compound::MESH:C554292|MESH:D013755


In DMDB;

drugbank: DB:DB09310
drug_mesh: MESH:D000068877  disease_mesh: MESH:C567691

drug_mesh: MESH:C524865, MESH:C519125

# Drugs, Diseases, & Commons

## Drugs

In [32]:
def insert_entry_eid(entity,ent_type,eid,dic):
    if ent_type not in dic:
        dic[ent_type]={}
    ent_n_id=eid
    if entity not in dic[ent_type]:
        dic[ent_type][entity]=ent_n_id
    return dic

In [33]:
dmdb_drug_dict = {}

for drug, eid in dmdb_entity_dictionary['Drug'].items():
    if drug != None:
        identifier, idx = drug.split(":")
    insert_entry_eid(idx, identifier, eid, dmdb_drug_dict)

In [34]:
dmdb_entity_dictionary['Drug'][None]

93

In [35]:
for k,v in dmdb_drug_dict.items():
    print(len(v),k,v)

1556 MESH {'D000068877': 0, 'D000082': 2, 'D001241': 4, 'D009288': 6, 'D009643': 8, 'D003348': 10, 'D000077562': 12, 'D008795': 14, 'D003707': 16, 'D000658': 18, 'C106856': 20, 'D000583': 22, 'D003907': 24, 'D002442': 26, 'D020110': 28, 'C101425': 30, 'D000865': 32, 'D064704': 34, 'D000077239': 36, 'D009553': 38, 'C007852': 40, 'D000068298': 42, 'D005476': 44, 'D016593': 46, 'C106538': 48, 'D009640': 50, 'C092292': 52, 'C004649': 54, 'D005640': 56, 'D002443': 58, 'D005672': 60, 'D014859': 62, 'C415771': 64, 'D015296': 66, 'D017336': 68, 'D010109': 70, 'D010634': 72, 'D008012': 74, 'D000068736': 76, 'D004319': 78, 'D002955': 80, 'D017291': 82, 'D008914': 84, 'D000068677': 86, 'D000077735': 88, 'D001623': 90, 'C106301': 92, 'D014221': 94, 'D000077206': 96, 'D015378': 98, 'D011239': 100, 'C005548': 102, 'D009355': 104, 'D000077727': 106, 'D006206': 108, 'D010868': 110, 'D016912': 113, 'D000069462': 115, 'D007444': 117, 'D011530': 119, 'C006208': 121, 'C076948': 123, 'C119141': 125, 'D0049

In [38]:
drkg_drug_dict = {}

for drug,eid in drkg_entity_dictionary['Compound'].items():
    if len(drug.split(":")) == 1:    # DB and CHEMBL
        if 'DB' in drug:
            insert_entry_eid(drug, 'DB', eid, drkg_drug_dict)
        elif 'CHEMBL' in drug: 
            insert_entry_eid(drug, 'CHEMBL', eid, drkg_drug_dict)
        else:
            print("DEBUG Outlier with no split:", drug)  
    else:
        if "|" in drug:       # Exceptions  --  MESH:C002480|MESH:, MESH:C011440|MESH:C046229,...
            continue
            # TODO: figure how to handle them appropriately
        else:
            identifier, idx = drug.split(":")
            insert_entry_eid(idx, identifier, eid, drkg_drug_dict)

In [39]:
for k,v in drkg_drug_dict.items():
    print(len(v),k,v)

10551 DB {'DB02573': 0, 'DB05105': 1, 'DB00244': 2, 'DB00684': 3, 'DB03118': 4, 'DB03678': 5, 'DB08715': 6, 'DB04298': 7, 'DB02972': 8, 'DB03000': 9, 'DB08459': 10, 'DB00669': 11, 'DB02102': 13, 'DB07935': 14, 'DB00997': 15, 'DB04595': 16, 'DB01087': 17, 'DB01048': 18, 'DB01732': 19, 'DB00369': 20, 'DB11575': 21, 'DB01072': 22, 'DB08379': 23, 'DB08706': 24, 'DB14122': 25, 'DB08115': 26, 'DB06817': 27, 'DB04029': 28, 'DB00391': 29, 'DB01243': 31, 'DB08580': 32, 'DB02683': 33, 'DB08709': 34, 'DB07910': 38, 'DB00694': 40, 'DB08012': 41, 'DB01880': 42, 'DB02704': 44, 'DB06874': 45, 'DB05961': 46, 'DB06408': 48, 'DB09552': 49, 'DB08281': 50, 'DB06133': 51, 'DB02768': 53, 'DB07762': 54, 'DB03017': 55, 'DB02331': 56, 'DB04565': 58, 'DB09101': 59, 'DB12996': 60, 'DB00946': 61, 'DB01887': 62, 'DB04139': 63, 'DB00409': 64, 'DB02600': 65, 'DB06236': 66, 'DB08017': 67, 'DB02668': 68, 'DB02375': 69, 'DB00773': 70, 'DB06290': 72, 'DB00558': 74, 'DB06198': 75, 'DB01392': 76, 'DB07414': 77, 'DB03312':

In [None]:
# common_drug_set = set()
common_drug_dict = defaultdict(tuple)

counter = [0,0,0]
for mesh_drug,dmdb_id in dmdb_drug_dict['MESH'].items():
    if mesh_drug in drkg_drug_dict['MESH'].keys():
        # common_drug_set.add('MESH:'+ mesh_dis)       # Re-adding identifier 'MESH:'
        common_drug_dict['MESH:'+ mesh_drug] = (dmdb_id,drkg_drug_dict['MESH'][mesh_drug])
        counter[0]+=1
for db_drug,dmdb_id in dmdb_drug_dict['DB'].items():
    if db_drug in drkg_drug_dict['DB'].keys():
        # common_drug_set.add('DB:'+ mesh_dis)       # Re-adding identifier 'DB:'
        common_drug_dict['DB:'+ db_drug] = (dmdb_id,drkg_drug_dict['DB'][db_drug])
        counter[1]+=1
for chebi_drug,dmdb_id in dmdb_drug_dict['CHEBI'].items():
    if chebi_drug in drkg_drug_dict['CHEBI'].keys():
        # common_drug_set.add('CHEBI:'+ mesh_dis)       # Re-adding identifier 'CHEBI:'
        common_drug_dict['CHEBI:'+ chebi_drug] = (dmdb_id,drkg_drug_dict['CHEBI'][chebi_drug])
        counter[2]+=1

print("Number of common drugs [MESH/DB/CHEBI]:", len(common_drug_dict), counter)

Number of common drugs [MESH/DB/CHEBI]: 1778 [214, 1564, 0]


## Disease

In [41]:
dmdb_disease_dict = {}

for disease, eid in dmdb_entity_dictionary['Disease'].items():
    identifier, idx = disease.split(":")
    insert_entry_eid(idx, identifier, eid, dmdb_disease_dict)

In [42]:
for k,v in dmdb_disease_dict.items():
    print(len(v),k,v)

758 MESH {'D015464': 0, 'D034721': 1, 'D010146': 2, 'D005334': 3, 'D013927': 4, 'D004405': 5, 'D007634': 6, 'D003586': 7, 'D018805': 8, 'D000073605': 9, 'D014069': 10, 'D011023': 11, 'D016920': 12, 'D013717': 13, 'D014552': 14, 'D006973': 15, 'D009181': 16, 'D012223': 17, 'D011704': 18, 'D013345': 19, 'D006996': 20, 'D001991': 21, 'D012559': 22, 'D017449': 23, 'D015658': 24, 'D008595': 25, 'D003233': 26, 'D016649': 27, 'D006069': 28, 'D000881': 29, 'D011655': 30, 'D008060': 31, 'D007710': 32, 'D065631': 33, 'D015508': 34, 'D004827': 35, 'D003865': 36, 'D000749': 37, 'D015523': 38, 'D000505': 39, 'D007172': 40, 'D018410': 41, 'D003424': 42, 'D012148': 43, 'D010538': 44, 'D012507': 45, 'D012871': 46, 'D005879': 47, 'D000856': 48, 'D009103': 49, 'D006943': 50, 'D006976': 51, 'D005764': 52, 'D059268': 53, 'D004414': 54, 'D008088': 55, 'D010392': 56, 'D013610': 57, 'D009101': 58, 'D013964': 59, 'D005705': 60, 'D004832': 61, 'D011471': 62, 'D007014': 63, 'D001943': 64, 'D010062': 65, 'D01277

In [43]:
drkg_disease_dict = {}

for disease, eid in drkg_entity_dictionary['Disease'].items():
    if len(disease.split(":")) != 2:
        insert_entry_eid(disease, 'other', eid, drkg_disease_dict)
    else:
        identifier, idx = disease.split(":")
        insert_entry_eid(idx, identifier, eid, drkg_disease_dict)

In [44]:
for k,v in drkg_disease_dict.items():
    print(len(v),k,v)

27 other {'SARS-CoV2 E': 0, 'SARS-CoV2 M': 1, 'SARS-CoV2 N': 2, 'SARS-CoV2 Spike': 3, 'SARS-CoV2 nsp1': 4, 'SARS-CoV2 nsp10': 5, 'SARS-CoV2 nsp11': 6, 'SARS-CoV2 nsp12': 7, 'SARS-CoV2 nsp13': 8, 'SARS-CoV2 nsp14': 9, 'SARS-CoV2 nsp15': 10, 'SARS-CoV2 nsp2': 11, 'SARS-CoV2 nsp4': 12, 'SARS-CoV2 nsp5': 13, 'SARS-CoV2 nsp5_C145A': 14, 'SARS-CoV2 nsp6': 15, 'SARS-CoV2 nsp7': 16, 'SARS-CoV2 nsp8': 17, 'SARS-CoV2 nsp9': 18, 'SARS-CoV2 orf10': 19, 'SARS-CoV2 orf3a': 20, 'SARS-CoV2 orf3b': 21, 'SARS-CoV2 orf6': 22, 'SARS-CoV2 orf7a': 23, 'SARS-CoV2 orf8': 24, 'SARS-CoV2 orf9b': 25, 'SARS-CoV2 orf9c': 26}
4871 MESH {'D001351': 27, 'D045473': 28, 'D065207': 29, 'D028941': 30, 'D058957': 31, 'D006517': 32, 'D003550': 33, 'C063419': 34, 'D013167': 35, 'D006086': 36, 'D017497': 37, 'D001171': 38, 'D011565': 39, 'D015535': 40, 'D017511': 41, 'D001172': 42, 'D013262': 43, 'D054058': 44, 'D013927': 45, 'D001943': 46, 'D011629': 47, 'D004715': 48, 'D007246': 49, 'D010146': 50, 'D010262': 51, 'D007889':

In [59]:
# MESH
# common_disease_set = set()
common_disease_dict = defaultdict(tuple)

for mesh_dis,dmdb_id in dmdb_disease_dict['MESH'].items():
    if mesh_dis in drkg_disease_dict['MESH'].keys():
        # common_disease_set.add('MESH:'+ mesh_dis)       # Re-adding identifier 'MESH:'
        common_disease_dict['MESH:'+ mesh_dis] = (dmdb_id,drkg_disease_dict['MESH'][mesh_dis])

print("Number of common MESH diseases:", len(common_disease_dict))

Number of common MESH diseases: 714


# Common Pairs

TODO : Include entity dictionary IDs for keeping track, use it in paths too. 

In [66]:
# for entry in dmdb_yaml:
#     dis_mesh = entry['graph']['disease_mesh']
#     if len(dis_mesh.split(':')) != 2:
#         print(dis_mesh)

In [None]:
# Check if both drug and disease elements exist in common entities
# TODO: How is it possible that there are duplicates? (184 duplicates)

# count = [0,0,0]
# common_pairs = set()

# for entry in dmdb_yaml:
#     graph_dict = entry['graph']
#     count[2] += 1
#     db_id = graph_dict['drugbank']
#     mesh_id = graph_dict['drug_mesh']
#     dis_id = graph_dict['disease_mesh']

#     if db_id in common_drug_dict and db_id != None:
#         if dis_id in common_disease_dict:
#             if (db_id, dis_id) in common_pairs:
#                 print("Duplicate", (db_id, dis_id))
#             common_pairs.add((db_id, dis_id))
#             count[0] += 1
#         else: count[1] += 1
#     elif mesh_id in common_drug_dict and mesh_id != None and mesh_id != 'MESH:null':
#         if dis_id in common_disease_dict:
#             if (mesh_id, dis_id) in common_pairs:
#                 print("Duplicate", (mesh_id, dis_id))
#             common_pairs.add((mesh_id, dis_id))
#             count[0] += 1
#         else: count[1] += 1
#     else:
#         count[1] += 1


# print("Number of common drug-disease pairs:", len(common_pairs))
# print("[DB, MESH, Total]", count)

# Number of common drug-disease pairs: 4484
# [DB, MESH, Total] [4668, 178, 4846]

In [89]:
common_pairs_dict = defaultdict()

for entry in dmdb_yaml:
    graph_dict = entry['graph']
    db_id = graph_dict['drugbank']
    mesh_id = graph_dict['drug_mesh']
    dis_id = graph_dict['disease_mesh']

    if db_id in common_drug_dict and db_id != None:     # Drugbank ID
        if dis_id in common_disease_dict:
            common_pairs_dict[(db_id,dis_id)] = 'DB' # = ((dmdb_drug_dict['DB'][db_id],drkg_drug_dict['DB'][db_id]),(dmdb_disease_dict['MESH'][dis_id],drkg_disease_dict['MESH'][dis_id]))
    elif mesh_id in common_drug_dict and mesh_id != None and mesh_id != 'MESH:null':    # Drug Mesh ID  - no duplicates (Drugbank ID has priority)
        if dis_id in common_disease_dict:
            common_pairs_dict[(mesh_id,dis_id)] = 'MESH'

print("Number of common drug-disease pairs:", len(common_pairs_dict),'/',len(dmdb_yaml))

Number of common drug-disease pairs: 4484 / 4846


In [105]:
# Testing reverse pair search
for common_pair in common_pairs_dict.keys():
    common_drug, common_disease = common_pair
    d_type, drug = common_drug.split(':')
    dis_type, disease = common_disease.split(':')
    print("(DRUG) DMDB Entity index, DRKG Entity index:", dmdb_drug_dict[d_type][drug], drkg_drug_dict[d_type][drug])
    print("(DISEASE) DMDB Entity index, DRKG Entity index:", dmdb_disease_dict[dis_type][disease], drkg_disease_dict[dis_type][disease])
    print()
    break

(DRUG) DMDB Entity index, DRKG Entity index: 1 5716
(DISEASE) DMDB Entity index, DRKG Entity index: 0 648



# Pathing strategy for DrugMechDB

Determine the data structure formatting to store the subgraph structure. Use integer IDs in entity dictionaries and focus on connections.

# GNN Training

In [None]:
# Functionize and import Laurie's code


In [None]:
# Format inputs appropriately (care splitting training/eval/test datasets)
