In [2]:
import numpy as np
import pandas as pd
import os

import csv
import yaml

from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import Namespace

from tqdm import tqdm

# Import Data

In [3]:
class stats_log():
    def __init__(self):
        self.entity = {'DRKG':[], 'DMDB':[]}
        self.predicate = {'DRKG':[], 'DMDB':[]}
        self.triple = {'DRKG':[], 'DMDB':[]}
        self.common = []

    def print(self):
        print("===\n Stats - Entity Logs \n===")
        for src,entry in self.entity.items():
            print(src)
            print(entry)
        print("\n===\nStats - Predicate Logs \n===")
        for src,entry in self.predicate.items():
            print(src)
            print(entry)
        print("\n===\nStats - Triple Logs \n===")
        for src,entry in self.triple.items():
            print(src)
            print(entry)
        print("\n===\nStats - Common Logs \n===")
        for entry in self.common:
            print(entry)

stats = stats_log()

## Import DRKG (with pandas)
Source: https://github.com/gnn4dr/DRKG/blob/master/raw_graph_analysis/Jaccard_scores_among_all_edge_types_in_DRKG.ipynb

In [4]:
drkg_file = 'data/drkg/drkg.tsv'
drkg_df = pd.read_csv(drkg_file, sep="\t", header=None)     # header fix to original code
drkg_triplets_list = drkg_df.values.tolist()

In [5]:
drkg_df.tail()

Unnamed: 0,0,1,2
5874256,Gene::29099,STRING::REACTION::Gene:Gene,Gene::1643
5874257,Gene::51645,STRING::REACTION::Gene:Gene,Gene::3183
5874258,Gene::865,STRING::CATALYSIS::Gene:Gene,Gene::983
5874259,Gene::1066,STRING::BINDING::Gene:Gene,Gene::7365
5874260,Gene::6118,STRING::BINDING::Gene:Gene,Gene::1111


### Entity dictionary
Dictionary format:  {**keys**: Entity Type (str), **values**: Dict{}}

Sub-dictionary format: {**keys**: Entity Name (str), **values**: Entity Index (int)}

Example: *Gene {'Gene::2157': 0, 'Gene::5264': 1, 'Gene::2158': 2, 'Gene::3309': 3, 'Gene::28912': 4, ...}*

In [6]:
drkg_entity_dictionary={}
def insert_entry(entry,ent_type,dic):
    if ent_type not in dic:
        dic[ent_type]={}
    ent_n_id=len(dic[ent_type])
    if entry not in dic[ent_type]:
         dic[ent_type][entry]=ent_n_id
    return dic

for triple in drkg_triplets_list:
    src = triple[0]
    split_src=src.split('::')
    src_type=split_src[0]
    dest = triple[2]
    split_dest=dest.split('::')
    dest_type=split_dest[0]
    insert_entry(src,src_type,drkg_entity_dictionary)
    insert_entry(dest,dest_type,drkg_entity_dictionary)

In [7]:
# DEBUG entity dict
print("DRKG Entity Summary\n---")
total = 0
for k,v in sorted(drkg_entity_dictionary.items(), key=lambda x:len(x[1]), reverse=True):
    # print(k,v)
    string = "{"+ f"({len(v)}) {k} :" + "{"
    for etype,content in enumerate(v.items()):
        ent, eid = content
        string = string + f"{ent}-{eid}"
        if etype == 3: break
        string += ", "
    print(string + ", ... }}")
    total += len(v)
print("\nTotal number of unique DRKG entities:", total)
stats.entity['DRKG'].append(f"Total number of unique DRKG entities: {total}")

DRKG Entity Summary
---
{(39220) Gene :{Gene::2157-0, Gene::5264-1, Gene::2158-2, Gene::3309-3, ... }}
{(24313) Compound :{Compound::DB02573-0, Compound::DB05105-1, Compound::DB00244-2, Compound::DB00684-3, ... }}
{(11381) Biological Process :{Biological Process::GO:0071357-0, Biological Process::GO:0098780-1, Biological Process::GO:0055088-2, Biological Process::GO:0010243-3, ... }}
{(5701) Side Effect :{Side Effect::C0032584-0, Side Effect::C0424024-1, Side Effect::C0235309-2, Side Effect::C0014724-3, ... }}
{(5103) Disease :{Disease::SARS-CoV2 E-0, Disease::SARS-CoV2 M-1, Disease::SARS-CoV2 N-2, Disease::SARS-CoV2 Spike-3, ... }}
{(4048) Atc :{Atc::B01AE02-0, Atc::L01XC06-1, Atc::R05CB13-2, Atc::L01XX29-3, ... }}
{(2884) Molecular Function :{Molecular Function::GO:0042803-0, Molecular Function::GO:0016274-1, Molecular Function::GO:0015179-2, Molecular Function::GO:0005085-3, ... }}
{(1822) Pathway :{Pathway::PC7_6941-0, Pathway::PC7_5330-1, Pathway::PC7_6994-2, Pathway::PC7_2926-3, 

In [8]:
drkg_db_drug_set = set()
for elem in drkg_entity_dictionary["Compound"].keys():
    elem = elem.split("::")[-1]
    if  "DB" in elem:  
        drkg_db_drug_set.add(elem)

len(drkg_db_drug_set)

10563

In [9]:
drkg_db_disease_set = set()
for elem in drkg_entity_dictionary["Disease"].keys():
    elem = elem.split("::")[-1]
    if "MESH:D" in elem:  
        drkg_db_disease_set.add(elem)

len(drkg_db_disease_set)

3521

### Node & Edge dictionary
Edge dictionary format: {*keys*: Predicate Type (str), *values*: List **[** head Entity Index (int), tail Entity Index (int) **]**}

Node dictionary format: {*keys*: Predicate Type (str), *values*: Set **(** Entity Index (int)* **)**}  
**head and tail entity index added separately*

In [10]:
drkg_edge_dictionary={}
drkg_node_dictionary={}

ctrl_set = set()
for triple in drkg_triplets_list:
    src = triple[0]
    split_src=src.split('::')
    src_type=split_src[0]
    dest = triple[2]
    split_dest=dest.split('::')
    dest_type=split_dest[0]
    
    src_int_id=drkg_entity_dictionary[src_type][src]
    dest_int_id=drkg_entity_dictionary[dest_type][dest]
    
    pair=[(src_int_id,dest_int_id)]
    etype=triple[1]
    if etype in drkg_edge_dictionary:
        drkg_edge_dictionary[etype]+=pair
    else:
        drkg_edge_dictionary[etype]=pair
    if etype in drkg_node_dictionary:
        drkg_node_dictionary[etype].add(src_int_id)
        drkg_node_dictionary[etype].add(dest_int_id)
    else:
        drkg_node_dictionary[etype]=set()   
        drkg_node_dictionary[etype].add(src_int_id)
        drkg_node_dictionary[etype].add(dest_int_id)

In [11]:
print(f"Number of DRKG edge types: {len(drkg_edge_dictionary)}\n")
stats.predicate['DRKG'].append(f"Number of DRKG edge types: {len(drkg_edge_dictionary)}")

for k, v in sorted(drkg_edge_dictionary.items(), key=lambda x: len(x[1]), reverse=True):
    print(f"{k} - Freq: {len(v)}")

Number of DRKG edge types: 107

DRUGBANK::ddi-interactor-in::Compound:Compound - Freq: 1379271
Hetionet::GpBP::Gene:Biological Process - Freq: 559504
Hetionet::AeG::Anatomy:Gene - Freq: 526407
STRING::REACTION::Gene:Gene - Freq: 400426
STRING::CATALYSIS::Gene:Gene - Freq: 343533
STRING::BINDING::Gene:Gene - Freq: 315875
STRING::OTHER::Gene:Gene - Freq: 310690
Hetionet::Gr>G::Gene:Gene - Freq: 265672
Hetionet::GiG::Gene:Gene - Freq: 147164
Hetionet::CcSE::Compound:Side Effect - Freq: 138944
INTACT::PHYSICAL ASSOCIATION::Gene:Gene - Freq: 129318
INTACT::ASSOCIATION::Gene:Gene - Freq: 112390
Hetionet::AdG::Anatomy:Gene - Freq: 102240
Hetionet::AuG::Anatomy:Gene - Freq: 97848
Hetionet::GpMF::Gene:Molecular Function - Freq: 97222
Hetionet::GpPW::Gene:Pathway - Freq: 84372
STRING::ACTIVATION::Gene:Gene - Freq: 81355
Hetionet::GpCC::Gene:Cellular Component - Freq: 73566
Hetionet::GcG::Gene:Gene - Freq: 61690
bioarx::HumGenHumGen:Gene:Gene - Freq: 58094
GNBR::T::Compound:Disease - Freq: 54020


In [12]:
total = 0
temp_str = "All triplets including 'treats' relation in DRKG\n"
print("All triplets including 'treats' relation in DRKG\n")
for treats_pred in ['GNBR::T::Compound:Disease', 'DRUGBANK::treats::Compound:Disease','Hetionet::CtD::Compound:Disease']: # TODO : pred_match_drkg2dmdb('treats'):
    print(treats_pred,"\t", len(drkg_edge_dictionary[treats_pred]))
    total += len(drkg_edge_dictionary[treats_pred])
    temp_str += (f"{treats_pred} \t {len(drkg_edge_dictionary[treats_pred])}")
print("\nTotal: ", total)
stats.entity['DRKG'].append(temp_str)

# TODO : GNBR assumed treats

All triplets including 'treats' relation in DRKG

GNBR::T::Compound:Disease 	 54020
DRUGBANK::treats::Compound:Disease 	 4968
Hetionet::CtD::Compound:Disease 	 755

Total:  59743


### Predicate Set

For debugging/analyzing reoccuring predicate names in DRKG.

Original predicate format: **[source]::[predicate name]:[head entity type]:[tail entity type]**

OR **[source]::[predicate name]::[head entity type]:[tail entity type]**

In [13]:
# utils function
def remove_null_from_list(list):
    # Removes '' elements from list
    # Output for drkg preds: [source, predicate name, head_entity_type, tail_entity_type]
    return [x for x in list if x != '']

In [14]:
drkg_pred_set = set()

for triple in drkg_triplets_list:
    temp = triple[1].split(":")
    temp = remove_null_from_list(temp)
    pred_name = triple[1].split("::")[1].split(":")[0]
    drkg_pred_set.add(pred_name)

print("Number of unique names (string) in relations:", len(drkg_pred_set))

Number of unique names (string) in relations: 99


In [15]:
# DEBUG For cross-examining the predicate glossary
pred_glossary_path = "data\\drkg\\relation_glossary.tsv"

drkg_pred_set2 = set()
recur_set = set()

with open(pred_glossary_path, 'r') as f: 
    next(f)
    print("Reoccuring predicates\n")
    for line in f:
        pred_name = line.split('::')[1]
        if pred_name in drkg_pred_set2:
            print(f"'{pred_name}' from {line.split('::')[0]}")
            recur_set.add(pred_name)
        drkg_pred_set2.add(pred_name)


print("\nNumber of unique predicates pulled from relation glossary:", len(drkg_pred_set2))

Reoccuring predicates

'B' from GNBR
'E+' from GNBR
'E' from GNBR
'J' from GNBR
'ASSOCIATION' from INTACT
'DIRECT INTERACTION' from INTACT
'PHYSICAL ASSOCIATION' from INTACT
'OTHER' from STRING

Number of unique predicates pulled from relation glossary: 99


recur_set = ['B', 'E+', 'E', 'J', 'ASSOCIATION', 'DIRECT INTERACTION', 'PHYSICAL ASSOCIATION', 'OTHER']

### Triplets Set
Unique triplets set.

In [16]:
# DRKG unique triples
drkg_triplets_set = {tuple(triple) for triple in drkg_triplets_list}
print(f"Ratio of unique triplets: {len(drkg_triplets_set)} / {len(drkg_triplets_list)}")

Ratio of unique triplets: 5874258 / 5874261


In [17]:
print(type(drkg_triplets_set))

<class 'set'>


## Import DrugMechDB (with pandas)

Source: https://github.com/SuLab/DrugMechDB/blob/main/data_analysis/figures_DMDB_manuscript.ipynb

In [18]:
drugmech_path = "data/drugmech/drugmechdb.yaml"

with open(drugmech_path, 'r') as fh:
        ind = yaml.safe_load(fh)

In [19]:
from dmdb_data_tools_analysis import *
from collections import defaultdict

In [20]:
all_metapath_nodes = get_metapath_node(ind)
all_metapath_edges = get_metapath_edges(ind)

In [21]:
basic_stats = defaultdict(list)

all_metaedges = []
all_parings = []
all_targets = []
unique_metaedges = []
first_edge_type = []
all_nodes = []

id_to_name = {}
id_to_label = {}

for i, p in enumerate(ind):
    _id = (p["graph"]["_id"])
    drug_id, dis_id = path_to_tup(p)
    paths = get_all_paths(p)
    G = path_to_G(p)
    
    G = add_metaedges(G)
    G = add_meanode_pairs(G)
    
    basic_stats['idx'].append(i) #index
    basic_stats['id'].append(p['graph']['_id']) #DrugMechDB id
    basic_stats['drug'].append(drug_id) #Drug id
    basic_stats['disease'].append(dis_id)#Disease id
    basic_stats['nodes'].append((G.nodes)) #nodes in metapath
    basic_stats['n_nodes'].append(len(G.nodes)) # number of nodes in metapath
    basic_stats['n_edges'].append(len(G.edges)) #number of edges in metapath
    basic_stats['n_paths'].append(len(all_metapath_nodes[_id])) #number of paths
    basic_stats['metapath'].append(all_metapath_nodes[_id])
    basic_stats['metapath_with_edges'].append(all_metapath_edges[_id])

    
    this_metaedges = [G.edges[e]['metaedge'] for e in G.edges]
    
    all_metaedges += this_metaedges
    unique_metaedges += list(set(this_metaedges))
    
    all_parings += [G.edges[e]['mn_pair'] for e in G.edges]
    all_targets += get_targets(G)
    first_edge_type += get_target_metaedges(G)
    all_nodes += list(G.nodes)
    
    id_to_label = {**id_to_label, **get_id_to_type(G)}
    id_to_name = {**id_to_name, **get_id_to_name(G)}
    
dmdb_df = pd.DataFrame(basic_stats)

In [22]:
dmdb_df.tail()

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges
4841,4841,DB01234_MESH_D009404_1,DB:DB01234,MESH:D009404,"(MESH:D003907, UniProt:P04150, InterPro:IPR001...",7,6,1,[Drug - Protein - GeneFamily - BiologicalProce...,[Drug - positively regulates - Protein - incre...
4842,4842,DB01234_MESH_C562390_1,DB:DB01234,MESH:C562390,"(MESH:D003907, UniProt:P04150, InterPro:IPR001...",7,6,1,[Drug - Protein - GeneFamily - BiologicalProce...,[Drug - positively regulates - Protein - incre...
4843,4843,DB01234_MESH_D000312_1,DB:DB01234,MESH:D000312,"(MESH:D003907, UniProt:P04150, GO:0006702, MES...",4,3,1,[Drug - Protein - BiologicalProcess - Disease],[Drug - positively regulates - Protein - negat...
4844,4844,DB01234_MESH_D000224_1,DB:DB01234,MESH:D000224,"(MESH:D003907, UniProt:P04150, GO:0034651, GO:...",5,5,1,[Drug - Protein - BiologicalProcess - Disease],[Drug - positively regulates - Protein - posit...
4845,4845,DB01234_MESH_D000309_1,DB:DB01234,MESH:D000309,"(MESH:D003907, UniProt:P04150, GO:0120178, MES...",4,3,1,[Drug - Protein - BiologicalProcess - Disease],[Drug - positively regulates - Protein - posit...


In [23]:
tempset = defaultdict()
for idx, row in dmdb_df.iterrows():
    if row['n_paths'] not in tempset.keys():
        tempset[row['n_paths']] = 1
    else:
        tempset[row['n_paths']] += 1
tempset

defaultdict(None,
            {1: 4258, 2: 440, 3: 41, 5: 15, 0: 29, 4: 48, 6: 13, 21: 1, 10: 1})

### Entity Set

Is an entity dictionary necessary? 

In [24]:
dmdb_entity_set = set()

for idx, row in dmdb_df.iterrows():
    for n_id, ent in enumerate(list(row['nodes'])):
        dmdb_entity_set.add(ent)

print("Number of unique entities in DrugMechDB:", len(dmdb_entity_set))

Number of unique entities in DrugMechDB: 5128


In [25]:
dmdb_drugs_set = set()

for idx, row in dmdb_df.iterrows():
    if row['drug'] is not None:
        dmdb_drugs_set.add(row['drug'].split(':')[-1])
print("Number of unique drugs in DrugMechDB:", len(dmdb_drugs_set))

Number of unique drugs in DrugMechDB: 1579


In [26]:
common_db_drugs = len(drkg_db_drug_set.intersection(dmdb_drugs_set))

In [27]:
dmdb_drugs_set.difference(drkg_db_drug_set)

# TODO: fix this in code DB:DBDB08902, and other format mistakes

{'B02362',
 'DB010473',
 'DB01527',
 'DB05768',
 'DB09162',
 'DB11152',
 'DB11739',
 'DB12354',
 'DB13163',
 'DB14649',
 'DBDB08902',
 'DBSALT001045',
 'DBSALT001065'}

In [28]:
dmdb_entity_dictionary = {}   # Dictionary for entity stats

def insert_entry(entry,ent_type,dic):
    if ent_type not in dic:
        dic[ent_type]={}
    ent_n_id=len(dic[ent_type])
    if entry not in dic[ent_type]:
         dic[ent_type][entry]=ent_n_id
    return dic

for entry in ind:
    for node in entry['nodes']:
        src = node['id']
        src_type = node['label']
        insert_entry(src,src_type,dmdb_entity_dictionary)

In [29]:
# DEBUG entity dict
print("DrugMechDB Entity Summary\n---")
total = 0
for k,v in sorted(dmdb_entity_dictionary.items(), key=lambda x:len(x[1]), reverse=True):
    # print(k,v)
    string = "{"+ f"({len(v)}) '{k}' : " + "{"
    for enum,content in enumerate(v.items()):
        ent, eid = content
        string = string + f"{ent}-{eid}"
        if enum == 3: break
        string += ", "
    print(string + ", ... }}")
    total += len(v)
print("\nTotal number of unique DMDB entities:", total)

DrugMechDB Entity Summary
---
{(1644) 'Drug' : {MESH:D000068877-0, MESH:D000082-1, MESH:D001241-2, MESH:D009288-3, ... }}
{(788) 'Protein' : {UniProt:P00519-0, UniProt:P10721-1, UniProt:P16234-2, UniProt:P23219-3, ... }}
{(760) 'Disease' : {MESH:D015464-0, MESH:D034721-1, MESH:D010146-2, MESH:D005334-3, ... }}
{(694) 'BiologicalProcess' : {GO:0008283-0, GO:0001516-1, GO:0001659-2, GO:0006954-3, ... }}
{(359) 'ChemicalSubstance' : {MESH:D011453-0, MESH:D013928-1, MESH:D015774-2, MESH:C029371-3, ... }}
{(274) 'PhenotypicFeature' : {HP:0000969-0, HP:0000738-1, HP:0012531-2, HP:0100033-3, ... }}
{(166) 'OrganismTaxon' : {taxonomy:622-0, taxonomy:10358-1, taxonomy:2-2, taxonomy:786-3, ... }}
{(137) 'GeneFamily' : {TIGR:02074-0, InterPro:IPR000265-1, InterPro:IPR005446-2, InterPro:IPR006028-3, ... }}
{(122) 'GrossAnatomicalStructure' : {UBERON:0000955-0, UBERON:0002046-1, UBERON:0000007-2, UBERON:0000004-3, ... }}
{(98) 'MolecularActivity' : {GO:0003746-0, GO:0004969-1, GO:0030284-2, GO:0022

In [30]:
dmdb_entity_dictionary["Disease"].keys()

dict_keys(['MESH:D015464', 'MESH:D034721', 'MESH:D010146', 'MESH:D005334', 'MESH:D013927', 'MESH:D004405', 'MESH:D007634', 'MESH:D003586', 'MESH:D018805', 'MESH:D000073605', 'MESH:D014069', 'MESH:D011023', 'MESH:D016920', 'MESH:D013717', 'MESH:D014552', 'MESH:D006973', 'MESH:D009181', 'MESH:D012223', 'MESH:D011704', 'MESH:D013345', 'MESH:D006996', 'MESH:D001991', 'MESH:D012559', 'MESH:D017449', 'MESH:D015658', 'MESH:D008595', 'MESH:D003233', 'MESH:D016649', 'MESH:D006069', 'MESH:D000881', 'MESH:D011655', 'MESH:D008060', 'MESH:D007710', 'MESH:D065631', 'MESH:D015508', 'MESH:D004827', 'MESH:D003865', 'MESH:D000749', 'MESH:D015523', 'MESH:D000505', 'MESH:D007172', 'MESH:D018410', 'MESH:D003424', 'MESH:D012148', 'MESH:D010538', 'MESH:D012507', 'MESH:D012871', 'MESH:D005879', 'MESH:D000856', 'MESH:D009103', 'MESH:D006943', 'MESH:D006976', 'MESH:D005764', 'MESH:D059268', 'MESH:D004414', 'MESH:D008088', 'MESH:D010392', 'MESH:D013610', 'MESH:D009101', 'MESH:D013964', 'MESH:D005705', 'MESH:D004

In [31]:
dmdb_mesh_disease_set = set()
for elem in dmdb_entity_dictionary["Disease"].keys():
    if "MESH:D" in str(elem):
        dmdb_mesh_disease_set.add(elem)
# DB:DB00100 MESH:C000599709
len(dmdb_mesh_disease_set)

734

In [32]:
len(drkg_db_disease_set.intersection(dmdb_mesh_disease_set))

702

In [33]:
len(dmdb_mesh_disease_set.difference(drkg_db_disease_set))

32

### Predicate Set

TODO : How are there 29 samples with no relations or metapath?

In [34]:
dmdb_pred_set = set()

count = 0

def add_dmdb_preds(row):
    is_pred = False
    if row['metapath_with_edges'] == []:
        # print("flag", row)
        # print(row['nodes'])
        # count += 1
        return 1
    try:
        subj = row['metapath_with_edges'][0].split(" - ")
    except Exception as e:
        print(type(row['metapath_with_edges']))
        pass

    for elem in row['metapath_with_edges'][0].split(" - "):
        if is_pred:
            dmdb_pred_set.add(elem)
            is_pred = False
        else:
            is_pred = True

ret = dmdb_df.apply(add_dmdb_preds, axis=1)

for i in ret:
    if i == 1:
        count += 1

print("Number of unique predicates in DMDB:", len(dmdb_pred_set))
# print(dmdb_pred_set)
# TODO : print("\nNumber of samples with no relations or metapath:", count)


Number of unique predicates in DMDB: 56


In [35]:
# P.S DrugBank ID in drug, MESH ID in nodes
# 'metapath's are python lists with one element 
# Metapath indicates basic structure (e.g D - X - Di & D - Y - Di == Drug - Protein - Disease)

def compare_ids(row):
    return row['drug'] == list(row['nodes'])[0]

result = dmdb_df.apply(compare_ids, axis=1)
cnt = 0
for res in result:
    if res:
        cnt += 1
        # print(res)
print(f"Number of ['drug'] column drug IDs matching ['nodes'] column drug ID: {cnt}/{len(result)}\n")

# for elem in dmdb_df.loc[0,'metapath_with_edges'][0].split(" - "):   # metapath testing
#     print(elem)

Number of ['drug'] column drug IDs matching ['nodes'] column drug ID: 166/4846



### Triplets Set

In [None]:
# P.S These triplets do not include attribute triples with 'name' relation in DrugMechDB
dmdb_drug_disease_pairs = []
dmdb_triplets_list = []
dmdb_triplets_set = set()

counter = [0,0]
for entry in ind:
    counter[0] += 1
    # Drug - disease pair MESH 
    dmdb_drug_disease_pairs.append((entry['graph']['drug_mesh'], entry['graph']['disease_mesh']))

    for link in entry['links']:
        dmdb_triplets_set.add((link['source'], link['key'], link['target']))
        dmdb_triplets_list.append((link['source'], link['key'], link['target']))
        counter[1] += 1

print(f"Number of unique triples in DrugMechDB: {len(dmdb_triplets_set)}/{len(dmdb_triplets_list)} (out of {counter[0]} samples)")

Number of unique triples in DrugMechDB: 10791/32641 (out of 4846 samples)


# Analysis

## 0 - Notes

In [None]:
temp_set = set()    # Tax debug
for row in drkg_triplets_set:
    if 'Tax' in row[0]:
        temp_set.add(len(row[0].split(":")))
    elif 'Tax' in row[2]:
        temp_set.add(len(row[2].split(":")))
temp_set
# for i in drkg_entity_dictionary:
#     print(i)
# print()
# for j in dmdb_entity_dictionary:
#     print(j)

{3}

In [None]:
# Search all triplets including target ID
target_ID = "Side Effect::C0009763"
cnt = 0
for triplet in drkg_triplets_set:
    if target_ID == triplet[0] or target_ID == triplet[2]:
        print(triplet)
        cnt+=1
cnt

In [None]:
# TODO : edge dictionaries are triplet-focused right? So do one for dmdb - then we check github again for visuals cause drkg wont need it

## Step 1: Compare Entities

In [39]:
# No format transformation
common_entities_dict = {}
for e_type,dict in drkg_entity_dictionary.items():
    for elem,eid in dict.items():
        _, e_name = elem.split("::")     # TODO : This is just an integer for some classes
        # print(elem)
        if e_name in dmdb_entity_set:
            # print("Common element found: ", elem)
            if e_type in common_entities_dict:
                common_entities_dict[e_type]+=[str(e_name)]
            else:
                common_entities_dict[e_type]=[e_name]

for k,v in sorted(common_entities_dict.items(), key=lambda x: len(x[1]), reverse=True):
    print(f"({len(v)}) {k}  ", v[:5])

print("\nNumber of total common entities:", sum(len(k) for k in common_entities_dict.values()))

(724) Disease   ['MESH:D003550', 'MESH:D013167', 'MESH:D017497', 'MESH:D001171', 'MESH:D011565']
(501) Biological Process   ['GO:0006898', 'GO:0008015', 'GO:0006470', 'GO:0043647', 'GO:0001508']
(279) Compound   ['MESH:C026098', 'CHEBI:15356', 'CHEBI:15889', 'CHEBI:16134', 'CHEBI:16136']
(77) Molecular Function   ['GO:0004674', 'GO:0004115', 'GO:0016209', 'GO:0004857', 'GO:0003916']
(45) Anatomy   ['UBERON:0002110', 'UBERON:0000178', 'UBERON:0000955', 'UBERON:0001003', 'UBERON:0000992']
(27) Cellular Component   ['GO:0043025', 'GO:0005874', 'GO:0098793', 'GO:0032993', 'GO:0031012']

Number of total common entities: 1653


In [None]:
# No format transformation
common_entities_dict = {}
for e_type,dict in drkg_entity_dictionary.items():
    for elem,eid in dict.items():
        _, e_name = elem.split("::")     # TODO : This is just an integer for some classes
        # print(elem)
        if e_name in dmdb_entity_set:
            # print("Common element found: ", elem)
            if e_type in common_entities_dict:
                common_entities_dict[e_type]+=[str(e_name)]
            else:
                common_entities_dict[e_type]=[e_name]
        else:
            if elem.split("::")[0] == "Compound":
                print(elem)

# for k,v in sorted(common_entities_dict.items(), key=lambda x: len(x[1]), reverse=True):
#     print(f"({len(v)}) {k}  ", v)

# print("\nNumber of total common entities:", sum(len(k) for k in common_entities_dict.values()))

Compound::DB02573
Compound::DB05105
Compound::DB00244
Compound::DB00684
Compound::DB03118
Compound::DB03678
Compound::DB08715
Compound::DB04298
Compound::DB02972
Compound::DB03000
Compound::DB08459
Compound::DB00669
Compound::molport:MolPort-046-762-962
Compound::DB02102
Compound::DB07935
Compound::DB00997
Compound::DB04595
Compound::DB01087
Compound::DB01048
Compound::DB01732
Compound::DB00369
Compound::DB11575
Compound::DB01072
Compound::DB08379
Compound::DB08706
Compound::DB14122
Compound::DB08115
Compound::DB06817
Compound::DB04029
Compound::DB00391
Compound::brenda:207529
Compound::DB01243
Compound::DB08580
Compound::DB02683
Compound::DB08709
Compound::gtopdb:10100
Compound::bindingdb:50225285
Compound::pubchem:49866494
Compound::DB07910
Compound::brenda:6989
Compound::DB00694
Compound::DB08012
Compound::DB01880
Compound::brenda:102188
Compound::DB02704
Compound::DB06874
Compound::DB05961
Compound::brenda:61880
Compound::DB06408
Compound::DB09552
Compound::DB08281
Compound::DB0613

In [82]:
# Checking for original drug names with DrugBank ID - (DEBUG)
cnt = 0
common_debug_dict = {}
for e_type,dict in drkg_entity_dictionary.items():
    for elem,eid in dict.items():
        e_type, e_name = elem.split("::")     # TODO : This is just an integer for some classes
        # print(elem)
        if e_name in dmdb_drugs_set:
            # print("Common element found: ", elem)
            cnt += 1
            if e_type in common_debug_dict:
                common_debug_dict[e_type]+=[str(e_name)]
            else:
                common_debug_dict[e_type]=[e_name]

print("Number of hits with ['drugs'] (DrugBank IDs):", sum(len(v) for v in common_debug_dict.values()))

Number of hits with ['drugs'] (DrugBank IDs): 1566


### Pseudo-entity-alignment (with GPT 5 - Thinking - Deep Research mode)

Mapping Biomedical Concept Types (Group1 to Group2)

Below we map each concept type from Group 1 to the most semantically equivalent concept in Group 2, indicating whether the match is exact, broader, or narrower. Each mapping is justified with references to established biomedical ontologies (UMLS, Biolink Model, GO, etc.):

Genetic Entities

Gene → GeneFamily (narrower): The GeneFamily category represents a grouping of multiple genes or gene products with shared ancestry
biolink.github.io
. An individual gene is a member of such a family, making Gene a narrower concept under the broader GeneFamily category. In other words, a single gene is one element of a gene family.

Chemical/Drug Entities

Compound → ChemicalSubstance (exact): A chemical compound is essentially a chemical substance. For instance, in DBpedia’s ontology, ChemicalCompound is defined as a subclass of ChemicalSubstance
dbpedia.org
. Thus, Compound (a chemical compound) maps exactly to ChemicalSubstance – they refer to the same kind of entity (a chemical entity in biomedical context).

Atc → Drug (exact): ATC refers to the Anatomical Therapeutic Chemical classification system, which is a drug classification scheme. Each ATC code corresponds to a specific drug or a group of related drug substances
en.wikipedia.org
. Therefore, an Atc concept (an ATC-coded entity) is essentially a Drug. This is an exact match to the Drug category (with ATC being a coding scheme for drugs).

Pharmacologic Class → Drug (broader): A pharmacologic class is a group of drugs that share certain properties (such as mechanism of action or therapeutic effect). The FDA defines a pharmacologic class as “a group of drugs that share scientifically documented properties”
fda.gov
. Since Group 2 doesn’t list a separate “DrugClass” category, the best semantic fit is Drug. However, the relation is broader – Pharmacologic Class encompasses multiple drugs (it’s a higher-level grouping), whereas Drug can refer to individual drug entities.

Diseases and Phenotypes

Disease → Disease (exact): The concept of Disease maps directly and exactly to Disease. Both Group1 and Group2 use the same term, referring to a disorder or illness. For example, Biolink defines a disease as “a disorder of structure or function... that produces specific signs, phenotypes or symptoms”
biolink.github.io
, which is the standard meaning of a disease. This indicates an identical semantic category in both groups.

Symptom → PhenotypicFeature (narrower): A Symptom (clinical sign or symptom) is a type of PhenotypicFeature. In biomedical ontologies like HPO and Biolink, phenotypic features encompass all observable characteristics of an individual, including clinical signs and symptoms
johnsnowlabs.com
. Thus, Symptom is narrower than PhenotypicFeature – it represents one kind of phenotypic feature. (All symptoms are phenotypic features, but not all phenotypic features are symptoms.)

Side Effect → PhenotypicFeature (narrower): A Side Effect (an adverse effect of a drug) is also represented as a phenotypic outcome. In Biolink/Translator contexts, drug side effects are modeled as either diseases or phenotypic features, since they are secondary, undesirable clinical effects
biolink.github.io
. Many side effects manifest as symptoms or clinical phenotypes, so Side Effect fits under PhenotypicFeature. This is a narrower match (side effects are specific phenotypic features resulting from drug exposure).

Biological Processes & Functions

Biological Process → BiologicalProcess (exact): Biological Process corresponds exactly to BiologicalProcess. This concept refers to a series of biologically orchestrated events or functions. For example, the Gene Ontology defines a biological process as “one or more causally connected executions of molecular functions”
biolink.github.io
. The terms are essentially synonymous, so this is an exact match.

Molecular Function → MolecularActivity (exact): Molecular Function (as used in GO) is the same concept as MolecularActivity in Group 2. Biolink describes MolecularActivity as the execution of a molecular function by a gene product or complex
biolink.github.io
. In other words, it’s the biochemical activity that a molecule (like a protein) performs. This is an exact semantic match (different phrasing for the GO molecular function category).

Pathway → Pathway (exact): Pathway in Group1 maps exactly to Pathway in Group2, since they are identical concepts. A biological pathway is a series of interactions or actions among molecules leading to a certain product or change in a cell
genome.gov
. Both groups refer to this same idea of a biochemical or signaling pathway, making it an exact match.

Anatomy & Cellular Components

Anatomy → GrossAnatomicalStructure (exact): Anatomy (in this context) refers to anatomical entities at the gross (macroscopic) level, such as organs or tissues. This aligns with GrossAnatomicalStructure in Group2. The Biolink model, for instance, lists organ and tissue as aliases of GrossAnatomicalStructure
biolink.github.io
. Thus, Anatomy is an exact match to GrossAnatomicalStructure (both representing macroscopic anatomical structures).

Cellular Component → CellularComponent (exact): Cellular Component corresponds directly to CellularComponent (same term). Both denote a location or structural component of a cell (e.g., an organelle, membrane, or other part of a cell). The Gene Ontology defines a cellular component as “a location in or around a cell”
biolink.github.io
. Therefore, this mapping is exact – the concept is identical in both groups.

Taxonomy

Tax → OrganismTaxon (exact): Tax (short for taxonomy or taxon) maps to OrganismTaxon. OrganismTaxon denotes a taxonomic classification for organisms (for example, Homo sapiens is an organism taxon in the NCBI taxonomy hierarchy)
biolink.github.io
. This is an exact alignment: Tax represents an organism’s taxonomic group, which is precisely what OrganismTaxon is.

In [42]:
# DRKG entity types are keys, DMDB entity types are values (Gene was an erroneous type in DMDB)
etype_match_dict_old = {'Gene': 'Gene',
'Compound': 'ChemicalSubstance',
'Disease': 'Disease',
'Atc': 'Drug',
'Tax': 'OrganismTaxon',
'Biological Process': 'BiologicalProcess',
'Symptom': 'PhenotypicFeature',
'Anatomy': 'GrossAnatomicalStructure',
'Molecular Function': 'MolecularActivity',
'Pharmacologic Class': 'Drug',
'Cellular Component': 'CellularComponent',
'Pathway': 'Pathway',
'Side Effect': 'PhenotypicFeature'}

In [None]:
# DRKG entity types are keys, DMDB entity types are values
etype_match_dict = {'Gene': 'Protein',
'Compound': 'Drug',
'Disease': 'Disease',
#'Atc': 'Drug',
'Tax': 'OrganismTaxon',
'Biological Process': 'BiologicalProcess',
'Symptom': 'PhenotypicFeature',
'Anatomy': 'GrossAnatomicalStructure',
'Molecular Function': 'MolecularActivity',
'Pharmacologic Class': 'ChemicalSubstance',
'Cellular Component': 'CellularComponent',
'Pathway': 'Pathway'
#'Side Effect': 'PhenotypicFeature'
}

##### Testing LLM Entity Type Matches

In [None]:
for drkg_etype in drkg_entity_dictionary.keys():
    if drkg_etype in etype_match_dict:
        # print(drkg_etype, etype_match_dict[drkg_etype])
        dmdb_etype = etype_match_dict[drkg_etype]
        dmdb_dic = dmdb_entity_dictionary[dmdb_etype]
        drkg_dic = drkg_entity_dictionary[drkg_etype]
        print(len(dmdb_dic), dmdb_dic)
        print(len(drkg_dic), drkg_dic)
        #print()
        cnt = 0

        for sample in drkg_dic.keys():
            #print(drkg_dic.keys())
            sample = sample.split("::")[-1]
            if sample in dmdb_dic.keys():
                # print(sample)
                cnt+=1
        print(cnt, dmdb_etype, [key for key, val in etype_match_dict.items() if val == dmdb_etype])
        print()
        # No one-to-one matches found.
        # if set(dmdb_entity_dictionary[dmdb_etype].keys()).intersection(set(drkg_entity_dictionary[drkg_etype].keys())):
        #     temp_common = dmdb_entity_dictionary[dmdb_etype].keys().intersection(drkg_entity_dictionary[drkg_etype].keys())
        #     print(temp_common)

## Step 2: Compare Predicates

### Local LLM Outputs - underwhelming results

In [71]:
import subprocess

# model = 'deepseek-r1:32b'
# model = 'qwen3:32b'
model = 'llama3.1'

In [72]:
!ollama --version

ollama version is 0.11.4


In [73]:
# Basic LLM alignment of relationships
# An adapted version of the Autoalign entity type matching prompt template

prompt = f"Now you are an expert in biomedicine, linguistics and knowledge graphs. \
    I will give you two sets of words, indicating the predicate types from two knowledge graphs. \
    You need to identify all the word pairs from the two sets that are semantic synonyms. For example, \
    if the first set has the word ‘people’ and the second set has the word 'person', you need to \
    identify the two words being synonyms and return me the pair (people, person). Now the following \
    are the two sets: Set 1: {drkg_pred_set} Set 2: {dmdb_pred_set} Please return all the pairs that are synonyms \
    from the two sets regarding predicate types. Do not output the pairs if they are exactly the same. \
    Remember you only need to return the pairs, each pair in one line. Each pair contain two types, \
    one from Set 1 and another from Set 2, in the format (type1, type2)."


with open(pred_glossary_path, 'r') as f: next(f); drkg_pred_gloss_str = f.read()
drkg_pred_gloss_str = drkg_pred_gloss_str.expandtabs()

In [74]:
final_prompt = prompt + "\n Below is a glossary that corresponds to Set 1. Utilize the information to optimize the matching process.\n" + drkg_pred_gloss_str

In [75]:
# result = subprocess.run(
#     ["ollama", "run", model],
#     input=final_prompt,
#     capture_output=True,
#     text=True,
#     check=True
# )

# if result.stdout:
#     print("Output:\n", result.stdout)

### GPT-o3 (Deep Research Mode) Output 
Note: There are two reoccuring DRKG predicates: "Y" and "GpMF". Also the predicates that are identical in name are not yet included.
"Y" : "affects risk for" and "predisposes".
"GpMF" : "enables" and "capable of".


General Regulation

    (MODULATOR, regulates)

    (Gr>G, regulates)

    (Rg, regulates)

    (E, regulates)

Positive Regulation / Activation

    (ACTIVATOR, increases activity of)

    (AGONIST, increases activity of)

    (POSITIVE ALLOSTERIC MODULATOR, increases activity of)

    (ACTIVATION, positively regulates)

    (A+, increases activity of)

    (V+, increases activity of)

    (W, positively regulates)

Negative Regulation / Inhibition

    (ANTAGONIST, decreases activity of)

    (BLOCKER, decreases activity of)

    (CHANNEL BLOCKER, decreases activity of)

    (INHIBITOR, decreases activity of)

    (INHIBITION, decreases activity of)

    (A-, negatively regulates)

    (N, decreases activity of)

Gene Expression Changes (Upregulation/Downregulation)

    (E+, increases expression of)

    (E-, decreases expression of)

    (CuG, increases expression of)

    (CdG, decreases expression of)

    (AuG, increases expression of)

    (AdG, decreases expression of)

    (DuG, increases expression of)

    (DdG, decreases expression of)

    (EXPRESSION, positively correlated with)

Physical Binding and Association

    (BINDING, physically interacts with)

    (BINDER, physically interacts with)

    (PHYSICAL ASSOCIATION, physically interacts with)

    (DIRECT INTERACTION, physically interacts with)

    (ANTIBODY, physically interacts with)

    (B, physically interacts with)

Molecular and Generic Interactions

    (HumGenHumGen, molecularly interacts with)

    (GiG, interacts with)

    (DrugHumGen, interacts with)

    (DrugVirGen, interacts with)

    (VirGenHumGen, interacts with)

    (ddi-interactor-in, interacts with)

    (target, interacts with)

Therapeutic and Disease Influence Relations

    (T, treats)

    (CtD, treats)

    (treats, treats)

    (CpD, ameliorates)

    (Pa, ameliorates)

    (Sa, causes)

    (U, causes)

    (G, exacerbates)

    (J, contributes to)

    (Pr, prevents)

    (Y, affects risk for)

    (Y, predisposes)

Similarity Relations

    (CrC, chemically similar to)

    (DrD, similar to)

Taxonomic Classification

    (in_tax, in taxon)

Gene Function and Participation

    (GpBP, participates in)

    (GpCC, participates in)

    (GpPW, actively involved in)

    (GpMF, enables)

    (GpMF, capable of)



In [45]:
llm_pred_pairs = [("MODULATOR", "regulates"),
  ("Gr>G", "regulates"), 
  ("Rg", "regulates"), 
  ("E", "regulates"),
  ("ACTIVATOR", "increases activity of"),
  ("AGONIST", "increases activity of"),
  ("POSITIVE ALLOSTERIC MODULATOR", "increases activity of"),
  ("ACTIVATION", "positively regulates"),
  ("A+", "increases activity of"),
  ("V+", "increases activity of"),
  ("W", "positively regulates"),
  ("ANTAGONIST", "decreases activity of"),
  ("BLOCKER", "decreases activity of"),
  ("CHANNEL BLOCKER", "decreases activity of"),
  ("INHIBITOR", "decreases activity of"),
  ("INHIBITION", "decreases activity of"),
  ("A-", "negatively regulates"),
  ("N", "decreases activity of"),
  ("E+", "increases expression of"),
  ("E-", "decreases expression of"),
  ("CuG", "increases expression of"),
  ("CdG", "decreases expression of"),
  ("AuG", "increases expression of"),
  ("AdG", "decreases expression of"),
  ("DuG", "increases expression of"),
  ("DdG", "decreases expression of"),
  ("EXPRESSION", "positively correlated with"),
  ("BINDING", "physically interacts with"),
  ("BINDER", "physically interacts with"),
  ("PHYSICAL ASSOCIATION", "physically interacts with"),
  ("DIRECT INTERACTION", "physically interacts with"),
  ("ANTIBODY", "physically interacts with"),
  ("B", "physically interacts with"),
  ("HumGenHumGen", "molecularly interacts with"),
  ("GiG", "interacts with"),
  ("DrugHumGen", "interacts with"),
  ("DrugVirGen", "interacts with"),
  ("VirGenHumGen", "interacts with"),
  ("ddi-interactor-in", "interacts with"),
  ("target", "interacts with"),
  ("T", "treats"),
  ("CtD", "treats"),
  ("treats", "treats"),
  ("CpD", "ameliorates"),
  ("Pa", "ameliorates"),
  ("Sa", "causes"),
  ("U", "causes"),
  ("G", "exacerbates"),
  ("J", "contributes to"),
  ("Pr", "prevents"),
  ("Y", "affects risk for"),
  ("Y", "predisposes"),
  ("CrC", "chemically similar to"),
  ("DrD", "similar to"),
  ("in_tax", "in taxon"),
  ("GpBP", "participates in"),
  ("GpCC", "participates in"),
  ("GpPW", "actively involved in"),
  ("GpMF", "enables"),
  ("GpMF", "capable of")
]

In [48]:
test_set = {elem[1] for elem in llm_pred_pairs}

print(f"From all given unique predicates (DRKG:{len(drkg_pred_set)} DrugMech:{len(dmdb_pred_set)}) - {len(llm_pred_pairs)} for DRKG to {len(test_set)} for DrugMechDB matched.")

From all given unique predicates (DRKG:99 DrugMech:56) - 60 for DRKG to 26 for DrugMechDB matched.


### Predicate Conversion Functions

In [None]:
#Predicate matching dictionary: {"drkg_pred_1": "dmdb_pred_1"}
# Have a llm_pred_pairs dictionary prepared with the desired method.

pred_match_dict = {k:v for (k,v) in llm_pred_pairs}
print("DEBUG", pred_match_dict["U"])

DEBUG causes


## Step 3: Compare Pairs & Triples

### Acelerated Entity Pairing

Improved matching speed with ahocorasick library, a Python implementation of the Aho–Corasick algorithm.

In [None]:
import ahocorasick
from builtins import tuple

H, T = defaultdict(list), defaultdict(list)
for i, (s, p, o) in enumerate(dmdb_triplets_set):
    H[s].append(i)
    T[o].append(i)

A = ahocorasick.Automaton()
for k, v in H.items():
    A.add_word(k, tuple(v))
A.make_automaton()

B = ahocorasick.Automaton()
for k, v in T.items():
    B.add_word(k, tuple(v))
B.make_automaton()

common_entity_pair_hits = []
ai, bi = A.iter, B.iter
for (s_drkg, p_drkg, o_drkg) in drkg_triplets_set:
    Hhit = {i for _, idxs in ai(str(s_drkg)) for i in idxs}
    Thit = {i for _, idxs in bi(str(o_drkg)) for i in idxs}
    for i in Hhit.intersection(Thit):
        common_entity_pair_hits.append((i, s_drkg, p_drkg, o_drkg))
common_entity_pair_hits

[(859,
  'Compound::CHEBI:36500',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D005776'),
 (7892,
  'Compound::MESH:D019343',
  'GNBR::Sa::Compound:Disease',
  'Disease::MESH:D010146'),
 (4480,
  'Compound::MESH:D000728',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D000152'),
 (87,
  'Compound::MESH:D013963',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D006980'),
 (5603,
  'Compound::MESH:D004967',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D059268'),
 (1020,
  'Compound::MESH:D004967',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D059268'),
 (466,
  'Compound::MESH:D004967',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D005348'),
 (3202,
  'Compound::MESH:D011453',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D004412'),
 (625,
  'Compound::MESH:D004967',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D010024'),
 (10212,
  'Compound::CHEBI:28044',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D010661'),
 (4401,
  'Compound::MESH:D014280',
  'GNBR::T::Compoun

In [None]:
"Number of matched triplets: " + len(common_entity_pair_hits)

### Predicate-wise Comparison

Utilizes the looping of prior section. Predicates of matches stored and converted here for cross-examination.

In [50]:
# utils function
def get_drkg_pred_name(pred):
    # Input: pred: DRKG native predicate
    # Output: Predicate name string
    return remove_null_from_list(pred.split(":"))[1]

In [None]:
# Common triplet search
for s_dmdb, p_dmdb, o_dmdb in dmdb_triplets_set:
    for _, s_drkg, p_drkg, o_drkg in common_entity_pair_hits:
        if s_dmdb in s_drkg and o_dmdb in o_drkg:
            if p_dmdb == pred_match_dict[get_drkg_pred_name(p_drkg)]:
                print(s_drkg, p_drkg, p_dmdb, o_drkg)

Compound::MESH:D004967 GNBR::T::Compound:Disease treats Disease::MESH:D010024


### **Manual Improvements in matching common elements**

#### Tax == Taxonomy
DRKG format: Tax::10090
DrugMechDB format: taxonomy:622

In [83]:
# DMDB ("taxonomy") to DRKG ("Tax")
dmdb_tax_dict = defaultdict(list)
C = ahocorasick.Automaton()

for i, (s, p, o) in enumerate(dmdb_triplets_set):
    if 'taxonomy' in s:
        recon = f'Tax::{s.split(':')[-1]}'
        dmdb_tax_dict[recon].append(i)
    if 'taxonomy' in o:
        recon = f'Tax::{o.split(':')[-1]}'
        dmdb_tax_dict[recon].append(i)

for tax_e, idx in dmdb_tax_dict.items():
    C.add_word(tax_e, tuple(idx))
C.make_automaton()

ci = C.iter

tax_hits = []
for (s_drkg, p_drkg, o_drkg) in drkg_triplets_set:
    Hhit = {i for _, idxs in ci(str(s_drkg)) for i in idxs}
    Thit = {i for _, idxs in ci(str(o_drkg)) for i in idxs}
    for i in (Hhit | Thit):
        tax_hits.append((i, p_drkg, s_drkg, o_drkg))
# tax_hits
len(tax_hits)

10375

In [84]:
etype_match_dict

{'Gene': 'Protein',
 'Compound': 'Drug',
 'Disease': 'Disease',
 'Atc': 'Drug',
 'Tax': 'OrganismTaxon',
 'Biological Process': 'BiologicalProcess',
 'Symptom': 'PhenotypicFeature',
 'Anatomy': 'GrossAnatomicalStructure',
 'Molecular Function': 'MolecularActivity',
 'Pharmacologic Class': 'ChemicalSubstance',
 'Cellular Component': 'CellularComponent',
 'Pathway': 'Pathway',
 'Side Effect': 'PhenotypicFeature'}

In [None]:
# Re-matching common entity pairs with Tax transformation
import ahocorasick
from builtins import tuple

dmdb_tax_H, dmdb_tax_T = defaultdict(list), defaultdict(list)

H, T = defaultdict(list), defaultdict(list)
for i, (s, p, o) in enumerate(dmdb_triplets_set):
    H[s].append(i)
    T[o].append(i)

    # Manual Format Rules #
    if 'taxonomy' in s:
        recon = f'Tax::{s.split(':')[-1]}'
        dmdb_tax_H[recon].append(i)
    if 'taxonomy' in o:
        recon = f'Tax::{o.split(':')[-1]}'
        dmdb_tax_T[recon].append(i)

A = ahocorasick.Automaton()
for k, v in H.items():
    A.add_word(k, tuple(v))
for tax_e, idx in dmdb_tax_H.items():
    A.add_word(tax_e, tuple(idx))
A.make_automaton()

B = ahocorasick.Automaton()
for k, v in T.items():
    B.add_word(k, tuple(v))
for tax_e, idx in dmdb_tax_T.items():
    B.add_word(tax_e, tuple(idx))
B.make_automaton()

common_entity_pair_hits = []
ai, bi = A.iter, B.iter
for (s_drkg, p_drkg, o_drkg) in drkg_triplets_set:
    Hhit = {i for _, idxs in ai(str(s_drkg)) for i in idxs}
    Thit = {i for _, idxs in bi(str(o_drkg)) for i in idxs}
    for i in Hhit.intersection(Thit):
        common_entity_pair_hits.append((i, s_drkg, p_drkg, o_drkg))
common_entity_pair_hits

[(3768,
  'Compound::MESH:D010710',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D010024'),
 (10072,
  'Compound::MESH:D004967',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D010024'),
 (4892,
  'Compound::CHEBI:28044',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D010661'),
 (1567,
  'Compound::MESH:D014280',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D006949'),
 (10126,
  'Compound::MESH:D001647',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D002769'),
 (2746,
  'Compound::MESH:D014280',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D015228'),
 (4799,
  'Compound::MESH:D014280',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D015228'),
 (1329,
  'Compound::CHEBI:36500',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D005776'),
 (4993,
  'Compound::MESH:D000728',
  'GNBR::T::Compound:Disease',
  'Disease::MESH:D000152'),
 (10088,
  'Compound::MESH:D011453',
  'GNBR::Sa::Compound:Disease',
  'Disease::MESH:D010146'),
 (10726,
  'Compound::MESH:D004083',
  'GNBR::T:

In [None]:
len(common_entity_pair_hits)

20

## Step 4: Synonym Comparison & Pairing

In [None]:
# ! python -m pip install requests

In [None]:
# NodeNormalization Requests #
# Sample ID Requests #
#
# result = requests.get('https://nodenormalization-sri.renci.org/get_normalized_nodes',
#                      params={'curie':"MESH:D014867"})
# result = json.dumps( result.json(), indent = 2)
# result = json.loads(result)
# print(type(result))

# Sample Multi ID Request
# 
# result = requests.post('https://nodenormalization-sri.renci.org/get_normalized_nodes',
#                      json={"curies":["HP:0007354", "HGNC:613", "CURIE:NOTHING"]})
# print( json.dumps( result.json(), indent = 2))

In [40]:
import json 
import requests

def synonym_request(ID='MESH:D014867', out_dict=True):
    # Input: Identifier in database
    # Returns all corresponding IDs from onthologies
    try:
        result = requests.get('https://nodenormalization-sri.renci.org/get_normalized_nodes',
                            params={'curie':f"{ID}"})
        result = json.dumps(result.json(), indent=2)
        if out_dict:
            return request2dict(json.loads(result))
        else:
            return json.loads(result)
    except:
        print(f"The queried element ({ID}) is not available in database.")
        
def synonym_list_request(ID_list):
    # Input: List of identifiers in database
    # Returns dictionary of all corresponding IDs from onthologies
    
    try:
        result = requests.post('https://nodenormalization-sri.renci.org/get_normalized_nodes',
                               json={"curies":ID_list})
        result = json.dumps(result.json(), indent=2)
        return request2dict(json.loads(result))
    except:
        print("An element in the list is not available in the database")

def request2dict(result):
    # Input: Identifier synonyms in database
    # Returns dictionary of synonym identifiers
    tempdict = {}
    for id,v in result.items():
        if v is not None:
            temp_set = set()
            for ide_dict in v['equivalent_identifiers']:
                temp_set.add(ide_dict['identifier'])
            tempdict.update({id:temp_set})
    # print(len(list(tempdict.values())[0]))
    # print(len(tempdict.values()), "is the length of result in req2dict")
    return tempdict

**TODO :** Not all DRKG entities have neat IDs, and thus, have normalization. Reanalyze DRKG for entity types compatible with transformation. Afterwards, check formats since we aren't performing partial string comparison...

### Group Synonym Requests 

Original ID is included in synonym values.

#### DrugMechDB  Synonym Requests

In [44]:
dmdb_synonyms_dict = defaultdict()

for etype, edict in dmdb_entity_dictionary.items():
    print("Processing entity class:", etype)
    result = synonym_list_request(list(edict.keys()))
    print(f"Number of Matching IDs:\t{len(result)}/{len(edict)}\n")
    dmdb_synonyms_dict.update(result)

print("Number of IDs with found synonyms:", len(dmdb_synonyms_dict))

Processing entity class: Drug
Number of Matching IDs:	1555/1644

Processing entity class: Protein
Number of Matching IDs:	7/788

Processing entity class: Disease
Number of Matching IDs:	756/760

Processing entity class: BiologicalProcess
Number of Matching IDs:	653/694

Processing entity class: ChemicalSubstance
Number of Matching IDs:	321/359

Processing entity class: Pathway
Number of Matching IDs:	0/81

Processing entity class: GrossAnatomicalStructure
Number of Matching IDs:	122/122

Processing entity class: MolecularActivity
Number of Matching IDs:	95/98

Processing entity class: OrganismTaxon
Number of Matching IDs:	0/166

Processing entity class: GeneFamily
Number of Matching IDs:	7/137

Processing entity class: CellularComponent
Number of Matching IDs:	41/45

Processing entity class: PhenotypicFeature
Number of Matching IDs:	269/274

Processing entity class: Cell
Number of Matching IDs:	37/37

Processing entity class: MacromolecularComplex
Number of Matching IDs:	2/2

Number of

#### DRKG Synonym Requests

In [127]:
drkg_synonyms_dict = defaultdict()

for etype, edict in drkg_entity_dictionary.items():
    print("Processing entity class: ", etype)
    elist = [x.split("::")[-1] for x in edict.keys()]
    result = synonym_list_request(elist)
    print(f"Number of Matching IDs: {len(result)}/{len(edict)}\n")
    drkg_synonyms_dict.update(result)

print("Number of IDs with found synonyms:", len(drkg_synonyms_dict))

Processing entity class:  Gene
Number of Matching IDs: 0/39220

Processing entity class:  Compound
Number of Matching IDs: 8660/24313

Processing entity class:  Disease
Number of Matching IDs: 4973/5103

Processing entity class:  Atc
Number of Matching IDs: 0/4048

Processing entity class:  Tax
Number of Matching IDs: 0/215

Processing entity class:  Biological Process
Number of Matching IDs: 10207/11381

Processing entity class:  Symptom
Number of Matching IDs: 0/415

Processing entity class:  Anatomy
Number of Matching IDs: 400/400

Processing entity class:  Molecular Function
Number of Matching IDs: 2622/2884

Processing entity class:  Pharmacologic Class
Number of Matching IDs: 0/345

Processing entity class:  Cellular Component
Number of Matching IDs: 1291/1391

Processing entity class:  Pathway
Number of Matching IDs: 0/1822

Processing entity class:  Side Effect
Number of Matching IDs: 0/5701

Number of IDs with found synonyms: 28150


#### Analysis

In [None]:
dmdb_syn_stats = [0,0]
loners = []

for k,v in dmdb_synonyms_dict.items():
    dmdb_syn_stats[1] += len(v)
    if len(v) == 1:
        loners.append(k)
        dmdb_syn_stats[0] += 1

drkg_syn_stats = [0,0]

for k,v in drkg_synonyms_dict.items():
    drkg_syn_stats[1] += len(v)
    if len(v) == 1:
        loners.append(k)
        drkg_syn_stats[0] += 1

print(f"DrugMechDB synonyms found: {dmdb_syn_stats[1]} for {len(dmdb_synonyms_dict)} original entities ({dmdb_syn_stats[0]} exist without synonyms)")
print(f"DRKG synonyms found: {drkg_syn_stats[1]} for {len(drkg_synonyms_dict)} original entities ({drkg_syn_stats[0]} exist without synonyms)")
print(drkg_synonyms_dict[loners[421]])
print(synonym_request(list(drkg_synonyms_dict[loners[421]]).pop()))

DrugMechDB synonyms found: 37936 for 3790 original entities (421 exist without synonyms)
DRKG synonyms found: 139247 for 28150 original entities (5673 exist without synonyms)
['RHEA:58100']
{'RHEA:58100': {'RHEA:58100'}}


### Test  0 synonym entity classes

TODO Figure out if they do not exist or it is a formatting issue. 

In [None]:
# Synonym dictionary samples
for i,dict in enumerate(dmdb_synonyms_dict.items()):
    k,v = dict
    print(k,v)
    if i == 10: break
print() 
for i,dict in enumerate(drkg_synonyms_dict.items()):
    k,v = dict
    print(k,v)
    if i == 10: break

### Matching with Synonyms 

In [None]:
# DrugMechDB Number of IDs with found synonyms: 3790 - synonyms found: 37936  (421 exist without synoynms)
# DRKG Number of IDs with found synonyms: 28150 - synonyms found: 139247  (5673 exist without synoynms)

def crosskg_matches(dmdb_synonyms_dict, drkg_synonyms_dict):
    # Build inverted index : identifier -> {DRKG original IDs}
    identifier_to_drkg = defaultdict()
    for drkg_orig, id_set in drkg_synonyms_dict.items():
        for identifier in id_set:
            identifier_to_drkg[identifier] = drkg_orig

    # For each DMDB original ID, match DRKG originals sharing any identifier
    #dmdb_to_drkg = defaultdict(set)
    dmdb_to_drkg = defaultdict()
    for dmdb_orig, dmdb_id_set in dmdb_synonyms_dict.items():
        for dmdb_syn_ide in dmdb_id_set:
            if dmdb_syn_ide in identifier_to_drkg:
                #dmdb_to_drkg[dmdb_orig].add(identifier_to_drkg[dmdb_syn_ide])
                dmdb_to_drkg[dmdb_orig] = identifier_to_drkg[dmdb_syn_ide]

    return dmdb_to_drkg

# Returns dictionary - {[DMDB ID]:[DRKG ID]}
synonym_matches = crosskg_matches(dmdb_synonyms_dict, drkg_synonyms_dict)
print(len(synonym_matches))

1834


In [None]:
count = [0,0]
for k,v in synonym_matches.items():
    if k == v:
        count[0] += 1
    else:
        count[1] += 1
    
print(count)


[1550, 284]


In [151]:
type_dict = {}

debug_bio = []

for dmdb_id,drkg_id in synonym_matches.items():
    # for ent_type, ent_dict in dmdb_entity_dictionary.items():
    #     if dmdb_id in ent_dict:
    #         #print(dmdb_id, "in DMDB", ent_type)
    #         if ent_type in type_dict:
    #             type_dict[ent_type] += 1
    #         else:
    #             type_dict[ent_type] = 1

    # TODO : FIX - drkg_id in synonyms is not directly applicable to drkg_ dictionaries 
    #              due to format change in NodeNormalization requests ("[source]::" removed)
    #
    #
    # for ent_type, ent_dict in drkg_entity_dictionary.items():
    #     if drkg_id in ent_dict:
    #         print(drkg_id, "in DRKG", ent_type)
    #         if ent_type in type_dict:
    #             type_dict[ent_type] += 1
    #         else:
    #             type_dict[ent_type] = 1

    for ent_type, ent_dict in drkg_entity_dictionary.items():
        for e_id in ent_dict.keys():
            if drkg_id in e_id:
                #print(drkg_id, "in DRKG", ent_type)
                if ent_type in type_dict:
                    type_dict[ent_type] += 1
                else:
                    type_dict[ent_type] = 1

                # DEBUG
                if ent_type == "Biological Process":
                    debug_bio.append(e_id)


for k,v in sorted(type_dict.items(), key=lambda x: x[1], reverse=True):
    print(f"({v}) {k}")

(857) Disease
(483) Biological Process
(346) Compound
(77) Molecular Function
(47) Anatomy
(27) Cellular Component


For comparison, the regular entity matching results (in DRKG Entity Classes)

(724) Disease  
(501) Biological Process  
(279) Compound  
(77) Molecular Function  
(45) Anatomy  
(27) Cellular Component

In [None]:
# for cent in common_entities_dict["Biological Process"]:
#     cent = cent.split("::")
#     if cent in debug_bio:
#         print(cent)

In [None]:
# DrugMechDB synonyms found: 37936  (421 exist without synoynms)
# DRKG synonyms found: 139247  (5673 exist without synoynms)
# TODO : Discrepancy of 90 matches between methods

# def crosskg_matches(dmdb_synonyms_dict, drkg_synonyms_dict):
#     # Build inverted index : identifier -> {DRKG original IDs}
#     identifier_to_drkg = defaultdict()
#     for drkg_orig, id_set in drkg_synonyms_dict.items():
#         for identifier in id_set:
#             identifier_to_drkg[identifier] = drkg_orig

#     # For each DMDB original ID, match DRKG originals sharing any identifier
#     dmdb_to_drkg = defaultdict(set)
#     for dmdb_orig, dmdb_id_set in dmdb_synonyms_dict.items():
#         for dmdb_syn_ide in dmdb_id_set:
#             if dmdb_syn_ide in identifier_to_drkg:
#                 # dmdb_to_drkg[dmdb_orig].add(identifier_to_drkg[dmdb_syn_ide])
#                 # Dictionary format: {[Matched item]:([DMDB Original ID],[DRKG Original ID])}
#                 dmdb_to_drkg[dmdb_syn_ide] = (dmdb_orig, identifier_to_drkg[dmdb_syn_ide])
#                 break

#     return dmdb_to_drkg

# synonym_matches = crosskg_matches(dmdb_synonyms_dict, drkg_synonyms_dict)
# print(len(synonym_matches))

1744


In [None]:
# tempset = set()
# for i,dict in enumerate(synonym_matches.items()):
#     k,v = dict
#     dmdb_s, drkg_s = v
#     if not dmdb_s == drkg_s:
#         print(k,v)
#         tempset.add(v)
        
# print(len(tempset))
# synonym_request("MESH:D000068877")

### Synonym DrugMech's Drug-Disease Pair Matching

In [29]:
# 
for i in dmdb_drug_disease_pairs:
    print(i)
    break

('MESH:D000068877', 'MESH:D015464')


In [None]:
# Re-matching common entity pairs with Tax transformation
import ahocorasick
from builtins import tuple

dmdb_tax_H, dmdb_tax_T = defaultdict(list), defaultdict(list)
dmdb_syn_H, dmdb_syn_T = defaultdict(list), defaultdict(list)

H, T = defaultdict(list), defaultdict(list)
for i, (s, p, o) in enumerate(dmdb_triplets_set):

    H[s].append(i)
    T[o].append(i)

    # Manual Format Rules #
    if 'taxonomy' in s:
        recon = f'Tax::{s.split(':')[-1]}'
        dmdb_tax_H[recon].append(i)
    if 'taxonomy' in o:
        recon = f'Tax::{o.split(':')[-1]}'
        dmdb_tax_T[recon].append(i)

    if s in synonym_matches:
        dmdb_syn_H[synonym_matches[s]].append(i)
    if o in synonym_matches:
        dmdb_syn_T[synonym_matches[o]].append(i)
    #########

A = ahocorasick.Automaton()
for k, v in H.items():
    A.add_word(k, tuple(v))
#
for tax_e, idx in dmdb_tax_H.items():
    A.add_word(tax_e, tuple(idx))
for syn_e, idx in dmdb_syn_H.items():
    A.add_word(syn_e, tuple(idx))
A.make_automaton()

B = ahocorasick.Automaton()
for k, v in T.items():
    B.add_word(k, tuple(v))
#
for tax_e, idx in dmdb_tax_T.items():
    B.add_word(tax_e, tuple(idx))
for syn_e, idx in dmdb_syn_T.items():
    B.add_word(syn_e, tuple(idx))
B.make_automaton()

syn_common_entity_pair_hits = []
ai, bi = A.iter, B.iter
for (s_drkg, p_drkg, o_drkg) in drkg_triplets_set:
    Hhit = {i for _, idxs in ai(str(s_drkg)) for i in idxs}
    Thit = {i for _, idxs in bi(str(o_drkg)) for i in idxs}
    for i in Hhit.intersection(Thit):
        syn_common_entity_pair_hits.append((i, s_drkg, p_drkg, o_drkg))
syn_common_entity_pair_hits

In [None]:
len(syn_common_entity_pair_hits)

24

In [None]:
for tup in syn_common_entity_pair_hits:
    if tup not in common_entity_pair_hits:
        print(tup)

(7508, 'Disease::MESH:D003866', 'Hetionet::DrD::Disease:Disease', 'Disease::DOID:5419')
(6213, 'Disease::MESH:D003866', 'Hetionet::DrD::Disease:Disease', 'Disease::DOID:5419')
(9642, 'Compound::MESH:D011453', 'GNBR::Sa::Compound:Disease', 'Disease::MESH:D010146')
(7564, 'Compound::MESH:D011453', 'GNBR::Sa::Compound:Disease', 'Disease::MESH:D010146')


# Visualization
Visualization template provided by Rishabh Jakhar.

In [105]:
from pyvis.network import Network
from IPython.display import IFrame

def plot_pyvis_network(G, labeled_nodes=None, notebook=True, output_file="graph.html"):
    """
    Plot a network graph using PyVis with optimized handling of labeled nodes.
    
    Parameters:
    -----------
    G : networkx.Graph
        The graph to visualize
    labeled_nodes : list, optional
        List of node IDs that should display labels
    notebook : bool, default=True
        Whether to render in a Jupyter notebook
    output_file : str, default="graph.html"
        Path to save the HTML output
        
    Returns:
    --------
    IFrame or None
        IFrame object if in notebook mode, otherwise None
    """
    net = Network(
        height="750px", 
        width="100%", 
        directed=True, 
        notebook=notebook, 
        cdn_resources="in_line"
    )
    
    # Define node colors by type
    node_color_map = {
        "drug": "#00817a",
        "indication": "#e69138",
        "protein": "#b71c1c",
        "biological_function": "#6d9eeb",
    }
    
    # Add nodes to the network
    
    for node in G.nodes():
        ntype = G.nodes[node].get("type", "unknown")
        
        # Determine if this is a labeled node
        is_labeled = labeled_nodes and node in labeled_nodes
        
        # Set node properties based on whether it's labeled
        size = 120 if is_labeled else 30
        
        # Set tooltip for all nodes
        title = f"""{G.nodes[node].get("label", "")} ({node})"""
        
        # For unlabeled nodes, use a space character with font size 0 to effectively hide labels
        # Using None or empty string would show the node ID
        if is_labeled:
            label = f"""{G.nodes[node].get("name", "")} ({node})"""
            font_settings = {'size': 200, 'strokeWidth': 2, 'strokeColor': 'white'}
            physics = True
            mass = 10
        else:
            label = " "  # Space character instead of None
            font_settings = {'size': 0}  # Size 0 makes it invisible
            physics = True
            mass = .5
        
        # Add the node with configured properties
        net.add_node(
            node,
            label=label,
            title=title,
            color=node_color_map.get(ntype, "gray"),
            size=size,
            font=font_settings,
            shape="dot", 
            physics=physics, 
            mass=mass
        )
    
    # Add edges to the network
    for u, v, edata in G.edges(data=True):
        rel = edata.get("type", "related_to")
        width = 1
        dashes = True
        color = 'gray'
        
        # Highlight edges that connect sequential labeled nodes
        if labeled_nodes and len(labeled_nodes) > 1:
            for i in range(len(labeled_nodes) - 1):
                if u == labeled_nodes[i] and v == labeled_nodes[i+1]:
                    width = 20
                    dashes = False
                    color = 'orange'
                    break
        
        net.add_edge(
            u,
            v,
            color=color,               
            width=width,
            dashes=dashes,
            title=rel,
            arrowStrikethrough=False,
        )
    
   # Set display and physics options
    net.set_options("""
    var options = {
      "nodes": { 
        "font": {
          "size": 16,
          "strokeWidth": 2,
          "strokeColor": "white"
        },
        "scaling": {
          "label": {
            "enabled": false
          }
        }
      },
      "edges": { 
        "smooth": true,
        "arrows": {
          "to": {
            "enabled": true,
            "scaleFactor": 2
          }
        }
      },
      "physics": {
        "enabled": true,
        "solver": "barnesHut",
        "barnesHut": {
          "gravitationalConstant": -60000,
          "centralGravity": 0.6,
          "springLength": 150,
          "springConstant": 0.04,
          "damping": 0.6,
          "avoidOverlap": 0.4
        },
        "minVelocity": 0.75,
        "stabilization": {
          "enabled": true,
          "iterations": 100,
          "updateInterval": 1,
          "fit": true
        }
      },
      "interaction": {
        "hover": true,
        "tooltipDelay": 100,
        "hideEdgesOnDrag": false,
        "hideNodesOnDrag": false,
        "zoomView": true,
        "dragNodes": true,
        "dragView": true
      },
      "layout": {
        "improvedLayout": true,
        "hierarchical": {
          "enabled": false
        }
      }
    }
    """)
    
    # Show the network
    net.show(output_file)
    
    # Return an IFrame for notebook display
    if notebook:
        return IFrame(output_file, width="100%", height="1000px")