In [44]:
import numpy
import os

import csv
import yaml

from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import Namespace

from tqdm import tqdm

#### Import DRKG

In [2]:
drkg_path = "data/drkg/drkg.tsv"
ns = Namespace("http://drkg.org/graph/")

In [3]:
drkg = Graph()

print("Loading DRKG database from tsv...")
with open(drkg_path, 'r') as tsv_file:
    reader = csv.reader(tsv_file, delimiter='\t')
    for row in reader:
        if len(row) == 3:
            subject = ns[row[0].replace(' ', '_').replace('|', '_')]
            predicate = ns[row[1].replace(' ', '_').replace('>', '_')]
            # Determine if object is a URI or a Literal
            object = URIRef(row[2].replace(' ', '_').replace('|', '_')) if row[2].startswith('http') else Literal(row[2].replace(' ', '_').replace('|', '_'))
            drkg.add((subject, predicate, object))
            #print(f'Added:\n {subject}, \n {predicate}, \n {object}.')

Loading DRKG database from tsv...


In [46]:
cnt = 0

def raw2str(row):
    # remove uri and clean predicate
    s,p,o = row
    if "drkg" in s:
        return str(s).split("/")[-1], clean_pred(str(p).split("/")[-1]), str(o).split("/")[-1]
    else:
        return str(s).split("/")[-1], str(p).split("/")[-1], str(o).split("/")[-1]

def clean_pred(p):
    return str(p).split("::")[1]    # 0 - source, 1 - predicate, 2 - head & tail entity types

In [None]:
# Standard DRKG loop
for row in drkg:
    s,p,o = raw2str(row)

    if "MESH" in s or "MESH" in o:
        print(s,p,o)
        cnt += 1
    if cnt > 50: break

#### Import DrugMechDB

In [5]:
dmdb_path = "data/drugmech/drugmechdb.yaml"
ns = Namespace("http://drugmech.org/graph/")

In [6]:
dmdb = Graph()

with open(dmdb_path, 'r') as yaml_file:
    data = yaml.safe_load(yaml_file)
    print('YAML loaded')

    count = [0,0,0]
    
    for entry in data:
        # graph_id = entry['graph']['_id']

        for node in entry['nodes']:
            node_id = ns[node['id']]
            label = Literal(node['label'])
            name = Literal(node['name'])
            dmdb.add((node_id, RDF.type, URIRef(ns + 'Node')))
            dmdb.add((node_id, URIRef(ns + 'label'), label))
            dmdb.add((node_id, URIRef(ns + 'name'), name))
            count[1] += 1
            # print('Node :', node)
            
        for link in entry['links']:
            source = URIRef(ns + link['source'])
            target = URIRef(ns + link['target'])
            predicate = URIRef(ns + link['key'].replace(" ", "_"))
            dmdb.add((source, predicate, target))
            count[2] += 1
            # print('Link :', link)
        count[0] += 1
        # print('Entry :', entry)
    print(count)

YAML loaded
[4846, 33009, 32641]


### Step 1
How much overlap between DMDB and DRKG entities?

In [39]:
dmdb_mesh = set()

for row in dmdb:
    s,p,o = raw2str(row)

    if "MESH" in s:
        dmdb_mesh.add(s)
    elif "MESH" in o:
        dmdb_mesh.add(o)

print("Number of MESH-coded entities in DrugMechDB:", len(dmdb_mesh), "/", len(dmdb))

Number of MESH-coded entities in DrugMechDB: 2539 / 27228


In [None]:
common_mesh = set()


drkg_mesh_cnt = 0
for row in drkg:
    s,p,o = raw2str(row)

    if "MESH" in s:
        drkg_mesh_cnt += 1
        if s in dmdb_mesh:
            common_mesh.add(s)
    elif "MESH" in o:
        drkg_mesh_cnt += 1
        if o in dmdb_mesh:
            common_mesh.add(o)
print("Number of MESH-coded entities in DRKG:", drkg_mesh_cnt, "/", len(drkg))

Number of MESH-coded entities in DRKG: 239803 / 5874258


In [34]:
for i in common_mesh:
    print(i)

print(len(dmdb_mesh))

14767


### Getting Unique Entities

In [51]:
# Get DRKG's unique entities
drkg_entities = set()

for row in tqdm(drkg):
    s,p,o = raw2str(row)

    for ent in [s,o]:
        if not isinstance(ent, Literal):
            drkg_entities.add(ent)

print(f"Number of stored unique entities: {len(drkg_entities)}")


100%|██████████| 5874258/5874258 [00:39<00:00, 148673.11it/s]

Number of stored unique entities: 97238





In [None]:
# DEBUG
for idx, i in enumerate(drkg_entities):
    print(i)
    if idx == 30: break

In [56]:
# Get DMDB's Unique entities
dmdb_entities = set()

for row in tqdm(dmdb):
    s,p,o = raw2str(row)

    for ent in [s,o]:
        if not isinstance(ent, Literal):
            # print(ent)
            dmdb_entities.add(ent)

print(f"Number of stored unique entities: {len(dmdb_entities)}")

100%|██████████| 27228/27228 [00:00<00:00, 246468.18it/s]

Number of stored unique entities: 10962





In [None]:
# DEBUG         TODO : Literals in entities?
for idx, i in enumerate(dmdb_entities):
    print(i)
    if idx == 30: break

vitamin C
purine nucleotide biosynthetic process
Wheezing
MESH:D001943
Chloramphenicol
MESH:C536777
Paroxysmal nocturnal hemoglobinuria
Cytokine production
MESH:D003907
Protein Kinase C
Cataract
imipenem
Streptococcal infectious disease
Giant Cell Arteritis
Pneumococcal meningitis
UniProt:P0DJD9
MESH:D056806
MESH:D002177
UniProt:P00488
MESH:D003607
Rosuvastatin
UniProt:Q8ZB62
DB:DB01430
Abnormality of temperature regulation
MESH:D008118
Cytokine receptor common subunit gamma
5-hydroxytryptamine receptor 1D
regulation of synaptic transmission, dopaminergic
Lymphoma, Mantle-Cell
Diazepam
Magnesium


In [None]:
# Comparing unique entity sets
common_entities = set()

for elem in tqdm(dmdb_entities, desc="Comparison in progress"):
    if elem in drkg_entities:
        common_entities.add(elem)

print("Number of common entities:", len(common_entities), "/", (len(drkg_entities) + len(dmdb_entities)))

Comparison in progress: 100%|██████████| 10962/10962 [00:00<00:00, 1811867.92it/s]

Number of common entities: 10962 / 119162





In [None]:
# DEBUG
for idx, i in enumerate(common_entities):
    if ":" in i:        # hard-coded filter...
        print(i)
    if idx == 300: break

MESH:D001943
MESH:C536777
MESH:D003907
UniProt:P0DJD9
MESH:D056806
MESH:D002177
UniProt:P00488
MESH:D003607
UniProt:Q8ZB62
DB:DB01430
MESH:D008118
UniProt:O75116
MESH:D000069470
MESH:D005871
UBERON:0001555
taxonomy:76773
taxonomy:10407
MESH:D005998
MESH:D003493
MESH:C579652
GO:0014051
GO:0099536
MESH:D005230
MESH:D017984
MESH:D000069445
HP:0030088
CHEBI:16412
HP:0003124
MESH:D004967
MESH:C047681
UBERON:0001811
MESH:C036006
MESH:D002470
UniProt:P78559
InterPro:IPR001054
GO:0008219
GO:0031594
UBERON:0000007
UniProt:P9WQA9
MESH:D006143
MESH:C000589393
MESH:D013322
MESH:D018784
MESH:D009266
MESH:D014747
reactome:R-HSA-390651
HP:0012534
MESH:C073323
UniProt:P17948
UBERON:0001291
MESH:D011906
MESH:D031901
MESH:D000069580
MESH:D061268
UniProt:P06653
MESH:D014290
MESH:C074679
UBERON:0008915
MESH:C011685
UniProt:B2ISJ9
MESH:D007638
UBERON:0002018
UniProt:A0A717UR96
MESH:C004280
MESH:D019584
MESH:D006493
HP:0025300
CHEBI:46345
HP:0100543
UniProt:P62942
taxonomy:1817
MESH:D015232
taxonomy:1392
GO

In [None]:
# Inefficient entity comparison
common_entities = set()

for row in tqdm(dmdb, desc="Hour-long comparison in progress..."):
    s,p,o = raw2str(row)

    for row2 in drkg:
        s2,p2,o2 = raw2str(row2)

        if s in s2 or s in o2:
            common_entities.add(s)
        if o in s2 or o in o2:
            common_entities.add(o)

print("Number of common entities:", len(common_entities), "/", (len(drkg) + len(dmdb)))