<a href="https://colab.research.google.com/github/wikipathways/BioThings_Explorer_PFOCR_clustering/blob/main/bte_clustering_AA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import required packages
import requests
import json
from copy import copy,deepcopy
import pandas as pd

## Get BTE TRAPI Results
Imatinib - [Gene] - Asthma: https://arax.ncats.io/api/arax/v1.3/response/7b14f961-9066-41f7-9e3b-d76b2b4a7fac (83kB, 7 results); "results" in n1

In [2]:
#create the query
query = {
  "message": {
    "query_graph": {
      "edges": {
        "e0": {
          "object": "n1",
          "predicates": [
            "biolink:related_to"
          ],
          "subject": "n0"
        },
        "e1": {
          "object": "n2",
          "predicates": [
            "biolink:related_to"
          ],
          "subject": "n1"
        }
      },
      "nodes": {
        "n0": {
          "categories": [
            "biolink:Drug",
            "biolink:MolecularMixture"
          ],
          "ids": [
            "MESH:D000068877"
          ]
        },
        "n1": {
          "categories": [
            "biolink:Gene"
          ]
        },
        "n2": {
          "categories": [
            "biolink:Disease",
            "biolink:PhenotypicFeature"
          ],
          "ids": [
            "MESH:D001249"
          ]
        }
      }
    }
  }
}

In [3]:
#get the TRAPI results
#trapi_response = requests.post("http://localhost:3000/v1/query", json=query)
trapi_response = requests.post("https://api.bte.ncats.io/v1/query", json=query)
#trapi_response = requests.post("https://arax.ncats.io/api/rtxkg2/v1.2/query", json=query)
#trapi_response = requests.post("https://arax.ncats.io/api/arax/v1.2/query", json=query)

## Process TRAPI results into a data frame

In [4]:
#print(json.dumps(query))
#trapi_response.json()
#trapi_response.json()["message"]
trapi_message = trapi_response.json()["message"]
#trapi_message["results"]

In [5]:
# NOTE: we're including categories specified in the query template
# in the next cell, and in a cell further down, we're also including
# any categories from the TRAPI results.
#
# TODO: Some categories are supersets of others. Should we handle
# this systematically?

curie_categories = set()
for v in query["message"]["query_graph"]["nodes"].values():
    if "categories" in v:
        for category in v["categories"]:
            curie_categories.add(category)

In [6]:
# for genes/gene products, chemicals and diseases
preferred_prefixes = set(["NCBIGene", "MESH"])
trapi_results_unified_curies = set()
unification_failed_curies = set()
unified_prefixes = set()
all_prefixes = set()
curie_to_name = dict()
curie_to_categories = dict()
curie_to_unified_curie = dict()
for k, v in trapi_message["knowledge_graph"]["nodes"].items():
    name = v["name"]
    categories = v["categories"]
    curie_categories |= set(categories)

    for a in v["attributes"]:
        if a["attribute_type_id"] == "biolink:xref":
            curies = a["value"]

            # k should always be one of the curies
            if not k in curies:
                raise Exception(f"key {k} not in {curies}")

            unified_curie = None
            intersecting_trapi_results_unified_curies = trapi_results_unified_curies.intersection(
                set(curies)
            )
            if len(intersecting_trapi_results_unified_curies) > 1:
                multiple_matches = list(intersecting_trapi_results_unified_curies)
                raise Exception(f"matching multiple: {k} to {multiple_matches}")
            elif len(intersecting_trapi_results_unified_curies) == 1:
                unified_curie = list(intersecting_trapi_results_unified_curies)[0]
            else:
                # get curie for preferred prefix. usually this is k, but not always.
                for curie in curies:
                    [prefix, identifier] = curie.split(":")
                    if prefix in preferred_prefixes:
                        unified_curie = curie
                        trapi_results_unified_curies.add(unified_curie)
                        break

            if not unified_curie:
                if k in curie_to_unified_curie:
                    unified_curie = curie_to_unified_curie[k]
                else:
                    unification_failed_curies.add(k)
                    break

                #raise Exception(f"failed to find a unified curie for {k} in {curies}")

            [unified_prefix, unified_identifier] = unified_curie.split(":")
            unified_prefixes.add(unified_prefix)

            for curie in curies:
                [prefix, identifier] = curie.split(":")
                all_prefixes.add(prefix)
                if not curie in curie_to_unified_curie:
                    curie_to_unified_curie[curie] = unified_curie
                if not curie in curie_to_name:
                    curie_to_name[curie] = name
                    curie_to_categories[curie] = categories
                elif curie_to_name[curie] != name:
                    print(f"curie {curie} has multiple primary names: {curie_to_name[curie]} and {name}")
                    #raise Exception(f"curie{curie} has multiple names: {curie_to_name[curie]} and {name}")


#print(f'curie_categories: {curie_categories}')
#print("")
#print(f"curie_to_name key count: {len(curie_to_name.keys())}")
#print(f"curie_to_unified_curie key count: {len(curie_to_unified_curie.keys())}")
#print("")
print("all CURIE prefixes found:")
print(all_prefixes)
print("")
print("unified CURIE prefixes found:")
print(unified_prefixes)
print("")
print(f"failed to unify {len(unification_failed_curies)} CURIEs")

all CURIE prefixes found:
{'HGNC', 'CHEMBL.COMPOUND', 'MESH', 'NCBIGene', 'UMLS', 'CHEBI', 'PR', 'OMIM', 'PUBCHEM.COMPOUND', 'INCHIKEY', 'HP', 'CAS', 'ENSEMBL', 'NCIT', 'MEDDRA', 'SNOMEDCT', 'UniProtKB', 'UNII'}

unified CURIE prefixes found:
{'MESH', 'NCBIGene'}

failed to unify 0 CURIEs


In [7]:
curie_to_unified_curie

{'NCBIGene:5156': 'NCBIGene:5156',
 'ENSEMBL:ENSG00000134853': 'NCBIGene:5156',
 'HGNC:8803': 'NCBIGene:5156',
 'OMIM:173490': 'NCBIGene:5156',
 'UMLS:C1335201': 'NCBIGene:5156',
 'UniProtKB:P16234': 'NCBIGene:5156',
 'PR:P16234': 'NCBIGene:5156',
 'UMLS:C3853694': 'NCBIGene:5156',
 'PUBCHEM.COMPOUND:123596': 'MESH:D000068877',
 'PUBCHEM.COMPOUND:90091706': 'MESH:D000068877',
 'CHEMBL.COMPOUND:CHEMBL1642': 'MESH:D000068877',
 'UNII:8A1O1M485B': 'MESH:D000068877',
 'CHEBI:31690': 'MESH:D000068877',
 'MESH:D000068877': 'MESH:D000068877',
 'CAS:220127-57-1': 'MESH:D000068877',
 'INCHIKEY:YLMAHDNUQAMNNX-UHFFFAOYSA-N': 'MESH:D000068877',
 'UMLS:C0906802': 'MESH:D000068877',
 'UMLS:C0939537': 'MESH:D000068877',
 'NCBIGene:4067': 'NCBIGene:4067',
 'ENSEMBL:ENSG00000254087': 'NCBIGene:4067',
 'HGNC:6735': 'NCBIGene:4067',
 'OMIM:165120': 'NCBIGene:4067',
 'UMLS:C0812307': 'NCBIGene:4067',
 'UniProtKB:A8K379': 'NCBIGene:4067',
 'UniProtKB:P07948': 'NCBIGene:4067',
 'PR:P07948': 'NCBIGene:4067',

In [8]:
columns = []
q_node_id_keys = set(["object", "subject"])
q_node_ids = []
q_edge_ids = []
for q_edge_id,edge_v in query["message"]["query_graph"]["edges"].items():
    q_edge_ids.append(q_edge_id)
    q_node_id_found = False
    for k,v in edge_v.items():
        if (k in q_node_id_keys) and (type(v) is str):
            if v not in columns:
                q_node_ids.append(v)
                columns.append(v)
            if not q_node_id_found:
                q_node_id_found = True
                columns.append(q_edge_id)

query_nodes_with_ids = set()
for k, v in query['message']['query_graph']['nodes'].items():
    node_ids = v.get('ids', [])
    if node_ids:
        query_nodes_with_ids.add(k)

for q_node_id in q_node_ids:
    columns.append(f"{q_node_id}_original_curie")
    columns.append(f"{q_node_id}_unified_curie")

columns.append("unified_curie_set")

trapi_result_columns = []
for i in range(len(q_node_ids)):
    trapi_result_columns.append(q_node_ids[i])
    if i < len(q_edge_ids):
        trapi_result_columns.append(q_edge_ids[i])

unified_curie_columns = []
for q_node_id in q_node_ids:
    unified_curie_columns.append(f"{q_node_id}_unified_curie")

In [10]:
trapi_results = trapi_message["results"]

result_row_data = []
for trapi_result in trapi_results:
    curie_to_qnode_ids = dict()
    for qnode_id, entries in trapi_result["node_bindings"].items():
        for entry in entries:
            curie = entry["id"]
            if curie not in curie_to_qnode_ids:
                curie_to_qnode_ids[curie] = []
            curie_to_qnode_ids[curie].append(qnode_id)

    row_data_template = dict()
    q_edge_id_to_predicates = dict()
    trapi_result_curie_set = set()
    for qedge_id, entries in trapi_result["edge_bindings"].items():
        for entry in entries:
            curie = entry["id"]
            kg_entry = trapi_message["knowledge_graph"]["edges"][curie]
            subject_curie = kg_entry["subject"]
            object_curie = kg_entry["object"]
            predicate_curie = kg_entry["predicate"]
            [predicate_prefix, predicate_identifier] = predicate_curie.split(":")

            if qedge_id not in q_edge_id_to_predicates:
                q_edge_id_to_predicates[qedge_id] = set()
            q_edge_id_to_predicates[qedge_id].add(predicate_identifier)

            for curie in [subject_curie, object_curie]:
                for qnode_id in curie_to_qnode_ids[curie]:
                    if curie in curie_to_unified_curie:
                        unified_curie = curie_to_unified_curie[curie]
                    else:
                        break

                    name = curie_to_name[curie]
                    row_data_template[qnode_id] = name

                    trapi_result_curie_set.add(unified_curie)
                    row_data_template[qnode_id + "_original_curie"] = curie
                    row_data_template[qnode_id + "_unified_curie"] = unified_curie

    if len(trapi_result_curie_set) != len(q_node_ids):
        #print(f'skipping {list(curie_to_qnode_ids.keys())}')
        continue

    row_data_template["unified_curie_set"] = trapi_result_curie_set
    q_edge_ids_processed = set()
    row_datas = [row_data_template]
    for q_edge_id,predicates in q_edge_id_to_predicates.items():
        next_row_datas = []
        for row_data in row_datas:
            for predicate in predicates:
                next_row_data = deepcopy(row_data)
                next_row_data[q_edge_id] = predicate
                next_row_datas.append(
                    next_row_data 
                )
        row_datas = next_row_datas
    result_row_data += row_datas

print("warning: predicate direction(s) may be switched")
trapi_results_df = pd.DataFrame.from_records(result_row_data, columns=columns)
trapi_results_df



Unnamed: 0,n1,e0,n0,n2,e1,n1_original_curie,n1_unified_curie,n0_original_curie,n0_unified_curie,n2_original_curie,n2_unified_curie,unified_curie_set
0,LYN,response_affected_by,Imatinib mesylate,Asthma,related_to,NCBIGene:4067,NCBIGene:4067,PUBCHEM.COMPOUND:123596,MESH:D000068877,HP:0002099,MESH:D001249,"{MESH:D001249, NCBIGene:4067, MESH:D000068877}"
1,LYN,response_decreased_by,Imatinib mesylate,Asthma,related_to,NCBIGene:4067,NCBIGene:4067,PUBCHEM.COMPOUND:123596,MESH:D000068877,HP:0002099,MESH:D001249,"{MESH:D001249, NCBIGene:4067, MESH:D000068877}"
2,PDGFRA,response_affected_by,Imatinib mesylate,Asthma,gene_associated_with_condition,NCBIGene:5156,NCBIGene:5156,PUBCHEM.COMPOUND:123596,MESH:D000068877,HP:0002099,MESH:D001249,"{MESH:D001249, NCBIGene:5156, MESH:D000068877}"
3,PDGFRA,response_increased_by,Imatinib mesylate,Asthma,gene_associated_with_condition,NCBIGene:5156,NCBIGene:5156,PUBCHEM.COMPOUND:123596,MESH:D000068877,HP:0002099,MESH:D001249,"{MESH:D001249, NCBIGene:5156, MESH:D000068877}"


In [11]:
trapi_results

[{'node_bindings': {'n1': [{'id': 'NCBIGene:4067'}],
   'n2': [{'id': 'HP:0002099'}],
   'n0': [{'id': 'PUBCHEM.COMPOUND:123596'}]},
  'edge_bindings': {'e1': [{'id': '5fd91eeb6d827684f35f037870260046'}],
   'e0': [{'id': 'fdf3b7ea048504930c98447f9f41fff5'},
    {'id': '4d6d6d501d8c0a2f5db2ea4319996329'}]},
  'score': 0.8510301411652301},
 {'node_bindings': {'n1': [{'id': 'NCBIGene:5156'}],
   'n2': [{'id': 'HP:0002099'}],
   'n0': [{'id': 'PUBCHEM.COMPOUND:123596'}]},
  'edge_bindings': {'e1': [{'id': 'ddf2f612e98a1763d52394ac4a3768e6'}],
   'e0': [{'id': 'c24ce08d1925cfe53cffd609f2ebfb0e'},
    {'id': 'd7949fe28a6d3362a91c2b08d5a8dfe0'}]},
  'score': 0}]