<a href="https://colab.research.google.com/github/wikipathways/pathway-figure-ocr/blob/master/notebooks/bte_with_pfocr_cooccurrence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install numpy pandas requests requests_cache



# BTE with PFOCR Coocurrence

In [2]:
import requests
import requests_cache


requests_cache.install_cache("pfocr_cache")

In [3]:
query = {
    "message": {
        "query_graph": {
            "edges": {
                "e01": {
                    "constraints": [],
                    "object": "n0",
                    "predicates": ["biolink:related_to"],
                    "subject": "n1",
                },
                "e02": {
                    "constraints": [],
                    "object": "n1",
                    "predicates": ["biolink:related_to"],
                    "subject": "n2",
                },
            },
            "nodes": {
                "n0": {
                    "categories": ["biolink:Gene"],
                    "constraints": [],
                    "ids": ["NCBIGene:3855", "NCBIGene:211", "NCBIGene:26995"],
                    "is_set": False,
                },
                "n1": {
                    "categories": ["biolink:Gene"],
                    "constraints": [],
                    "is_set": False,
                },
                "n2": {
                    "categories": ["biolink:SmallMolecule"],
                    "constraints": [],
                    "ids": ["PUBCHEM.COMPOUND:3121"],
                    "is_set": False,
                },
            },
        }
    }
}

bte_r = requests.post("https://api.bte.ncats.io/v1/query", json=query)
print(bte_r.status_code)
if bte_r.status_code != 200:
    print(bte_r.text)

200


In [4]:
gene_nodes = []
for query_node_id, v in query["message"]["query_graph"]["nodes"].items():
    if "biolink:Gene" in v["categories"]:
        gene_nodes.append(query_node_id)
print(gene_nodes)

['n0', 'n1']


In [5]:
bte_json = bte_r.json()
bte_results = bte_json["message"]["results"]
all_genes = set()
prefixes = set()
ncbi_genes = set()
genes_to_bte_results = dict()
for bte_result in bte_results:
    bte_result_genes = []
    for gene_node in gene_nodes:
        for entry in bte_result["node_bindings"][gene_node]:
            id = entry["id"]
            all_genes.add(id)
            prefixes.add(entry["id"].split(":")[0])
            target_prefix = "NCBIGene:"
            if id.startswith(target_prefix):
                ncbi_genes.add(entry["id"][len(target_prefix):])
                bte_result_genes.append("NCBIGene:" + entry["id"][len(target_prefix):])
    genes_key = tuple(sorted(bte_result_genes))
    if not genes_key in genes_to_bte_results:
        genes_to_bte_results[genes_key] = []
    genes_to_bte_results[genes_key].append(bte_result)
prefixes

{'NCBIGene'}

In [6]:
query_param_limit = 4000
pfocr_results = []
q_term = ""
for ncbi_gene in ncbi_genes:
    if q_term != "":
        concatenator = " OR "
    else:
        concatenator = ""

    next_q = concatenator + "associatedWith.mentions.genes.ncbigene:" + ncbi_gene

    if len(q_term + next_q) < query_param_limit:
        q_term += next_q
    else:
        payload = {"q": q_term}
        pfocr_r = requests.get("https://biothings.ncats.io/pfocr/query", params=payload)
        print(f"status_code: {pfocr_r.status_code}")
        if pfocr_r.status_code != 200:
            print(pfocr_r.text)
        pfocr_results.append(pfocr_r.json())
        q_term = "associatedWith.mentions.genes.ncbigene:" + ncbi_gene

if q_term:
    print(f"q_term: {q_term}")
    payload = {"q": q_term}
    pfocr_r = requests.get("https://biothings.ncats.io/pfocr/query", params=payload)
    print(f"status_code: {pfocr_r.status_code}")
    if pfocr_r.status_code != 200:
        print(pfocr_r.text)
    pfocr_results.append(pfocr_r.json())
    q_term = ""

print(len(pfocr_results))

status_code: 200
status_code: 200
status_code: 200
q_term: associatedWith.mentions.genes.ncbigene:4585 OR associatedWith.mentions.genes.ncbigene:23657 OR associatedWith.mentions.genes.ncbigene:3558 OR associatedWith.mentions.genes.ncbigene:2167 OR associatedWith.mentions.genes.ncbigene:36 OR associatedWith.mentions.genes.ncbigene:11118 OR associatedWith.mentions.genes.ncbigene:2169 OR associatedWith.mentions.genes.ncbigene:5950 OR associatedWith.mentions.genes.ncbigene:10038 OR associatedWith.mentions.genes.ncbigene:54556 OR associatedWith.mentions.genes.ncbigene:7450 OR associatedWith.mentions.genes.ncbigene:5111 OR associatedWith.mentions.genes.ncbigene:11083 OR associatedWith.mentions.genes.ncbigene:5927 OR associatedWith.mentions.genes.ncbigene:1026 OR associatedWith.mentions.genes.ncbigene:4582 OR associatedWith.mentions.genes.ncbigene:1543 OR associatedWith.mentions.genes.ncbigene:79633 OR associatedWith.mentions.genes.ncbigene:231 OR associatedWith.mentions.genes.ncbigene:2660 O

In [7]:
hits = []
for pfocr_result in pfocr_results:
    hits += pfocr_result["hits"]
print(len(hits))

40


In [8]:
pfocr_id_to_genes = dict()
pfocr_id_to_pfocr_result = dict()
for hit in hits:
    pfocr_id = hit["_id"]
    pfocr_id_to_genes[pfocr_id] = set(
        "NCBIGene:" + g for g in hit["associatedWith"]["mentions"]["genes"]["ncbigene"]
        )
    pfocr_id_to_pfocr_result[pfocr_id] = {
        "figureUrl": hit["associatedWith"]["figureUrl"],
        "pmc": hit["associatedWith"]["pmc"],
        "score": hit["_score"],
    }

In [9]:
print(len(pfocr_id_to_genes.keys()))
len(genes_to_bte_results.keys())

36


356

In [10]:
from copy import copy


cooccurring_bte_results = []
for g0_tuple, genes_bte_results in genes_to_bte_results.items():
    g0 = set(g0_tuple)
    for pfocr_id, g1 in pfocr_id_to_genes.items():
        if len(g0.intersection(g1)) >= 2:
            for bte_result in genes_bte_results:
                matches = list()
                nodes = set()
                for query_node_id, values in bte_result["node_bindings"].items():
                    for value in values:
                        id = value["id"]
                        if id in g0:
                            nodes.add(query_node_id)
                bte_result["pfocr"] = copy(pfocr_id_to_pfocr_result[pfocr_id])
                bte_result["pfocr"]["nodes"] = sorted(nodes)
            cooccurring_bte_results += genes_bte_results
cooccurring_bte_results

[{'edge_bindings': {'e01': [{'id': '5fc6cc476f4bcb068460c5d299db52dd'}],
   'e02': [{'id': 'fc671d4b62d2d983372271323c3e8be3'},
    {'id': '7b6da63564f96e9b4d0aef2581eb3ba3'},
    {'id': '3e93479f91fd45c376d4851eea747174'},
    {'id': 'e495f8080658dd1cd0acbec8c4cfeaf7'},
    {'id': '551bd928ba58c60f465375306b81bbac'},
    {'id': '6f6dd5467485ea133e64cb76b405c7d2'}]},
  'node_bindings': {'n0': [{'id': 'NCBIGene:211'}],
   'n1': [{'id': 'NCBIGene:5599'}],
   'n2': [{'id': 'PUBCHEM.COMPOUND:3121'}]},
  'pfocr': {'figureUrl': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5354998/bin/nihms843846f10.jpg',
   'nodes': ['n0', 'n1'],
   'pmc': 'PMC5354998',
   'score': 52.49851},
  'score': 0},
 {'edge_bindings': {'e01': [{'id': '0eb02f5d33311f1ab445417b47fd6e11'},
    {'id': '0ccc29e5e4605fdc6131e3f091905b66'},
    {'id': 'ed99dfcff0914197197743a6674bf275'},
    {'id': '099133a2b625d0e9768bca884698670a'}],
   'e02': [{'id': '11777d2063e7b250c311ffe794add320'},
    {'id': '101db421fc6adc0af509d

In [11]:
print(f"{len(cooccurring_bte_results)} BTE results (out of {len(bte_results)} total) have pfocr figures")

49 BTE results (out of 356 total) have pfocr figures
