<a href="https://colab.research.google.com/github/wikipathways/pathway-figure-ocr/blob/master/notebooks/bte_with_pfocr_cooccurrence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from copy import copy, deepcopy
import json

# BTE with PFOCR Coocurrence

I modified a query from the 3/17 Question of the Month to find relationships like this:

`1 of 3 selected genes` -> `any gene` -> `Valproic Acid`

In [2]:
import requests
import requests_cache


requests_cache.install_cache("pfocr_cache")

## Get BTE Results

In [3]:
query = {
    "message": {
        "query_graph": {
            "edges": {
                "e01": {
                    "object": "n0",
                    "predicates": ["biolink:related_to"],
                    "subject": "n1"
                },
                "e02": {
                    "object": "n1",
                    "predicates": ["biolink:related_to"],
                    "subject": "n2"
                }
            },
            "nodes": {
                "n0": {
                    "categories": ["biolink:Gene"],
                    "ids": ["NCBIGene:3855", "NCBIGene:211", "NCBIGene:26995"]
                },
                "n1": {
                    "categories": ["biolink:Gene"]
                },
                "n2": {
                    "categories": ["biolink:SmallMolecule"],
                    "ids": ["PUBCHEM.COMPOUND:3121"]
                }
            }
        }
    }
}

# bte_r = requests.post("https://api.bte.ncats.io/v1/query", json=query)
bte_r = requests.post("http://localhost:3000/v1/query", json=query)
# To clear the cache for this request
# requests_cache.get_cache().delete(bte_r.cache_key)

print(bte_r.status_code)
if bte_r.status_code != 200:
    print(bte_r.text)

200


In [4]:
gene_q_node_ids = []
for q_node_id, v in query["message"]["query_graph"]["nodes"].items():
    if "categories" in v and "biolink:Gene" in v["categories"]:
        gene_q_node_ids.append(q_node_id)
print(gene_q_node_ids)

['n0', 'n1']


In [5]:
bte_message = bte_r.json()["message"]
bte_results = bte_message["results"]
genes_to_bte_results = dict()
for bte_result in bte_results:
    bte_result_genes = []
    for gene_q_node_id in gene_q_node_ids:
        for entry in bte_result["node_bindings"][gene_q_node_id]:
            id = entry["id"]
            target_prefix = "NCBIGene:"
            if id.startswith(target_prefix):
                bte_result_genes.append("NCBIGene:" + id[len(target_prefix):])
    genes_key = tuple(sorted(bte_result_genes))
    if not genes_key in genes_to_bte_results:
        genes_to_bte_results[genes_key] = []
    genes_to_bte_results[genes_key].append(bte_result)
print(f'BTE TRAPI result count: {len(bte_results)}')

BTE TRAPI result count: 367


In [6]:
for bte_result in bte_results:
    if "pfocr" in bte_result:
        print(json.dumps(bte_result, indent=4))
        break

{
    "node_bindings": {
        "n1": [
            {
                "id": "NCBIGene:5599"
            }
        ],
        "n0": [
            {
                "id": "NCBIGene:211"
            }
        ],
        "n2": [
            {
                "id": "PUBCHEM.COMPOUND:3121"
            }
        ]
    },
    "edge_bindings": {
        "e01": [
            {
                "id": "5fc6cc476f4bcb068460c5d299db52dd"
            }
        ],
        "e02": [
            {
                "id": "fc671d4b62d2d983372271323c3e8be3"
            },
            {
                "id": "7b6da63564f96e9b4d0aef2581eb3ba3"
            },
            {
                "id": "3e93479f91fd45c376d4851eea747174"
            },
            {
                "id": "8353796517ac7652afabf4042e944971"
            },
            {
                "id": "e495f8080658dd1cd0acbec8c4cfeaf7"
            },
            {
                "id": "551bd928ba58c60f465375306b81bbac"
            },
            {


## Get PFOCR Data

Download the entire JSON file we gave to BTE.

In [7]:
pfocr_url = "https://www.dropbox.com/s/1f14t5zaseocyg6/bte_chemicals_diseases_genes.ndjson?dl=1"
pfocr_request = requests.get(pfocr_url)
print(f"status_code: {pfocr_request.status_code}")
if pfocr_request.status_code != 200:
    print(pfocr_request.text)

genes_to_figids = {}
figid_to_genes = {}
figid_to_pfocr_result = {}
pfocr_result_count = 0
for line in pfocr_request.text.splitlines():
    pfocr_result_count += 1
    pfocr_result = json.loads(line)
    figid = pfocr_result["_id"]
    genes = set(
            ["NCBIGene:" + g for g in pfocr_result["associatedWith"]["mentions"]["genes"]["ncbigene"]]
            )
    figid_to_pfocr_result[figid] = pfocr_result
    figid_to_genes[figid] = genes

    genes_key = tuple(sorted(genes))
    if not genes_key in genes_to_figids:
        genes_to_figids[genes_key] = []
    genes_to_figids[genes_key].append(figid)
print(f'pfocr_result_count: {pfocr_result_count}')

status_code: 200
pfocr_result_count: 77719


#### How many CURIEs are in both?

In [8]:
all_bte_result_genes_keys = genes_to_bte_results.keys()
all_bte_result_genes = set([]).union(
    *[set(bte_result_genes_key) for bte_result_genes_key in all_bte_result_genes_keys]
)
print(f'BTE TRAPI result unique gene count: {len(all_bte_result_genes)}')

all_common_genes = set()
for figure_genes_keys in genes_to_figids.keys():
    figure_genes = set(figure_genes_keys)
    all_common_genes.update(
        figure_genes.intersection(all_bte_result_genes)
    )
print(len(all_common_genes))
print(f'Genes common to BTE TRAPI results and PFOCR figures count: {len(all_common_genes)}')

BTE TRAPI result unique gene count: 346
333
Genes common to BTE TRAPI results and PFOCR figures count: 333


## Connect BTE Results & PFOCR

### Compare Algo Performance

The following three algorithms all match up the BTE gene sets with the PFOCR sets, but they have different performances:
- Brute Force: 47s
- Check All BTE Results Genes Set: 5s
- SetSimilaritySearch: 2s

#### TRAPI Results x Figs (Brute Force)

In [9]:
all_figure_genes_keys = genes_to_figids.keys()
bf_overlaps_2_plus = set()
for bte_result_genes_key in genes_to_bte_results.keys():
    bte_result_genes = set(bte_result_genes_key)
    for figure_genes_keys in all_figure_genes_keys:
        figure_genes = set(figure_genes_keys)
        if len(bte_result_genes.intersection(figure_genes)) >= 2:
            bf_overlaps_2_plus.add((bte_result_genes_key, figure_genes_keys))
print(len(bf_overlaps_2_plus))

196


#### TRAPI Results x Figs with check against set of all genes in any BTE result

Before checking every BTE result against the current figure, check whether the figure matches two genes from any BTE result first. If not, it's pointless to check specific BTE results, so we can skip entire figure.

In [10]:
all_bte_result_genes_keys = genes_to_bte_results.keys()
all_bte_result_genes = set([]).union(
    *[set(bte_result_genes_key) for bte_result_genes_key in all_bte_result_genes_keys]
)
bf_overlaps_2_plus = set()

matched_figures = set()
matched_bte_results = set()

for figure_genes_keys in genes_to_figids.keys():
    figure_genes = set(figure_genes_keys)
    figure_ids = genes_to_figids[figure_genes_keys]
    if len(figure_genes.intersection(all_bte_result_genes)) >= 2:
        for bte_result_genes_key in all_bte_result_genes_keys:
            bte_result_genes = set(bte_result_genes_key)
            if len(bte_result_genes.intersection(figure_genes)) >= 2:
                matched_figures.update(set(figure_ids))
                matched_bte_results.add(bte_result_genes_key)
                bf_overlaps_2_plus.add((bte_result_genes_key, figure_genes_keys))

print(len(bf_overlaps_2_plus))
print(f'matched_figures count: {len(matched_figures)}')
print(f'matched BTE results count: {len(matched_bte_results)}')

196
matched_figures count: 63
matched BTE results count: 71


#### SetSimilaritySearch

This algorithm is sometimes too permissive, making the overlap check needed, but even so, it's faster and gets the same results once we apply the overlap check.

In [None]:
from SetSimilaritySearch import SearchIndex

pfocr_gene_sets = list(genes_to_figids.keys())
index = SearchIndex(pfocr_gene_sets, similarity_func_name="containment", 
    similarity_threshold=0.8)

sss_overlaps_2_plus = set()
for bte_result_genes_key in genes_to_bte_results.keys():
    bte_result_genes = set(bte_result_genes_key)
    results = index.query(bte_result_genes)
    for result in results:
        figure_genes = pfocr_gene_sets[result[0]]
        if len(bte_result_genes.intersection(figure_genes)) >= 2:
            sss_overlaps_2_plus.add((bte_result_genes_key, tuple(sorted(figure_genes))))
print(len(sss_overlaps_2_plus))

### Choose and Apply SetSimilaritySearch

SetSimilaritySearch was fastest, so let's choose it and use it to augment the BTE results.

In [None]:
from SetSimilaritySearch import SearchIndex

pfocr_gene_sets = list(genes_to_figids.keys())
index = SearchIndex(pfocr_gene_sets, similarity_func_name="containment", 
    similarity_threshold=0.8)

overlaps_2_plus_count = 0
for bte_result_genes_key, genes_bte_results in genes_to_bte_results.items():
    bte_result_genes = set(bte_result_genes_key)
    results = index.query(bte_result_genes)
    #if results:
    #    print("------")
    #    print(f'bte_result_genes: {bte_result_genes}')
    for result in results:
        figure_genes = pfocr_gene_sets[result[0]]
        score = result[1]
        figids = genes_to_figids[tuple(sorted(figure_genes))]
        #print(f'{figids} ({score}) - {figure_genes}')
        common = bte_result_genes.intersection(figure_genes)
        #print(f'intersection: {common}')
        if len(common) >= 2:
            for figid in figids:
                overlaps_2_plus_count += 1
                pfocr_result = figid_to_pfocr_result[figid]
                #print("--------")
                #print(f'{figid} - {figure_genes}')
                for bte_result in genes_bte_results:
                    #print(f'bte_result: {sorted(bte_result_genes)}')
                    #print(f'common: {bte_result_genes.intersection(figure_genes)}')
                    nodes = set()
                    for q_node_id, values in bte_result["node_bindings"].items():
                        for value in values:
                            id = value["id"]
                            if id in figure_genes:
                                nodes.add(q_node_id)
                    if not "pfocr_notebook" in bte_result:
                        bte_result["pfocr_notebook"] = []
                    pfocr_entry = copy(pfocr_result)
                    pfocr_entry["nodes"] = sorted(nodes)
                    pfocr_entry["score"] = score
                    bte_result["pfocr_notebook"].append(pfocr_entry)
print(overlaps_2_plus_count)

In [None]:
import pandas as pd


kg_nodes = bte_message["knowledge_graph"]["nodes"]

bte_rows = []
for bte_result in bte_results:
    bte_row_template = {}
    for q_node_id, value in bte_result["node_bindings"].items():
        node_labels = []
        for v in value:
            id = v["id"]
            name = kg_nodes[id]["name"]
            #node_labels.append(f'{name} ({id})')
            node_labels.append(name)
        bte_row_template[q_node_id] = ",".join(node_labels)
        # just taking the first one for now
        bte_row_template[q_node_id + "_identifier"] = value[0]["id"]

    bte_row_template["score"] = bte_result["score"]
    
    if "pfocr" in bte_result or "pfocr_notebook" in bte_result:
        if "pfocr" in bte_result:
            for pfocr_result in bte_result["pfocr"]:
                bte_row = deepcopy(bte_row_template)
                bte_row["figure_url"] = pfocr_result["figureUrl"]
                bte_row["pmc"] = pfocr_result["pmc"]
                bte_row["nodes"] = pfocr_result["nodes"]
                bte_rows.append(bte_row)
        if "pfocr_notebook" in bte_result:
            for pfocr_result in bte_result["pfocr_notebook"]:
                bte_row = deepcopy(bte_row_template)
                bte_row["figure_title_notebook"] = pfocr_result["associatedWith"]["title"]
                bte_row["figid_notebook"] = pfocr_result["_id"]
                bte_row["figure_url_notebook"] = pfocr_result["associatedWith"]["figureUrl"]
                bte_row["pfocr_score_notebook"] = pfocr_result["score"]
                bte_rows.append(bte_row)
    else:
        bte_rows.append(bte_row_template)
    
bte_df = pd.DataFrame(bte_rows)
bte_df

In [None]:
print(len(bte_df[bte_df["figure_url"].notnull()]["figure_url"].drop_duplicates()))
print(len(bte_df[bte_df["figure_url_notebook"].notnull()]["figure_url_notebook"].drop_duplicates()))

In [None]:
print(
    len(bte_df[bte_df["figure_url"].notnull()][
        ["n0_identifier", "n1_identifier", "n2_identifier"]
    ].drop_duplicates())
)
print(
    len(bte_df[bte_df["figure_url_notebook"].notnull()][
        ["n0_identifier", "n1_identifier", "n2_identifier"]
    ].drop_duplicates())
)

In [None]:
bte_df.rename(
    columns={"figure_url": "figure_count"}
).groupby(
    ["n0", "n1", "n2"]
)[["figure_count"]].count().sort_values("figure_count", ascending=False)

## Display some figures

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 


for i, df in bte_df[
    bte_df["figure_url_notebook"].notnull() & bte_df["figure_title_notebook"].notnull()
][["figure_url_notebook", "figure_title_notebook"]].drop_duplicates().iterrows():
    figure_title = df["figure_title_notebook"]
    figure_url = df["figure_url_notebook"]
    display(Image(url=figure_url))
    print(figure_title)
    print("")
    print("")
    print("")
