In [1]:
from copy import copy
import json
import pandas as pd
import textwrap

# BTE Sleeve

Some queries result in an explosion of the number of results. This is bad for performance of the server and also not very useful to the user. This notebook explores using PFOCR as a "compression sleeve" to dampen these types of explosions. I don't have a CC-licensed image of a compression sleeve, so instead here's an image of a heat-shrink tube.

<a title="oomlout, CC BY-SA 2.0 &lt;https://creativecommons.org/licenses/by-sa/2.0&gt;, via Wikimedia Commons" href="https://commons.wikimedia.org/wiki/File:HESH-06-L-STAN-01_(9601242883).jpg"><img width="512" alt="HESH-06-L-STAN-01 (9601242883)" src="https://upload.wikimedia.org/wikipedia/commons/thumb/d/d3/HESH-06-L-STAN-01_%289601242883%29.jpg/512px-HESH-06-L-STAN-01_%289601242883%29.jpg"></a>

Let's try BTE Sleeve on the test query [A.2_RHOBTB2_twohop](https://github.com/NCATSTranslator/minihackathons/blob/main/2021-12_demo/workflowA/A.2_RHOBTB2_twohop.json):

`RHOBTB2 Gene`--entity_regulates_entity|genetically_interacts_with--`Any Gene`--related_to--`Any SmallMolecule`

`n0`--e01--`n1`--e02--`n2`


Note BTE Sleeve currently only works on queries with nodes of categories supported by PFOCR, e.g., `biolink:Gene`, `biolink:SmallMolecule`, `biolink:Disease`. TODO: update the Fisher's Exact test to work with node categories not in PFOCR.

Also, BTE Sleeve currently assumes `n0` has an `ids` parameter. TODO: update to handle `ids` params on any q_node.

In [2]:
import requests
import requests_cache


requests_cache.install_cache("pfocr_cache", allowable_methods=['GET', 'POST'])

## Get BTE TRAPI Results

In [41]:
query = {
    "message": {
        "query_graph": {
            "edges": {
                "e01": {
                    "object": "n0",
                    "subject": "n1",
                    "predicates": [
                        "biolink:entity_regulates_entity",
                        "biolink:genetically_interacts_with"
                    ]
                },
                "e02": {
                    "object": "n1",
                    "subject": "n2",
                    "predicates": [
                        "biolink:related_to"
                    ]
                }
            },
            "nodes": {
                "n0": {
                    "ids": [
                        "NCBIGene:23221"
                    ],
                    "categories": [
                        "biolink:Gene"
                    ]
                },
                "n1": {
                    "categories": [
                        "biolink:Gene"
                    ]
                },
                "n2": {
                    "categories": [
                        "biolink:SmallMolecule"
                    ]
                }
            }
        }
    }
}

trapi_response = requests.post("http://localhost:3000/v1/query", json=query)
#trapi_response = requests.post("https://api.bte.ncats.io/v1/query", json=query)
#trapi_response = requests.post("https://arax.ncats.io/api/rtxkg2/v1.2/query", json=query)
#trapi_response = requests.post("https://arax.ncats.io/api/arax/v1.2/query", json=query)
print(trapi_response.status_code)
if trapi_response.status_code != 200:
    print(trapi_response.text)
if trapi_response.from_cache:
    print(f"got response from cache")

trapi_message = trapi_response.json()["message"]

#print(json.dumps(query))

200
got response from cache


In [4]:
# note we're also including any categories from the results
curie_categories = set()
for v in  query["message"]["query_graph"]["nodes"].values():
    if "categories" in v:
        for category in v["categories"]:
            curie_categories.add(category)

In [5]:
# for genes/gene products, chemicals and diseases
preferred_prefixes = set(["NCBIGene", "MESH"])
unified_names = set()
unified_curies = set()
unification_failed_curies = set()
unified_prefixes = set()
all_prefixes = set()
curie_to_name = dict()
curie_to_unified_curie = dict()
for k, v in trapi_message["knowledge_graph"]["nodes"].items():
    name = v["name"]
    for category in v["categories"]:
        curie_categories.add(category)
        
    for a in v["attributes"]:
        if a["attribute_type_id"] == "biolink:xref":
            curies = a["value"]
            
            # k should always be one of the curies
            if not k in curies:
                raise Exception(f"key {k} not in {curies}")
            
            unified_curie = None
            intersecting_unified_curies = unified_curies.intersection(set(curies))
            if len(intersecting_unified_curies) > 1:
                raise Exception(f"matching multiple: {k} to {list(intersecting_unified_curies)}")
            elif len(intersecting_unified_curies) == 1:
                unified_curie = list(intersecting_unified_curies)[0]
            else:
                # get curie for preferred prefix. usually this is k, but not always.
                for curie in curies:
                    [prefix, identifier] = curie.split(":")
                    if prefix in preferred_prefixes:
                        unified_curie = curie
                        unified_curies.add(unified_curie)
                        break
                        
            if not unified_curie:
                if k in curie_to_unified_curie:
                    unified_curie = curie_to_unified_curie[k]
                else:
                    unification_failed_curies.add(k)
                    break
                    
                #raise Exception(f"failed to find a unified curie for {k} in {curies}")
                
            [unified_prefix, unified_identifier] = unified_curie.split(":")
            unified_prefixes.add(unified_prefix)
            
            for curie in curies:
                [prefix, identifier] = curie.split(":")
                all_prefixes.add(prefix)
                if not curie in curie_to_unified_curie:
                    curie_to_unified_curie[curie] = unified_curie
                if not curie in curie_to_name:
                    curie_to_name[curie] = name
                    unified_names.add(name)
                elif curie_to_name[curie] != name:
                    print(f"curie {curie} has multiple primary names: {curie_to_name[curie]} and {name}")
                    #raise Exception(f"curie{curie} has multiple names: {curie_to_name[curie]} and {name}")


print(f'curie_categories: {curie_categories}')
print("")
print(f"curie_to_name key count: {len(curie_to_name.keys())}")
print(f"curie_to_unified_curie key count: {len(curie_to_unified_curie.keys())}")
print("")
print("all curie prefixes found:")
print(all_prefixes)
print("")
print("unified curie prefixes found:")
print(unified_prefixes)
print("")
print(f"failed to unify {len(unification_failed_curies)} curies")

curie_categories: {'biolink:Gene', 'biolink:ComplexMolecularMixture', 'biolink:Polypeptide', 'biolink:MolecularMixture', 'biolink:ChemicalEntity', 'biolink:SmallMolecule', 'biolink:Protein'}

curie_to_name key count: 13680
curie_to_unified_curie key count: 13680

all curie prefixes found:
{'INCHIKEY', 'GTOPDB', 'KEGG.COMPOUND', 'OMIM', 'CAS', 'UMLS', 'UniProtKB', 'PUBCHEM.COMPOUND', 'PR', 'MESH', 'CHEMBL.COMPOUND', 'CHEBI', 'DrugCentral', 'HGNC', 'UNII', 'HMDB', 'NCBIGene', 'ENSEMBL', 'DRUGBANK'}

unified curie prefixes found:
{'MESH', 'NCBIGene'}

failed to unify 446 curies


In [6]:
columns = []
q_node_id_keys = set(["object", "subject"])
q_node_ids = []
q_edge_ids = []
for q_edge_id,edge_v in query["message"]["query_graph"]["edges"].items():
    q_edge_ids.append(q_edge_id)
    q_node_id_found = False
    for k,v in edge_v.items():
        if (k in q_node_id_keys) and (type(v) is str):
            if v not in columns:
                q_node_ids.append(v)
                columns.append(v)
            if not q_node_id_found:
                q_node_id_found = True
                columns.append(q_edge_id)
                
for q_node_id in q_node_ids:
    columns.append(f"{q_node_id}_curie")
    columns.append(f"{q_node_id}_unified_curie")
    
columns.append("trapi_result_curie_combo")
    
print(columns)
print(q_node_ids)
print(q_edge_ids)

['n0', 'e01', 'n1', 'e02', 'n2', 'n0_curie', 'n0_unified_curie', 'n1_curie', 'n1_unified_curie', 'n2_curie', 'n2_unified_curie', 'trapi_result_curie_combo']
['n0', 'n1', 'n2']
['e01', 'e02']


In [7]:
trapi_result_columns = []
for i in range(len(q_node_ids)):
    trapi_result_columns.append(q_node_ids[i])
    if i < len(q_edge_ids):
        trapi_result_columns.append(q_edge_ids[i])
print(trapi_result_columns)

['n0', 'e01', 'n1', 'e02', 'n2']


In [8]:
from copy import deepcopy


trapi_results = trapi_message["results"]

result_row_data = []
for trapi_result in trapi_results:
    curie_to_qnode_ids = dict()
    for qnode_id, entries in trapi_result["node_bindings"].items():
        for entry in entries:
            curie = entry["id"]
            if curie not in curie_to_qnode_ids:
                curie_to_qnode_ids[curie] = []
            curie_to_qnode_ids[curie].append(qnode_id)
        
    row_data_template = dict()
    q_edge_id_to_predicates = dict()
    trapi_result_curies = set()
    for qedge_id, entries in trapi_result["edge_bindings"].items():
        for entry in entries:
            curie = entry["id"]
            kg_entry = trapi_message["knowledge_graph"]["edges"][curie]
            subject_curie = kg_entry["subject"]
            object_curie = kg_entry["object"]
            predicate_curie = kg_entry["predicate"]
            [predicate_prefix, predicate_identifier] = predicate_curie.split(":")
            
            if qedge_id not in q_edge_id_to_predicates:
                q_edge_id_to_predicates[qedge_id] = set()
            q_edge_id_to_predicates[qedge_id].add(predicate_identifier)

            for curie in [subject_curie, object_curie]:
                for qnode_id in curie_to_qnode_ids[curie]:
                    if curie in curie_to_unified_curie:
                        unified_curie = curie_to_unified_curie[curie]
                    else:
                        break

                    name = curie_to_name[curie]
                    row_data_template[qnode_id] = name

                    trapi_result_curies.add(unified_curie)
                    row_data_template[qnode_id + "_curie"] = curie
                    row_data_template[qnode_id + "_unified_curie"] = unified_curie
                    
    if len(trapi_result_curies) != len(q_node_ids):
        #print(f'skipping {list(curie_to_qnode_ids.keys())}')
        continue
        
    row_data_template["trapi_result_curie_combo"] = tuple(sorted(trapi_result_curies))
    q_edge_ids_processed = set()
    row_datas = [row_data_template]
    for q_edge_id,predicates in q_edge_id_to_predicates.items():
        next_row_datas = []
        for row_data in row_datas:
            for predicate in predicates:
                next_row_data = deepcopy(row_data)
                next_row_data[q_edge_id] = predicate
                next_row_datas.append(
                    next_row_data 
                )
        row_datas = next_row_datas
    result_row_data += row_datas
        
print("warning: predicate direction(s) may be switched")
results_df = pd.DataFrame.from_records(result_row_data, columns=columns).drop_duplicates()
results_df



Unnamed: 0,n0,e01,n1,e02,n2,n0_curie,n0_unified_curie,n1_curie,n1_unified_curie,n2_curie,n2_unified_curie,trapi_result_curie_combo
0,RHOBTB2,entity_negatively_regulates_entity,STK11,interacts_with,4-Hydroxy-3-methoxycinnamaldehyde,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:5280536,MESH:C075384,"(MESH:C075384, NCBIGene:23221, NCBIGene:6794)"
1,RHOBTB2,entity_negatively_regulates_entity,STK11,coexists_with,Ditiomustine,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:127547,MESH:C033713,"(MESH:C033713, NCBIGene:23221, NCBIGene:6794)"
2,RHOBTB2,entity_negatively_regulates_entity,STK11,interacts_with,Benzene Derivatives,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,UMLS:C0005037,MESH:D001555,"(MESH:D001555, NCBIGene:23221, NCBIGene:6794)"
3,RHOBTB2,entity_negatively_regulates_entity,STK11,entity_negatively_regulates_entity,Puerarin,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:5281807,MESH:C033607,"(MESH:C033607, NCBIGene:23221, NCBIGene:6794)"
4,RHOBTB2,entity_negatively_regulates_entity,STK11,entity_positively_regulates_entity,"3-Chloro-1,2-propanediol",NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:7290,MESH:D000517,"(MESH:D000517, NCBIGene:23221, NCBIGene:6794)"
...,...,...,...,...,...,...,...,...,...,...,...,...
5271,RHOBTB2,genetically_interacts_with,CUL3,interacts_with,Hydrogen peroxide,NCBIGene:23221,NCBIGene:23221,NCBIGene:8452,NCBIGene:8452,PUBCHEM.COMPOUND:784,MESH:D006861,"(MESH:D006861, NCBIGene:23221, NCBIGene:8452)"
5272,RHOBTB2,genetically_interacts_with,CUL3,entity_negatively_regulated_by_entity,MicroRNAs,NCBIGene:23221,NCBIGene:23221,NCBIGene:8452,NCBIGene:8452,UMLS:C1101610,MESH:D035683,"(MESH:D035683, NCBIGene:23221, NCBIGene:8452)"
5273,RHOBTB2,genetically_interacts_with,CUL3,entity_positively_regulated_by_entity,Eloxatin (TN),NCBIGene:23221,NCBIGene:23221,NCBIGene:8452,NCBIGene:8452,PUBCHEM.COMPOUND:9887054,MESH:C030110,"(MESH:C030110, NCBIGene:23221, NCBIGene:8452)"
5274,RHOBTB2,genetically_interacts_with,CUL3,decreases_expression_of,Folic acid,NCBIGene:23221,NCBIGene:23221,NCBIGene:8452,NCBIGene:8452,PUBCHEM.COMPOUND:135398658,MESH:D005492,"(MESH:D005492, NCBIGene:23221, NCBIGene:8452)"


In [9]:
for q_node_id in q_node_ids:
    print(len(set(results_df[q_node_id])))

1
10
1108


In [10]:
len(set(results_df["trapi_result_curie_combo"]))

3738

Total results count (excluding any that couldn't be unified): 3,738. That's too many for a researcher to efficiently go through manually, so let's try filtering to make that more manageable.

## Match up BTE TRAPI Results & PFOCR

We're going to try using PFOCR to filter and prioritize the results.

First we need to get the PFOCR Data. We could get it from the API, but for now, we'll just go ahead and download the entire JSON file we gave to BTE.

In [11]:
pfocr_url = "https://www.dropbox.com/s/1f14t5zaseocyg6/bte_chemicals_diseases_genes.ndjson?dl=1"
pfocr_request = requests.get(pfocr_url)
print(f"status_code: {pfocr_request.status_code}")
if pfocr_request.status_code != 200:
    print(pfocr_request.text)

status_code: 200


In [12]:
curies_to_figure_ids = {}
figure_id_to_curies = {}
figure_id_to_pfocr_result = {}
for line in pfocr_request.text.splitlines():
    pfocr_result = json.loads(line)
    figure_id = pfocr_result["_id"]

    curies = set()
    for identifier in pfocr_result["associatedWith"]["mentions"]["chemicals"]["mesh"]:
        curie = "MESH:" + identifier
        if curie in curie_to_unified_curie:
            unified_curie = curie_to_unified_curie[curie]
            curies.add(unified_curie)
        else:
            #print(f"{curie} not in list")
            # this curie isn't in the BTE results, but we'll add an item for
            # the purpose of the denominator in jaccard/containment cals
            curies.add(curie)
    for identifier in pfocr_result["associatedWith"]["mentions"]["diseases"]["mesh"]:
        curie = "MESH:" + identifier
        if curie in curie_to_unified_curie:
            unified_curie = curie_to_unified_curie[curie]
            curies.add(unified_curie)
        else:
            #print(f"{curie} not in list")
            # this curie isn't in the BTE results, but we'll add an item for
            # the purpose of the denominator in jaccard/containment cals
            curies.add(curie)
    for identifier in pfocr_result["associatedWith"]["mentions"]["genes"]["ncbigene"]:
        curie = "NCBIGene:" + identifier
        if curie in curie_to_unified_curie:
            unified_curie = curie_to_unified_curie[curie]
            curies.add(unified_curie)
        else:
            #print(f"{curie} not in list")
            # this curie isn't in the BTE results, but we'll add an item for
            # the purpose of the denominator in jaccard/containment cals
            curies.add(curie)

    figure_id_to_pfocr_result[figure_id] = pfocr_result
    figure_id_to_curies[figure_id] = curies

    curies_key = tuple(sorted(curies))
    if curies_key not in curies_to_figure_ids:
        curies_to_figure_ids[curies_key] = []
    curies_to_figure_ids[curies_key].append(figure_id)

In [13]:
from SetSimilaritySearch import SearchIndex

# the reference sets are supposed to be from PFOCR only
reference_sets = list()
for curies in set(curies_to_figure_ids.keys()):
    reference_sets.append(set(curies))
# but in order to calculate the scores correctly,
# the SetSimilaritySearch library requires that
# every curie from the bte results need to be
# mentioned at least once in the reference set.
# That's the only reason we add them below, as
# one large set.

unified_curie_columns = [
    q_node_id + "_unified_curie" for q_node_id in q_node_ids
]

trapi_result_curie_combos = set()
for i, df in results_df[unified_curie_columns].drop_duplicates().iterrows():
    trapi_result_curies = []
    for unified_curie_column in unified_curie_columns:
        trapi_result_curies.append(
            df[unified_curie_column]
        )
    trapi_result_curie_combos.add(tuple(sorted(
        trapi_result_curies
    )))
    
reference_set = set()
for trapi_result_curie_combo in trapi_result_curie_combos:
    reference_set |= set(trapi_result_curie_combo)
reference_sets.append(reference_set)

matchable_node_min = 2
matchable_node_count = len(q_node_ids)
index = SearchIndex(reference_sets, similarity_func_name="containment", 
    similarity_threshold=matchable_node_min/matchable_node_count)

trapi_figure_overlap_rows = []
for trapi_result_curie_combo in trapi_result_curie_combos:
    trapi_curies = set(trapi_result_curie_combo)
    results = index.query(trapi_curies)
    for result in results:
        figure_curie_combos = reference_sets[result[0]]
        figure_curie_key = tuple(sorted(figure_curie_combos))
        
        # needed to not match bte results to themselves
        if figure_curie_key in curies_to_figure_ids:
            figure_ids = curies_to_figure_ids[figure_curie_key]
        else:
            continue
            
        score = result[1]
        
        common_curies = trapi_curies.intersection(figure_curie_combos)
        for figure_id in figure_ids:
            pfocr_result = figure_id_to_pfocr_result[figure_id]
            trapi_figure_overlap_rows.append({
                "figure_id": figure_id,
                "figure_url": pfocr_result["associatedWith"]["figureUrl"],
                "figure_title": pfocr_result["associatedWith"]["title"],
                "trapi_result_curie_combo": trapi_result_curie_combo,
                "figure_curie_combo": figure_curie_key,
                "overlap_curie_combo": tuple(sorted(common_curies)),
                "score": score,
            })


trapi_figure_overlap_df = pd.DataFrame.from_records(trapi_figure_overlap_rows)
trapi_figure_overlap_df

Unnamed: 0,figure_id,figure_url,figure_title,trapi_result_curie_combo,figure_curie_combo,overlap_curie_combo,score
0,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:C000706872, NCBIGene:23221, NCBIGene:388)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:23221, NCBIGene:388)",0.666667
1,PMC7642974__fphys-11-01022-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Action and classification of E3 ubiquitin liga...,"(MESH:C580998, NCBIGene:23221, NCBIGene:8452)","(MESH:C017937, MESH:C039244, MESH:C119604, MES...","(NCBIGene:23221, NCBIGene:8452)",0.666667
2,PMC7642974__fphys-11-01022-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Action and classification of E3 ubiquitin liga...,"(MESH:D013395, NCBIGene:23221, NCBIGene:8452)","(MESH:C017937, MESH:C039244, MESH:C119604, MES...","(NCBIGene:23221, NCBIGene:8452)",0.666667
3,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:C568210, NCBIGene:22836, NCBIGene:23221)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:22836, NCBIGene:23221)",0.666667
4,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:D024506, NCBIGene:23221, NCBIGene:388)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:23221, NCBIGene:388)",0.666667
...,...,...,...,...,...,...,...
2167,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:C545373, NCBIGene:23221, NCBIGene:388)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:23221, NCBIGene:388)",0.666667
2168,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:C522973, NCBIGene:23221, NCBIGene:388)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:23221, NCBIGene:388)",0.666667
2169,PMC8113272__41598_2021_89476_Fig2_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8...,Genomic network analysis showing the central r...,"(MESH:D014212, NCBIGene:23221, NCBIGene:595)","(MESH:C063610, MESH:C086827, MESH:C103303, MES...","(MESH:D014212, NCBIGene:595)",0.666667
2170,PMC3618522__nihms433145f1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Molecular Pathways: Current Role and Future Di...,"(MESH:D014212, NCBIGene:23221, NCBIGene:595)","(MESH:C059630, MESH:D014212, MESH:D014801, NCB...","(MESH:D014212, NCBIGene:595)",0.666667


In [14]:
print(len(set(
    trapi_figure_overlap_df[trapi_figure_overlap_df["overlap_curie_combo"].map(len) >= 2]["trapi_result_curie_combo"]
)))
print(len(set(
    trapi_figure_overlap_df[trapi_figure_overlap_df["overlap_curie_combo"].map(len) >= 3]["trapi_result_curie_combo"]
)))
print(len(set(
    trapi_figure_overlap_df[trapi_figure_overlap_df["overlap_curie_combo"].map(len) >= 4]["trapi_result_curie_combo"]
)))
print(len(set(trapi_figure_overlap_df["figure_id"])))

887
0
0
695


### trapi_curie_combo_count_by_figure_df

In [15]:
trapi_curie_combo_count_by_figure_df = trapi_figure_overlap_df[["figure_id", "trapi_result_curie_combo"]].rename(columns={
    "trapi_result_curie_combo": "trapi_curie_combo_count"
}).groupby("figure_id").count().sort_values(
    by="trapi_curie_combo_count", ascending=False)
trapi_curie_combo_count_by_figure_df

Unnamed: 0_level_0,trapi_curie_combo_count
figure_id,Unnamed: 1_level_1
PMC6089851__gr1.jpg,326
PMC7642974__fphys-11-01022-g001.jpg,174
PMC4706528__cbm-12-04-342-f1.jpg,30
PMC5820294__fonc-08-00038-g001.jpg,21
PMC2861144__nihms195501f1.jpg,19
...,...
PMC5550862__cddis2017315f8.jpg,1
PMC5546324__nihms866088f6.jpg,1
PMC3811010__nihms-496184-f0004.jpg,1
PMC5543040__fendo-08-00187-g001.jpg,1


### figure count/score by trapi curie combo

In [16]:
trapi_figure_overlap_df[["trapi_result_curie_combo", "figure_id"]].rename(columns={
    "figure_id": "figure_count"
}).groupby("trapi_result_curie_combo").count().sort_values(
    by="figure_count", ascending=False).head(10)

Unnamed: 0_level_0,figure_count
trapi_result_curie_combo,Unnamed: 1_level_1
"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)",123
"(MESH:D000255, NCBIGene:23221, NCBIGene:6794)",107
"(MESH:D008687, NCBIGene:23221, NCBIGene:6794)",57
"(MESH:D020123, NCBIGene:23221, NCBIGene:6794)",51
"(NCBIGene:23221, NCBIGene:5111, NCBIGene:595)",42
"(MESH:D009243, NCBIGene:23221, NCBIGene:6794)",42
"(MESH:D002118, NCBIGene:23221, NCBIGene:595)",40
"(MESH:D002118, NCBIGene:23221, NCBIGene:6794)",33
"(MESH:D012694, NCBIGene:23221, NCBIGene:595)",31
"(MESH:D012694, NCBIGene:23221, NCBIGene:6794)",23


In [17]:
trapi_result_figure_score_df = trapi_figure_overlap_df[["trapi_result_curie_combo", "score"]].rename(
    columns={"score": "cumulative_trapi_result_curie_combo_figure_score"}
).groupby("trapi_result_curie_combo").sum().sort_values(
    by="cumulative_trapi_result_curie_combo_figure_score", ascending=False)
trapi_result_figure_score_df.head(10)

Unnamed: 0_level_0,cumulative_trapi_result_curie_combo_figure_score
trapi_result_curie_combo,Unnamed: 1_level_1
"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)",82.0
"(MESH:D000255, NCBIGene:23221, NCBIGene:6794)",71.333333
"(MESH:D008687, NCBIGene:23221, NCBIGene:6794)",38.0
"(MESH:D020123, NCBIGene:23221, NCBIGene:6794)",34.0
"(NCBIGene:23221, NCBIGene:5111, NCBIGene:595)",28.0
"(MESH:D009243, NCBIGene:23221, NCBIGene:6794)",28.0
"(MESH:D002118, NCBIGene:23221, NCBIGene:595)",26.666667
"(MESH:D002118, NCBIGene:23221, NCBIGene:6794)",22.0
"(MESH:D012694, NCBIGene:23221, NCBIGene:595)",20.666667
"(MESH:D012694, NCBIGene:23221, NCBIGene:6794)",15.333333


In [18]:
results_with_figures_df = trapi_figure_overlap_df.merge(
    results_df,
    on="trapi_result_curie_combo",
    how="left").merge(
    trapi_result_figure_score_df.reset_index(),
    on="trapi_result_curie_combo",
    how="left",
    validate="many_to_one"
).merge(
    trapi_curie_combo_count_by_figure_df.reset_index().rename(columns={
        "trapi_curie_combo_count": "trapi_curie_combo_count_by_figure"
    }),
    on="figure_id",
    how="left",
    validate="many_to_one"
).sort_values(
    by="cumulative_trapi_result_curie_combo_figure_score", ascending=False
)

results_with_figures_df

Unnamed: 0,figure_id,figure_url,figure_title,trapi_result_curie_combo,figure_curie_combo,overlap_curie_combo,score,n0,e01,n1,e02,n2,n0_curie,n0_unified_curie,n1_curie,n1_unified_curie,n2_curie,n2_unified_curie,cumulative_trapi_result_curie_combo_figure_score,trapi_curie_combo_count_by_figure
3175,PMC3515750__cr2012108f1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,AMPK is phosphorylated and activated by LKB1 i...,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:C000589078, MESH:D000249, MESH:D005978, ...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,entity_negatively_regulates_entity,Adenosine monophosphate,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,1
3046,PMC7538784__fphar-11-558474-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Current mechanisms proposed for metformin in p...,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:D000249, MESH:D010146, NCBIGene:10645, N...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,entity_negatively_regulates_entity,Adenosine monophosphate,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,1
3295,PMC2815088__nihms-136354-f0005.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,Hexosamine Signaling Pathway: O-GlcNAc cycling...,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:D000249, MESH:D000255, NCBIGene:10000, N...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,entity_negatively_regulates_entity,Adenosine monophosphate,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2
3294,PMC2815088__nihms-136354-f0005.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,Hexosamine Signaling Pathway: O-GlcNAc cycling...,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:D000249, MESH:D000255, NCBIGene:10000, N...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,entity_positively_regulates_entity,Adenosine monophosphate,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2
3293,PMC1817805__tjp0574-0063-f1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,AMPK and cell proliferation – AMPK as a therap...,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:D000249, MESH:D000255, NCBIGene:10000, N...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,correlated_with,Adenosine monophosphate,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1709,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:C423915, NCBIGene:22836, NCBIGene:23221)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:22836, NCBIGene:23221)",0.666667,RHOBTB2,genetically_interacts_with,RHOBTB3,resistance_associated_with,"N-(2,2,2-Trifluoroethyl)-N-{4-[2,2,2-trifluoro...",NCBIGene:23221,NCBIGene:23221,NCBIGene:22836,NCBIGene:22836,PUBCHEM.COMPOUND:447912,MESH:C423915,0.666667,326
1708,PMC6858516__nihms-1054030-f0001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Towards precision oncology in advanced prostat...,"(MESH:C089740, NCBIGene:23221, NCBIGene:595)","(MESH:C035086, MESH:C089740, MESH:C540278, MES...","(MESH:C089740, NCBIGene:595)",0.666667,RHOBTB2,entity_negatively_regulates_entity,CCND1,entity_positively_regulates_entity,Abiraterone,NCBIGene:23221,NCBIGene:23221,NCBIGene:595,NCBIGene:595,PUBCHEM.COMPOUND:132971,MESH:C089740,0.666667,9
1707,PMC7522543__WJCO-11-679-g006.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,The prostate cancer pathway,"(MESH:D013196, NCBIGene:23221, NCBIGene:595)","(MESH:C111716, MESH:D009369, MESH:D013196, MES...","(MESH:D013196, NCBIGene:595)",0.666667,RHOBTB2,entity_negatively_regulates_entity,CCND1,entity_negatively_regulates_entity,Androstanolone,NCBIGene:23221,NCBIGene:23221,NCBIGene:595,NCBIGene:595,PUBCHEM.COMPOUND:10635,MESH:D013196,0.666667,2
1706,PMC7522543__WJCO-11-679-g006.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,The prostate cancer pathway,"(MESH:D013196, NCBIGene:23221, NCBIGene:595)","(MESH:C111716, MESH:D009369, MESH:D013196, MES...","(MESH:D013196, NCBIGene:595)",0.666667,RHOBTB2,entity_negatively_regulates_entity,CCND1,entity_positively_regulates_entity,Androstanolone,NCBIGene:23221,NCBIGene:23221,NCBIGene:595,NCBIGene:595,PUBCHEM.COMPOUND:10635,MESH:D013196,0.666667,2


## Fisher's Exact Test

In [19]:
all_genes = set()
gene_instance_count = 0
all_chemicals = set()
chemical_instance_count = 0
all_diseases = set()
disease_instance_count = 0
for figure_id, pfocr_result in figure_id_to_pfocr_result.items():
    genes = pfocr_result["associatedWith"]["mentions"]["genes"]["ncbigene"]
    for gene in genes:
        gene_instance_count += 1
        all_genes.add(gene)
        
    chemicals = pfocr_result["associatedWith"]["mentions"]["chemicals"]["mesh"]
    for chemical in chemicals:
        chemical_instance_count += 1
        all_chemicals.add(chemical)
        
    diseases = pfocr_result["associatedWith"]["mentions"]["diseases"]["mesh"]
    for disease in diseases:
        disease_instance_count += 1
        all_diseases.add(disease)
    
print("total instance counts:")
print(gene_instance_count)
print(chemical_instance_count)
print(disease_instance_count)

print("")

print("unique counts:")
print(len(all_genes))
print(len(all_chemicals))
print(len(all_diseases))

print("")

curie_category_to_curies = {
    "biolink:ChemicalEntity": all_chemicals,
    "biolink:SmallMolecule": all_chemicals,
    "biolink:Disease": all_diseases,
    "biolink:Gene": all_genes,
}


all_figure_curies = set()
for curie_category in set(curie_category_to_curies.keys()).intersection(curie_categories):
    all_figure_curies |= curie_category_to_curies[curie_category]
all_figure_curie_count = len(all_figure_curies)

print(all_figure_curie_count)

total instance counts:
1369680
275456
20465

unique counts:
14253
14482
1430

28735


In [20]:
# trapi_curies_in_pwy
results_with_figures_df["yes_pwy_yes_trapi"] = (
    results_with_figures_df["overlap_curie_combo"].map(len)
)

# q_node_id_count - trapi_curies_in_pwy
results_with_figures_df["no_pwy_yes_trapi"] = (
    len(q_node_ids) - results_with_figures_df["yes_pwy_yes_trapi"]
)

# curies_in_pwy - trapi_curies_in_pwy
results_with_figures_df["yes_pwy_no_trapi"] = (
    results_with_figures_df["figure_curie_combo"].map(len) - 
    results_with_figures_df["yes_pwy_yes_trapi"]
)

# 28735 - curies_in_pwy - q_node_id_count - trapi_curies_in_pwy
results_with_figures_df["no_pwy_no_trapi"] = (
    all_figure_curie_count -
    results_with_figures_df["figure_curie_combo"].map(len) -
    len(q_node_ids) -
    results_with_figures_df["yes_pwy_yes_trapi"]
)

In [21]:
import scipy.stats as stats


results_with_figures_df["fishers_exact"] = results_with_figures_df[
    ["yes_pwy_yes_trapi",
     "yes_pwy_no_trapi",
     "no_pwy_yes_trapi",
     "no_pwy_no_trapi"]
].apply(
    lambda r: stats.fisher_exact([[
        r.yes_pwy_yes_trapi,
        r.yes_pwy_no_trapi
    ], [
        r.no_pwy_yes_trapi,
        r.no_pwy_no_trapi,
    ]]),
    axis=1)

results_with_figures_df["p_value"] = results_with_figures_df["fishers_exact"].apply(
    lambda x: x[1]
)

In [22]:
results_with_figures_df["p_value"].sort_values()

4039    2.180600e-08
1448    1.090224e-07
1446    1.090224e-07
1447    1.090224e-07
1445    1.090224e-07
            ...     
308     4.788013e-04
3072    4.788013e-04
3073    4.788013e-04
3074    4.788013e-04
1748    4.788013e-04
Name: p_value, Length: 4348, dtype: float64

In [23]:
results_with_figures_df

Unnamed: 0,figure_id,figure_url,figure_title,trapi_result_curie_combo,figure_curie_combo,overlap_curie_combo,score,n0,e01,n1,...,n2_curie,n2_unified_curie,cumulative_trapi_result_curie_combo_figure_score,trapi_curie_combo_count_by_figure,yes_pwy_yes_trapi,no_pwy_yes_trapi,yes_pwy_no_trapi,no_pwy_no_trapi,fishers_exact,p_value
3175,PMC3515750__cr2012108f1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,AMPK is phosphorylated and activated by LKB1 i...,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:C000589078, MESH:D000249, MESH:D005978, ...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,...,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,1,2,1,14,28714,"(4102.0, 8.719769682728204e-07)",8.719770e-07
3046,PMC7538784__fphar-11-558474-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Current mechanisms proposed for metformin in p...,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:D000249, MESH:D010146, NCBIGene:10645, N...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,...,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,1,2,1,41,28687,"(1399.3658536585365, 6.557514198398454e-06)",6.557514e-06
3295,PMC2815088__nihms-136354-f0005.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,Hexosamine Signaling Pathway: O-GlcNAc cycling...,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:D000249, MESH:D000255, NCBIGene:10000, N...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,...,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2,2,1,52,28676,"(1102.923076923077, 1.0389153086920265e-05)",1.038915e-05
3294,PMC2815088__nihms-136354-f0005.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,Hexosamine Signaling Pathway: O-GlcNAc cycling...,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:D000249, MESH:D000255, NCBIGene:10000, N...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,...,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2,2,1,52,28676,"(1102.923076923077, 1.0389153086920265e-05)",1.038915e-05
3293,PMC1817805__tjp0574-0063-f1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,AMPK and cell proliferation – AMPK as a therap...,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:D000249, MESH:D000255, NCBIGene:10000, N...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,...,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2,2,1,44,28684,"(1303.8181818181818, 7.515563963719965e-06)",7.515564e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1709,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:C423915, NCBIGene:22836, NCBIGene:23221)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:22836, NCBIGene:23221)",0.666667,RHOBTB2,genetically_interacts_with,RHOBTB3,...,PUBCHEM.COMPOUND:447912,MESH:C423915,0.666667,326,2,1,153,28575,"(373.52941176470586, 8.644554850490196e-05)",8.644555e-05
1708,PMC6858516__nihms-1054030-f0001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Towards precision oncology in advanced prostat...,"(MESH:C089740, NCBIGene:23221, NCBIGene:595)","(MESH:C035086, MESH:C089740, MESH:C540278, MES...","(MESH:C089740, NCBIGene:595)",0.666667,RHOBTB2,entity_negatively_regulates_entity,CCND1,...,PUBCHEM.COMPOUND:132971,MESH:C089740,0.666667,9,2,1,26,28702,"(2207.846153846154, 2.745962336039939e-06)",2.745962e-06
1707,PMC7522543__WJCO-11-679-g006.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,The prostate cancer pathway,"(MESH:D013196, NCBIGene:23221, NCBIGene:595)","(MESH:C111716, MESH:D009369, MESH:D013196, MES...","(MESH:D013196, NCBIGene:595)",0.666667,RHOBTB2,entity_negatively_regulates_entity,CCND1,...,PUBCHEM.COMPOUND:10635,MESH:D013196,0.666667,2,2,1,96,28632,"(596.5, 3.447181364924254e-05)",3.447181e-05
1706,PMC7522543__WJCO-11-679-g006.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,The prostate cancer pathway,"(MESH:D013196, NCBIGene:23221, NCBIGene:595)","(MESH:C111716, MESH:D009369, MESH:D013196, MES...","(MESH:D013196, NCBIGene:595)",0.666667,RHOBTB2,entity_negatively_regulates_entity,CCND1,...,PUBCHEM.COMPOUND:10635,MESH:D013196,0.666667,2,2,1,96,28632,"(596.5, 3.447181364924254e-05)",3.447181e-05


## Analysis of results with vs. without BTE Sleeve

With the current query graph, requiring co-occurrence of at least 2 nodes in at least one PFOCR figure cuts the number of TRAPI result curie combos down to 887 from 3,738.

When requiring 3 nodes, it went down all the way to 0, but I've seen other queries where that yield 9 to 30 results.

Total number of curie combos returned:

In [24]:
len(results_df["trapi_result_curie_combo"].drop_duplicates())

3738

TRAPI result curie combos where at least `x` curies are also found in at least one figure:

In [25]:
for i in range(2, len(q_node_ids) + 1):
    curie_combo_count = len(set(
        trapi_figure_overlap_df[
            trapi_figure_overlap_df["overlap_curie_combo"].map(len) >= i
        ]["trapi_result_curie_combo"]
    ))
    print(f'when {i}+ overlapping curies: {curie_combo_count}')

when 2+ overlapping curies: 887
when 3+ overlapping curies: 0


curie combos where curies for each pair of nodes are found in at least one figure:

In [26]:
from itertools import combinations


overlap_columns = results_with_figures_df["overlap_curie_combo"].apply(pd.Series)

for (a, b) in combinations(q_node_ids, 2):
    overlap_df = results_with_figures_df[((
        results_with_figures_df[f"{a}_unified_curie"] == overlap_columns[0]
    ) | (
        results_with_figures_df[f"{a}_unified_curie"] == overlap_columns[1]
    )) & ((
        results_with_figures_df[f"{b}_unified_curie"] == overlap_columns[0]
    ) | (
        results_with_figures_df[f"{b}_unified_curie"] == overlap_columns[1]
    ))]
    print(f'{a} & {b}')
    print(f'  {len(overlap_df["trapi_result_curie_combo"].drop_duplicates())} TRAPI result(s)')
    print(f'  {len(overlap_df["figure_id"].drop_duplicates())} figure(s)')

n0 & n1
  491 TRAPI result(s)
  2 figure(s)
n0 & n2
  9 TRAPI result(s)
  1 figure(s)
n1 & n2
  417 TRAPI result(s)
  693 figure(s)


## View TRAPI results with figures

Note you can click the figures to go to the paper.

Compare these results with the `results_df` table earlier in this notebook and also with the [ARAX UI](https://arax.ncats.io/?r=44922) (click "Load" and then "Results".).

Does requiring co-occurrence for specific nodes like `n0` and `n2` help? Does ranking by p-value help?

### By lowest p-value

In [27]:
lowest_p_value_df = results_with_figures_df.sort_values(
    "p_value"
)

#### By TRAPI result

In [42]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 5
count = 0
display_count_limit = 10

for trapi_result_curie_combo, df1 in lowest_p_value_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]
    
    ordered_trapi_result_curie_combo = []
    for q_node_id in q_node_ids:
        ordered_trapi_result_curie_combo.append(list(set(
            df1[f"{q_node_id}_unified_curie"]
        ))[0])

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span>{", ".join(ordered_trapi_result_curie_combo)}</span> 
  <span>(cumulative containment score: {cumulative_trapi_result_curie_combo_figure_score:.2f})</span>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_result_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_result_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_result_row_count = len(trapi_result_rows)
    display(HTML(data=f'''
<div style="font-size: small;">
{"<br>".join(trapi_result_rows)}
</div>
'''))

    figures = []
    row_height='100px'
    for figure_id, df0 in df1.groupby("figure_id", sort=False):
        [pmc,filename] = figure_id.split("__")
        paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]
        figure_title_limited = textwrap.shorten(figure_title, width=45, placeholder="...")

        p_value = list(set(
            df0["p_value"]
        ))[0]
        score = list(set(df0["score"]))[0]

        figures.append(f'''
            <figure style="margin: 5px !important;">
              <a target="_blank" href="{paper_url}">
                <img src="{figure_url}" style="height: {row_height}">
              </a>
              <br>
              <figcaption style="font-size: small;">
                  {figure_title_limited}<br>
                  p-value: {p_value:.1e}, 
                  containment: {score:.2}
              </figcaption>
            </figure>
        ''')
        
    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more figures")
        print("")


    count += 1
    if count >= limit:
        print("...")
        print("")
        remaining = len(set(lowest_p_value_df["trapi_result_curie_combo"])) - limit
        print(f'plus {remaining} more TRAPI results')
        print("")
        break

...

plus 10 more figures



...

plus 882 more TRAPI results



#### By figure

In [29]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 5

for figure_id, raw_df in lowest_p_value_df.groupby("figure_id", sort=False):
    df0 = raw_df.sort_values(
        "p_value"
    )
    [pmc,filename] = figure_id.split("__")
    paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px; font-weight: bold;">
  {figure_title}
</div>
<figure style="margin: 5px !important;">
    <a target="_blank" href="{paper_url}">
      <img src="{figure_url}" style="max-height: 500px; max-width: 100%;">
    </a>
</figure>
'''))

    trapi_results_row_sets = []
    for trapi_result_curie_combo, df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo", "p_value", "score"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
                
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}</span>'
            )
            
        p_value = list(set(df1["p_value"]))[0]
        score = list(set(df1["score"]))[0]
        trapi_results_row_sets.append(f'''
<div>
    <span style="font-weight: bold; font-size: small;">
      TRAPI result p-value: {p_value:.1e}, containment: {score:.2}
    </span><br>
    {"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more TRAPI results")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print("")
        print(f'plus {len(set(lowest_p_value_df["figure_id"])) - limit} more figures')
        print("")
        break








...

plus 692 more figures



### co-occurrence: `n0` and (`n1` or `n2`)

In [30]:
n0_cooccurrence_df = results_with_figures_df[results_with_figures_df.apply(
    lambda r: r.n0_unified_curie in set(r.overlap_curie_combo),
    axis=1
)].sort_values(
    "p_value"
)

#### By TRAPI result

In [43]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 5
count = 0
display_count_limit = 10

for trapi_result_curie_combo, df1 in n0_cooccurrence_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]
    
    ordered_trapi_result_curie_combo = []
    for q_node_id in q_node_ids:
        ordered_trapi_result_curie_combo.append(list(set(
            df1[f"{q_node_id}_unified_curie"]
        ))[0])

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span>{", ".join(ordered_trapi_result_curie_combo)}</span> 
  <span>(cumulative containment score: {cumulative_trapi_result_curie_combo_figure_score:.2f})</span>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_result_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_result_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_result_row_count = len(trapi_result_rows)
    display(HTML(data=f'''
<div style="font-size: small;">
{"<br>".join(trapi_result_rows)}
</div>
'''))

    figures = []
    row_height='100px'
    for figure_id, df0 in df1.groupby("figure_id", sort=False):
        [pmc,filename] = figure_id.split("__")
        paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]
        figure_title_limited = textwrap.shorten(figure_title, width=45, placeholder="...")

        p_value = list(set(
            df0["p_value"]
        ))[0]
        score = list(set(df0["score"]))[0]

        figures.append(f'''
            <figure style="margin: 5px !important;">
              <a target="_blank" href="{paper_url}">
                <img src="{figure_url}" style="height: {row_height}">
              </a>
              <br>
              <figcaption style="font-size: small;">
                  {figure_title_limited}<br>
                  p-value: {p_value:.1e}, 
                  containment: {score:.2}
              </figcaption>
            </figure>
        ''')
        
    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more figures")
        print("")


    count += 1
    if count >= limit:
        print("...")
        print("")
        remaining = len(set(n0_cooccurrence_df["trapi_result_curie_combo"])) - limit
        print(f'plus {remaining} more TRAPI results')
        print("")
        break

...

plus 495 more TRAPI results



#### By figure

In [32]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 5

for figure_id, raw_df in n0_cooccurrence_df.groupby("figure_id", sort=False):
    df0 = raw_df.sort_values(
        "p_value"
    )
    [pmc,filename] = figure_id.split("__")
    paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px; font-weight: bold;">
  {figure_title}
</div>
<figure style="margin: 5px !important;">
    <a target="_blank" href="{paper_url}">
      <img src="{figure_url}" style="max-height: 500px; max-width: 100%;">
    </a>
</figure>
'''))

    trapi_results_row_sets = []
    for trapi_result_curie_combo, df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo", "p_value", "score"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
                
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}</span>'
            )
            
        p_value = list(set(df1["p_value"]))[0]
        score = list(set(df1["score"]))[0]
        trapi_results_row_sets.append(f'''
<div>
    <span style="font-weight: bold; font-size: small;">
      TRAPI result p-value: {p_value:.1e}, containment: {score:.2}
    </span><br>
    {"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more TRAPI results")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print("")
        print(f'plus {len(set(n0_cooccurrence_df["figure_id"])) - limit} more figures')
        print("")
        break

...

plus 169 more TRAPI results



...

plus 321 more TRAPI results



### co-occurrence: `n0` and `n1`

In [33]:
n0_n1_cooccurrence_df = results_with_figures_df[results_with_figures_df.apply(
    lambda r: (
        r.n0_unified_curie in set(r.overlap_curie_combo)
    ) and (
        r.n1_unified_curie in set(r.overlap_curie_combo)
    ),
    axis=1
)].sort_values(
    "p_value"
)

#### By TRAPI result

In [44]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 5
count = 0
display_count_limit = 10

for trapi_result_curie_combo, df1 in n0_n1_cooccurrence_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]
    
    ordered_trapi_result_curie_combo = []
    for q_node_id in q_node_ids:
        ordered_trapi_result_curie_combo.append(list(set(
            df1[f"{q_node_id}_unified_curie"]
        ))[0])

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span>{", ".join(ordered_trapi_result_curie_combo)}</span> 
  <span>(cumulative containment score: {cumulative_trapi_result_curie_combo_figure_score:.2f})</span>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_result_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_result_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_result_row_count = len(trapi_result_rows)
    display(HTML(data=f'''
<div style="font-size: small;">
{"<br>".join(trapi_result_rows)}
</div>
'''))

    figures = []
    row_height='100px'
    for figure_id, df0 in df1.groupby("figure_id", sort=False):
        [pmc,filename] = figure_id.split("__")
        paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]
        figure_title_limited = textwrap.shorten(figure_title, width=45, placeholder="...")

        p_value = list(set(
            df0["p_value"]
        ))[0]
        score = list(set(df0["score"]))[0]

        figures.append(f'''
            <figure style="margin: 5px !important;">
              <a target="_blank" href="{paper_url}">
                <img src="{figure_url}" style="height: {row_height}">
              </a>
              <br>
              <figcaption style="font-size: small;">
                  {figure_title_limited}<br>
                  p-value: {p_value:.1e}, 
                  containment: {score:.2}
              </figcaption>
            </figure>
        ''')
        
    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more figures")
        print("")


    count += 1
    if count >= limit:
        print("...")
        print("")
        remaining = len(set(n0_n1_cooccurrence_df["trapi_result_curie_combo"])) - limit
        print(f'plus {remaining} more TRAPI results')
        print("")
        break

...

plus 486 more TRAPI results



#### By figure

In [35]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 5

for figure_id, raw_df in n0_n1_cooccurrence_df.groupby("figure_id", sort=False):
    df0 = raw_df.sort_values(
        "p_value"
    )
    [pmc,filename] = figure_id.split("__")
    paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px; font-weight: bold;">
  {figure_title}
</div>
<figure style="margin: 5px !important;">
    <a target="_blank" href="{paper_url}">
      <img src="{figure_url}" style="max-height: 500px; max-width: 100%;">
    </a>
</figure>
'''))

    trapi_results_row_sets = []
    for trapi_result_curie_combo, df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo", "p_value", "score"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
                
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}</span>'
            )
            
        p_value = list(set(df1["p_value"]))[0]
        score = list(set(df1["score"]))[0]
        trapi_results_row_sets.append(f'''
<div>
    <span style="font-weight: bold; font-size: small;">
      TRAPI result p-value: {p_value:.1e}, containment: {score:.2}
    </span><br>
    {"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more TRAPI results")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print("")
        print(f'plus {len(set(n0_n1_cooccurrence_df["figure_id"])) - limit} more figures')
        print("")
        break

...

plus 160 more TRAPI results



...

plus 321 more TRAPI results



### co-occurrence: `n0` and `n2`

In [36]:
n0_n2_cooccurrence_df = results_with_figures_df[results_with_figures_df.apply(
    lambda r: (
        r.n0_unified_curie in set(r.overlap_curie_combo)
    ) and (
        r.n2_unified_curie in set(r.overlap_curie_combo)
    ),
    axis=1
)].sort_values(
    "p_value"
)

#### By TRAPI result

In [45]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 5
count = 0
display_count_limit = 10

for trapi_result_curie_combo, df1 in n0_n2_cooccurrence_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]
    
    ordered_trapi_result_curie_combo = []
    for q_node_id in q_node_ids:
        ordered_trapi_result_curie_combo.append(list(set(
            df1[f"{q_node_id}_unified_curie"]
        ))[0])

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span>{", ".join(ordered_trapi_result_curie_combo)}</span> 
  <span>(cumulative containment score: {cumulative_trapi_result_curie_combo_figure_score:.2f})</span>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_result_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_result_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_result_row_count = len(trapi_result_rows)
    display(HTML(data=f'''
<div style="font-size: small;">
{"<br>".join(trapi_result_rows)}
</div>
'''))

    figures = []
    row_height='100px'
    for figure_id, df0 in df1.groupby("figure_id", sort=False):
        [pmc,filename] = figure_id.split("__")
        paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]
        figure_title_limited = textwrap.shorten(figure_title, width=45, placeholder="...")

        p_value = list(set(
            df0["p_value"]
        ))[0]
        score = list(set(df0["score"]))[0]

        figures.append(f'''
            <figure style="margin: 5px !important;">
              <a target="_blank" href="{paper_url}">
                <img src="{figure_url}" style="height: {row_height}">
              </a>
              <br>
              <figcaption style="font-size: small;">
                  {figure_title_limited}<br>
                  p-value: {p_value:.1e}, 
                  containment: {score:.2}
              </figcaption>
            </figure>
        ''')
        
    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more figures")
        print("")


    count += 1
    if count >= limit:
        print("...")
        print("")
        remaining = len(set(n0_n2_cooccurrence_df["trapi_result_curie_combo"])) - limit
        print(f'plus {remaining} more TRAPI results')
        print("")
        break

...

plus 4 more TRAPI results



#### By figure

In [38]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 5

for figure_id, raw_df in n0_n2_cooccurrence_df.groupby("figure_id", sort=False):
    df0 = raw_df.sort_values(
        "p_value"
    )
    [pmc,filename] = figure_id.split("__")
    paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px; font-weight: bold;">
  {figure_title}
</div>
<figure style="margin: 5px !important;">
    <a target="_blank" href="{paper_url}">
      <img src="{figure_url}" style="max-height: 500px; max-width: 100%;">
    </a>
</figure>
'''))

    trapi_results_row_sets = []
    for trapi_result_curie_combo, df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo", "p_value", "score"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
                
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}</span>'
            )
            
        p_value = list(set(df1["p_value"]))[0]
        score = list(set(df1["score"]))[0]
        trapi_results_row_sets.append(f'''
<div>
    <span style="font-weight: bold; font-size: small;">
      TRAPI result p-value: {p_value:.1e}, containment: {score:.2}
    </span><br>
    {"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more TRAPI results")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print("")
        print(f'plus {len(set(n0_n2_cooccurrence_df["figure_id"])) - limit} more figures')
        print("")
        break

...

plus 4 more TRAPI results

