In [1]:
from copy import copy
import json
import pandas as pd
import textwrap

# BTE Sleeve

Some queries result in an explosion of the number of results. This is bad for performance of the server and also not very useful to the user. This notebook explores using PFOCR as a "compression sleeve" to dampen these types of explosions. I don't have a CC-licensed image of a compression sleeve, so instead here's an image of a heat-shrink tube.

<a title="oomlout, CC BY-SA 2.0 &lt;https://creativecommons.org/licenses/by-sa/2.0&gt;, via Wikimedia Commons" href="https://commons.wikimedia.org/wiki/File:HESH-06-L-STAN-01_(9601242883).jpg"><img width="512" alt="HESH-06-L-STAN-01 (9601242883)" src="https://upload.wikimedia.org/wikipedia/commons/thumb/d/d3/HESH-06-L-STAN-01_%289601242883%29.jpg/512px-HESH-06-L-STAN-01_%289601242883%29.jpg"></a>

Let's try BTE Sleeve on the test query [A.2_RHOBTB2_twohop](https://github.com/NCATSTranslator/minihackathons/blob/main/2021-12_demo/workflowA/A.2_RHOBTB2_twohop.json):

`RHOBTB2 Gene`--entity_regulates_entity|genetically_interacts_with--`Any Gene`--related_to--`Any SmallMolecule`

`n0`--e01--`n1`--e02--`n2`


Note BTE Sleeve currently only works on queries with nodes of categories supported by PFOCR, e.g., `biolink:Gene`, `biolink:SmallMolecule`, `biolink:Disease`. TODO: update the Fisher's Exact test to work with node categories not in PFOCR.

Also, BTE Sleeve currently assumes `n0` has an `ids` parameter. TODO: update to handle `ids` params on any q_node.

In [2]:
import requests
import requests_cache


requests_cache.install_cache("pfocr_cache", allowable_methods=['GET', 'POST'])

## Get BTE TRAPI Results

In [3]:
query = {
    "message": {
        "query_graph": {
            "edges": {
                "e01": {
                    "object": "n0",
                    "subject": "n1",
                    "predicates": [
                        "biolink:entity_regulates_entity",
                        "biolink:genetically_interacts_with"
                    ]
                },
                "e02": {
                    "object": "n1",
                    "subject": "n2",
                    "predicates": [
                        "biolink:related_to"
                    ]
                }
            },
            "nodes": {
                "n0": {
                    "ids": [
                        "NCBIGene:7454"
                    ],
                    "categories": [
                        "biolink:Gene"
                    ]
                },
                "n1": {
                    "categories": [
                        "biolink:Gene"
                    ],
                },
                "n2": {
                    "categories": [
                        "biolink:SmallMolecule"
                    ]
                }
            }
        }
    }
}

trapi_response = requests.post("http://localhost:3000/v1/query", json=query)
#trapi_response = requests.post("https://api.bte.ncats.io/v1/query", json=query)
#trapi_response = requests.post("https://arax.ncats.io/api/rtxkg2/v1.2/query", json=query)
#trapi_response = requests.post("https://arax.ncats.io/api/arax/v1.2/query", json=query)
print(trapi_response.status_code)
if trapi_response.status_code != 200:
    print(trapi_response.text)
if trapi_response.from_cache:
    print(f"got response from cache")

trapi_message = trapi_response.json()["message"]

#print(json.dumps(query))

200
got response from cache


In [4]:
# note we're also including any categories from the results
curie_categories = set()
for v in  query["message"]["query_graph"]["nodes"].values():
    if "categories" in v:
        for category in v["categories"]:
            curie_categories.add(category)

In [5]:
# for genes/gene products, chemicals and diseases
preferred_prefixes = set(["NCBIGene", "MESH"])
unified_names = set()
unified_curies = set()
unification_failed_curies = set()
unified_prefixes = set()
all_prefixes = set()
curie_to_name = dict()
curie_to_unified_curie = dict()
for k, v in trapi_message["knowledge_graph"]["nodes"].items():
    name = v["name"]
    for category in v["categories"]:
        curie_categories.add(category)
        
    for a in v["attributes"]:
        if a["attribute_type_id"] == "biolink:xref":
            curies = a["value"]
            
            # k should always be one of the curies
            if not k in curies:
                raise Exception(f"key {k} not in {curies}")
            
            unified_curie = None
            intersecting_unified_curies = unified_curies.intersection(set(curies))
            if len(intersecting_unified_curies) > 1:
                raise Exception(f"matching multiple: {k} to {list(intersecting_unified_curies)}")
            elif len(intersecting_unified_curies) == 1:
                unified_curie = list(intersecting_unified_curies)[0]
            else:
                # get curie for preferred prefix. usually this is k, but not always.
                for curie in curies:
                    [prefix, identifier] = curie.split(":")
                    if prefix in preferred_prefixes:
                        unified_curie = curie
                        unified_curies.add(unified_curie)
                        break
                        
            if not unified_curie:
                if k in curie_to_unified_curie:
                    unified_curie = curie_to_unified_curie[k]
                else:
                    unification_failed_curies.add(k)
                    break
                    
                #raise Exception(f"failed to find a unified curie for {k} in {curies}")
                
            [unified_prefix, unified_identifier] = unified_curie.split(":")
            unified_prefixes.add(unified_prefix)
            
            for curie in curies:
                [prefix, identifier] = curie.split(":")
                all_prefixes.add(prefix)
                if not curie in curie_to_unified_curie:
                    curie_to_unified_curie[curie] = unified_curie
                if not curie in curie_to_name:
                    curie_to_name[curie] = name
                    unified_names.add(name)
                elif curie_to_name[curie] != name:
                    print(f"curie {curie} has multiple primary names: {curie_to_name[curie]} and {name}")
                    #raise Exception(f"curie{curie} has multiple names: {curie_to_name[curie]} and {name}")


print(f'curie_categories: {curie_categories}')
print("")
print(f"curie_to_name key count: {len(curie_to_name.keys())}")
print(f"curie_to_unified_curie key count: {len(curie_to_unified_curie.keys())}")
print("")
print("all curie prefixes found:")
print(all_prefixes)
print("")
print("unified curie prefixes found:")
print(unified_prefixes)
print("")
print(f"failed to unify {len(unification_failed_curies)} curies")

curie UMLS:C0010592 has multiple primary names: Cyclosporin A and Cyclosporine
curie UMLS:C0086135 has multiple primary names: Desmopressin and Desmopressin Acetate
curie UMLS:C0021641 has multiple primary names: Iletin and Insulin
curie UMLS:C0003779 has multiple primary names: Desmopressin and Argipressin
curie UMLS:C0245109 has multiple primary names: Anakinra and anakinra
curie_categories: {'biolink:Gene', 'biolink:MolecularMixture', 'biolink:Polypeptide', 'biolink:PhenotypicFeature', 'biolink:SmallMolecule', 'biolink:ComplexMolecularMixture', 'biolink:Protein', 'biolink:ChemicalEntity'}

curie_to_name key count: 36205
curie_to_unified_curie key count: 36205

all curie prefixes found:
{'CAS', 'KEGG.COMPOUND', 'DRUGBANK', 'PR', 'GTOPDB', 'NCBIGene', 'ENSEMBL', 'OMIM', 'HMDB', 'CHEBI', 'INCHIKEY', 'UNII', 'HGNC', 'MESH', 'UniProtKB', 'CHEMBL.COMPOUND', 'DrugCentral', 'PUBCHEM.COMPOUND', 'UMLS'}

unified curie prefixes found:
{'NCBIGene', 'MESH'}

failed to unify 5616 curies


In [6]:
columns = []
q_node_id_keys = set(["object", "subject"])
q_node_ids = []
q_edge_ids = []
for q_edge_id,edge_v in query["message"]["query_graph"]["edges"].items():
    q_edge_ids.append(q_edge_id)
    q_node_id_found = False
    for k,v in edge_v.items():
        if (k in q_node_id_keys) and (type(v) is str):
            if v not in columns:
                q_node_ids.append(v)
                columns.append(v)
            if not q_node_id_found:
                q_node_id_found = True
                columns.append(q_edge_id)
                
for q_node_id in q_node_ids:
    columns.append(f"{q_node_id}_curie")
    columns.append(f"{q_node_id}_unified_curie")
    
columns.append("trapi_result_curie_combo")
    
print(columns)
print(q_node_ids)
print(q_edge_ids)

['n0', 'e01', 'n1', 'e02', 'n2', 'n0_curie', 'n0_unified_curie', 'n1_curie', 'n1_unified_curie', 'n2_curie', 'n2_unified_curie', 'trapi_result_curie_combo']
['n0', 'n1', 'n2']
['e01', 'e02']


In [7]:
trapi_result_columns = []
for i in range(len(q_node_ids)):
    trapi_result_columns.append(q_node_ids[i])
    if i < len(q_edge_ids):
        trapi_result_columns.append(q_edge_ids[i])
print(trapi_result_columns)

['n0', 'e01', 'n1', 'e02', 'n2']


In [8]:
from copy import deepcopy


trapi_results = trapi_message["results"]

result_row_data = []
for trapi_result in trapi_results:
    curie_to_qnode_ids = dict()
    for qnode_id, entries in trapi_result["node_bindings"].items():
        for entry in entries:
            curie = entry["id"]
            if curie not in curie_to_qnode_ids:
                curie_to_qnode_ids[curie] = []
            curie_to_qnode_ids[curie].append(qnode_id)
        
    row_data_template = dict()
    q_edge_id_to_predicates = dict()
    trapi_result_curies = set()
    for qedge_id, entries in trapi_result["edge_bindings"].items():
        for entry in entries:
            curie = entry["id"]
            kg_entry = trapi_message["knowledge_graph"]["edges"][curie]
            subject_curie = kg_entry["subject"]
            object_curie = kg_entry["object"]
            predicate_curie = kg_entry["predicate"]
            [predicate_prefix, predicate_identifier] = predicate_curie.split(":")
            
            if qedge_id not in q_edge_id_to_predicates:
                q_edge_id_to_predicates[qedge_id] = set()
            q_edge_id_to_predicates[qedge_id].add(predicate_identifier)

            for curie in [subject_curie, object_curie]:
                for qnode_id in curie_to_qnode_ids[curie]:
                    if curie in curie_to_unified_curie:
                        unified_curie = curie_to_unified_curie[curie]
                    else:
                        break

                    name = curie_to_name[curie]
                    row_data_template[qnode_id] = name

                    trapi_result_curies.add(unified_curie)
                    row_data_template[qnode_id + "_curie"] = curie
                    row_data_template[qnode_id + "_unified_curie"] = unified_curie
                    
    if len(trapi_result_curies) != len(q_node_ids):
        #print(f'skipping {list(curie_to_qnode_ids.keys())}')
        continue
        
    row_data_template["trapi_result_curie_combo"] = tuple(sorted(trapi_result_curies))
    q_edge_ids_processed = set()
    row_datas = [row_data_template]
    for q_edge_id,predicates in q_edge_id_to_predicates.items():
        next_row_datas = []
        for row_data in row_datas:
            for predicate in predicates:
                next_row_data = deepcopy(row_data)
                next_row_data[q_edge_id] = predicate
                next_row_datas.append(
                    next_row_data 
                )
        row_datas = next_row_datas
    result_row_data += row_datas
        
print("warning: predicate direction(s) may be switched")
results_df = pd.DataFrame.from_records(result_row_data, columns=columns).drop_duplicates()
results_df



Unnamed: 0,n0,e01,n1,e02,n2,n0_curie,n0_unified_curie,n1_curie,n1_unified_curie,n2_curie,n2_unified_curie,trapi_result_curie_combo
0,WAS,entity_positively_regulates_entity,MANF,increases_expression_of,"(E)-N-[(2R,3R,4R,5R,6R)-2-[(2R,3R,4R,5S,6R)-3-...",NCBIGene:7454,NCBIGene:7454,NCBIGene:7873,NCBIGene:7873,PUBCHEM.COMPOUND:5282055,MESH:D014415,"(MESH:D014415, NCBIGene:7454, NCBIGene:7873)"
1,WAS,entity_positively_regulates_entity,MANF,entity_positively_regulates_entity,"(E)-N-[(2R,3R,4R,5R,6R)-2-[(2R,3R,4R,5S,6R)-3-...",NCBIGene:7454,NCBIGene:7454,NCBIGene:7873,NCBIGene:7873,PUBCHEM.COMPOUND:5282055,MESH:D014415,"(MESH:D014415, NCBIGene:7454, NCBIGene:7873)"
2,WAS,entity_positively_regulates_entity,MANF,resistance_associated_with,Tretinoin,NCBIGene:7454,NCBIGene:7454,NCBIGene:7873,NCBIGene:7873,PUBCHEM.COMPOUND:444795,MESH:D014212,"(MESH:D014212, NCBIGene:7454, NCBIGene:7873)"
3,WAS,entity_positively_regulates_entity,MANF,entity_negatively_regulates_entity,Tretinoin,NCBIGene:7454,NCBIGene:7454,NCBIGene:7873,NCBIGene:7873,PUBCHEM.COMPOUND:444795,MESH:D014212,"(MESH:D014212, NCBIGene:7454, NCBIGene:7873)"
4,WAS,entity_positively_regulates_entity,MANF,entity_negatively_regulates_entity,Diclofenac,NCBIGene:7454,NCBIGene:7454,NCBIGene:7873,NCBIGene:7873,PUBCHEM.COMPOUND:3033,MESH:D004008,"(MESH:D004008, NCBIGene:7454, NCBIGene:7873)"
...,...,...,...,...,...,...,...,...,...,...,...,...
35263,WAS,genetically_interacts_with,ASB15,resistance_associated_with,Pelitinib,NCBIGene:7454,NCBIGene:7454,NCBIGene:142685,NCBIGene:142685,PUBCHEM.COMPOUND:6445562,MESH:C413879,"(MESH:C413879, NCBIGene:142685, NCBIGene:7454)"
35264,WAS,genetically_interacts_with,ASB15,resistance_associated_with,"2-(4-(Benzo[d][1,3]dioxol-5-yl)-2-tert-butyl-1...",NCBIGene:7454,NCBIGene:7454,NCBIGene:142685,NCBIGene:142685,PUBCHEM.COMPOUND:9858940,MESH:C519132,"(MESH:C519132, NCBIGene:142685, NCBIGene:7454)"
35265,WAS,genetically_interacts_with,ASB15,resistance_associated_with,Linifanib,NCBIGene:7454,NCBIGene:7454,NCBIGene:142685,NCBIGene:142685,PUBCHEM.COMPOUND:11485656,MESH:C513486,"(MESH:C513486, NCBIGene:142685, NCBIGene:7454)"
35266,WAS,genetically_interacts_with,ASB15,part_of,"Early Promoters, Genetic",NCBIGene:7454,NCBIGene:7454,NCBIGene:142685,NCBIGene:142685,UMLS:C0013462,MESH:D011401,"(MESH:D011401, NCBIGene:142685, NCBIGene:7454)"


In [9]:
for q_node_id in q_node_ids:
    print(len(set(results_df[q_node_id])))

1
101
3248


In [10]:
len(set(results_df["trapi_result_curie_combo"]))

22366

Total results count (excluding any that couldn't be unified): 3,738. That's too many for a researcher to efficiently go through manually, so let's try filtering to make that more manageable.

## Match up BTE TRAPI Results & PFOCR

We're going to try using PFOCR to filter and prioritize the results.

First we need to get the PFOCR Data. We could get it from the API, but for now, we'll just go ahead and download the entire JSON file we gave to BTE.

In [11]:
pfocr_url = "https://www.dropbox.com/s/1f14t5zaseocyg6/bte_chemicals_diseases_genes.ndjson?dl=1"
pfocr_request = requests.get(pfocr_url)
print(f"status_code: {pfocr_request.status_code}")
if pfocr_request.status_code != 200:
    print(pfocr_request.text)

status_code: 200


In [12]:
curies_to_figure_ids = {}
figure_id_to_curies = {}
figure_id_to_pfocr_result = {}
for line in pfocr_request.text.splitlines():
    pfocr_result = json.loads(line)
    figure_id = pfocr_result["_id"]

    curies = set()
    for identifier in pfocr_result["associatedWith"]["mentions"]["chemicals"]["mesh"]:
        curie = "MESH:" + identifier
        if curie in curie_to_unified_curie:
            unified_curie = curie_to_unified_curie[curie]
            curies.add(unified_curie)
        else:
            #print(f"{curie} not in list")
            # this curie isn't in the BTE results, but we'll add an item for
            # the purpose of the denominator in jaccard/containment cals
            curies.add(curie)
    for identifier in pfocr_result["associatedWith"]["mentions"]["diseases"]["mesh"]:
        curie = "MESH:" + identifier
        if curie in curie_to_unified_curie:
            unified_curie = curie_to_unified_curie[curie]
            curies.add(unified_curie)
        else:
            #print(f"{curie} not in list")
            # this curie isn't in the BTE results, but we'll add an item for
            # the purpose of the denominator in jaccard/containment cals
            curies.add(curie)
    for identifier in pfocr_result["associatedWith"]["mentions"]["genes"]["ncbigene"]:
        curie = "NCBIGene:" + identifier
        if curie in curie_to_unified_curie:
            unified_curie = curie_to_unified_curie[curie]
            curies.add(unified_curie)
        else:
            #print(f"{curie} not in list")
            # this curie isn't in the BTE results, but we'll add an item for
            # the purpose of the denominator in jaccard/containment cals
            curies.add(curie)

    figure_id_to_pfocr_result[figure_id] = pfocr_result
    figure_id_to_curies[figure_id] = curies

    curies_key = tuple(sorted(curies))
    if curies_key not in curies_to_figure_ids:
        curies_to_figure_ids[curies_key] = []
    curies_to_figure_ids[curies_key].append(figure_id)

In [13]:
from SetSimilaritySearch import SearchIndex

# the reference sets are supposed to be from PFOCR only
reference_sets = list()
for curies in set(curies_to_figure_ids.keys()):
    reference_sets.append(set(curies))
# but in order to calculate the scores correctly,
# the SetSimilaritySearch library requires that
# every curie from the bte results need to be
# mentioned at least once in the reference set.
# That's the only reason we add them below, as
# one large set.

unified_curie_columns = [
    q_node_id + "_unified_curie" for q_node_id in q_node_ids
]

trapi_result_curie_combos = set()
for i, df in results_df[unified_curie_columns].drop_duplicates().iterrows():
    trapi_result_curies = []
    for unified_curie_column in unified_curie_columns:
        trapi_result_curies.append(
            df[unified_curie_column]
        )
    trapi_result_curie_combos.add(tuple(sorted(
        trapi_result_curies
    )))
    
reference_set = set()
for trapi_result_curie_combo in trapi_result_curie_combos:
    reference_set |= set(trapi_result_curie_combo)
reference_sets.append(reference_set)

matchable_node_min = 2
matchable_node_count = len(q_node_ids)
index = SearchIndex(reference_sets, similarity_func_name="containment", 
    similarity_threshold=matchable_node_min/matchable_node_count)

trapi_figure_overlap_rows = []
for trapi_result_curie_combo in trapi_result_curie_combos:
    trapi_curies = set(trapi_result_curie_combo)
    results = index.query(trapi_curies)
    for result in results:
        figure_curie_combos = reference_sets[result[0]]
        figure_curie_key = tuple(sorted(figure_curie_combos))
        
        # needed to not match bte results to themselves
        if figure_curie_key in curies_to_figure_ids:
            figure_ids = curies_to_figure_ids[figure_curie_key]
        else:
            continue
            
        score = result[1]
        
        common_curies = trapi_curies.intersection(figure_curie_combos)
        for figure_id in figure_ids:
            pfocr_result = figure_id_to_pfocr_result[figure_id]
            trapi_figure_overlap_rows.append({
                "figure_id": figure_id,
                "figure_url": pfocr_result["associatedWith"]["figureUrl"],
                "figure_title": pfocr_result["associatedWith"]["title"],
                "trapi_result_curie_combo": trapi_result_curie_combo,
                "figure_curie_combo": figure_curie_key,
                "overlap_curie_combo": tuple(sorted(common_curies)),
                "score": score,
            })


trapi_figure_overlap_df = pd.DataFrame.from_records(trapi_figure_overlap_rows)
trapi_figure_overlap_df

Unnamed: 0,figure_id,figure_url,figure_title,trapi_result_curie_combo,figure_curie_combo,overlap_curie_combo,score
0,PMC4662426__bbi-9-2015-153f5.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,Difference in expression of the calcium signal...,"(MESH:D002166, NCBIGene:5335, NCBIGene:7454)","(MESH:C024376, MESH:D000275, MESH:D002118, MES...","(NCBIGene:5335, NCBIGene:7454)",0.666667
1,PMC3328896__CDI2012-692639.002.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Cbl-b functions as central gate keeper of T-ce...,"(MESH:D002166, NCBIGene:5335, NCBIGene:7454)","(MESH:D017260, NCBIGene:10000, NCBIGene:10725,...","(NCBIGene:5335, NCBIGene:7454)",0.666667
2,PMC2944409__nihms233031f6.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,"Calcium Signaling Pathway (KEGG, Kanehisa Labo...","(MESH:D002166, NCBIGene:5335, NCBIGene:7454)","(MESH:C024376, MESH:D000255, MESH:D002118, MES...","(NCBIGene:5335, NCBIGene:7454)",0.666667
3,PMC3988951__gr6.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Insights into TREM2-related signaling using da...,"(MESH:D002166, NCBIGene:5335, NCBIGene:7454)","(NCBIGene:1072, NCBIGene:1073, NCBIGene:10766,...","(NCBIGene:5335, NCBIGene:7454)",0.666667
4,PMC7525051__acmi-2-082-g005.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Chemokine signalling (KEGG) pathway regulation...,"(MESH:D002166, NCBIGene:5335, NCBIGene:7454)","(MESH:D000249, NCBIGene:107, NCBIGene:108, NCB...","(NCBIGene:5335, NCBIGene:7454)",0.666667
...,...,...,...,...,...,...,...
351365,PMC6620384__bsr-39-bsr20190513-g1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Regulatory mechanisms of Robo4 and their effec...,"(MESH:C516969, NCBIGene:28964, NCBIGene:7454)","(MESH:C516025, NCBIGene:146850, NCBIGene:2321,...","(NCBIGene:28964, NCBIGene:7454)",0.666667
351366,PMC4762141__41048_2015_5_Fig5_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,Different gene expressions in regulation of ac...,"(MESH:C516969, NCBIGene:28964, NCBIGene:7454)","(MESH:C035086, MESH:C043055, MESH:C113338, MES...","(NCBIGene:28964, NCBIGene:7454)",0.666667
351367,PMC4597143__zjw0101551330004.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,Quantification of the full chemotaxis pathway,"(MESH:C516969, NCBIGene:28964, NCBIGene:7454)","(MESH:D014001, NCBIGene:10000, NCBIGene:10152,...","(NCBIGene:28964, NCBIGene:7454)",0.666667
351368,PMC3603002__nihms207342f6.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,T cell signaling pathways and protein phosphor...,"(MESH:C520365, NCBIGene:50618, NCBIGene:7454)","(NCBIGene:10451, NCBIGene:10725, NCBIGene:1432...","(NCBIGene:50618, NCBIGene:7454)",0.666667


In [14]:
print(len(set(
    trapi_figure_overlap_df[trapi_figure_overlap_df["overlap_curie_combo"].map(len) >= 2]["trapi_result_curie_combo"]
)))
print(len(set(
    trapi_figure_overlap_df[trapi_figure_overlap_df["overlap_curie_combo"].map(len) >= 3]["trapi_result_curie_combo"]
)))
print(len(set(
    trapi_figure_overlap_df[trapi_figure_overlap_df["overlap_curie_combo"].map(len) >= 4]["trapi_result_curie_combo"]
)))
print(len(set(trapi_figure_overlap_df["figure_id"])))

16179
139
0
11723


### trapi_curie_combo_count_by_figure_df

In [15]:
trapi_curie_combo_count_by_figure_df = trapi_figure_overlap_df[["figure_id", "trapi_result_curie_combo"]].rename(columns={
    "trapi_result_curie_combo": "trapi_curie_combo_count"
}).groupby("figure_id").count().sort_values(
    by="trapi_curie_combo_count", ascending=False)
trapi_curie_combo_count_by_figure_df

Unnamed: 0_level_0,trapi_curie_combo_count
figure_id,Unnamed: 1_level_1
PMC2642659__zpq9990969120002.jpg,6944
PMC4538774__BMRI2015-949514.004.jpg,6384
PMC7263378__nihms-1585779-f0004.jpg,6182
PMC5501852__41598_2017_5280_Fig6_HTML.jpg,6056
PMC5118094__nihms825299f7.jpg,5163
...,...
PMC4909440__kmco-03-03-1160174-g001.jpg,1
PMC4908701__12014_2016_9114_Fig5_HTML.jpg,1
PMC4907806__nihms770844f3.jpg,1
PMC4905575__nihms791846f5.jpg,1


### figure count/score by trapi curie combo

In [16]:
trapi_figure_overlap_df[["trapi_result_curie_combo", "figure_id"]].rename(columns={
    "figure_id": "figure_count"
}).groupby("trapi_result_curie_combo").count().sort_values(
    by="figure_count", ascending=False).head(10)

Unnamed: 0_level_0,figure_count
trapi_result_curie_combo,Unnamed: 1_level_1
"(NCBIGene:7040, NCBIGene:7042, NCBIGene:7454)",2774
"(MESH:D002118, NCBIGene:5595, NCBIGene:7454)",1161
"(MESH:D002118, NCBIGene:5335, NCBIGene:7454)",928
"(NCBIGene:1950, NCBIGene:1956, NCBIGene:7454)",777
"(MESH:D002118, NCBIGene:207, NCBIGene:7454)",762
"(MESH:D000255, NCBIGene:207, NCBIGene:7454)",622
"(NCBIGene:7040, NCBIGene:7124, NCBIGene:7454)",620
"(NCBIGene:1956, NCBIGene:2064, NCBIGene:7454)",556
"(MESH:D020123, NCBIGene:207, NCBIGene:7454)",478
"(MESH:D012694, NCBIGene:207, NCBIGene:7454)",365


In [17]:
trapi_result_figure_score_df = trapi_figure_overlap_df[["trapi_result_curie_combo", "score"]].rename(
    columns={"score": "cumulative_trapi_result_curie_combo_figure_score"}
).groupby("trapi_result_curie_combo").sum().sort_values(
    by="cumulative_trapi_result_curie_combo_figure_score", ascending=False)
trapi_result_figure_score_df.head(10)

Unnamed: 0_level_0,cumulative_trapi_result_curie_combo_figure_score
trapi_result_curie_combo,Unnamed: 1_level_1
"(NCBIGene:7040, NCBIGene:7042, NCBIGene:7454)",1853.333333
"(MESH:D002118, NCBIGene:5595, NCBIGene:7454)",781.333333
"(MESH:D002118, NCBIGene:5335, NCBIGene:7454)",625.333333
"(NCBIGene:1950, NCBIGene:1956, NCBIGene:7454)",518.666667
"(MESH:D002118, NCBIGene:207, NCBIGene:7454)",509.666667
"(MESH:D000255, NCBIGene:207, NCBIGene:7454)",415.333333
"(NCBIGene:7040, NCBIGene:7124, NCBIGene:7454)",414.333333
"(NCBIGene:1956, NCBIGene:2064, NCBIGene:7454)",372.0
"(MESH:D020123, NCBIGene:207, NCBIGene:7454)",318.666667
"(MESH:D012694, NCBIGene:207, NCBIGene:7454)",243.333333


### TODO: why is NCBIGene:5111 returned as a SmallMolecule?

In [18]:
results_df[results_df["trapi_result_curie_combo"] == ("NCBIGene:23221", "NCBIGene:5111", "NCBIGene:595")]

Unnamed: 0,n0,e01,n1,e02,n2,n0_curie,n0_unified_curie,n1_curie,n1_unified_curie,n2_curie,n2_unified_curie,trapi_result_curie_combo


In [19]:
results_with_figures_df = trapi_figure_overlap_df.merge(
    results_df,
    on="trapi_result_curie_combo",
    how="left").merge(
    trapi_result_figure_score_df.reset_index(),
    on="trapi_result_curie_combo",
    how="left",
    validate="many_to_one"
).merge(
    trapi_curie_combo_count_by_figure_df.reset_index().rename(columns={
        "trapi_curie_combo_count": "trapi_curie_combo_count_by_figure"
    }),
    on="figure_id",
    how="left",
    validate="many_to_one"
).sort_values(
    by="cumulative_trapi_result_curie_combo_figure_score", ascending=False
)

results_with_figures_df

Unnamed: 0,figure_id,figure_url,figure_title,trapi_result_curie_combo,figure_curie_combo,overlap_curie_combo,score,n0,e01,n1,e02,n2,n0_curie,n0_unified_curie,n1_curie,n1_unified_curie,n2_curie,n2_unified_curie,cumulative_trapi_result_curie_combo_figure_score,trapi_curie_combo_count_by_figure
427559,PMC3696710__cjcr-25-03-346-f2.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Different models of Hh signaling pathway activ...,"(NCBIGene:7040, NCBIGene:7042, NCBIGene:7454)","(MESH:D009369, NCBIGene:1950, NCBIGene:1956, N...","(NCBIGene:7040, NCBIGene:7042)",0.666667,WAS,entity_negatively_regulates_entity,TGFB1,part_of,TGFB2,NCBIGene:7454,NCBIGene:7454,NCBIGene:7040,NCBIGene:7040,NCBIGene:7042,NCBIGene:7042,1853.333333,3
427497,PMC7709261__40364_2020_252_Fig2_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Function of lncRNAs in regulating TGFB signali...,"(NCBIGene:7040, NCBIGene:7042, NCBIGene:7454)","(NCBIGene:11025, NCBIGene:3880, NCBIGene:4086,...","(NCBIGene:7040, NCBIGene:7042)",0.666667,WAS,entity_negatively_regulates_entity,TGFB1,part_of,TGFB2,NCBIGene:7454,NCBIGene:7454,NCBIGene:7040,NCBIGene:7040,NCBIGene:7042,NCBIGene:7042,1853.333333,1
427489,PMC5241340__10549_2016_4079_Fig8_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,Hypothesis of miR-567 modulation in healthy an...,"(NCBIGene:7040, NCBIGene:7042, NCBIGene:7454)","(NCBIGene:4087, NCBIGene:4088, NCBIGene:4089, ...","(NCBIGene:7040, NCBIGene:7042)",0.666667,WAS,entity_negatively_regulates_entity,TGFB1,part_of,TGFB2,NCBIGene:7454,NCBIGene:7454,NCBIGene:7040,NCBIGene:7040,NCBIGene:7042,NCBIGene:7042,1853.333333,1
427490,PMC2857646__nihms191420f1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,Schematic of the TGFB signaling pathway,"(NCBIGene:7040, NCBIGene:7042, NCBIGene:7454)","(NCBIGene:4087, NCBIGene:4088, NCBIGene:4089, ...","(NCBIGene:7040, NCBIGene:7042)",0.666667,WAS,entity_negatively_regulates_entity,TGFB1,part_of,TGFB2,NCBIGene:7454,NCBIGene:7454,NCBIGene:7040,NCBIGene:7040,NCBIGene:7042,NCBIGene:7042,1853.333333,1
427491,PMC4682714__ata0021300280002.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,Provides a schematic and abbreviated overview ...,"(NCBIGene:7040, NCBIGene:7042, NCBIGene:7454)","(NCBIGene:4087, NCBIGene:4088, NCBIGene:4089, ...","(NCBIGene:7040, NCBIGene:7042)",0.666667,WAS,entity_negatively_regulates_entity,TGFB1,part_of,TGFB2,NCBIGene:7454,NCBIGene:7454,NCBIGene:7040,NCBIGene:7040,NCBIGene:7042,NCBIGene:7042,1853.333333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293815,PMC4487197__40303_2015_13_Fig2_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,Ephrin receptor signaling pathway (genes hypom...,"(MESH:C000706872, NCBIGene:10006, NCBIGene:7454)","(MESH:C043055, MESH:C105630, MESH:D006153, NCB...","(NCBIGene:10006, NCBIGene:7454)",0.666667,WAS,entity_positively_regulates_entity,ABI1,sensitivity_associated_with,N-[4-[(4-Ethylpiperazin-1-yl)methyl]-3-(triflu...,NCBIGene:7454,NCBIGene:7454,NCBIGene:10006,NCBIGene:10006,PUBCHEM.COMPOUND:53340664,MESH:C000706872,0.666667,4195
293729,PMC5319840__elife-22914-fig3-figsupp2.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,Tgfbr2 cKO CD34+ SCC cells upregulate genes im...,"(MESH:C474722, NCBIGene:6503, NCBIGene:7454)","(MESH:C004322, MESH:C045346, MESH:C054171, MES...","(NCBIGene:6503, NCBIGene:7454)",0.666667,WAS,entity_negatively_regulates_entity,SLA,resistance_associated_with,N-[4-[[6-Methoxy-7-[3-(4-morpholinyl)propoxy]-...,NCBIGene:7454,NCBIGene:7454,NCBIGene:6503,NCBIGene:6503,PUBCHEM.COMPOUND:9914412,MESH:C474722,0.666667,302
293728,PMC5319840__elife-22914-fig3-figsupp2.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,Tgfbr2 cKO CD34+ SCC cells upregulate genes im...,"(MESH:C474722, NCBIGene:6503, NCBIGene:7454)","(MESH:C004322, MESH:C045346, MESH:C054171, MES...","(NCBIGene:6503, NCBIGene:7454)",0.666667,WAS,entity_negatively_regulates_entity,SLA,sensitivity_associated_with,N-[4-[[6-Methoxy-7-[3-(4-morpholinyl)propoxy]-...,NCBIGene:7454,NCBIGene:7454,NCBIGene:6503,NCBIGene:6503,PUBCHEM.COMPOUND:9914412,MESH:C474722,0.666667,302
630465,PMC5118094__nihms825299f7.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,Cocaine regulation of specific Canonical Pathw...,"(MESH:C526575, NCBIGene:10095, NCBIGene:7454)","(MESH:C035086, NCBIGene:10092, NCBIGene:10093,...","(NCBIGene:10095, NCBIGene:7454)",0.666667,WAS,genetically_interacts_with,ARPC1B,resistance_associated_with,Masitinib,NCBIGene:7454,NCBIGene:7454,NCBIGene:10095,NCBIGene:10095,PUBCHEM.COMPOUND:10074640,MESH:C526575,0.666667,5163


## Fisher's Exact Test

### TODO: vectorize
For better performance, try vectorizing the calculation of Fisher's Exact test, e.g.:
https://stackoverflow.com/questions/34947578/how-to-vectorize-fishers-exact-test

In [20]:
all_genes = set()
gene_instance_count = 0
all_chemicals = set()
chemical_instance_count = 0
all_diseases = set()
disease_instance_count = 0
for figure_id, pfocr_result in figure_id_to_pfocr_result.items():
    genes = pfocr_result["associatedWith"]["mentions"]["genes"]["ncbigene"]
    for gene in genes:
        gene_instance_count += 1
        all_genes.add(gene)
        
    chemicals = pfocr_result["associatedWith"]["mentions"]["chemicals"]["mesh"]
    for chemical in chemicals:
        chemical_instance_count += 1
        all_chemicals.add(chemical)
        
    diseases = pfocr_result["associatedWith"]["mentions"]["diseases"]["mesh"]
    for disease in diseases:
        disease_instance_count += 1
        all_diseases.add(disease)
    
print("total instance counts:")
print(gene_instance_count)
print(chemical_instance_count)
print(disease_instance_count)

print("")

print("unique counts:")
print(len(all_genes))
print(len(all_chemicals))
print(len(all_diseases))

print("")

curie_category_to_curies = {
    "biolink:ChemicalEntity": all_chemicals,
    "biolink:SmallMolecule": all_chemicals,
    "biolink:Disease": all_diseases,
    "biolink:Gene": all_genes,
}


all_figure_curies = set()
for curie_category in set(curie_category_to_curies.keys()).intersection(curie_categories):
    all_figure_curies |= curie_category_to_curies[curie_category]
all_figure_curie_count = len(all_figure_curies)

print(all_figure_curie_count)

total instance counts:
1369680
275456
20465

unique counts:
14253
14482
1430

28735


In [21]:
# trapi_curies_in_pwy
results_with_figures_df["yes_pwy_yes_trapi"] = (
    results_with_figures_df["overlap_curie_combo"].map(len)
)

# q_node_id_count - trapi_curies_in_pwy
results_with_figures_df["no_pwy_yes_trapi"] = (
    len(q_node_ids) - results_with_figures_df["yes_pwy_yes_trapi"]
)

# curies_in_pwy - trapi_curies_in_pwy
results_with_figures_df["yes_pwy_no_trapi"] = (
    results_with_figures_df["figure_curie_combo"].map(len) - 
    results_with_figures_df["yes_pwy_yes_trapi"]
)

# 28735 - curies_in_pwy - q_node_id_count - trapi_curies_in_pwy
results_with_figures_df["no_pwy_no_trapi"] = (
    all_figure_curie_count -
    results_with_figures_df["figure_curie_combo"].map(len) -
    len(q_node_ids) -
    results_with_figures_df["yes_pwy_yes_trapi"]
)

In [22]:
import scipy.stats as stats


results_with_figures_df["fishers_exact"] = results_with_figures_df[
    ["yes_pwy_yes_trapi",
     "no_pwy_yes_trapi",
     "yes_pwy_no_trapi",
     "no_pwy_no_trapi"]
].apply(
    lambda r: stats.fisher_exact([[
        r.yes_pwy_yes_trapi,
        r.no_pwy_yes_trapi
    ], [
        r.yes_pwy_no_trapi,
        r.no_pwy_no_trapi,
    ]]),
    axis=1)

results_with_figures_df["p_value"] = results_with_figures_df["fishers_exact"].apply(
    lambda x: x[1]
)

In [23]:
results_with_figures_df["p_value"].sort_values()

243319    2.530667e-13
243314    2.530667e-13
243316    2.530667e-13
243317    2.530667e-13
243318    2.530667e-13
              ...     
28919     4.788013e-04
666040    4.788013e-04
666039    4.788013e-04
531907    4.788013e-04
28920     4.788013e-04
Name: p_value, Length: 673602, dtype: float64

In [24]:
results_with_figures_df

Unnamed: 0,figure_id,figure_url,figure_title,trapi_result_curie_combo,figure_curie_combo,overlap_curie_combo,score,n0,e01,n1,...,n2_curie,n2_unified_curie,cumulative_trapi_result_curie_combo_figure_score,trapi_curie_combo_count_by_figure,yes_pwy_yes_trapi,no_pwy_yes_trapi,yes_pwy_no_trapi,no_pwy_no_trapi,fishers_exact,p_value
427559,PMC3696710__cjcr-25-03-346-f2.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Different models of Hh signaling pathway activ...,"(NCBIGene:7040, NCBIGene:7042, NCBIGene:7454)","(MESH:D009369, NCBIGene:1950, NCBIGene:1956, N...","(NCBIGene:7040, NCBIGene:7042)",0.666667,WAS,entity_negatively_regulates_entity,TGFB1,...,NCBIGene:7042,NCBIGene:7042,1853.333333,3,2,1,13,28715,"(4417.692307692308, 7.629975582113902e-07)",7.629976e-07
427497,PMC7709261__40364_2020_252_Fig2_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Function of lncRNAs in regulating TGFB signali...,"(NCBIGene:7040, NCBIGene:7042, NCBIGene:7454)","(NCBIGene:11025, NCBIGene:3880, NCBIGene:4086,...","(NCBIGene:7040, NCBIGene:7042)",0.666667,WAS,entity_negatively_regulates_entity,TGFB1,...,NCBIGene:7042,NCBIGene:7042,1853.333333,1,2,1,14,28714,"(4102.0, 8.719769682728204e-07)",8.719770e-07
427489,PMC5241340__10549_2016_4079_Fig8_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,Hypothesis of miR-567 modulation in healthy an...,"(NCBIGene:7040, NCBIGene:7042, NCBIGene:7454)","(NCBIGene:4087, NCBIGene:4088, NCBIGene:4089, ...","(NCBIGene:7040, NCBIGene:7042)",0.666667,WAS,entity_negatively_regulates_entity,TGFB1,...,NCBIGene:7042,NCBIGene:7042,1853.333333,1,2,1,4,28724,"(14362.0, 1.0902242242363447e-07)",1.090224e-07
427490,PMC2857646__nihms191420f1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,Schematic of the TGFB signaling pathway,"(NCBIGene:7040, NCBIGene:7042, NCBIGene:7454)","(NCBIGene:4087, NCBIGene:4088, NCBIGene:4089, ...","(NCBIGene:7040, NCBIGene:7042)",0.666667,WAS,entity_negatively_regulates_entity,TGFB1,...,NCBIGene:7042,NCBIGene:7042,1853.333333,1,2,1,4,28724,"(14362.0, 1.0902242242363447e-07)",1.090224e-07
427491,PMC4682714__ata0021300280002.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,Provides a schematic and abbreviated overview ...,"(NCBIGene:7040, NCBIGene:7042, NCBIGene:7454)","(NCBIGene:4087, NCBIGene:4088, NCBIGene:4089, ...","(NCBIGene:7040, NCBIGene:7042)",0.666667,WAS,entity_negatively_regulates_entity,TGFB1,...,NCBIGene:7042,NCBIGene:7042,1853.333333,1,2,1,4,28724,"(14362.0, 1.0902242242363447e-07)",1.090224e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293815,PMC4487197__40303_2015_13_Fig2_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,Ephrin receptor signaling pathway (genes hypom...,"(MESH:C000706872, NCBIGene:10006, NCBIGene:7454)","(MESH:C043055, MESH:C105630, MESH:D006153, NCB...","(NCBIGene:10006, NCBIGene:7454)",0.666667,WAS,entity_positively_regulates_entity,ABI1,...,PUBCHEM.COMPOUND:53340664,MESH:C000706872,0.666667,4195,2,1,65,28663,"(881.9384615384615, 1.6047155573859988e-05)",1.604716e-05
293729,PMC5319840__elife-22914-fig3-figsupp2.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,Tgfbr2 cKO CD34+ SCC cells upregulate genes im...,"(MESH:C474722, NCBIGene:6503, NCBIGene:7454)","(MESH:C004322, MESH:C045346, MESH:C054171, MES...","(NCBIGene:6503, NCBIGene:7454)",0.666667,WAS,entity_negatively_regulates_entity,SLA,...,PUBCHEM.COMPOUND:9914412,MESH:C474722,0.666667,302,2,1,65,28663,"(881.9384615384615, 1.6047155573859988e-05)",1.604716e-05
293728,PMC5319840__elife-22914-fig3-figsupp2.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,Tgfbr2 cKO CD34+ SCC cells upregulate genes im...,"(MESH:C474722, NCBIGene:6503, NCBIGene:7454)","(MESH:C004322, MESH:C045346, MESH:C054171, MES...","(NCBIGene:6503, NCBIGene:7454)",0.666667,WAS,entity_negatively_regulates_entity,SLA,...,PUBCHEM.COMPOUND:9914412,MESH:C474722,0.666667,302,2,1,65,28663,"(881.9384615384615, 1.6047155573859988e-05)",1.604716e-05
630465,PMC5118094__nihms825299f7.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,Cocaine regulation of specific Canonical Pathw...,"(MESH:C526575, NCBIGene:10095, NCBIGene:7454)","(MESH:C035086, NCBIGene:10092, NCBIGene:10093,...","(NCBIGene:10095, NCBIGene:7454)",0.666667,WAS,genetically_interacts_with,ARPC1B,...,PUBCHEM.COMPOUND:10074640,MESH:C526575,0.666667,5163,2,1,129,28599,"(443.3953488372093, 6.17088606859295e-05)",6.170886e-05


## Analysis of results with vs. without BTE Sleeve

With the current query graph, requiring co-occurrence of at least 2 nodes in at least one PFOCR figure cuts the number of TRAPI result curie combos down to 887 from 3,738.

When requiring 3 nodes, it went down all the way to 0, but I've seen other queries where that yield 9 to 30 results.

Total number of curie combos returned:

In [25]:
len(results_df["trapi_result_curie_combo"].drop_duplicates())

22366

TRAPI result curie combos where at least `x` curies are also found in at least one figure:

In [26]:
for i in range(2, len(q_node_ids) + 1):
    curie_combo_count = len(set(
        trapi_figure_overlap_df[
            trapi_figure_overlap_df["overlap_curie_combo"].map(len) >= i
        ]["trapi_result_curie_combo"]
    ))
    print(f'when {i}+ overlapping curies: {curie_combo_count}')

when 2+ overlapping curies: 16179
when 3+ overlapping curies: 139


curie combos where curies for each pair of nodes are found in at least one figure:

In [27]:
from itertools import combinations


overlap_columns = results_with_figures_df["overlap_curie_combo"].apply(pd.Series)

for q_node_id in q_node_ids:
    overlap_df = results_with_figures_df[(
        results_with_figures_df[f"{q_node_id}_unified_curie"] == overlap_columns[0]
    ) | (
        results_with_figures_df[f"{q_node_id}_unified_curie"] == overlap_columns[1]
    )]
    print(f'{q_node_id}')
    print(f'  {len(overlap_df["trapi_result_curie_combo"].drop_duplicates())} TRAPI result(s)')
    print(f'  {len(overlap_df["figure_id"].drop_duplicates())} figure(s)')

for (a, b) in combinations(q_node_ids, 2):
    overlap_df = results_with_figures_df[((
        results_with_figures_df[f"{a}_unified_curie"] == overlap_columns[0]
    ) | (
        results_with_figures_df[f"{a}_unified_curie"] == overlap_columns[1]
    )) & ((
        results_with_figures_df[f"{b}_unified_curie"] == overlap_columns[0]
    ) | (
        results_with_figures_df[f"{b}_unified_curie"] == overlap_columns[1]
    ))]
    print(f'{a} & {b}')
    print(f'  {len(overlap_df["trapi_result_curie_combo"].drop_duplicates())} TRAPI result(s)')
    print(f'  {len(overlap_df["figure_id"].drop_duplicates())} figure(s)')

n0
  16075 TRAPI result(s)
  210 figure(s)
n1
  15842 TRAPI result(s)
  11681 figure(s)
n2
  3438 TRAPI result(s)
  11677 figure(s)
n0 & n1
  15681 TRAPI result(s)
  168 figure(s)
n0 & n2
  1446 TRAPI result(s)
  157 figure(s)
n1 & n2
  2624 TRAPI result(s)
  11606 figure(s)


## View TRAPI results with figures

Note you can click the figures to go to the paper.

Compare these results with the `results_df` table earlier in this notebook and also with the [ARAX UI](https://arax.ncats.io/?r=44922) (click "Load" and then "Results".).

Does requiring co-occurrence for specific nodes like `n0` and `n2` help? Does ranking by p-value help?

### By lowest p-value

In [28]:
lowest_p_value_df = results_with_figures_df.sort_values(
    "p_value"
)

#### By TRAPI result

In [29]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 5
count = 0
display_count_limit = 10

for trapi_result_curie_combo, df1 in lowest_p_value_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]
    
    ordered_trapi_result_curie_combo = []
    for q_node_id in q_node_ids:
        ordered_trapi_result_curie_combo.append(list(set(
            df1[f"{q_node_id}_unified_curie"]
        ))[0])

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span>{", ".join(ordered_trapi_result_curie_combo)}</span> 
  <span>(cumulative containment score: {cumulative_trapi_result_curie_combo_figure_score:.2f})</span>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_result_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_result_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_result_row_count = len(trapi_result_rows)
    display(HTML(data=f'''
<div style="font-size: small;">
{"<br>".join(trapi_result_rows)}
</div>
'''))

    figures = []
    row_height='100px'
    for figure_id, df0 in df1.groupby("figure_id", sort=False):
        [pmc,filename] = figure_id.split("__")
        paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]
        figure_title_limited = textwrap.shorten(figure_title, width=45, placeholder="...")

        p_value = list(set(
            df0["p_value"]
        ))[0]
        score = list(set(df0["score"]))[0]

        # TODO: why does the commented out block below not display correctly
        # on GitHub? The figcaption appears to the right of the image.
        # The div solution below is a kludge.
        figures.append(f'''
        <div style="margin: 5px !important; padding: 5px !important;>
            <figure style="margin: 5px !important;">
                <a target="_blank" href="{paper_url}">
                  <img src="{figure_url}" style="height: {row_height}">
              </a>
            </figure>
            <div style="font-size: x-small;">
                {figure_title_limited}<br>
                p-value: {p_value:.1e}, 
                containment: {score:.2}
            </div>
        </div>
        ''')
        
#        figures.append(f'''
#            <figure style="margin: 5px !important;">
#                <a target="_blank" href="{paper_url}">
#                  <img src="{figure_url}" style="height: {row_height}">
#                </a>
#                <figcaption style="font-size: small;">
#                      {figure_title_limited}<br>
#                      p-value: {p_value:.1e}, 
#                      containment: {score:.2}
#                </figcaption>
#            </figure>
#        ''')

    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more figures")
        print("")


    count += 1
    if count >= limit:
        print("...")
        print("")
        remaining = len(set(lowest_p_value_df["trapi_result_curie_combo"])) - limit
        print(f'plus {remaining} more TRAPI results')
        print("")
        break

...

plus 101 more figures



...

plus 206 more figures



...

plus 2764 more figures



...

plus 67 more figures



...

plus 5 more figures

...

plus 16174 more TRAPI results



#### By figure

In [30]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 5

for figure_id, raw_df in lowest_p_value_df.groupby("figure_id", sort=False):
    df0 = raw_df.sort_values(
        "p_value"
    )
    [pmc,filename] = figure_id.split("__")
    paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px; font-weight: bold;">
  {figure_title}
</div>
<figure style="margin: 5px !important;">
    <a target="_blank" href="{paper_url}">
      <img src="{figure_url}" style="max-height: 500px; max-width: 100%;">
    </a>
</figure>
'''))

    trapi_results_row_sets = []
    for trapi_result_curie_combo, df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo", "p_value", "score"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
                
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}</span>'
            )
            
        p_value = list(set(df1["p_value"]))[0]
        score = list(set(df1["score"]))[0]
        trapi_results_row_sets.append(f'''
<div>
    <span style="font-weight: bold; font-size: small;">
      TRAPI result p-value: {p_value:.1e}, containment: {score:.2}
    </span><br>
    {"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more TRAPI results")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print("")
        print(f'plus {len(set(lowest_p_value_df["figure_id"])) - limit} more figures')
        print("")
        break

...

plus 392 more TRAPI results



...

plus 1056 more TRAPI results



...

plus 1270 more TRAPI results

...

plus 11720 more figures



### co-occurrence: `n0` and (`n1` or `n2`)

In [31]:
n0_cooccurrence_df = results_with_figures_df[results_with_figures_df.apply(
    lambda r: r.n0_unified_curie in set(r.overlap_curie_combo),
    axis=1
)].sort_values(
    "p_value"
)

In [32]:
print(len(set(results_with_figures_df["figure_id"])))
print(len(set(n0_cooccurrence_df["figure_id"])))

11723
210


#### By TRAPI result

In [33]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 5
count = 0
display_count_limit = 10

for trapi_result_curie_combo, df1 in n0_cooccurrence_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]
    
    ordered_trapi_result_curie_combo = []
    for q_node_id in q_node_ids:
        ordered_trapi_result_curie_combo.append(list(set(
            df1[f"{q_node_id}_unified_curie"]
        ))[0])

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span>{", ".join(ordered_trapi_result_curie_combo)}</span> 
  <span>(cumulative containment score: {cumulative_trapi_result_curie_combo_figure_score:.2f})</span>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_result_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_result_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_result_row_count = len(trapi_result_rows)
    display(HTML(data=f'''
<div style="font-size: small;">
{"<br>".join(trapi_result_rows)}
</div>
'''))

    figures = []
    row_height='100px'
    for figure_id, df0 in df1.groupby("figure_id", sort=False):
        [pmc,filename] = figure_id.split("__")
        paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]
        figure_title_limited = textwrap.shorten(figure_title, width=45, placeholder="...")

        p_value = list(set(
            df0["p_value"]
        ))[0]
        score = list(set(df0["score"]))[0]

        # TODO: why does the commented out block below not display correctly
        # on GitHub? The figcaption appears to the right of the image.
        # The div solution below is a kludge.
        figures.append(f'''
        <div style="margin: 5px !important; padding: 5px !important;>
            <figure style="margin: 5px !important;">
                <a target="_blank" href="{paper_url}">
                  <img src="{figure_url}" style="height: {row_height}">
              </a>
            </figure>
            <div style="font-size: x-small;">
                {figure_title_limited}<br>
                p-value: {p_value:.1e}, 
                containment: {score:.2}
            </div>
        </div>
        ''')
        
#        figures.append(f'''
#            <figure style="margin: 5px !important;">
#                <a target="_blank" href="{paper_url}">
#                  <img src="{figure_url}" style="height: {row_height}">
#                </a>
#                <figcaption style="font-size: small;">
#                      {figure_title_limited}<br>
#                      p-value: {p_value:.1e}, 
#                      containment: {score:.2}
#                </figcaption>
#            </figure>
#        ''')

    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more figures")
        print("")


    count += 1
    if count >= limit:
        print("...")
        print("")
        remaining = len(set(n0_cooccurrence_df["trapi_result_curie_combo"])) - limit
        print(f'plus {remaining} more TRAPI results')
        print("")
        break

...

plus 51 more figures



...

plus 16 more figures



...

plus 5 more figures



...

plus 47 more figures



...

plus 4 more figures

...

plus 16070 more TRAPI results



#### By figure

In [34]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 5

for figure_id, raw_df in n0_cooccurrence_df.groupby("figure_id", sort=False):
    df0 = raw_df.sort_values(
        "p_value"
    )
    [pmc,filename] = figure_id.split("__")
    paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px; font-weight: bold;">
  {figure_title}
</div>
<figure style="margin: 5px !important;">
    <a target="_blank" href="{paper_url}">
      <img src="{figure_url}" style="max-height: 500px; max-width: 100%;">
    </a>
</figure>
'''))

    trapi_results_row_sets = []
    for trapi_result_curie_combo, df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo", "p_value", "score"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
                
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}</span>'
            )
            
        p_value = list(set(df1["p_value"]))[0]
        score = list(set(df1["score"]))[0]
        trapi_results_row_sets.append(f'''
<div>
    <span style="font-weight: bold; font-size: small;">
      TRAPI result p-value: {p_value:.1e}, containment: {score:.2}
    </span><br>
    {"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more TRAPI results")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print("")
        print(f'plus {len(set(n0_cooccurrence_df["figure_id"])) - limit} more figures')
        print("")
        break

...

plus 392 more TRAPI results



...

plus 1056 more TRAPI results



...

plus 1270 more TRAPI results

...

plus 207 more figures



### co-occurrence: `n0` and `n1`

In [35]:
n0_n1_cooccurrence_df = results_with_figures_df[results_with_figures_df.apply(
    lambda r: (
        r.n0_unified_curie in set(r.overlap_curie_combo)
    ) and (
        r.n1_unified_curie in set(r.overlap_curie_combo)
    ),
    axis=1
)].sort_values(
    "p_value"
)

#### By TRAPI result

In [36]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 5
count = 0
display_count_limit = 10

for trapi_result_curie_combo, df1 in n0_n1_cooccurrence_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]
    
    ordered_trapi_result_curie_combo = []
    for q_node_id in q_node_ids:
        ordered_trapi_result_curie_combo.append(list(set(
            df1[f"{q_node_id}_unified_curie"]
        ))[0])

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span>{", ".join(ordered_trapi_result_curie_combo)}</span> 
  <span>(cumulative containment score: {cumulative_trapi_result_curie_combo_figure_score:.2f})</span>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_result_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_result_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_result_row_count = len(trapi_result_rows)
    display(HTML(data=f'''
<div style="font-size: small;">
{"<br>".join(trapi_result_rows)}
</div>
'''))

    figures = []
    row_height='100px'
    for figure_id, df0 in df1.groupby("figure_id", sort=False):
        [pmc,filename] = figure_id.split("__")
        paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]
        figure_title_limited = textwrap.shorten(figure_title, width=45, placeholder="...")

        p_value = list(set(
            df0["p_value"]
        ))[0]
        score = list(set(df0["score"]))[0]

        # TODO: why does the commented out block below not display correctly
        # on GitHub? The figcaption appears to the right of the image.
        # The div solution below is a kludge.
        figures.append(f'''
        <div style="margin: 5px !important; padding: 5px !important;>
            <figure style="margin: 5px !important;">
                <a target="_blank" href="{paper_url}">
                  <img src="{figure_url}" style="height: {row_height}">
              </a>
            </figure>
            <div style="font-size: x-small;">
                {figure_title_limited}<br>
                p-value: {p_value:.1e}, 
                containment: {score:.2}
            </div>
        </div>
        ''')
        
#        figures.append(f'''
#            <figure style="margin: 5px !important;">
#                <a target="_blank" href="{paper_url}">
#                  <img src="{figure_url}" style="height: {row_height}">
#                </a>
#                <figcaption style="font-size: small;">
#                      {figure_title_limited}<br>
#                      p-value: {p_value:.1e}, 
#                      containment: {score:.2}
#                </figcaption>
#            </figure>
#        ''')

    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more figures")
        print("")


    count += 1
    if count >= limit:
        print("...")
        print("")
        remaining = len(set(n0_n1_cooccurrence_df["trapi_result_curie_combo"])) - limit
        print(f'plus {remaining} more TRAPI results')
        print("")
        break

...

plus 45 more figures



...

plus 4 more figures



...

plus 4 more figures



...

plus 45 more figures



...

plus 4 more figures

...

plus 15677 more TRAPI results



#### By figure

In [37]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 5

for figure_id, raw_df in n0_n1_cooccurrence_df.groupby("figure_id", sort=False):
    df0 = raw_df.sort_values(
        "p_value"
    )
    [pmc,filename] = figure_id.split("__")
    paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px; font-weight: bold;">
  {figure_title}
</div>
<figure style="margin: 5px !important;">
    <a target="_blank" href="{paper_url}">
      <img src="{figure_url}" style="max-height: 500px; max-width: 100%;">
    </a>
</figure>
'''))

    trapi_results_row_sets = []
    for trapi_result_curie_combo, df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo", "p_value", "score"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
                
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}</span>'
            )
            
        p_value = list(set(df1["p_value"]))[0]
        score = list(set(df1["score"]))[0]
        trapi_results_row_sets.append(f'''
<div>
    <span style="font-weight: bold; font-size: small;">
      TRAPI result p-value: {p_value:.1e}, containment: {score:.2}
    </span><br>
    {"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more TRAPI results")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print("")
        print(f'plus {len(set(n0_n1_cooccurrence_df["figure_id"])) - limit} more figures')
        print("")
        break

...

plus 387 more TRAPI results



...

plus 1054 more TRAPI results



...

plus 1270 more TRAPI results

...

plus 165 more figures



### co-occurrence: `n0` and `n2`

In [38]:
n0_n2_cooccurrence_df = results_with_figures_df[results_with_figures_df.apply(
    lambda r: (
        r.n0_unified_curie in set(r.overlap_curie_combo)
    ) and (
        r.n2_unified_curie in set(r.overlap_curie_combo)
    ),
    axis=1
)].sort_values(
    "p_value"
)

#### By TRAPI result

In [39]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 5
count = 0
display_count_limit = 10

for trapi_result_curie_combo, df1 in n0_n2_cooccurrence_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]
    
    ordered_trapi_result_curie_combo = []
    for q_node_id in q_node_ids:
        ordered_trapi_result_curie_combo.append(list(set(
            df1[f"{q_node_id}_unified_curie"]
        ))[0])

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span>{", ".join(ordered_trapi_result_curie_combo)}</span> 
  <span>(cumulative containment score: {cumulative_trapi_result_curie_combo_figure_score:.2f})</span>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_result_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_result_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_result_row_count = len(trapi_result_rows)
    display(HTML(data=f'''
<div style="font-size: small;">
{"<br>".join(trapi_result_rows)}
</div>
'''))

    figures = []
    row_height='100px'
    for figure_id, df0 in df1.groupby("figure_id", sort=False):
        [pmc,filename] = figure_id.split("__")
        paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]
        figure_title_limited = textwrap.shorten(figure_title, width=45, placeholder="...")

        p_value = list(set(
            df0["p_value"]
        ))[0]
        score = list(set(df0["score"]))[0]

        # TODO: why does the commented out block below not display correctly
        # on GitHub? The figcaption appears to the right of the image.
        # The div solution below is a kludge.
        figures.append(f'''
        <div style="margin: 5px !important; padding: 5px !important;>
            <figure style="margin: 5px !important;">
                <a target="_blank" href="{paper_url}">
                    <img src="{figure_url}" style="height: {row_height}">
                </a>
            </figure>
            <div style="font-size: x-small;">
                {figure_title_limited}<br>
                p-value: {p_value:.1e}, 
                containment: {score:.2}
            </div>
        </div>
        ''')
        
#        figures.append(f'''
#            <figure style="margin: 5px !important;">
#                <a target="_blank" href="{paper_url}">
#                  <img src="{figure_url}" style="height: {row_height}">
#                </a>
#                <figcaption style="font-size: small;">
#                      {figure_title_limited}<br>
#                      p-value: {p_value:.1e}, 
#                      containment: {score:.2}
#                </figcaption>
#            </figure>
#        ''')

    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more figures")
        print("")


    count += 1
    if count >= limit:
        print("...")
        print("")
        remaining = len(set(n0_n2_cooccurrence_df["trapi_result_curie_combo"])) - limit
        print(f'plus {remaining} more TRAPI results')
        print("")
        break

...

plus 1 more figures



...

plus 4 more figures



...

plus 3 more figures



...

plus 1472 more TRAPI results



#### By figure

In [40]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 5

for figure_id, raw_df in n0_n2_cooccurrence_df.groupby("figure_id", sort=False):
    df0 = raw_df.sort_values(
        "p_value"
    )
    [pmc,filename] = figure_id.split("__")
    paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px; font-weight: bold;">
  {figure_title}
</div>
<figure style="margin: 5px !important;">
    <a target="_blank" href="{paper_url}">
      <img src="{figure_url}" style="max-height: 500px; max-width: 100%;">
    </a>
</figure>
'''))

    trapi_results_row_sets = []
    for trapi_result_curie_combo, df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo", "p_value", "score"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
                
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}</span>'
            )
            
        p_value = list(set(df1["p_value"]))[0]
        score = list(set(df1["score"]))[0]
        trapi_results_row_sets.append(f'''
<div>
    <span style="font-weight: bold; font-size: small;">
      TRAPI result p-value: {p_value:.1e}, containment: {score:.2}
    </span><br>
    {"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more TRAPI results")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print("")
        print(f'plus {len(set(n0_n2_cooccurrence_df["figure_id"])) - limit} more figures')
        print("")
        break

...

plus 1 more TRAPI results







...

plus 161 more figures

